def insert_compounds( filename, lib_name, namekey, idkey, lib_ver=0, ): """ Read SDF file, fetch each compound and process it. ........All compounds will be inserted into a new library, ........If you are updating an existing library, ........please use update_compounds()""" library = get_or_create_library(lib_name, lib_ver) f = open(filename) sdffile = u'' count = 0 for line in f.xreadlines(): line = unicode(line, 'latin1') sdffile += line if line.startswith('$$$$'): moldata = parse_annotation(sdffile, namekey, idkey) insert_single_compound(moldata, sdffile, library, namekey, idkey) sdffile = u'' count += 1 root.warning(' --> count:%s' % count) f.close()
def gen_joelib_property(sdf): """run and parse the property output """ # save the input in FS t = tempfile.NamedTemporaryFile(suffix='.sdf') t.write(codecs.encode(sdf, 'utf-8')) t.flush() # prepare the output file (f, out) = tempfile.mkstemp(suffix='.sdf') os.close(f) # convert cmd = \ """JAVA_HOME=/opt/jre/ JOELIB2=/opt/JOELib2-alpha-20070303/ /opt/JOELib2-alpha-20070303/moleculeConversion.sh +d +h -iSDF -osdf "%s" "%s" > /dev/null""" \ % (t.name, out) root.warning(' --> running:%s' % cmd) if os.system(cmd) != 0: os.unlink(out) raise 'cannot run JOELib' # read and parse f = file(out) tags = get_sdf_tags(codecs.decode(f.read(), 'utf-8')) f.close() # clean os.unlink(out) return tags
def parse_annotation(sdf, namekey, idkey): """ parse annotation from SDF file """ # parse the sdf tags moldata = get_sdf_tags(sdf) moldata[idkey] = moldata[idkey].replace(' ', '-') # --- inchi inchiconv.SetInAndOutFormats('sdf', 'Inchi') mol = openbabel.OBMol() if '(noMol)' not in moldata[idkey]: res = inchiconv.ReadString(mol, codecs.encode(sdf, 'utf-8')) if mol.Empty(): root.warning(' --> ERROR on sdf') raise Exception # standard data generated # --- inchi/formula/weight moldata['inchi'] = inchiconv.WriteString(mol).strip() moldata['formula'] = mol.GetFormula() mol.AddHydrogens() moldata['weight'] = str(mol.GetMolWt()) # if the name is not in sdf: if namekey == '__NA__': moldata[namekey] = '' # smiles inchiconv.SetInAndOutFormats('sdf', 'smi') mol = openbabel.OBMol() if '(noMol)' not in moldata[idkey]: res = inchiconv.ReadString(mol, codecs.encode(sdf, 'utf-8')) moldata['smiles'] = inchiconv.WriteString(mol).strip() else: moldata['smiles'] = '' # check if necessary keys are present: the name and key if not moldata.has_key(idkey): print 'compound data error: the idkey is not found' raise Exception if not moldata.has_key(namekey): # print "warning: no name key" moldata[namekey] = '' # do any necessary fixes for tags return moldata
def do_process(compounds, filep, tags): for cmp in compounds: sdf = SDFFile.objects.get(compound=cmp).sdffile print ' running compound:%s' % cmp.id try: propertydata = gen_joelib_property(sdf) _insert_property(cmp, propertydata, tags) except: root.warning(' --> cannot run property on %s' % cmp.id) raise
def insert_single_compound( moldata, sdf, library, namekey, idkey, username='', ): """ insert single compound into database """ cid = moldata[idkey] name = moldata[namekey] if '\n' in name: name = name.split('\n')[0] # compound c = Compound( cid=cid, name=name, formula=moldata['formula'], weight=moldata['weight'], inchi=moldata['inchi'], smiles=moldata['smiles'], username=username, ) # sdf_file=s) c.save() c.library.add(library) c_id = c.id root.warning(' -->new compound inserted: c_id=%s, cid=%s' % (c_id, cid)) # sdf file s = SDFFile(sdffile=sdf, compound=c) s.save() sdfid = s.id # annotation a_ids = [] for (name, value) in moldata.items(): if name in (namekey, idkey): continue a = Annotation(name=name, value=value, compound=c) a.save() aid = a.id a_ids.append(aid) return False
def insert_plate_from_db( library_name, library_ver, format, plate_key, plate_base, well_key, row_key, col_key, ): """if no file about plate mapping info, ........use plate info from sdf in db ........'format': 24/48/96/384 ........'library': library name/id ........'plate_key': required ........'plate_base': '__NA__'/required, e.g. 'LAT', to be stripped from plate ........'well_key': optional, if no, must be '__NA__' ........'row_key' 'col_key': if no well_key, these 2 keys are required, ........ if has well_key, will be ignored ........e.g.: ........insert_plate_from_db('Latca', 96, 'PLATE', 'LAT', '__NA__','ROW', 'COL')""" library = get_library(library_name, library_ver) compounds = Compound.objects.filter(library=library) format = int(format) count = 0 for cmp in compounds: plate = Annotation.objects.get(compound=cmp, name=plate_key).value if plate_base is not '__NA__': try: plate = int(plate.lstrip(plate_base)) except: root.warning(' --> plate:%s, base:%s' % (plate, plate_base)) if well_key is not '__NA__': well = Annotation.objects.get(compound=cmp, name=well_key).value else: row = Annotation.objects.get(compound=cmp, name=row_key).value col = Annotation.objects.get(compound=cmp, name=col_key).value if len(col) == 1: col = '0%s' % col well = '%s%s' % (row, col) try: p = Plate(format=format, plate=int(plate), well=well, compound=cmp) p.save() count += 1 root.warning(' --> count:%s, compound:%s' % (count, cmp)) except: raise root.warning(' --> count:%s, compound:%s' % (count, cmp)) root.warning(' --> total count:%s' % count) return
def parse_annotation(sdf, namekeys): """ parse annotation from SDF file """ # parse the sdf tags moldata = get_sdf_tags(sdf) # --- inchi inchiconv.SetInAndOutFormats('sdf', 'Inchi') mol = openbabel.OBMol() res = inchiconv.ReadString(mol, sdf) if mol.Empty(): root.warning(' --> ERROR on sdf') raise Exception # standard data generated # --- inchi/formula/weight moldata['inchi'] = inchiconv.WriteString(mol).strip() moldata['formula'] = mol.GetFormula() moldata['id'] = mol.GetTitle() if moldata['id'] == '': moldata['id'] = 'unspecified_' \ + ''.join(random.sample(string.digits, 6)) mol.AddHydrogens() moldata['weight'] = str(mol.GetMolWt()) # Retrieve display name from SDF, or empty string if absent: for n in namekeys: if n in moldata: moldata['name'] = moldata[n] break if moldata.get('name') is None: moldata['name'] = '' # smiles inchiconv.SetInAndOutFormats('sdf', 'smi') mol = openbabel.OBMol() #res = inchiconv.ReadString(mol, codecs.encode(sdf, 'utf-8')) res = inchiconv.ReadString(mol, sdf) if mol.Empty(): root.warning(' --> ERROR on sdf') raise Exception moldata['smiles'] = inchiconv.WriteString(mol).strip() return moldata
def fast_insert_plate_from_file( library_name, library_ver, platefile, format, ): """if we have a file containing plate mapping info, ........use plate info from the file. ........'library': library name/library id ........'platefile' format: ............1st column: compound id ............2nd column: plate ............3rd column: well ........'format': 24/48/96/384 """ format = int(format) library = get_library(library_name, library_ver) fp = file(platefile) count = 0 for i in fp.xreadlines(): cid = '' plate = '' well = '' (cid, plate, well) = i.strip().split(',')[0:3] cid = str(cid) if cid == 'NA': continue plate = int(plate) well = str(well) # sys.stdout.write('\r --> cid:%s' % (cid, )) compound = Compound.objects.get(library=library, cid=cid) try: p = Plate(format=format, plate=int(plate), well=well, compound=compound) p.save() count += 1 except: print ' --> count:%s, cid:%s' % (count, cid) raise root.warning(' --> count:%s' % (count, )) fp.close() return
def parse_annotation(sdf, namekey): """ parse annotation from SDF file """ # parse the sdf tags moldata = get_sdf_tags(sdf) # --- inchi inchiconv.SetInAndOutFormats('sdf', 'Inchi') mol = openbabel.OBMol() res = inchiconv.ReadString(mol, sdf) if mol.Empty(): root.warning(' --> ERROR on sdf') raise Exception # standard data generated # --- inchi/formula/weight moldata['inchi'] = inchiconv.WriteString(mol).strip() moldata['formula'] = mol.GetFormula() moldata['id'] = mol.GetTitle() if moldata['id'] == '': moldata['id'] = 'unspecified_' \ + ''.join(random.sample(string.digits, 6)) mol.AddHydrogens() moldata['weight'] = str(mol.GetMolWt()) # if the name is not in sdf: if namekey not in moldata: moldata[namekey] = '' # smiles inchiconv.SetInAndOutFormats('sdf', 'smi') mol = openbabel.OBMol() #res = inchiconv.ReadString(mol, codecs.encode(sdf, 'utf-8')) res = inchiconv.ReadString(mol, sdf) if mol.Empty(): root.warning(' --> ERROR on sdf') raise Exception moldata['smiles'] = inchiconv.WriteString(mol).strip() return moldata
def insert_single_property_field( name, description, is_integer, source_tag, ): if is_integer: is_integer = True else: is_integer = False field = PropertyField(name=name, description=description, is_integer=is_integer, source_tag=source_tag) field.save() root.warning(' --> id=%s' % field.id) return False
def parse_annotation(sdf, namekey): """ parse annotation from SDF file """ # parse the sdf tags moldata = get_sdf_tags(sdf) # --- inchi inchiconv.SetInAndOutFormats('sdf', 'Inchi') mol = openbabel.OBMol() res = inchiconv.ReadString(mol, codecs.encode(sdf, 'utf-8')) if mol.Empty(): root.warning(' --> ERROR on sdf') raise Exception # standard data generated # --- inchi/formula/weight moldata['inchi'] = inchiconv.WriteString(mol).strip() moldata['formula'] = mol.GetFormula() moldata['id'] = mol.GetTitle() if moldata['id'] == '': moldata['id'] = 'unspecified_' \ + ''.join(random.sample(string.digits, 6)) mol.AddHydrogens() moldata['weight'] = str(mol.GetMolWt()) # if the name is not in sdf: if not moldata.has_key(namekey): moldata[namekey] = '' # smiles inchiconv.SetInAndOutFormats('sdf', 'smi') mol = openbabel.OBMol() res = inchiconv.ReadString(mol, codecs.encode(sdf, 'utf-8')) if mol.Empty(): root.warning(' --> ERROR on sdf') raise Exception moldata['smiles'] = inchiconv.WriteString(mol).strip() return moldata
def update_compounds( filename, library_name, version, namekey, idkey, ): """update an existing library: ........ library version += 1 """ library = get_library(library_name, version) if check_or_update_compounds(filename, library, namekey, idkey, check_only=True): root.warning(' --> : compound update found. Will update now.') check_or_update_compounds(filename, library, namekey, idkey, check_only=False) else: root.warning(' --> : No compound update found.')
def create_library_w_header(header): """ given header, create library with latest version + 1""" # library_set w/ given header libs = header.library_set.all() # no library yet if len(libs) == 0: version = 0 else: version = libs.latest().version + 1 library = Library(header=header, version=version) library.save() root.warning(' --> : New library created: %s' % library) return library
def insert_property_fields(jfile): """file format: NAME, DESCRIPTION, IS_INTEGER?, SOURCE_TAG""" fp = file(jfile) counter = 0 for i in fp.readlines(): counter += 1 try: (name, description, is_integer, source_tag) = \ i.strip().split(',')[0:4] except: root.warning(' --> line %s' % counter) raise insert_single_property_field(name, description, is_integer, source_tag) fp.close() return False
def gen_joelib_property(sdf): """run and parse the property output """ # save the input in FS t = tempfile.NamedTemporaryFile(suffix='.sdf') t.write(codecs.encode(sdf, 'utf-8')) t.flush() # prepare the output file (f, out) = tempfile.mkstemp(suffix='.sdf') os.close(f) # convert cmd = \ """JAVA_HOME=%s/support/jre1.6.0_06/ JOELIB2=%s/support/JOELib2-alpha-20070303/ %s/support/JOELib2-alpha-20070303/moleculeConversion.sh +d +h -iSDF -osdf "%s" "%s" > /dev/null""" \ % (cur_dir, cur_dir, cur_dir, t.name, out) if os.uname()[1] == 'chemmineweb': cmd = \ """JAVA_HOME=/usr/lib/jvm/java-6-sun JOELIB2=%s/support/JOELib2-alpha-20070303/ %s/support/JOELib2-alpha-20070303/moleculeConversion.sh +d +h -iSDF -osdf "%s" "%s" > /dev/null""" \ % (cur_dir, cur_dir, t.name, out) elif os.uname()[1] == 'biocluster': cmd = \ """JAVA_HOME=/usr/lib/jvm/java-1.5.0-sun-1.5.0.14/ JOELIB2=%s/support/JOELib2-alpha-20070303/ %s/support/JOELib2-alpha-20070303/moleculeConversion.sh +d +h -iSDF -osdf "%s" "%s" > /dev/null""" \ % (cur_dir, cur_dir, t.name, out) root.warning(' --> running:%s' % cmd) if os.system(cmd) != 0: os.unlink(out) raise 'cannot run JOELib' # read and parse f = file(out) tags = get_sdf_tags(codecs.decode(f.read(), 'utf-8')) f.close() # clean os.unlink(out) return tags
def insert_single_compound( moldata, sdf, namekey, idkey, user, ): """ insert single compound into database """ cid = moldata[idkey] name = moldata[namekey] if '\n' in name: name = name.split('\n')[0] # compound c = Compound( cid=cid, name=name, formula=moldata['formula'], weight=moldata['weight'], inchi=moldata['inchi'], smiles=moldata['smiles'], user=user, ) # sdf_file=s) c.save() c_id = c.id root.warning(' -->new compound inserted: c_id=%s, cid=%s' % (c_id, cid)) # sdf file s = SDFFile(sdffile=sdf, compound=c) s.save() sdfid = s.id return c.id
plates = [] plate = compound.plate_set.all() for p in plate: plates.append(dict(format=p.format, plate=p.plate, well=p.well)) # --- get duplicates via fingerprint try: fpt = Fingerprint.objects.get(compound=compound) dup_fpts = \ Fingerprint.objects.filter(fingerprint=fpt.fingerprint) if len(dup_fpts) == 1: dup_fpts = [] except: root.warning('failed to get duplicates for %s' % compound) dup_fpts = [] if 'addWorkbench' in request.POST: input_mode = 'view' addToWorkbench(compound=compound, username=request.user.username) matches = None request.user.message_set.create(message='Compound added to workbench' ) return render_to_response('compound.html', dict( libname=library, compound=compound, annotations=annotations, annotations_head=annotations.items()[:11],
def insert_property( library_name, library_ver, offset, limit, filename, ): """for JOELib property .... good for small library as it is slow .... for library with large amount of compounds: .... 1. use extra scripts to generate Property files ........2. use insert_property_from_file() """ def do_process(compounds, filep, tags): for cmp in compounds: sdf = SDFFile.objects.get(compound=cmp).sdffile print ' running compound:%s' % cmp.id try: propertydata = gen_joelib_property(sdf) _insert_property(cmp, propertydata, tags) except: root.warning(' --> cannot run property on %s' % cmp.id) raise library = get_library(library_name, library_ver) try: offset = int(offset) limit = int(limit) except: raise # number of compounds filep = file(filename, 'w') compounds = library.compound_set.all() count = len(compounds) root.warning(' --> %s compounds to be processed' % count) tags = [] for i in PropertyField.objects.values_list('source_tag'): tags.append(i[0]) limit = limit + offset count = 0 while offset < limit: cur_limit = 1000 # cur_limit = 7 # for testing only if offset + cur_limit < limit: cur_limit = limit - offset root.warning(' --> cur_limit:%s' % cur_limit) filep_progress = file(filename + '.progress', 'w') filep_progress.write('working on %s - %s\n' % (offset, offset + cur_limit)) filep_progress.flush() # process cmps = compounds[offset:offset + cur_limit] do_process(cmps, filep, tags) filep_progress.close() count += 1 offset += cur_limit filep.close() return False