def insert_property_from_file( library_name, library_ver, ID_tag, filename, ): """for JOELib Property .... input files: parsed name-value pairs in SDF format .... name-value pairs: ID and property only """ import datetime begin = datetime.datetime.now() print 'starts at: %s' % begin library = get_library(library_name, library_ver) for sdf in sdfiterator.sdf_iter(filename): cid = '' tagdict = get_sdf_tags(sdf) cid = tagdict.pop(ID_tag) compound = Compound.objects.get(library=library, cid=cid) for (tag, value) in tagdict.items(): try: field = PropertyField.objects.get(source_tag=tag) except: raise p = Property(field=field, value=value, compound=compound) p.save() end = datetime.datetime.now() print 'finished at: %s' % end print 'time lapsed: %s' % (end - begin) return
def format_qsar_property_file( lib_name, lib_ver, input, output, sdffile, id_tag, ): """cerius2 program not giving correct cid for some .... libraries, need to find out them manually .... note: input file and sdffile have same number of compounds""" library = get_library(lib_name, lib_ver) fp = file(input) result = file(output, 'w') line1 = fp.readline() result.write(line1) for sdf in sdfiterator.sdf_iter(sdffile): tagdict = get_sdf_tags(sdf) cid = tagdict[id_tag] line = fp.readline() # line = line.split('\t',1)[1] result.write('%s\t%s' % (cid, line)) fp.close() result.close() return
def list_all_cid_from_sdf(sdffile, ID_tag, outfile): fp = file(outfile, 'w') for sdf in sdfiterator.sdf_iter(sdffile): tagdict = get_sdf_tags(sdf) cid = tagdict[ID_tag] fp.write('%s\n' % cid) fp.close() return
def prepare_property_file( library_name, library_ver, ID, input, output, ): """for JOELib property .... input: parsed name-value pairs in SDF format .... output: result file to write SQL sentences .... name-value pair: ID and value only""" import datetime begin = datetime.datetime.now() print 'now begin at: %s' % begin library = get_library(library_name, library_ver) fields = {} for f in PropertyField.objects.all(): fields[f.source_tag] = f.id fp = file(output, 'w') for sdf in sdfiterator.sdf_iter(input): cid = '' tagdict = get_sdf_tags(sdf) cid = tagdict.pop(ID) try: c_id = Compound.objects.get(library=library, cid=cid).id except: print '-------exception!!!-----------' print Compound.objects.filter(library=library, cid=cid) continue for (tag, value) in tagdict.items(): try: field_id = fields[tag] except: print '-------field exception!!!-----------' print 'cid:%s, c_id:%s' % (cid, c_id) print '''tag:%s ''' % tag raise query = \ """INSERT INTO compounddb_property (compound_id, value, field_id) VALUES (%s, %s, %s);""" \ % (c_id, value, field_id) fp.write('%s\n' % query) fp.close() end = datetime.datetime.now() print 'finished at: %s' % end print 'time lapsed: %s' % (end - begin) return
def batch_sdf_to_smiles(sdfs): from sdfiterator import sdf_iter from cStringIO import StringIO buf = '' err = 0 for sdf in sdf_iter(StringIO(sdfs)): try: buf += sdf_to_smiles(sdf) except InputError: err += 1 return (buf, err)
def check_or_update_compounds( filename, library, namekey, idkey, check_only=True, ): """Before updating a library, manually check which compounds have update """ need_update = False # if update, create a new library first if check_only == False: new_lib = create_library_w_header(header=library.header) for sdf in sdfiterator.sdf_iter(codecs.open(filename, 'r', 'latin1' )): # for sdf in sdfiterator.sdf_iter(filename): moldata = parse_annotation(sdf, namekey, idkey) cid = moldata[idkey] try: c = library.compound_set.get(cid=cid) sdf_old = SDFFile.objects.get(compound=c).sdffile if sdf != sdf_old: need_update = True if check_only: return need_update else: # update cmp: insert cmp actually insert_single_compound(moldata, sdf, new_lib, namekey, idkey) else: # same compound, link it to new library if not check_only: link_compound_to_library(c, new_lib) except Compound.DoesNotExist: insert_single_compound(moldata, sdf, new_lib, namekey, idkey) return False
def newdb(self, filepath, dbpath=None): """take a filepath, and store all SDFs inside to a database""" if dbpath is None: dbpath = filepath + '.db' info('opening %s for writing' % dbpath) db = open(dbpath, 'n') cntr = 1 for sdf in sdf_iter(filepath): info(str(cntr)) db[str(cntr)] = sdf cntr += 1 info('writing %s finished' % dbpath) if self.db: info('rebinding to %s' % dbpath) self.db.close() else: info('binding to %s' % dbpath) self.db = db
def format_sdf_for_qsar(sdffile, output, ID_tag): """Cerius2 uses 1st line in SDF as ID tag .... some sdf has blank 1st line, so we need to format SDF .... by filling cid to 1st line in SDF""" fp = file(output, 'w') for sdf in sdfiterator.sdf_iter(sdffile): tagdict = get_sdf_tags(sdf) cid = tagdict[ID_tag] fp.write('%s\n' % cid) fp.write(sdf.split('\n', 1)[1].split('M END')[0]) fp.write('M END\n') fp.write('''> <%s> %s ''' % (ID_tag, cid)) fp.write('$$$$\n') fp.close() return
#!/usr/bin/python # -*- coding: utf-8 -*- """make sdf smaller by keeping only the MOL. Also, compounds with bonds fewer than the limit will be skipped""" limit = 0 import sys inp = sys.argv[1] if sys.argv[2] == '-': outp = sys.stdout else: outp = file(sys.argv[2], 'w') if len(sys.argv) == 4: limit = int(sys.argv[3]) from sdfiterator import sdf_iter iter = sdf_iter(inp, True) cnt = 0 for i in iter: bonds_cnt = int((i.split('\n')[3])[3:6]) if bonds_cnt >= limit: outp.write(i) else: sys.stderr.write('skipping %s\n' % cnt) cnt += 1
#!/usr/bin/python # -*- coding: utf-8 -*- """remove sdf with 0 bonds""" import sys inp = sys.argv[1] outp = file(sys.argv[2], 'w') from sdfiterator import sdf_iter iter = sdf_iter(inp) cnt = 0 for i in iter: bonds_cnt = int((i.split('\n')[3])[3:6]) if bonds_cnt != 0: outp.write(i) else: print 'skipping ' + cnt cnt += 1