def Arch2SQL(database, sqlfile, verbose): if verbose: sys.stderr.write("Retrieving data from {0} ...\n".format(database)) newsource = Source(name='PDBarch', source="http://www-pdb.org/") outdir = os.path.join(os.path.join(os.path.abspath(sqlfile), '00')) Path.mkdir(outdir) sql_fd = gzip.open(os.path.join(outdir, '0000.sql.gz'), 'wb') sql_fd.write(start_transaction()) sql_fd.write(newsource.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() files_list_by_pdb = {} subdirs = ['archobj', 'superobj'] for subdir in subdirs: for archobjfile in Path.list_files(os.path.join(database, subdir)): if archobjfile.endswith('.archObj'): data = tuple( os.path.splitext( os.path.split(archobjfile)[-1])[0].split('_')[2:]) files_list_by_pdb[data] = archobjfile old_pdb = None newArchSet = None for dofdata in sorted(files_list_by_pdb): pdb = dofdata[0] + '_' + dofdata[1] if pdb != old_pdb: if old_pdb is not None: sql_fd.write(newArchSet.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() outdir = os.path.join( os.path.join(os.path.abspath(sqlfile), dofdata[0][1:3].lower())) Path.mkdir(outdir) if verbose: sys.stderr.write("Retrieving loops from {0} ...\n".format(pdb)) sql_fd = gzip.open(os.path.join(outdir, pdb + '.sql.gz'), 'wb') sql_fd.write(start_transaction()) if verbose: sys.stderr.write("Printing data from {0} ...\n".format(pdb)) old_pdb = pdb newArchSet = Arch(pdb) newArchSet.archs = SSpair.load(files_list_by_pdb[dofdata]) sql_fd.write(newArchSet.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def TaxID2SQL(database, sqlfile, skip_download, verbose): taxid_connect = TaxIDlink(local=database) newsource = None if not skip_download: if verbose: sys.stderr.write( "Downloading TaxID database to {0} ...\n".format(database)) taxid_connect.download() newsource = Source(name='taxid', source=taxid_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") has_new = [] if verbose: sys.stderr.write("Parsing TaxID.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) for tax_line in taxid_connect.localTaxIDs: newtax = TaxID(inline=tax_line) if newtax.has_new: has_new.append(newtax.toSQL()) else: sql_fd.write(newtax.toSQL() + "\n") sql_fd.write("\n".join(has_new) + "\n") sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def DrugBank2SQL(database, sqlfile, skip_download, verbose): drugbank_connect = DrugBanklink(local = database) newsource = None if not skip_download: if verbose: sys.stderr.write("Downloading drugBank database to {0} ...\n".format(database)) # drugbank_connect.download() newsource = Source(name = 'DrugBank', source = drugbank_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") if verbose: sys.stderr.write("Parsing drugBank.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) sql_fd.write(Drug.preuniprotdeleted()) if newsource is not None: sql_fd.write(newsource.toSQL()) for drg_line in drugbank_connect.localDrugs: newdrg = Drug(inline = drg_line) sql_fd.write(newdrg.toSQL()) sql_fd.write(Drug.afteruniprotdeleted()) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def Enzyme2SQL(database, sqlfile, skip_download, verbose): enzyme_connect = Enzymelink(local=database) newsource = None if not skip_download: if verbose: sys.stderr.write( "Downloading Enzyme database to {0} ...\n".format(database)) enzyme_connect.download() newsource = Source(name='enzyme', source=enzyme_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") if verbose: sys.stderr.write("Parsing Enzyme.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) transfers = [] for enz_line in enzyme_connect.localEnzymes: newenz = Enzyme(inline=enz_line) sql_fd.write(newenz.toSQL()) if newenz.has_transfers: transfers.append(newenz.transfered2SQL()) sql_fd.write("".join(transfers)) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def PDBeChem2SQL(database, sqlfile, skip_download, verbose): pdbechem_connect = PDBeChemlink(local = database) newsource = None if not skip_download: if verbose: sys.stderr.write("Downloading PDBeChem database to {0} ...\n".format(database)) pdbechem_connect.download() newsource = Source(name = 'PDBeChem', source = pdbechem_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") noparent_chems = [] parent_chems = [] if verbose: sys.stderr.write("Parsing PDBeChem.\n") for chem_file in pdbechem_connect.localPDBeChems: if verbose: sys.stderr.write("\tReading {0} ....\n".format(chem_file)) newchem = PDBeChem(chem_file) if newchem.parent is None: noparent_chems.append(newchem.toSQL()) else: parent_chems.append(newchem.toSQL()) if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) for e in element_dic.values(): newelement = Element(e.number, e.symbol, e.name) sql_fd.write(newelement.toSQL() + "\n") sql_fd.write("\n".join(noparent_chems) + "\n") sql_fd.write("\n".join(parent_chems) + "\n") sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def PDBTM2SQL(database, sqlfile, skip_download, verbose): pdbtm_connect = PDBTMlink(local=database) newsource = None if not skip_download: if verbose: sys.stderr.write( "Downloading PDBTM database to {0} ...\n".format(database)) #pdbtm_connect.download() newsource = Source(name='enzyme', source=pdbtm_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") if verbose: sys.stderr.write("Parsing PDBTM.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) sql_fd.write(TM.prepdbdeleted()) if newsource is not None: sql_fd.write(newsource.toSQL()) sql_fd.write(TM.regions2SQL()) for line in pdbtm_connect.localTM: tmdata = TM(inline=line) sql_fd.write(tmdata.toSQL()) sql_fd.write(TM.afterpdbdeleted()) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def SCOP2SQL(database, sqlfile, skip_download, verbose): scop_connect = SCOPlink(local = database) newsource = None if not skip_download: if verbose: sys.stderr.write("Downloading SCOP database to {0} ...\n".format(database)) scop_connect.download() newsource = Source(name = 'enzyme', source = scop_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") if verbose: sys.stderr.write("Parsing SCOP.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) transfers = [] scop_obj = SCOP() for line in scop_connect.descriptions: scop_obj.add_description(line.strip()) for line in scop_connect.relations: scop_obj.add_relation(line.strip()) sql_fd.write(SCOP.prepdbdeleted()) sql_fd.write(scop_obj.toSQL()) sql_fd.write(SCOP.afterpdbdeleted()) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def Uniprot2SQL(database, sqlfile, skip_download, verbose): uniprot_connect = Uniprotlink(local=database) newsource = None if not options.skip_download: if verbose: sys.stderr.write( "Downloading Uniprot database to {0} ...\n".format(database)) uniprot_connect.download() newsource = Source(name='uniprot', source=uniprot_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") file_counter = 1 file_sequence = 0 file_sql_name = sqlfile.replace('_', '{0:03}') if verbose: sys.stderr.write("Parsing Uniprot.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format( file_sql_name.format(file_counter))) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(file_sql_name.format(file_counter), 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) for uni_line in uniprot_connect.localUniprots: newuni = Uniprot(inline=uni_line) if file_sequence > 500000: sql_fd.write(end_transaction()) sql_fd.close() file_sequence = 0 file_counter += 1 if verbose: sys.stderr.write("Writing {0} ....\n".format( file_sql_name.format(file_counter))) sql_fd = gzip.open(file_sql_name.format(file_counter), 'wb') sql_fd.write(start_transaction()) sql_fd.write(newuni.toSQL()) file_sequence += 1 if verbose: sys.stderr.write("End execution.\n")
def CDhit2SQL(database, sqlfile, verbose): if verbose: sys.stderr.write("Retrieving data from {0} ...\n".format(database)) cdhit = CDhit(database) if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) sql_fd.write(cdhit.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def enrichment2SQL(infile, table, rel, verbose): if verbose: sys.stderr.write("Retrieving data from {0} ...\n".format(infile)) if verbose: sys.stderr.write("\tTo table {0} ...\n".format(table)) start_command = "INSERT INTO {0} VALUES ".format(table) sys.stdout.write(start_transaction() + "\n") fd = open(infile) for line in fd: d = line.strip().split() d[1] = d[1] if rel in ['scop', 'go'] else "'" + d[1] + "'" sys.stdout.write( "{0} ({1[0]},{1[1]},'{1[7]}',{1[2]},'{1[8]}','{1[9]}','{1[10]}');\n" .format(start_command, d)) sys.stdout.write(end_transaction()) if verbose: sys.stderr.write("End execution.\n")
def GO2SQL(database, sqlfile, skip_download, verbose): go_connect = GOlink(local=database) newsource = None if not skip_download: if verbose: sys.stderr.write( "Downloading GO database to {0} ...\n".format(database)) go_connect.download() newsource = Source(name='GO', source=go_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") with_parents = [] with_relations = [] if verbose: sys.stderr.write("Parsing GO.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) for go_line in go_connect.localGOs: newGO = GOterm(inline=go_line) sql_fd.write(newGO.toSQL() + "\n") if len(newGO.relations) > 0: with_relations.append(newGO) if len(newGO.parents) > 0: with_parents.append(newGO) for GO in with_relations: sql_fd.write(GO.relations2SQL() + "\n") for GO in with_parents: sql_fd.write(GO.parents2SQL() + "\n") sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def PDB2SQL(database, seqdatabase, listfiles, sqlfile, skip_download, verbose): pdb_connect = PDBlink(local=database, PDBseq=seqdatabase) newsource = None if not skip_download: if verbose: sys.stderr.write( "Syncronizing PDB database to {0} ...\n".format(database)) pdb_connect.sync_PDB(log_file=os.path.join(database, 'PDB.sync.log')) newsource = Source(name='PDB', source=pdb_connect.source) if verbose: sys.stderr.write( "Creating PDBseq in {0} ...\n".format(seqdatabase)) pdb_connect.make_PDBseq( log_file=os.path.join(seqdatabase, 'PDB.seq.log')) if verbose: sys.stderr.write("Download Finished.\n") outdir = os.path.abspath(os.path.join(sqlfile, '00')) Path.mkdir(outdir) sql_fd = gzip.open(os.path.join(outdir, '0000.sql.gz'), 'wb') sql_fd.write(start_transaction()) sql_fd.write(newsource.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() else: if verbose: sys.stderr.write("Using previously downloaded database.\n") files2check = set() if listfiles is not None: fd = open(listfiles) for line in fd: files2check.add(line.strip()) fd.close() logfd = open(listfiles + ".log", "w") else: logfd = open("PDB2SQL.log", "w") import traceback for pdbfile in pdb_connect.localPDBs: try: if listfiles is not None and pdbfile not in files2check: if len(files2check) == 0: break continue #else: # files2check.add(pdbfile) # files2check.remove(pdbfile) if verbose: sys.stderr.write("Working file {0}\n".format(pdbfile)) newPDB = PDB(pdb_file=pdbfile) outsqldir = os.path.join(sqlfile, newPDB.id[1:3].lower()) Path.mkdir(outsqldir) outsqlfile = os.path.join(outsqldir, newPDB.id + '.sql.gz') # outsqlfile = os.path.join(os.getcwd(), newPDB.id + '.sql.gz') if verbose: sys.stderr.write( "\tOutput SQL file is {0}.\n".format(outsqlfile)) sql_fd = gzip.open(outsqlfile, 'wb') sql_fd.write(start_transaction()) sql_fd.write(PDB.preuniprotdeleted()) sql_fd.write(newPDB.toSQL()) sql_fd.write(PDB.afteruniprotdeleted()) sql_fd.write(end_transaction()) sql_fd.close() except KeyboardInterrupt: raise except: if verbose: sys.stderr.write("\tAn error occurred. Check log file\n") SBIglobals.alert( 'error', None, '\tAn error occurred for {0} . Check log file'.format(pdbfile)) logfd.write("FILE {0}\n".format(pdbfile)) logfd.write(traceback.format_exc()) logfd.write("\n")
def DS2SQL(database, looplist, sqlfile, verbose): Path.mkdir(sqlfile) for dsfile in Path.list_files(database): subclasstype = os.path.split(dsfile)[-1].split('.')[1] classification = Cclass(subclasstype) if verbose: sys.stderr.write( "Retrieving data for subclass {0} ...\n".format(subclasstype)) loops = readlist(looplist, subclasstype) sql_fd = gzip.open(os.path.join(sqlfile, subclasstype + '.sql.gz'), 'wb') sql_fd.write(start_transaction()) sql_in = open(dsfile) read = False for line in sql_in: dataline = line.rstrip('\n') #SKIP LINES if line.startswith('==') or line.startswith('***') or len( line.strip()) == 0 or line.startswith( '---- P R O T E I N C O D E ----'): continue if line.startswith('CONSENSUS & MULTIPLE ALIGNEMENT IN THE'): data = line.split(':')[-1].strip().split() classification.subclasses = Subclass( tuple([data[0].strip(), data[3].strip()]), data[4]) workscls = classification.lastsubclass read = True continue if line.startswith('GLOBAL STATISTICS'): read = False continue if read: if line.startswith( ' SEQUENCE ALIGNEMENT :' ): parse_mode, counter = 'P', 0 elif line.startswith( ' ACCESSIBLE SURFACE ALIGNEMENT :' ): parse_mode, counter = 'E', 0 elif line.startswith( ' RAMACHANDRAN :' ): parse_mode, counter = 'R', 0 elif line.startswith( ' SECONDARY STRUCTURE :' ): parse_mode, counter = 'S', 0 elif line.startswith('--------- CONSENSUS THORNTON :'): workscls.add_consensus(dataline, 'DS', loops) elif line.startswith('--------- CONSENSUS TOPOLOGY'): workscls.add_topology(dataline, 'DS') elif line.startswith('CENTROIDE POLAR COORD. :'): workscls.add_coordinates(dataline) elif line.startswith('--------- RAMACHANDRAN PATTERN :'): workscls.ram_pat = re.sub( '\(X\)', '', dataline.split(':')[1].strip().strip('.')) elif line.startswith('--------- SEQUENCE PATTERN :'): workscls.seq_pat = re.sub( '\(X\)', '', dataline.split(':')[1].strip().strip('.')) elif line.startswith('--------- BURIAL PATTERN :'): workscls.exp_pat = re.sub( '\(X\)', '', dataline.split(':')[1].strip().strip('.')) elif line.startswith(' ' ) and len(dataline) < 400: if parse_mode == 'P': workscls.loops = Loop(info=dataline) if parse_mode == 'E': workscls.loops[counter].add_surface(info=dataline) counter += 1 if parse_mode == 'R': workscls.loops[counter].add_ramachandran(info=dataline) counter += 1 if parse_mode == 'S': workscls.loops[counter].add_secondary_str( info=dataline) counter += 1 sql_fd.write(classification.toSQL('DS')) sql_in.close() sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")