def builtArchDB(pdblist, pdbdir, outdir): pdb_connect = PDBlink(local=pdbdir) for pdbinfo in pdblist: pdb, chain = pdbinfo subdir = pdb[1:3].lower() pdbfile = pdb_connect.get_PDB(pdb) SBIglobals.alert('verbose', None, 'Processing file: {0}'.format(pdbfile)) archs = archer.build_archs(sourcepdb=pdbfile, chain=chain, limit_distance=25) for archkey in archs: if len(archs[archkey]) > 0: Path.mkdir(os.path.join(outdir[archkey], subdir)) Path.mkdir(os.path.join(outdir['STRUC'], subdir)) for arch in archs[archkey]: pyobjName = "_".join([ str(arch.aminoacid_distance), arch.type, arch.identifier ]) + '.archObj' arch.dump( os.path.join(os.path.join(outdir[archkey], subdir), pyobjName)) arch.format2file(filename=os.path.join( os.path.join(outdir['STRUC'], subdir), arch.identifier), extension='pdb', center=True) arch.format2file(filename=os.path.join( os.path.join(outdir['STRUC'], subdir), arch.identifier), extension='js', center=True)
def Enzyme2SQL(database, sqlfile, skip_download, verbose): enzyme_connect = Enzymelink(local=database) newsource = None if not skip_download: if verbose: sys.stderr.write( "Downloading Enzyme database to {0} ...\n".format(database)) enzyme_connect.download() newsource = Source(name='enzyme', source=enzyme_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") if verbose: sys.stderr.write("Parsing Enzyme.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) transfers = [] for enz_line in enzyme_connect.localEnzymes: newenz = Enzyme(inline=enz_line) sql_fd.write(newenz.toSQL()) if newenz.has_transfers: transfers.append(newenz.transfered2SQL()) sql_fd.write("".join(transfers)) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def SCOP2SQL(database, sqlfile, skip_download, verbose): scop_connect = SCOPlink(local = database) newsource = None if not skip_download: if verbose: sys.stderr.write("Downloading SCOP database to {0} ...\n".format(database)) scop_connect.download() newsource = Source(name = 'enzyme', source = scop_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") if verbose: sys.stderr.write("Parsing SCOP.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) transfers = [] scop_obj = SCOP() for line in scop_connect.descriptions: scop_obj.add_description(line.strip()) for line in scop_connect.relations: scop_obj.add_relation(line.strip()) sql_fd.write(SCOP.prepdbdeleted()) sql_fd.write(scop_obj.toSQL()) sql_fd.write(SCOP.afterpdbdeleted()) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def sortarchs(inputdir, outputdir): archsdir = outputdir Path.mkdir(archsdir) sorted_archs = {} loop_file_name = os.path.join(archsdir, 'ArchDB.{0}.db') loop_split_file_name = os.path.join(archsdir, 'ArchDB.{0}.{1:02d}-{2:02d}.db') sections_ini = [ 0, 4, 7,14,21] sections_end = [ 4, 6,13,20, 0] for archfile in Path.list_files(root = inputdir, pattern = '*.archObj'): filename = os.path.basename(archfile) data = filename.split('_') length = int(data[0]) archtype = data[1] sorted_archs.setdefault(archtype,{}).setdefault(length,[]) sorted_archs[archtype][length].append(archfile) for archtype in sorted_archs: SBIglobals.alert('verbose', None, "ARCHS: " + archtype + "\n") fd = File(loop_file_name.format(archtype), 'w') fdp = [] for x in range(len(sections_ini)): fdp.append(File(loop_split_file_name.format(archtype, sections_ini[x], sections_end[x]), 'w')) for length in sorted(sorted_archs[archtype]): SBIglobals.alert('verbose', None, '\t{0}'.format(length)) for archfile in sorted_archs[archtype][length]: SBIglobals.alert('verbose', None, '\t\t{0}\n'.format(archfile)) nsp = Arch.load(archfile) fd.descriptor.write(nsp.archtype_format() + "\n") for x in range(len(fdp)): if length >= sections_ini[x] and (sections_end[x] == 0 or length <= sections_end[x]): fdp[x].descriptor.write(nsp.archtype_format() + "\n") fd.close() for x in range(len(fdp)): fdp[x].close()
def DrugBank2SQL(database, sqlfile, skip_download, verbose): drugbank_connect = DrugBanklink(local = database) newsource = None if not skip_download: if verbose: sys.stderr.write("Downloading drugBank database to {0} ...\n".format(database)) # drugbank_connect.download() newsource = Source(name = 'DrugBank', source = drugbank_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") if verbose: sys.stderr.write("Parsing drugBank.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) sql_fd.write(Drug.preuniprotdeleted()) if newsource is not None: sql_fd.write(newsource.toSQL()) for drg_line in drugbank_connect.localDrugs: newdrg = Drug(inline = drg_line) sql_fd.write(newdrg.toSQL()) sql_fd.write(Drug.afteruniprotdeleted()) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def TaxID2SQL(database, sqlfile, skip_download, verbose): taxid_connect = TaxIDlink(local=database) newsource = None if not skip_download: if verbose: sys.stderr.write( "Downloading TaxID database to {0} ...\n".format(database)) taxid_connect.download() newsource = Source(name='taxid', source=taxid_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") has_new = [] if verbose: sys.stderr.write("Parsing TaxID.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) for tax_line in taxid_connect.localTaxIDs: newtax = TaxID(inline=tax_line) if newtax.has_new: has_new.append(newtax.toSQL()) else: sql_fd.write(newtax.toSQL() + "\n") sql_fd.write("\n".join(has_new) + "\n") sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def PDBTM2SQL(database, sqlfile, skip_download, verbose): pdbtm_connect = PDBTMlink(local=database) newsource = None if not skip_download: if verbose: sys.stderr.write( "Downloading PDBTM database to {0} ...\n".format(database)) #pdbtm_connect.download() newsource = Source(name='enzyme', source=pdbtm_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") if verbose: sys.stderr.write("Parsing PDBTM.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) sql_fd.write(TM.prepdbdeleted()) if newsource is not None: sql_fd.write(newsource.toSQL()) sql_fd.write(TM.regions2SQL()) for line in pdbtm_connect.localTM: tmdata = TM(inline=line) sql_fd.write(tmdata.toSQL()) sql_fd.write(TM.afterpdbdeleted()) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def PDBeChem2SQL(database, sqlfile, skip_download, verbose): pdbechem_connect = PDBeChemlink(local = database) newsource = None if not skip_download: if verbose: sys.stderr.write("Downloading PDBeChem database to {0} ...\n".format(database)) pdbechem_connect.download() newsource = Source(name = 'PDBeChem', source = pdbechem_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") noparent_chems = [] parent_chems = [] if verbose: sys.stderr.write("Parsing PDBeChem.\n") for chem_file in pdbechem_connect.localPDBeChems: if verbose: sys.stderr.write("\tReading {0} ....\n".format(chem_file)) newchem = PDBeChem(chem_file) if newchem.parent is None: noparent_chems.append(newchem.toSQL()) else: parent_chems.append(newchem.toSQL()) if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) for e in element_dic.values(): newelement = Element(e.number, e.symbol, e.name) sql_fd.write(newelement.toSQL() + "\n") sql_fd.write("\n".join(noparent_chems) + "\n") sql_fd.write("\n".join(parent_chems) + "\n") sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def __init__(self, local): ''' @param: local @pdef: local directory to store the database. Create if not exist @ptype: {String} ''' self._local = local Path.mkdir(self._local)
def download(self): if not self.has_local: raise NameError('A local SCOP database directory must be defined.') Path.mkdir(self.local) urllib.urlretrieve(SCOPftp['desc'], self._desc) urllib.urlretrieve(SCOPftp['rel'], self._rel) return True
def execute_query(self, query_file, blast_output_file = None, work_directory = os.getcwd()): ''' Execute BLAST given a query sequence. @param: query_file @pdef: Fasta file with the query sequence. @pdefault: 'QuerySequence' @ptype: {String} or {File} or {Fasta} @param: blast_output_file @pdef: name of the temporary BLAST output file. @pdefault: query_file.prefix + job.pid + .blast.xml.out @ptype: {String} @param: work_directory @pdef: Directory to which the temporary files will be created. @pdefault: Current working directory. @ptype: {String} @raises: {AttributeError} if query_file is multi-fasta. @raises: {BlastError} in BLAST execution or output parsing errors. @returns: {BlastResult} ''' if isinstance(query_file, basestring) or isinstance(query_file, File): newFasta = Fasta(fasta_file = query_file) elif isinstance(query_file, Fasta): newFasta = query_file if newFasta.is_multifasta: msg = 'Blasts can only be executed one at a time due to XML output restrictions.' raise AttributeError(msg) # All the sequence is unknown, it will crash blast newFasta.load() query_sequence = newFasta.sequence if len(re.sub(r'[Xx]', '', query_sequence.sequence)) == 0: SBIg.warn(self, 'Created an empty BlastResult.') return BlastResult(query_name = query_sequence.id, query_sequence = query_sequence.sequence) Path.mkdir(work_directory) file_prefixes = ".".join([newFasta.file.prefix, str(os.getpid())]) file_prefixes = os.path.join(work_directory, file_prefixes) tmp_output = file_prefixes + ".blast.xml.out" tmp_output = tmp_output if blast_output_file is None else os.path.join(work_directory, blast_output_file) self._execute(input_file = newFasta, output_file = tmp_output) blast_result = self._parse_blast(newFasta.sequence.sequence, tmp_output) self._clean([tmp_output, ]) return blast_result
def download(self): if not self.has_local: raise NameError('A local GO database directory must be defined.') Path.mkdir(self.local) destination = os.path.join(self.local, self._gfile) urllib.urlretrieve(GOftp['source'], destination) self._process() return True
def download(self): if not self.has_local: raise NameError( 'A local drugBank database directory must be defined.') Path.mkdir(self.local) urllib.urlretrieve(drugBankftp['targets'], self._target) urllib.urlretrieve(drugBankftp['main'], self._main) self._process() return True
def download(self): if not self.has_local: raise NameError( 'A local Enzyme database directory must be defined.') Path.mkdir(self.local) urllib.urlretrieve(Enzymeftp['dat'], self._dfile) urllib.urlretrieve(Enzymeftp['cls'], self._cfile) self._process() return True
def download(self): if not self.has_local: raise NameError( 'A local drugBank database directory must be defined.') Path.mkdir(self.local) here = os.getcwd() os.chdir(self.local) os.system("svn export {0}".format(PDBTMftp['svn'])) self._process() return True
def Arch2SQL(database, sqlfile, verbose): if verbose: sys.stderr.write("Retrieving data from {0} ...\n".format(database)) newsource = Source(name='PDBarch', source="http://www-pdb.org/") outdir = os.path.join(os.path.join(os.path.abspath(sqlfile), '00')) Path.mkdir(outdir) sql_fd = gzip.open(os.path.join(outdir, '0000.sql.gz'), 'wb') sql_fd.write(start_transaction()) sql_fd.write(newsource.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() files_list_by_pdb = {} subdirs = ['archobj', 'superobj'] for subdir in subdirs: for archobjfile in Path.list_files(os.path.join(database, subdir)): if archobjfile.endswith('.archObj'): data = tuple( os.path.splitext( os.path.split(archobjfile)[-1])[0].split('_')[2:]) files_list_by_pdb[data] = archobjfile old_pdb = None newArchSet = None for dofdata in sorted(files_list_by_pdb): pdb = dofdata[0] + '_' + dofdata[1] if pdb != old_pdb: if old_pdb is not None: sql_fd.write(newArchSet.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() outdir = os.path.join( os.path.join(os.path.abspath(sqlfile), dofdata[0][1:3].lower())) Path.mkdir(outdir) if verbose: sys.stderr.write("Retrieving loops from {0} ...\n".format(pdb)) sql_fd = gzip.open(os.path.join(outdir, pdb + '.sql.gz'), 'wb') sql_fd.write(start_transaction()) if verbose: sys.stderr.write("Printing data from {0} ...\n".format(pdb)) old_pdb = pdb newArchSet = Arch(pdb) newArchSet.archs = SSpair.load(files_list_by_pdb[dofdata]) sql_fd.write(newArchSet.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def make_PDBseq(self, log_file, resolution_threshold=None): if not self.has_local: raise NameError( 'A local PDB database must be defined to do create a PDBseq database.' ) outdir = self.PDBseq if self.PDBseq is not None else os.curdir Path.mkdir(self.PDBseq) fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'), action='w', overwrite=True) fasta_fd = fasta_file.descriptor idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'), action='w', overwrite=True) idx_fd = idx_file.descriptor # if resolution_threshold is not None: # filtered_file_name = self.get_PDBseq_filtered(resolution_threshold) # filtered_file = File(file_name = filtered_file_name, action = 'w', overwrite = True) # filtered_fd = filtered_file.descriptor # resolutions = self.get_resolutions(resolution_threshold = resolution_threshold) log_file = File(file_name=log_file, action='w', overwrite=True) log_idx = log_file.descriptor for pdb_file in self.localPDBs: log_idx.write("Reading File: {0}\n".format(pdb_file)) newPDB = PDB(pdb_file=pdb_file, dehydrate=True) fasta_idx = newPDB.FASTA_IDX(nucleotide=False) if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']): log_idx.write( 'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n' .format(newPDB.id)) if len(fasta_idx['FASTA']) > 0: log_idx.write('\tPrinting FASTA and IDX...\n') else: log_idx.write('\tProblably just a nucleotide PDB...\n') for c in range(len(fasta_idx['FASTA'])): sequence = fasta_idx['FASTA'][c].split('\n')[1] sequence = sequence.replace('X', '').replace('x', '') if len(sequence) > 0: fasta_fd.write(fasta_idx['FASTA'][c] + "\n") if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca: filtered_fd.write(fasta_idx['FASTA'][c] + "\n") idx_fd.write(fasta_idx['IDX'][c] + "\n") del (newPDB) #CLOSE & END fasta_file.close() idx_file.close() if resolution_threshold is not None: filtered_fd.close()
def download(self): if not self.has_local: raise NameError('A local TaxID database directory must be defined.') Path.mkdir(self.local) destination = os.path.join(self.local, 'taxdmp.zip') urllib.urlretrieve(taxIDftp['global'], destination) command = ['unzip', '-o', destination, '-d', self.local] p = subprocess.Popen(command, stdout = subprocess.PIPE, stderr = subprocess.PIPE) out, err = p.communicate() self._process() return True
def download(self): if not self.has_local: raise NameError('A local PDBeChem database directory must be defined.') Path.mkdir(self.local) destination = os.path.join(self.local, 'mmcif.tar.gz') try: urllib.urlretrieve(PDBeChemftp['global'], destination) except: return False command = ['tar', 'zxvf', destination, '-C', self.local] p = subprocess.Popen(command, stdout = subprocess.PIPE, stderr = subprocess.PIPE) out, err = p.communicate() return True
def CDhit2SQL(database, sqlfile, verbose): if verbose: sys.stderr.write("Retrieving data from {0} ...\n".format(database)) cdhit = CDhit(database) if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) sql_fd.write(cdhit.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def Uniprot2SQL(database, sqlfile, skip_download, verbose): uniprot_connect = Uniprotlink(local=database) newsource = None if not options.skip_download: if verbose: sys.stderr.write( "Downloading Uniprot database to {0} ...\n".format(database)) uniprot_connect.download() newsource = Source(name='uniprot', source=uniprot_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") file_counter = 1 file_sequence = 0 file_sql_name = sqlfile.replace('_', '{0:03}') if verbose: sys.stderr.write("Parsing Uniprot.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format( file_sql_name.format(file_counter))) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(file_sql_name.format(file_counter), 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) for uni_line in uniprot_connect.localUniprots: newuni = Uniprot(inline=uni_line) if file_sequence > 500000: sql_fd.write(end_transaction()) sql_fd.close() file_sequence = 0 file_counter += 1 if verbose: sys.stderr.write("Writing {0} ....\n".format( file_sql_name.format(file_counter))) sql_fd = gzip.open(file_sql_name.format(file_counter), 'wb') sql_fd.write(start_transaction()) sql_fd.write(newuni.toSQL()) file_sequence += 1 if verbose: sys.stderr.write("End execution.\n")
def GO2SQL(database, sqlfile, skip_download, verbose): go_connect = GOlink(local=database) newsource = None if not skip_download: if verbose: sys.stderr.write( "Downloading GO database to {0} ...\n".format(database)) go_connect.download() newsource = Source(name='GO', source=go_connect.source) if verbose: sys.stderr.write("Download Finished.\n") else: if verbose: sys.stderr.write("Using previously downloaded database.\n") with_parents = [] with_relations = [] if verbose: sys.stderr.write("Parsing GO.\n") if verbose: sys.stderr.write("Writing {0} ....\n".format(sqlfile)) Path.mkdir(os.path.split(os.path.abspath(sqlfile))[0]) sql_fd = gzip.open(sqlfile, 'wb') sql_fd.write(start_transaction()) if newsource is not None: sql_fd.write(newsource.toSQL()) for go_line in go_connect.localGOs: newGO = GOterm(inline=go_line) sql_fd.write(newGO.toSQL() + "\n") if len(newGO.relations) > 0: with_relations.append(newGO) if len(newGO.parents) > 0: with_parents.append(newGO) for GO in with_relations: sql_fd.write(GO.relations2SQL() + "\n") for GO in with_parents: sql_fd.write(GO.parents2SQL() + "\n") sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")
def sync_PDB(self, log_file=None): if not self.has_local: raise NameError( 'A local PDB database must be defined to sync with.') Path.mkdir(self.local) command = [ 'rsync', '-rlpt', '-v', '-z', '--port=' + PDBrsync['port'], PDBrsync['address'], self.local ] p = subprocess.Popen(command, stdout=open(log_file, 'w') if log_file is not None else subprocess.PIPE, stderr=subprocess.PIPE) SBIglobals.alert('verbose', self, 'Executing: {0}'.format(" ".join(command))) out, err = p.communicate() if err.strip() != '': raise SystemError('{0}'.format(err))
def get_PDB(self, pdbID): if self.has_local: rootdir = os.path.join(self.local, pdbID.lower()[1:3]) for pdb_file in Path.list_files(root=rootdir, pattern='*.ent.gz'): newfile = File(file_name=pdb_file, action='r') if newfile.prefix.lstrip('pdb').upper() == pdbID.upper(): return pdb_file #If we do not find it in local (or we do not have a local) we search it on the FTP pdb_file = 'pdb' + pdbID.lower() + '.ent.gz' source = 'ftp://' + PDBftp['address'] + os.path.join( PDBftp['structures'], pdbID[1:3].lower(), pdb_file) try: urllib.urlretrieve(source, pdb_file) except: return False return os.path.abspath(pdb_file)
def _process(self): tmoFile = File(self._pdbtmfile, 'w', True) for xmlfile in Path.list_files( os.path.join(self._local, 'pdbtm/database/'), '*.xml'): xmldata = TM( pdb=os.path.splitext(os.path.split(xmlfile)[1])[0].upper()) skip_chains = set() read = False fdxml = open(xmlfile) for line in fdxml: if line.startswith(' <TMRES>'): xmldata.tmres = line elif line.startswith(' <TMTYPE'): xmldata.tmtype = line elif line.startswith(' <PDBKWRES'): xmldata.kwres = line elif line.startswith(' <SIDEDEFINITION'): m = re.search('Side1="(\S+)"', line) xmldata.side = m.group(1) elif line.startswith(' <APPLY_TO_CHAIN'): m = re.search('NEW_CHAINID=\"(\S{1})\"', line) if m: skip_chains.add(m.group(1)) elif line.startswith(' <CHAIN '): m = re.search( 'CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"', line) if m: chain, num, tmtype = m.group(1), m.group(2), m.group(3) if not chain in skip_chains: cdata = tuple([chain, num, tmtype]) xmldata.set_chain(cdata) read = True elif line.startswith(' <REGION ') and read: m = re.search( 'pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"', line) ini, end, tmtype = m.group(1), m.group(2), m.group(3) xmldata.set_chain(cdata, tuple([ini, end, tmtype])) elif line.startswith(' </CHAIN>'): read = False fdxml.close() if len(xmldata.chains) > 0: tmoFile.write(str(xmldata) + "\n") tmoFile.close()
def localPDBs(self): for pdb_file in Path.list_files(root=self.local, pattern='*.ent.gz'): yield pdb_file
def PDB2SQL(database, seqdatabase, listfiles, sqlfile, skip_download, verbose): pdb_connect = PDBlink(local=database, PDBseq=seqdatabase) newsource = None if not skip_download: if verbose: sys.stderr.write( "Syncronizing PDB database to {0} ...\n".format(database)) pdb_connect.sync_PDB(log_file=os.path.join(database, 'PDB.sync.log')) newsource = Source(name='PDB', source=pdb_connect.source) if verbose: sys.stderr.write( "Creating PDBseq in {0} ...\n".format(seqdatabase)) pdb_connect.make_PDBseq( log_file=os.path.join(seqdatabase, 'PDB.seq.log')) if verbose: sys.stderr.write("Download Finished.\n") outdir = os.path.abspath(os.path.join(sqlfile, '00')) Path.mkdir(outdir) sql_fd = gzip.open(os.path.join(outdir, '0000.sql.gz'), 'wb') sql_fd.write(start_transaction()) sql_fd.write(newsource.toSQL()) sql_fd.write(end_transaction()) sql_fd.close() else: if verbose: sys.stderr.write("Using previously downloaded database.\n") files2check = set() if listfiles is not None: fd = open(listfiles) for line in fd: files2check.add(line.strip()) fd.close() logfd = open(listfiles + ".log", "w") else: logfd = open("PDB2SQL.log", "w") import traceback for pdbfile in pdb_connect.localPDBs: try: if listfiles is not None and pdbfile not in files2check: if len(files2check) == 0: break continue #else: # files2check.add(pdbfile) # files2check.remove(pdbfile) if verbose: sys.stderr.write("Working file {0}\n".format(pdbfile)) newPDB = PDB(pdb_file=pdbfile) outsqldir = os.path.join(sqlfile, newPDB.id[1:3].lower()) Path.mkdir(outsqldir) outsqlfile = os.path.join(outsqldir, newPDB.id + '.sql.gz') # outsqlfile = os.path.join(os.getcwd(), newPDB.id + '.sql.gz') if verbose: sys.stderr.write( "\tOutput SQL file is {0}.\n".format(outsqlfile)) sql_fd = gzip.open(outsqlfile, 'wb') sql_fd.write(start_transaction()) sql_fd.write(PDB.preuniprotdeleted()) sql_fd.write(newPDB.toSQL()) sql_fd.write(PDB.afteruniprotdeleted()) sql_fd.write(end_transaction()) sql_fd.close() except KeyboardInterrupt: raise except: if verbose: sys.stderr.write("\tAn error occurred. Check log file\n") SBIglobals.alert( 'error', None, '\tAn error occurred for {0} . Check log file'.format(pdbfile)) logfd.write("FILE {0}\n".format(pdbfile)) logfd.write(traceback.format_exc()) logfd.write("\n")
def localPDBeChems(self): for chem_file in Path.list_files(root = self.local, pattern = '*.cif'): yield chem_file
def execute_query_seq(self, sequenceID = None, sequence = None, blast_input_file = None, blast_output_file = None, work_directory = os.getcwd()): ''' Execute BLAST given a query sequence. @param: sequenceID @pdef: name of the query sequence. @pdefault: 'QuerySequence' @pclash: If sequence is not provided, it assumes that the sequenceID belongs to a protein in the database and, thus, it searches for it. Either sequenceID or sequence needs to be provided. @ptype: {String} @param: sequence @pdef: query sequence. @pdefault: _None_ @pclash: Either sequenceID or sequence needs to be provided. @ptype: {String} @param: blast_input_file @pdef: name of the temporary fasta file to use as BLAST input. @pdefault: job.pid + clock + .tmp.fa @ptype: {String} @param: blast_output_file @pdef: name of the temporary BLAST output file. @pdefault: job.pid + clock + .blast.xml.out @ptype: {String} @param: work_directory @pdef: Directory to which the temporary files will be created. @pdefault: Current working directory. @ptype: {String} @raises: {AttributeError} if neither sequenceID nor sequence are provided or if sequenceID is a list of sequence names. @raises: {BlastError} in BLAST execution or output parsing errors. @returns: {BlastResult} ''' if sequenceID is None and sequence is None: msg = 'Either a sequence or sequenceID is needed to perform the blast.' raise AttributeError(msg) if isinstance(sequenceID, (list, set, tuple)): msg = 'Blasts can only be executed one at a time due to XML output restrictions.' raise AttributeError(msg) sequenceID = 'QuerySequence' if sequenceID is None else sequenceID # Given only a code implies that the protein of interest is in the # database itself if sequence is None: grabbedSequence = self._database.retrieve(sequenceID) sequenceID = grabbedSequence[0].id sequence = grabbedSequence[0].sequence # All the sequence is unknown, it will crash blast if len(re.sub(r'[Xx]', '', sequence)) == 0: SBIg.warn(self, 'Created an empty BlastResult.') return BlastResult(query_name = sequenceID, query_sequence = sequence) Path.mkdir(work_directory) file_prefixes = ".".join([str(os.getpid()), str(int(time.clock()*100000))]) file_prefixes = os.path.join(work_directory, file_prefixes) tmp_input = file_prefixes + ".tmp.fa" tmp_output = file_prefixes + ".blast.xml.out" tmp_input = tmp_input if blast_input_file is None else os.path.join(work_directory, blast_input_file) tmp_output = tmp_output if blast_output_file is None else os.path.join(work_directory, blast_output_file) QueryFasta = Fasta.build(file_name = tmp_input, sequence_id = sequenceID, sequence = sequence, force = True) self._execute(input_file = QueryFasta, output_file = tmp_output) blast_result = self._parse_blast(sequence, tmp_output) self._clean([tmp_input, tmp_output]) return blast_result
def DS2SQL(database, looplist, sqlfile, verbose): Path.mkdir(sqlfile) for dsfile in Path.list_files(database): subclasstype = os.path.split(dsfile)[-1].split('.')[1] classification = Cclass(subclasstype) if verbose: sys.stderr.write( "Retrieving data for subclass {0} ...\n".format(subclasstype)) loops = readlist(looplist, subclasstype) sql_fd = gzip.open(os.path.join(sqlfile, subclasstype + '.sql.gz'), 'wb') sql_fd.write(start_transaction()) sql_in = open(dsfile) read = False for line in sql_in: dataline = line.rstrip('\n') #SKIP LINES if line.startswith('==') or line.startswith('***') or len( line.strip()) == 0 or line.startswith( '---- P R O T E I N C O D E ----'): continue if line.startswith('CONSENSUS & MULTIPLE ALIGNEMENT IN THE'): data = line.split(':')[-1].strip().split() classification.subclasses = Subclass( tuple([data[0].strip(), data[3].strip()]), data[4]) workscls = classification.lastsubclass read = True continue if line.startswith('GLOBAL STATISTICS'): read = False continue if read: if line.startswith( ' SEQUENCE ALIGNEMENT :' ): parse_mode, counter = 'P', 0 elif line.startswith( ' ACCESSIBLE SURFACE ALIGNEMENT :' ): parse_mode, counter = 'E', 0 elif line.startswith( ' RAMACHANDRAN :' ): parse_mode, counter = 'R', 0 elif line.startswith( ' SECONDARY STRUCTURE :' ): parse_mode, counter = 'S', 0 elif line.startswith('--------- CONSENSUS THORNTON :'): workscls.add_consensus(dataline, 'DS', loops) elif line.startswith('--------- CONSENSUS TOPOLOGY'): workscls.add_topology(dataline, 'DS') elif line.startswith('CENTROIDE POLAR COORD. :'): workscls.add_coordinates(dataline) elif line.startswith('--------- RAMACHANDRAN PATTERN :'): workscls.ram_pat = re.sub( '\(X\)', '', dataline.split(':')[1].strip().strip('.')) elif line.startswith('--------- SEQUENCE PATTERN :'): workscls.seq_pat = re.sub( '\(X\)', '', dataline.split(':')[1].strip().strip('.')) elif line.startswith('--------- BURIAL PATTERN :'): workscls.exp_pat = re.sub( '\(X\)', '', dataline.split(':')[1].strip().strip('.')) elif line.startswith(' ' ) and len(dataline) < 400: if parse_mode == 'P': workscls.loops = Loop(info=dataline) if parse_mode == 'E': workscls.loops[counter].add_surface(info=dataline) counter += 1 if parse_mode == 'R': workscls.loops[counter].add_ramachandran(info=dataline) counter += 1 if parse_mode == 'S': workscls.loops[counter].add_secondary_str( info=dataline) counter += 1 sql_fd.write(classification.toSQL('DS')) sql_in.close() sql_fd.write(end_transaction()) sql_fd.close() if verbose: sys.stderr.write("End execution.\n")