def make_PDBseq(self, log_file, resolution_threshold=None): if not self.has_local: raise NameError( 'A local PDB database must be defined to do create a PDBseq database.' ) outdir = self.PDBseq if self.PDBseq is not None else os.curdir Path.mkdir(self.PDBseq) fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'), action='w', overwrite=True) fasta_fd = fasta_file.descriptor idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'), action='w', overwrite=True) idx_fd = idx_file.descriptor # if resolution_threshold is not None: # filtered_file_name = self.get_PDBseq_filtered(resolution_threshold) # filtered_file = File(file_name = filtered_file_name, action = 'w', overwrite = True) # filtered_fd = filtered_file.descriptor # resolutions = self.get_resolutions(resolution_threshold = resolution_threshold) log_file = File(file_name=log_file, action='w', overwrite=True) log_idx = log_file.descriptor for pdb_file in self.localPDBs: log_idx.write("Reading File: {0}\n".format(pdb_file)) newPDB = PDB(pdb_file=pdb_file, dehydrate=True) fasta_idx = newPDB.FASTA_IDX(nucleotide=False) if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']): log_idx.write( 'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n' .format(newPDB.id)) if len(fasta_idx['FASTA']) > 0: log_idx.write('\tPrinting FASTA and IDX...\n') else: log_idx.write('\tProblably just a nucleotide PDB...\n') for c in range(len(fasta_idx['FASTA'])): sequence = fasta_idx['FASTA'][c].split('\n')[1] sequence = sequence.replace('X', '').replace('x', '') if len(sequence) > 0: fasta_fd.write(fasta_idx['FASTA'][c] + "\n") if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca: filtered_fd.write(fasta_idx['FASTA'][c] + "\n") idx_fd.write(fasta_idx['IDX'][c] + "\n") del (newPDB) #CLOSE & END fasta_file.close() idx_file.close() if resolution_threshold is not None: filtered_fd.close()
def format2file(self, filename, extension='pdb', center=False): if extension not in ('pdb', 'js'): raise AttributeError('Not accepted extension') structure = File('.'.join([filename, extension]), 'w') if extension == 'pdb': structure.write(self.pdb_format(center=center)) elif extension == 'js': structure.write(self.js_format(center=center)) structure.close()
def build_multifasta(file_name, sequenceList, force=False): newFasta = File(file_name, 'w', overwrite=force) file_dsc = newFasta.descriptor for sequence in sequenceList: file_dsc.write(sequence.format('FASTA') + "\n") newFasta.close() return Fasta(fasta_file=newFasta.full)
def build(file_name, sequenceID, sequence, force=False): newFasta = File(file_name, 'w', overwrite=force) newSeq = Sequence(seqID=sequenceID, sequence=sequence) file_dsc = newFasta.descriptor file_dsc.write(newSeq.format('FASTA')) newFasta.close() return Fasta(fasta_file=newFasta.full)
def __init__(self, database, search_type = 'prot'): #Search Type Check if search_type not in set(['prot','nucl']): raise BE(-10) self._search_type = search_type #Blast executable configuration self._configurator = ConfigParser.RawConfigParser(allow_no_value=True) self._configurator.read(os.getenv('SBI_CONFIG_FILE',default_configuration_file)) self._exe = Executable(executable = self._configurator.get('blast','executable'), path = self._configurator.get('blast','path'), variable_path = self._configurator.get('blast','variable_path')) #Database Configuration self._database = self._check_database(os.path.abspath(database)) if os.path.isfile(self._database.file.full + ".idx"): self._idx = File(file_name = self._database.file.full + ".idx", action = 'r') else: self._idx = None #Adding fixed blast parameters self._exe.add_attribute(self._database.file.full, '-db') self._exe.add_attribute('5', '-outfmt') self._exe.add_parameter('-lcase_masking') SBIglobals.alert('debug', self, 'New Blast Executable created.\nBlast executable at {0}\n'.format(self._exe.full_executable)) self._selfHit = False self._hitIDformat = 'single' self._overwritte = False self._clean_files = True
def _process(self): tmoFile = File(self._pdbtmfile,'w', True) for xmlfile in Path.list_files(os.path.join(self._local,'pdbtm/database/'), '*.xml'): xmldata = TM(pdb = os.path.splitext(os.path.split(xmlfile)[1])[0].upper()) skip_chains = set() read = False fdxml = open(xmlfile) for line in fdxml: if line.startswith(' <TMRES>'): xmldata.tmres = line elif line.startswith(' <TMTYPE'): xmldata.tmtype = line elif line.startswith(' <PDBKWRES'): xmldata.kwres = line elif line.startswith(' <SIDEDEFINITION'): m = re.search('Side1="(\S+)"', line) xmldata.side = m.group(1) elif line.startswith(' <APPLY_TO_CHAIN'): m = re.search('NEW_CHAINID=\"(\S{1})\"', line) if m: skip_chains.add(m.group(1)) elif line.startswith(' <CHAIN '): m = re.search('CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"', line) if m: chain, num, tmtype = m.group(1), m.group(2), m.group(3) if not chain in skip_chains: cdata = tuple([chain, num, tmtype]) xmldata.set_chain(cdata) read = True elif line.startswith(' <REGION ') and read: m = re.search('pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"', line) ini, end, tmtype = m.group(1), m.group(2), m.group(3) xmldata.set_chain(cdata, tuple([ini, end, tmtype])) elif line.startswith(' </CHAIN>'): read = False fdxml.close() if len(xmldata.chains) > 0: tmoFile.write(str(xmldata)+"\n") tmoFile.close()
def _process(self): enzymes = self._parse_enzclass() + self._parse_enzymedat() enzymes.sort() enzFile = File(self._enzfile, 'w', True) for e in enzymes: enzFile.write(repr(e) + "\n") enzFile.close()
def get_FASTA_IDX_by_names_to_file(self, names, outfile): fastafile = Fasta(self.PDBseq) selectedfasta = fastafile.retrieve(copy.deepcopy(names)) output_fasta = File(outfile, 'w') for sequence in selectedfasta: output_fasta.write(sequence.format('FASTA') + "\n") output_fasta.close() idxfile = self.PDBseq + '.idx' output_idx = File(outfile + '.idx', 'w') input_idx = File(idxfile, 'r') for line in input_idx.descriptor: info = line.split() pdbname = info[0][1:] if pdbname in names: output_idx.write(line) input_idx.close() output_idx.close()
def _process(self): targets = self._process_targets() drugs = self._process_drugs(targets) drugFile = File(self._drugfile, 'w', True) for d in drugs: drugFile.write(repr(d) + "\n") drugFile.close()
def _process(self): go_dic = {} parseFile = File(os.path.join(self.local, self._gfile), 'r') go = None for line in parseFile.descriptor: line = re.sub('\'', '\\\'', line) if line.startswith('[Term]'): if go is not None: go_dic[go.id] = go if line.startswith('id:'): go = GOterm(id=line.split()[1].strip()) continue if line.startswith('name:'): go.name = " ".join(line.split()[1:]).strip() continue if line.startswith('namespace:'): go.namespace = line.split()[1].strip() continue if line.startswith('alt_id:'): go.alt_id.append(line.split()[1].strip()) continue if line.startswith('is_obsolete:'): go.obsolete = True continue if line.startswith('is_a:'): go.parents.add(line.split()[1].strip()) continue if line.startswith('relationship:'): go.relations.append( (line.split()[1].strip(), line.split()[2].strip())) continue if line.startswith('[Typedef]'): go_dic[go.id] = go break parseFile.close() for go in go_dic: go_dic[go].parents = self._search_parents(go_dic, go) goFile = File(self._gofile, 'w', True) for go in go_dic: go_dic[go].parents.add(go) goFile.write(str(go_dic[go]) + "\n") goFile.close()
def write(self, output_file=None, format='PDB', force=False, clean=False): """ Writes the object in a specific format @type output_file: String @param output_file: File to write @type format: String @param format: Format of the file to print """ outfile = File(file_name=output_file, action='w', overwrite=force) if format == 'PDB': self._write_PDB_file(pdb_file=outfile, clean=clean)
def _process(self): inh = {} nodefile = File(file_name=self._nodes, action='r') for line in nodefile.descriptor: line = re.sub('\'', '\\\'', line) line_data = line.split('|') inh[line_data[0].strip()] = TaxID(line_data[0].strip()) inh[line_data[0].strip()].parent = line_data[1].strip() inh[line_data[0].strip()].rank = line_data[2].strip() nodefile.close() namefile = File(file_name=self._names, action='r') for line in namefile.descriptor: line = re.sub('\'', '\\\'', line) line_data = line.split('|') if line_data[3].strip() == 'scientific name': inh[line_data[0].strip()].name = line_data[1].strip() namefile.close() delefile = File(file_name=self._delet, action='r') for line in delefile.descriptor: data = line.split('|') inh[data[0].strip()] = TaxID(data[0].strip()) inh[data[0].strip()].old = True delefile.close() mrgefile = File(file_name=self._merged, action='r') for line in mrgefile.descriptor: data = line.split('|') inh[data[0].strip()] = TaxID(data[0].strip()) inh[data[0].strip()].old = True inh[data[0].strip()].new = data[1].strip() mrgefile.close() taxFile = File(self._taxid, 'w', True) for taxid in inh: taxFile.write(str(inh[taxid]) + "\n") taxFile.close()
def pdb_file(self, value): """ Sets a PDB file if none has been given @raise UsedAttributeError """ if self._pdb_file is not None: raise AttributeError( "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object" .format(self._pdb_file.full, value)) if isinstance(value, File): self._pdb_file = value else: self._pdb_file = File(file_name=value, type='r')
def __init__(self, fasta_file): if isinstance(fasta_file, basestring): self._file = File(file_name=fasta_file, action='r') elif isinstance(fasta_file, File): self._file = fasta_file self._file.action = 'r' else: raise AttributeError('Check the input of the Fasta object') self._is_multifasta = self._check_multifasta() self._sequences = [] self._seqfinder = {}
def _parse_uniprot_file(self, source, destination, fasta, code): sourceFile = File(source, 'r') destinFile = File(destination, 'w', True) fastaFile = File(fasta, 'w', True) protein = None for line in sourceFile.descriptor: if line.startswith('ID'): protein = Uniprot(line.split()[1].strip(), code) if line.startswith('AC'): protein.accession = line.split()[1:] if line.startswith('OX'): protein.taxid = line.split()[1] if line.startswith('OH'): protein.hosts = line.split()[1] if line.startswith('DR'): protein.databases = line.split()[1:3] if line.startswith(' '): protein.sequence = line.strip().replace(' ', '') if line.startswith('//'): destinFile.write(str(protein) + "\n") fastaFile.write(repr(protein) + "\n") sourceFile.close() destinFile.close()
def get_PDBeChem(self, chemID): if self.has_local: for chem_file in self.localPDBeChems: newfile = File(file_name=chem_file, action='r') if newfile.prefix.upper() == chemID.upper(): return chem_file #If we do not find it in local (or we do not have a local) we search it on the FTP chem_file = chemID.upper() + '.cif' source = PDBeChemftp['single'] + chem_file try: urllib.urlretrieve(source, chem_file) except: return False return os.path.abspath(chem_file)
def __init__(self, cif_file): self._file = File(file_name=cif_file, action='r') self.__name__ = 'databases.PDBeChem' # This must be included in every class for the SBIglobals.alert() self._id = None self._name = None self._type = None self._formula = None self._parent = None self._weight = None self._fcharge = None self._code1l = None self._flformula = {} self._parse() self._decompose_formula()
def get_PDBs(self, pdbIDset): if isintance(pdbIDset, str): warnings.warn( 'For single PDB search the get_PDB function is recomended.') yield self.get_PDB(pdbIDset) else: pdbIDset = set([x.upper() for x in pdbIDset]) if self.has_local: for pdb_file in self.localPDBs: newfile = File(file_name=pdb_file, action='r') if newfile.prefix.lstrip('pdb').upper() in pdbIDset: yield pdb_file else: for pdbID in pdbIDset: yield self.get_PDB(pdbID)
def get_PDB(self, pdbID): if self.has_local: rootdir = os.path.join(self.local, pdbID.lower()[1:3]) for pdb_file in Path.list_files(root=rootdir, pattern='*.ent.gz'): newfile = File(file_name=pdb_file, action='r') if newfile.prefix.lstrip('pdb').upper() == pdbID.upper(): return pdb_file #If we do not find it in local (or we do not have a local) we search it on the FTP pdb_file = 'pdb' + pdbID.lower() + '.ent.gz' source = 'ftp://' + PDBftp['address'] + os.path.join( PDBftp['structures'], pdbID[1:3].lower(), pdb_file) try: urllib.urlretrieve(source, pdb_file) except: return False return os.path.abspath(pdb_file)
def get_PDBeChems(self, chemIDset): if isintance(chemIDset, str): warnings.warn( 'For single PDBeChem search the get_PDBeChem function is recomended.' ) yield self.get_PDBeChem(chemIDset) else: chemIDset = set([x.upper() for x in chemIDset]) if self.has_local: for chem_file in self.localPDBeChems: newfile = File(file_name=chem_file, action='r') if newfile.prefix.lstrip('pdb').upper() in chemIDset: yield chem_file else: for chemID in chemIDset: yield self.get_PDBeChem(chemID)
def get_resolutions(self): # resolutions (-1) are for methods that do not define resolution resolutions = {} ftp = ftplib.FTP(PDBftp['address']) ftp.login() ftp.cwd(PDBftp['derived']) resoluIDX = [] ftp.retrlines('RETR ' + PDBftp['resolution'], resoluIDX.append) ftp.quit() SBIglobals.alert('debug', self, 'Retrieving resolution data from PDB FTP...') active = False for line in resoluIDX: if line.startswith('-'): active = True continue if active and len(line.strip()) > 0: data = [x.strip() for x in line.split(';')] if len(data[1]) > 0: SBIglobals.alert( 'debug', self, '\tResolution for {0[0]} is {0[1]}...'.format(data)) # if resolution_threshold is None: resolutions[data[0]] = data[1] #rsync is accumulative, we might have structures that are not in the residu.idx anymore.. must check for pdb_file in self.localPDBs: newfile = File(file_name=pdb_file, action='r') pdbid = newfile.prefix.lstrip('pdb').upper() if pdbid not in resolutions: pdbobj = PDB(pdb_file=pdb_file, header=True, onlyheader=True) SBIglobals.alert( 'debug', self, '\tGrabbing Resolution for {0} is {1}...'.format( pdbid, pdbobj.header.resolution)) resolutions[pdbid] = pdbobj.header.resolution return resolutions
def __init__(self, pdb_file=None, dehydrate=False, header=False, onlyheader=False, biomolecule=False): """ @type pdb_file: String @param pdb_file: PDB formated file to read @raise IOError if pdb_file does not exist and it is not an empty object """ if biomolecule or onlyheader: header = True self._pdb_file = pdb_file self._chains = [] self._NMR = False self._NMR_chains = [] self._chain_id = set() self._biomol_id = -1 # -1 -> original # 0 -> symmetry # >0 -> biomolecule self._header = None self._has_prot = False self._has_nucl = False self._COMPND = None if self.pdb_file is not None: self._pdb_file = File(file_name=self._pdb_file, action='r') self._read_PDB_file(header=header, onlyheader=onlyheader, biomolecule=biomolecule) if dehydrate: self.dehydrate()
def localTrembls(self): tblFile = File(self._trbfile, 'r') for uni_line in tblFile.descriptor: yield uni_line
def localTM(self): tmoFile = File(self._pdbtmfile, 'r') for tm_line in tmoFile.descriptor: yield tm_line
def localEnzymes(self): enzFile = File(self._enzfile, 'r') for enz_line in enzFile.descriptor: yield enz_line
def relations(self): relFile = File(self._rel, 'r') for rel_line in relFile.descriptor: if not rel_line.startswith('#'): yield rel_line
def localSwissprots(self): swsFile = File(self._swsfile, 'r') for uni_line in swsFile.descriptor: yield uni_line
def localGOs(self): goFile = File(self._gofile, 'r') for go_line in goFile.descriptor: yield go_line
def __init__(self, cdhitfile): self._clusters = [] self._allseqids = {} self._file = File(file_name=cdhitfile) self._parse_file()
def descriptions(self): dscFile = File(self._desc, 'r') for dsc_line in dscFile.descriptor: if not dsc_line.startswith('#'): yield dsc_line