def __init__(self, fasta_file, auto_load=10): ''' @param: fasta_file @pdef: name of the FASTA file. @ptype: {String} or {File} @@param: auto_load @pdef: maximum number of sequences to autoload. @pdefault: 10 @ptype: {Integer} ''' if isinstance(fasta_file, basestring): self._file = File(file_name=fasta_file, action='r') elif isinstance(fasta_file, File): self._file = File(file_name=fasta_file.full, action='r') else: raise AttributeError('Check the input of the Fasta object') self._sequences = [] self._sequenceID = {} self._total_sequences = 0 self._loaded = False self._auto_load = auto_load self._check_multifasta() self._index_file = None self._check_index()
def subset(self, sequence_ids, new_fasta_file, all_but=False, prefix_size=None, index=False, force=None): ''' Creates a new {Fasta} with the requested subset of sequences. @param: sequence_ids @pdef: sequence identifier(s) @ptype: {String}, {List} or {Set} @param: new_fasta_file @pdef: name of the new fasta file @ptype: {String} @param: all_but @pdef: Flag. Instead of retrieving the given ids, we retrieve all except the given ids. @pdefault: _False_ @ptype: {Boolean} @param: prefix_size @pdef: maximum characters for the prefix. If _None_, all the characters are included. @pdefault: _None_ @ptype: {Integer} @param: index @pdef: create the index file also, in case it does exist @pdefault: _False_ @ptype: {Boolean} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @raises: {AttributeError} if sequence_ids is not a valid type. @return: {Fasta} ''' sequences = self.retrieve(sequence_ids, all_but, prefix_size) fasta_file = Fasta.build_multifasta(new_fasta_file, sequences, force) if self.has_index and index: idxfile = File(self.index_file) newidx = File(fasta_file.file.full + '.idx', 'w') seqids = set(fasta_file.sequence_identifiers) for idx in idxfile.read(): if idx.split()[0].strip('>') in seqids: newidx.write(idx) idxfile.close() newidx.close() fasta_file.index_file = newidx.full return fasta_file
def _process(self, update=False): ''' Transform the source files into the final local db files. @param: update @pdef: toggles between create and update processing @pdefault: _False_ @ptype: {Boolean ''' if update: old = self._RELEASE['total_items'].copy() j = 0 for i in range(len(self._SOURCES)): dfilen = os.path.join(self.local, self._SOURCES[i]) ofilen = os.path.join(self.local, self._MANDATORY_FILES[j]) ffilen = os.path.join(self.local, self._MANDATORY_FILES[j + 1]) if not os.path.isfile(dfilen): continue SBIg.alert('verbose', self, 'Parsing: {0}'.format(dfilen)) SBIg.alert('verbose', self, 'DB file to: {0}'.format(ofilen)) SBIg.alert('verbose', self, 'Fasta file to: {0}'.format(ffilen)) dfile = File(dfilen) ofile = File(ofilen, 'w', update) ffile = File(ffilen, 'w', update) protein = None for protein in Connect._parse_uniprot(dfile): pname = protein.entry_name pvers = protein.version SBIg.alert('verbose', self, 'Protein: {0}'.format(pname)) if not update: self._RELEASE['total_items'][pname] = pvers else: if pname not in self._RELEASE['total_items']: self._RELEASE['new_items'][pname] = pvers else: del (old[pname]) if self._RELEASE['total_items'][pname] != pvers: self._RELEASE['update_items'][pname] = pvers ffile.write(protein.sequence.format('FASTA') + '\n') ofile.write(protein.json() + '\n') j += 2 dfile.close() ofile.close() ffile.close() if update: self._RELEASE['total_items'].update(self._RELEASE['new_items']) self._RELEASE['total_items'].update(self._RELEASE['update_items']) self._RELEASE['deleted_items'] = old for k in self._RELEASE['deleted_items']: del (self._RELEASE['total_items'][k])
def make_PDBseq(self, log_file, resolution_threshold=None): if not self.has_local: raise NameError( 'A local PDB database must be defined to do create a PDBseq database.' ) outdir = self.PDBseq if self.PDBseq is not None else os.curdir Path.mkdir(self.PDBseq) fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'), action='w', overwrite=True) fasta_fd = fasta_file.descriptor idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'), action='w', overwrite=True) idx_fd = idx_file.descriptor # if resolution_threshold is not None: # filtered_file_name = self.get_PDBseq_filtered(resolution_threshold) # filtered_file = File(file_name = filtered_file_name, action = 'w', overwrite = True) # filtered_fd = filtered_file.descriptor # resolutions = self.get_resolutions(resolution_threshold = resolution_threshold) log_file = File(file_name=log_file, action='w', overwrite=True) log_idx = log_file.descriptor for pdb_file in self.localPDBs: log_idx.write("Reading File: {0}\n".format(pdb_file)) newPDB = PDB(pdb_file=pdb_file, dehydrate=True) fasta_idx = newPDB.FASTA_IDX(nucleotide=False) if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']): log_idx.write( 'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n' .format(newPDB.id)) if len(fasta_idx['FASTA']) > 0: log_idx.write('\tPrinting FASTA and IDX...\n') else: log_idx.write('\tProblably just a nucleotide PDB...\n') for c in range(len(fasta_idx['FASTA'])): sequence = fasta_idx['FASTA'][c].split('\n')[1] sequence = sequence.replace('X', '').replace('x', '') if len(sequence) > 0: fasta_fd.write(fasta_idx['FASTA'][c] + "\n") if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca: filtered_fd.write(fasta_idx['FASTA'][c] + "\n") idx_fd.write(fasta_idx['IDX'][c] + "\n") del (newPDB) #CLOSE & END fasta_file.close() idx_file.close() if resolution_threshold is not None: filtered_fd.close()
def build(file_name, sequence_id, sequence, force=None): ''' Creates a Fasta object and a FASTA file from a sequence. @param: file_name @pdef: name of the fasta file (with path, if necessary) @ptype: {String} @param: sequence_id @pdef: name of the sequence @ptype: {String} @param: sequence @pdef: sequence @ptype: {String} or {List} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} ''' newFasta = File(file_name, 'w', overwrite=force) newSeq = Sequence(sequence_id=sequence_id, sequence=sequence) newFasta.write(newSeq.format('FASTA')) newFasta.close() return Fasta(fasta_file=newFasta.full, auto_load=0)
def build_multifasta(file_name, sequence_list, force=None): ''' Creates a Fasta object and a FASTA file. For multiple sequences. @param: file_name @pdef: name of the fasta file (with path, if necessary) @ptype: {String} @param: sequence_list @pdef: list of sequences to create the FASTA from. @ptype: {List} or {Set} of {Sequence} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} ''' newFasta = File(file_name, 'w', overwrite=force) for sequence in sequence_list: newFasta.write(sequence.format('FASTA') + '\n') newFasta.close() fasta_file = Fasta(fasta_file=newFasta.full, auto_load=0) return fasta_file
def _parse(self): file_fd = File(self._dsspfile) read = False continuity = -1000 readline = 0 for line in file_fd.read(): if line.startswith(" # RESIDUE AA STRUCTURE BP1 BP2 ACC"): read = True continue if read: if line[13:14] != '!': res_num = int(line[6:10].strip()) ss = line[16:17] if line[16:17] != ' ' else '-' buried = int(line[35:38].strip()) aa = line[13:15].strip() self._dsspdata.append(DSSP(secondary_structure = ss, accessibility = buried, amino = aa)) self._dsspdata[-1].add_hydrogen_links(line[39:50], line[50:61], line[61:72], line[72:84]) if readline > 0: if res_num != continuity + 1: self._gapped = True continuity = res_num readline += 1 else: msg = "truncated chain!{0}\n".format(self._dsspfile) sys.stderr.write(msg) SBIg.warn(self, msg) self._gapped = True file_fd.close()
def _save_release(self): ''' Store the release data into a file. ''' f = File(os.path.join(self.local, self._CONTROL_FILE), 'w', True) f.write(json.dumps(self._RELEASE)) f.close()
def _process(self): enzymes = self._parse_enzclass() + self._parse_enzymedat() enzymes.sort() enzFile = File(self._enzfile, 'w', True) for e in enzymes: enzFile.write(repr(e) + "\n") enzFile.close()
def get_FASTA_IDX_by_names_to_file(self, names, outfile): fastafile = Fasta(self.PDBseq) selectedfasta = fastafile.retrieve(copy.deepcopy(names)) output_fasta = File(outfile, 'w') for sequence in selectedfasta: output_fasta.write(sequence.format('FASTA') + "\n") output_fasta.close() idxfile = self.PDBseq + '.idx' output_idx = File(outfile + '.idx', 'w') input_idx = File(idxfile, 'r') for line in input_idx.descriptor: info = line.split() pdbname = info[0][1:] if pdbname in names: output_idx.write(line) input_idx.close() output_idx.close()
def format2file(self, filename, extension = 'pdb', center = False): if extension not in ('pdb', 'js'): raise AttributeError('Not accepted extension') structure = File('.'.join([filename, extension]), 'w') if extension == 'pdb': structure.write(self.pdb_format(center = center)) elif extension == 'js': structure.write(self.js_format(center = center)) structure.close()
def _process(self): targets = self._process_targets() drugs = self._process_drugs(targets) drugFile = File(self._drugfile, 'w', True) for d in drugs: drugFile.write(repr(d) + "\n") drugFile.close()
def _process(self): go_dic = {} parseFile = File(os.path.join(self.local, self._gfile), 'r') go = None for line in parseFile.descriptor: line = re.sub('\'', '\\\'', line) if line.startswith('[Term]'): if go is not None: go_dic[go.id] = go if line.startswith('id:'): go = GOterm(id = line.split()[1].strip()) continue if line.startswith('name:'): go.name = " ".join(line.split()[1:]).strip() continue if line.startswith('namespace:'): go.namespace = line.split()[1].strip() continue if line.startswith('alt_id:'): go.alt_id.append(line.split()[1].strip()) continue if line.startswith('is_obsolete:'): go.obsolete = True continue if line.startswith('is_a:'): go.parents.add(line.split()[1].strip()) continue if line.startswith('relationship:'): go.relations.append((line.split()[1].strip(),line.split()[2].strip())) continue if line.startswith('[Typedef]'): go_dic[go.id] = go break parseFile.close() for go in go_dic: go_dic[go].parents = self._search_parents(go_dic, go) goFile = File(self._gofile, 'w', True) for go in go_dic: go_dic[go].parents.add(go) goFile.write(str(go_dic[go]) + "\n") goFile.close()
def read_compacted_blast(compacted_blast_file): ''' Read data from a printed compacted blast into {BlastResult}. Not all options will be available in that new object. @param: compacted_blast_file @pdef: file of the compacted blast print @ptype: {String} @return: {BlastResult} ''' from BlastHit import BlastHit query_name, query_sequence = None, None version, matrix, database = None, None, None gap_open, gap_extend, self_hit = None, None, None br = None cbf = File(compacted_blast_file) for line in cbf.read(): if line.startswith('#'): if line.startswith('#Query:'): query_name = line.strip().split()[-1] if line.startswith('#Query Sequence:'): query_sequence = line.strip().split()[-1] if line.startswith('#Blast Version:'): version = line.strip().split()[-1] if line.startswith('#Search on matrix:'): matrix = line.strip().split()[-1] if line.startswith('#Gap open penalty:'): gap_open = line.strip().split()[-1] if line.startswith('#Gap extension penalty:'): gap_extend = line.strip().split()[-1] if line.startswith('#Database searched:'): database = line.strip().split()[-1] if line.startswith('#Self Hit is omitted:'): self_hit = line.strip().split()[-1] else: if br is None: if version is None: bh = None else: bh = BlastHeader(version, matrix, gap_open, gap_extend, database, self_hit) br = BlastResult(query_name, query_sequence, bh) d = line.strip().split() hit = BlastHit( [d[2], d[3]], [d[8], d[9]], [int(x) for x in d[10].split(',')[0].split(':')], 1, [d[4], d[5], d[6], d[7]]) br.add_hit(hit) cbf.close() return br
def pdb_file(self, value): """ Sets a PDB file if none has been given @raise UsedAttributeError """ if self._pdb_file is not None: raise AttributeError( "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object".format(self._pdb_file.full, value)) if isinstance(value, File): self._pdb_file = value else: self._pdb_file = File(file_name=value, type='r')
def _process(self): inh = {} nodefile = File(file_name = self._nodes, action = 'r') for line in nodefile.descriptor: line = re.sub('\'', '\\\'', line) line_data = line.split('|') inh[line_data[0].strip()] = TaxID(line_data[0].strip()) inh[line_data[0].strip()].parent = line_data[1].strip() inh[line_data[0].strip()].rank = line_data[2].strip() nodefile.close() namefile = File(file_name = self._names, action = 'r') for line in namefile.descriptor: line = re.sub('\'', '\\\'', line) line_data = line.split('|') if line_data[3].strip() == 'scientific name': inh[line_data[0].strip()].name = line_data[1].strip() namefile.close() delefile = File(file_name = self._delet, action = 'r') for line in delefile.descriptor: data = line.split('|') inh[data[0].strip()] = TaxID(data[0].strip()) inh[data[0].strip()].old = True delefile.close() mrgefile = File(file_name = self._merged, action = 'r') for line in mrgefile.descriptor: data = line.split('|') inh[data[0].strip()] = TaxID(data[0].strip()) inh[data[0].strip()].old = True inh[data[0].strip()].new = data[1].strip() mrgefile.close() taxFile = File(self._taxid, 'w', True) for taxid in inh: taxFile.write(str(inh[taxid]) + "\n") taxFile.close()
def release(self): ''' Retrieves release data for the database. Not according to the DB release, but to when we downloaded it. @returns: {Dictionary} ''' if os.path.isfile(os.path.join(self.local, self._CONTROL_FILE)): f = File(os.path.join(self.local, self._CONTROL_FILE)) data = json.loads(f.read()) f.close() else: data = self._RELEASE return data
def write(self, output_file=None, format='PDB', force=None, clean=False): """ Writes the object in a specific format @type output_file: String @param output_file: File to write @type format: String @param format: Format of the file to print """ outfile = File( file_name=output_file, action='w', overwrite=SBIg.decide_overwrite(force)) if format == 'PDB': self._write_PDB_file(pdb_file=outfile, clean=clean)
def print_compacted_blast(self, out_file=None): ''' Print the compacted format of the blast hit. @param: out_file @pdef: file to print the blast data into. @pdefault: _None_ @ptype: {String} ''' if out_file is not None: output = File(out_file, 'w') output.write("%s\n" % self.str_compacted_blast()) output.close() else: print self.str_compacted_blast()
def get_PDBeChem(self, chemID): if self.has_local: for chem_file in self.localPDBeChems: newfile = File(file_name = chem_file, action = 'r') if newfile.prefix.upper() == chemID.upper(): return chem_file # If we do not find it in local (or we do not have a local) we search it on the FTP chem_file = chemID.upper() + '.cif' source = PDBeChemftp['single'] + chem_file try: urllib.urlretrieve(source, chem_file) except: return False return os.path.abspath(chem_file)
def get_PDBeChems(self, chemIDset): if isintance(chemIDset, str): warnings.warn('For single PDBeChem search the get_PDBeChem function is recomended.') yield self.get_PDBeChem(chemIDset) else: chemIDset = set([x.upper() for x in chemIDset]) if self.has_local: for chem_file in self.localPDBeChems: newfile = File(file_name = chem_file, action = 'r') if newfile.prefix.lstrip('pdb').upper() in chemIDset: yield chem_file else: for chemID in chemIDset: yield self.get_PDBeChem(chemID)
def items(self): ''' Loops through the items of the database @yields: Object depending on the database. ''' if not self.has_local: SBIg.throw(self, 'A local database needs to be build first', IOError) for ifile in self._ITEM_FILES: ifile = os.path.join(self.local, ifile) f = File(ifile) for line in f.read(): yield self._DBOBJECT.grab(line.strip()) f.close()
def get_PDBs(self, pdbIDset): if isintance(pdbIDset, str): warnings.warn( 'For single PDB search the get_PDB function is recomended.') yield self.get_PDB(pdbIDset) else: pdbIDset = set([x.upper() for x in pdbIDset]) if self.has_local: for pdb_file in self.localPDBs: newfile = File(file_name=pdb_file, action='r') if newfile.prefix.lstrip('pdb').upper() in pdbIDset: yield pdb_file else: for pdbID in pdbIDset: yield self.get_PDB(pdbID)
def __init__(self, cif_file): self._file = File(file_name = cif_file, action = 'r') self.__name__ = 'databases.PDBeChem' # This must be included in every class for the SBIglobals.alert() self._id = None self._name = None self._type = None self._formula = None self._parent = None self._weight = None self._fcharge = None self._code1l = None self._flformula = {} self._parse() self._decompose_formula()
def get_PDB(self, pdbID): if self.has_local: rootdir = os.path.join(self.local, pdbID.lower()[1:3]) for pdb_file in Path.list_files(root=rootdir, pattern='*.ent.gz'): newfile = File(file_name=pdb_file, action='r') if newfile.prefix.lstrip('pdb').upper() == pdbID.upper(): return pdb_file #If we do not find it in local (or we do not have a local) we search it on the FTP pdb_file = 'pdb' + pdbID.lower() + '.ent.gz' source = 'ftp://' + PDBftp['address'] + os.path.join( PDBftp['structures'], pdbID[1:3].lower(), pdb_file) try: urllib.urlretrieve(source, pdb_file) except: return False return os.path.abspath(pdb_file)
def __init__(self, cdhit_file=None): ''' @param: cdhit_file @pdef: name of the cd-hit output file @pdefault: _None_. Create an empty list @ptype: {String} ''' self._clusters = [] self._allseqids = {} if cdhit_file is not None: self._file = File(file_name=cdhit_file) else: self._file = None if self._file is not None: self._parse_file()
def print_representation(self, line_split=160, out_file=None): ''' Print the alignment representation of the blast hit. @param: line_split @pdef: number of characters per line @pdefault: 160 @ptype: {Integer} @param: out_file @pdef: file to print the blast data into. @pdefault: _None_ @ptype: {String} ''' if out_file is not None: output = File(out_file, 'w') output.write("%s\n" % self.str_representation(line_split)) output.close() else: print self.str_representation(line_split)
def get_resolutions(self): # resolutions (-1) are for methods that do not define resolution resolutions = {} ftp = ftplib.FTP(PDBftp['address']) ftp.login() ftp.cwd(PDBftp['derived']) resoluIDX = [] ftp.retrlines('RETR ' + PDBftp['resolution'], resoluIDX.append) ftp.quit() SBIglobals.alert('debug', self, 'Retrieving resolution data from PDB FTP...') active = False for line in resoluIDX: if line.startswith('-'): active = True continue if active and len(line.strip()) > 0: data = [x.strip() for x in line.split(';')] if len(data[1]) > 0: SBIglobals.alert( 'debug', self, '\tResolution for {0[0]} is {0[1]}...'.format(data)) # if resolution_threshold is None: resolutions[data[0]] = data[1] #rsync is accumulative, we might have structures that are not in the residu.idx anymore.. must check for pdb_file in self.localPDBs: newfile = File(file_name=pdb_file, action='r') pdbid = newfile.prefix.lstrip('pdb').upper() if pdbid not in resolutions: pdbobj = PDB(pdb_file=pdb_file, header=True, onlyheader=True) SBIglobals.alert( 'debug', self, '\tGrabbing Resolution for {0} is {1}...'.format( pdbid, pdbobj.header.resolution)) resolutions[pdbid] = pdbobj.header.resolution return resolutions
def _process(self): tmoFile = File(self._pdbtmfile, 'w', True) for xmlfile in Path.list_files( os.path.join(self._local, 'pdbtm/database/'), '*.xml'): xmldata = TM( pdb=os.path.splitext(os.path.split(xmlfile)[1])[0].upper()) skip_chains = set() read = False fdxml = open(xmlfile) for line in fdxml: if line.startswith(' <TMRES>'): xmldata.tmres = line elif line.startswith(' <TMTYPE'): xmldata.tmtype = line elif line.startswith(' <PDBKWRES'): xmldata.kwres = line elif line.startswith(' <SIDEDEFINITION'): m = re.search('Side1="(\S+)"', line) xmldata.side = m.group(1) elif line.startswith(' <APPLY_TO_CHAIN'): m = re.search('NEW_CHAINID=\"(\S{1})\"', line) if m: skip_chains.add(m.group(1)) elif line.startswith(' <CHAIN '): m = re.search( 'CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"', line) if m: chain, num, tmtype = m.group(1), m.group(2), m.group(3) if not chain in skip_chains: cdata = tuple([chain, num, tmtype]) xmldata.set_chain(cdata) read = True elif line.startswith(' <REGION ') and read: m = re.search( 'pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"', line) ini, end, tmtype = m.group(1), m.group(2), m.group(3) xmldata.set_chain(cdata, tuple([ini, end, tmtype])) elif line.startswith(' </CHAIN>'): read = False fdxml.close() if len(xmldata.chains) > 0: tmoFile.write(str(xmldata) + "\n") tmoFile.close()
def reduce(self, new_fasta_file, list_file, force=None): ''' Reduces the {Fasta} by removing identical sequences. @param: new_fasta_file @pdef: name of the new fasta file @ptype: {String} @param: list_file @pdef: name of the repetition list file @ptype: {String} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} and {File} with the list of identical sequences. ''' seq_md5 = {} sequences = [] for seq in self.live_show(): md5 = seq.md5 if not md5 in seq_md5: sequences.append(seq) seq_md5.setdefault(md5, []) else: SBIg.alert( 'debug', self, '{0} repeats of {1}'.format(seq.id, seq_md5[md5][0])) seq_md5[md5].append(seq.id) fasta = Fasta.build_multifasta(new_fasta_file, sequences, force) listfile = File(list_file, 'w') for md5 in seq_md5: listfile.write('\t'.join(seq_md5[md5]) + '\n') listfile.close() return fasta, listfile