def _parse(self): file_fd = File(self._dsspfile) read = False continuity = -1000 readline = 0 for line in file_fd.read(): if line.startswith(" # RESIDUE AA STRUCTURE BP1 BP2 ACC"): read = True continue if read: if line[13:14] != '!': res_num = int(line[6:10].strip()) ss = line[16:17] if line[16:17] != ' ' else '-' buried = int(line[35:38].strip()) aa = line[13:15].strip() self._dsspdata.append(DSSP(secondary_structure = ss, accessibility = buried, amino = aa)) self._dsspdata[-1].add_hydrogen_links(line[39:50], line[50:61], line[61:72], line[72:84]) if readline > 0: if res_num != continuity + 1: self._gapped = True continuity = res_num readline += 1 else: msg = "truncated chain!{0}\n".format(self._dsspfile) sys.stderr.write(msg) SBIg.warn(self, msg) self._gapped = True file_fd.close()
def sortarchs(inputdir, outputdir): archsdir = outputdir Path.mkdir(archsdir) sorted_archs = {} loop_file_name = os.path.join(archsdir, 'ArchDB.{0}.db') loop_split_file_name = os.path.join(archsdir, 'ArchDB.{0}.{1:02d}-{2:02d}.db') sections_ini = [ 0, 4, 7,14,21] sections_end = [ 4, 6,13,20, 0] for archfile in Path.list_files(root = inputdir, pattern = '*.archObj'): filename = os.path.basename(archfile) data = filename.split('_') length = int(data[0]) archtype = data[1] sorted_archs.setdefault(archtype,{}).setdefault(length,[]) sorted_archs[archtype][length].append(archfile) for archtype in sorted_archs: SBIglobals.alert('verbose', None, "ARCHS: " + archtype + "\n") fd = File(loop_file_name.format(archtype), 'w') fdp = [] for x in range(len(sections_ini)): fdp.append(File(loop_split_file_name.format(archtype, sections_ini[x], sections_end[x]), 'w')) for length in sorted(sorted_archs[archtype]): SBIglobals.alert('verbose', None, '\t{0}'.format(length)) for archfile in sorted_archs[archtype][length]: SBIglobals.alert('verbose', None, '\t\t{0}\n'.format(archfile)) nsp = Arch.load(archfile) fd.descriptor.write(nsp.archtype_format() + "\n") for x in range(len(fdp)): if length >= sections_ini[x] and (sections_end[x] == 0 or length <= sections_end[x]): fdp[x].descriptor.write(nsp.archtype_format() + "\n") fd.close() for x in range(len(fdp)): fdp[x].close()
def __init__(self, fasta_file, auto_load=10): ''' @param: fasta_file @pdef: name of the FASTA file. @ptype: {String} or {File} @@param: auto_load @pdef: maximum number of sequences to autoload. @pdefault: 10 @ptype: {Integer} ''' if isinstance(fasta_file, basestring): self._file = File(file_name=fasta_file, action='r') elif isinstance(fasta_file, File): self._file = File(file_name=fasta_file.full, action='r') else: raise AttributeError('Check the input of the Fasta object') self._sequences = [] self._sequenceID = {} self._total_sequences = 0 self._loaded = False self._auto_load = auto_load self._check_multifasta() self._index_file = None self._check_index()
def read_compacted_blast(compacted_blast_file): ''' Read data from a printed compacted blast into {BlastResult}. Not all options will be available in that new object. @param: compacted_blast_file @pdef: file of the compacted blast print @ptype: {String} @return: {BlastResult} ''' from BlastHit import BlastHit query_name, query_sequence = None, None version, matrix, database = None, None, None gap_open, gap_extend, self_hit = None, None, None br = None cbf = File(compacted_blast_file) for line in cbf.read(): if line.startswith('#'): if line.startswith('#Query:'): query_name = line.strip().split()[-1] if line.startswith('#Query Sequence:'): query_sequence = line.strip().split()[-1] if line.startswith('#Blast Version:'): version = line.strip().split()[-1] if line.startswith('#Search on matrix:'): matrix = line.strip().split()[-1] if line.startswith('#Gap open penalty:'): gap_open = line.strip().split()[-1] if line.startswith('#Gap extension penalty:'): gap_extend = line.strip().split()[-1] if line.startswith('#Database searched:'): database = line.strip().split()[-1] if line.startswith('#Self Hit is omitted:'): self_hit = line.strip().split()[-1] else: if br is None: if version is None: bh = None else: bh = BlastHeader(version, matrix, gap_open, gap_extend, database, self_hit) br = BlastResult(query_name, query_sequence, bh) d = line.strip().split() hit = BlastHit( [d[2], d[3]], [d[8], d[9]], [int(x) for x in d[10].split(',')[0].split(':')], 1, [d[4], d[5], d[6], d[7]]) br.add_hit(hit) cbf.close() return br
def pdb_file(self, value): """ Sets a PDB file if none has been given @raise UsedAttributeError """ if self._pdb_file is not None: raise AttributeError( "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object".format(self._pdb_file.full, value)) if isinstance(value, File): self._pdb_file = value else: self._pdb_file = File(file_name=value, type='r')
def release(self): ''' Retrieves release data for the database. Not according to the DB release, but to when we downloaded it. @returns: {Dictionary} ''' if os.path.isfile(os.path.join(self.local, self._CONTROL_FILE)): f = File(os.path.join(self.local, self._CONTROL_FILE)) data = json.loads(f.read()) f.close() else: data = self._RELEASE return data
def __init__(self, cif_file): self._file = File(file_name = cif_file, action = 'r') self.__name__ = 'databases.PDBeChem' # This must be included in every class for the SBIglobals.alert() self._id = None self._name = None self._type = None self._formula = None self._parent = None self._weight = None self._fcharge = None self._code1l = None self._flformula = {} self._parse() self._decompose_formula()
def items(self): ''' Loops through the items of the database @yields: Object depending on the database. ''' if not self.has_local: SBIg.throw(self, 'A local database needs to be build first', IOError) for ifile in self._ITEM_FILES: ifile = os.path.join(self.local, ifile) f = File(ifile) for line in f.read(): yield self._DBOBJECT.grab(line.strip()) f.close()
def format2file(self, filename, extension = 'pdb', center = False): if extension not in ('pdb', 'js'): raise AttributeError('Not accepted extension') structure = File('.'.join([filename, extension]), 'w') if extension == 'pdb': structure.write(self.pdb_format(center = center)) elif extension == 'js': structure.write(self.js_format(center = center)) structure.close()
def make_PDBseq(self, log_file, resolution_threshold=None): if not self.has_local: raise NameError( 'A local PDB database must be defined to do create a PDBseq database.' ) outdir = self.PDBseq if self.PDBseq is not None else os.curdir Path.mkdir(self.PDBseq) fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'), action='w', overwrite=True) fasta_fd = fasta_file.descriptor idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'), action='w', overwrite=True) idx_fd = idx_file.descriptor # if resolution_threshold is not None: # filtered_file_name = self.get_PDBseq_filtered(resolution_threshold) # filtered_file = File(file_name = filtered_file_name, action = 'w', overwrite = True) # filtered_fd = filtered_file.descriptor # resolutions = self.get_resolutions(resolution_threshold = resolution_threshold) log_file = File(file_name=log_file, action='w', overwrite=True) log_idx = log_file.descriptor for pdb_file in self.localPDBs: log_idx.write("Reading File: {0}\n".format(pdb_file)) newPDB = PDB(pdb_file=pdb_file, dehydrate=True) fasta_idx = newPDB.FASTA_IDX(nucleotide=False) if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']): log_idx.write( 'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n' .format(newPDB.id)) if len(fasta_idx['FASTA']) > 0: log_idx.write('\tPrinting FASTA and IDX...\n') else: log_idx.write('\tProblably just a nucleotide PDB...\n') for c in range(len(fasta_idx['FASTA'])): sequence = fasta_idx['FASTA'][c].split('\n')[1] sequence = sequence.replace('X', '').replace('x', '') if len(sequence) > 0: fasta_fd.write(fasta_idx['FASTA'][c] + "\n") if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca: filtered_fd.write(fasta_idx['FASTA'][c] + "\n") idx_fd.write(fasta_idx['IDX'][c] + "\n") del (newPDB) #CLOSE & END fasta_file.close() idx_file.close() if resolution_threshold is not None: filtered_fd.close()
def __init__(self, cdhit_file=None): ''' @param: cdhit_file @pdef: name of the cd-hit output file @pdefault: _None_. Create an empty list @ptype: {String} ''' self._clusters = [] self._allseqids = {} if cdhit_file is not None: self._file = File(file_name=cdhit_file) else: self._file = None if self._file is not None: self._parse_file()
def _process(self): go_dic = {} parseFile = File(os.path.join(self.local, self._gfile), 'r') go = None for line in parseFile.descriptor: line = re.sub('\'', '\\\'', line) if line.startswith('[Term]'): if go is not None: go_dic[go.id] = go if line.startswith('id:'): go = GOterm(id = line.split()[1].strip()) continue if line.startswith('name:'): go.name = " ".join(line.split()[1:]).strip() continue if line.startswith('namespace:'): go.namespace = line.split()[1].strip() continue if line.startswith('alt_id:'): go.alt_id.append(line.split()[1].strip()) continue if line.startswith('is_obsolete:'): go.obsolete = True continue if line.startswith('is_a:'): go.parents.add(line.split()[1].strip()) continue if line.startswith('relationship:'): go.relations.append((line.split()[1].strip(),line.split()[2].strip())) continue if line.startswith('[Typedef]'): go_dic[go.id] = go break parseFile.close() for go in go_dic: go_dic[go].parents = self._search_parents(go_dic, go) goFile = File(self._gofile, 'w', True) for go in go_dic: go_dic[go].parents.add(go) goFile.write(str(go_dic[go]) + "\n") goFile.close()
def _save_release(self): ''' Store the release data into a file. ''' f = File(os.path.join(self.local, self._CONTROL_FILE), 'w', True) f.write(json.dumps(self._RELEASE)) f.close()
def build_multifasta(file_name, sequence_list, force=None): ''' Creates a Fasta object and a FASTA file. For multiple sequences. @param: file_name @pdef: name of the fasta file (with path, if necessary) @ptype: {String} @param: sequence_list @pdef: list of sequences to create the FASTA from. @ptype: {List} or {Set} of {Sequence} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} ''' newFasta = File(file_name, 'w', overwrite=force) for sequence in sequence_list: newFasta.write(sequence.format('FASTA') + '\n') newFasta.close() fasta_file = Fasta(fasta_file=newFasta.full, auto_load=0) return fasta_file
def build(file_name, sequence_id, sequence, force=None): ''' Creates a Fasta object and a FASTA file from a sequence. @param: file_name @pdef: name of the fasta file (with path, if necessary) @ptype: {String} @param: sequence_id @pdef: name of the sequence @ptype: {String} @param: sequence @pdef: sequence @ptype: {String} or {List} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} ''' newFasta = File(file_name, 'w', overwrite=force) newSeq = Sequence(sequence_id=sequence_id, sequence=sequence) newFasta.write(newSeq.format('FASTA')) newFasta.close() return Fasta(fasta_file=newFasta.full, auto_load=0)
def _process(self): enzymes = self._parse_enzclass() + self._parse_enzymedat() enzymes.sort() enzFile = File(self._enzfile, 'w', True) for e in enzymes: enzFile.write(repr(e) + "\n") enzFile.close()
def write(self, output_file=None, format='PDB', force=None, clean=False): """ Writes the object in a specific format @type output_file: String @param output_file: File to write @type format: String @param format: Format of the file to print """ outfile = File( file_name=output_file, action='w', overwrite=SBIg.decide_overwrite(force)) if format == 'PDB': self._write_PDB_file(pdb_file=outfile, clean=clean)
def _process(self): targets = self._process_targets() drugs = self._process_drugs(targets) drugFile = File(self._drugfile, 'w', True) for d in drugs: drugFile.write(repr(d) + "\n") drugFile.close()
def get_PDBeChems(self, chemIDset): if isintance(chemIDset, str): warnings.warn('For single PDBeChem search the get_PDBeChem function is recomended.') yield self.get_PDBeChem(chemIDset) else: chemIDset = set([x.upper() for x in chemIDset]) if self.has_local: for chem_file in self.localPDBeChems: newfile = File(file_name = chem_file, action = 'r') if newfile.prefix.lstrip('pdb').upper() in chemIDset: yield chem_file else: for chemID in chemIDset: yield self.get_PDBeChem(chemID)
def get_PDBeChem(self, chemID): if self.has_local: for chem_file in self.localPDBeChems: newfile = File(file_name = chem_file, action = 'r') if newfile.prefix.upper() == chemID.upper(): return chem_file # If we do not find it in local (or we do not have a local) we search it on the FTP chem_file = chemID.upper() + '.cif' source = PDBeChemftp['single'] + chem_file try: urllib.urlretrieve(source, chem_file) except: return False return os.path.abspath(chem_file)
def __init__(self, pdb_file=None, dehydrate=False, header=False, onlyheader=False, biomolecule=False): """ @type pdb_file: String @param pdb_file: PDB formated file to read @raise IOError if pdb_file does not exist and it is not an empty object """ if biomolecule or onlyheader: header = True self._pdb_file = pdb_file self._chains = [] self._NMR = False self._NMR_chains = [] self._chain_id = set() self._biomol_id = -1 # -1 -> original # 0 -> symmetry # >0 -> biomolecule self._header = None self._has_prot = False self._has_nucl = False self._COMPND = None if self.pdb_file is not None: self._pdb_file = File(file_name=self._pdb_file, action='r') self._read_PDB_file(header=header, onlyheader=onlyheader, biomolecule=biomolecule) if dehydrate: self.dehydrate()
def get_PDBs(self, pdbIDset): if isintance(pdbIDset, str): warnings.warn( 'For single PDB search the get_PDB function is recomended.') yield self.get_PDB(pdbIDset) else: pdbIDset = set([x.upper() for x in pdbIDset]) if self.has_local: for pdb_file in self.localPDBs: newfile = File(file_name=pdb_file, action='r') if newfile.prefix.lstrip('pdb').upper() in pdbIDset: yield pdb_file else: for pdbID in pdbIDset: yield self.get_PDB(pdbID)
def get_PDB(self, pdbID): if self.has_local: rootdir = os.path.join(self.local, pdbID.lower()[1:3]) for pdb_file in Path.list_files(root=rootdir, pattern='*.ent.gz'): newfile = File(file_name=pdb_file, action='r') if newfile.prefix.lstrip('pdb').upper() == pdbID.upper(): return pdb_file #If we do not find it in local (or we do not have a local) we search it on the FTP pdb_file = 'pdb' + pdbID.lower() + '.ent.gz' source = 'ftp://' + PDBftp['address'] + os.path.join( PDBftp['structures'], pdbID[1:3].lower(), pdb_file) try: urllib.urlretrieve(source, pdb_file) except: return False return os.path.abspath(pdb_file)
def print_compacted_blast(self, out_file=None): ''' Print the compacted format of the blast hit. @param: out_file @pdef: file to print the blast data into. @pdefault: _None_ @ptype: {String} ''' if out_file is not None: output = File(out_file, 'w') output.write("%s\n" % self.str_compacted_blast()) output.close() else: print self.str_compacted_blast()
def get_resolutions(self): # resolutions (-1) are for methods that do not define resolution resolutions = {} ftp = ftplib.FTP(PDBftp['address']) ftp.login() ftp.cwd(PDBftp['derived']) resoluIDX = [] ftp.retrlines('RETR ' + PDBftp['resolution'], resoluIDX.append) ftp.quit() SBIglobals.alert('debug', self, 'Retrieving resolution data from PDB FTP...') active = False for line in resoluIDX: if line.startswith('-'): active = True continue if active and len(line.strip()) > 0: data = [x.strip() for x in line.split(';')] if len(data[1]) > 0: SBIglobals.alert( 'debug', self, '\tResolution for {0[0]} is {0[1]}...'.format(data)) # if resolution_threshold is None: resolutions[data[0]] = data[1] #rsync is accumulative, we might have structures that are not in the residu.idx anymore.. must check for pdb_file in self.localPDBs: newfile = File(file_name=pdb_file, action='r') pdbid = newfile.prefix.lstrip('pdb').upper() if pdbid not in resolutions: pdbobj = PDB(pdb_file=pdb_file, header=True, onlyheader=True) SBIglobals.alert( 'debug', self, '\tGrabbing Resolution for {0} is {1}...'.format( pdbid, pdbobj.header.resolution)) resolutions[pdbid] = pdbobj.header.resolution return resolutions
def print_representation(self, line_split=160, out_file=None): ''' Print the alignment representation of the blast hit. @param: line_split @pdef: number of characters per line @pdefault: 160 @ptype: {Integer} @param: out_file @pdef: file to print the blast data into. @pdefault: _None_ @ptype: {String} ''' if out_file is not None: output = File(out_file, 'w') output.write("%s\n" % self.str_representation(line_split)) output.close() else: print self.str_representation(line_split)
def _process(self): tmoFile = File(self._pdbtmfile, 'w', True) for xmlfile in Path.list_files( os.path.join(self._local, 'pdbtm/database/'), '*.xml'): xmldata = TM( pdb=os.path.splitext(os.path.split(xmlfile)[1])[0].upper()) skip_chains = set() read = False fdxml = open(xmlfile) for line in fdxml: if line.startswith(' <TMRES>'): xmldata.tmres = line elif line.startswith(' <TMTYPE'): xmldata.tmtype = line elif line.startswith(' <PDBKWRES'): xmldata.kwres = line elif line.startswith(' <SIDEDEFINITION'): m = re.search('Side1="(\S+)"', line) xmldata.side = m.group(1) elif line.startswith(' <APPLY_TO_CHAIN'): m = re.search('NEW_CHAINID=\"(\S{1})\"', line) if m: skip_chains.add(m.group(1)) elif line.startswith(' <CHAIN '): m = re.search( 'CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"', line) if m: chain, num, tmtype = m.group(1), m.group(2), m.group(3) if not chain in skip_chains: cdata = tuple([chain, num, tmtype]) xmldata.set_chain(cdata) read = True elif line.startswith(' <REGION ') and read: m = re.search( 'pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"', line) ini, end, tmtype = m.group(1), m.group(2), m.group(3) xmldata.set_chain(cdata, tuple([ini, end, tmtype])) elif line.startswith(' </CHAIN>'): read = False fdxml.close() if len(xmldata.chains) > 0: tmoFile.write(str(xmldata) + "\n") tmoFile.close()
def reduce(self, new_fasta_file, list_file, force=None): ''' Reduces the {Fasta} by removing identical sequences. @param: new_fasta_file @pdef: name of the new fasta file @ptype: {String} @param: list_file @pdef: name of the repetition list file @ptype: {String} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} and {File} with the list of identical sequences. ''' seq_md5 = {} sequences = [] for seq in self.live_show(): md5 = seq.md5 if not md5 in seq_md5: sequences.append(seq) seq_md5.setdefault(md5, []) else: SBIg.alert( 'debug', self, '{0} repeats of {1}'.format(seq.id, seq_md5[md5][0])) seq_md5[md5].append(seq.id) fasta = Fasta.build_multifasta(new_fasta_file, sequences, force) listfile = File(list_file, 'w') for md5 in seq_md5: listfile.write('\t'.join(seq_md5[md5]) + '\n') listfile.close() return fasta, listfile
def correct_hit_count(self, count_hit_file=None, count_query_file=None, return_correction_dict=False): ''' Corrects the starting point of the hits and the query, if needed. Why? When blasting vs. PDB (for example), sometimes the hit positions given by blast are wrong, as the blast always consider the first position of the hit sequence as 1 and PDB does not. Even more, the position reference doesn't even need to be a number. As the specific location in the PDB is important, we need to adapt our blasts so than we can read that data. Keep in mind that hits and query must be corrected together in this step, as this function cannot be called twice for a same instance. @param: count_hit_file @pdef: file containing the idex data for the query database each sequence in this file will have a format such as: >3K2K_A -7 ;-6 ;-5 ;-4 ;-3 ;-2 ;-1 ;0 ;1 ;2 ;3 ;4 ;5 ;6 ;7 ... @ptype: {String} @param: count_query_file @pdef: sometimes we might also need to correct the query (if PDB vs. PDB). Same format as count_hit_file. They might be the same file. @ptype: {String} @param: return_correction_dict @pdef: instead of actually executing the correction, it only returns the dictionary for further use. @pdefault: _False_ @ptype: {Boolean} @raises: {IOError} if the correction index file does not exist. @raises: {AttributeError} if the BlastResult does not contain any BlastHit. @raises: {BlastError} if it has been called before for this instance. ''' if not self.has_hits: SBIg.warn( self, "BlastResult of {0} has no hits to correct".format(self.query)) return if self.are_hits_corrected: be = BlastExe.BlastError() raise be.corrected_hits() SBIg.alert('debug', self, 'Correcting indexes for {0}'.format(self.query)) cfile = File(count_hit_file) cq = False codes_of_interest = set([hit.sequenceID for hit in self.raw_hits]) if count_query_file == count_hit_file: codes_of_interest.add(self.query) count_query_file = None cq = True start_index_dic = {} for line in cfile.read(): if len(line.strip()) > 0: k = line.split('\t') if k[0].lstrip('>') in codes_of_interest: start_index_dic[k[0].lstrip('>')] = k[1].strip().split(';') cfile.close() if count_query_file is not None: cfile = File(count_query_file) for line in cfile.read(): if len(line.strip()) > 0: k = line.split('\t') if k[0].lstrip('>') == self.query: start_index_dic[k[0].lstrip('>')] = k[1].strip().split( ';') cfile.read().close() cq = True if cq: SBIg.alert('debug', self, '\tFixing Query {0}'.format(self.query)) self._query_index = start_index_dic[self.query] if return_correction_dict: return start_index_dic for hit in self._hits: # This tests between the options PDB/PDB_ID or PDB_ID in case # the TAB file has different codification h = hit.sequenceID hit_ID = h if h in start_index_dic else h.split("/")[-1] SBIg.alert('debug', self, '\tFixing {0}'.format(hit_ID)) hit.correct_hit_count(new_index=start_index_dic[hit_ID]) if cq: SBIg.alert('debug', self, '\tFixing Query {0}'.format(self.query)) hit.correct_query_count(new_index=start_index_dic[self.query]) self._correctedHits = True
def relations(self): relFile = File(self._rel, 'r') for rel_line in relFile.descriptor: if not rel_line.startswith('#'): yield rel_line
def descriptions(self): dscFile = File(self._desc, 'r') for dsc_line in dscFile.descriptor: if not dsc_line.startswith('#'): yield dsc_line