def build_multifasta(file_name, sequence_list, force=None): ''' Creates a Fasta object and a FASTA file. For multiple sequences. @param: file_name @pdef: name of the fasta file (with path, if necessary) @ptype: {String} @param: sequence_list @pdef: list of sequences to create the FASTA from. @ptype: {List} or {Set} of {Sequence} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} ''' newFasta = File(file_name, 'w', overwrite=force) for sequence in sequence_list: newFasta.write(sequence.format('FASTA') + '\n') newFasta.close() fasta_file = Fasta(fasta_file=newFasta.full, auto_load=0) return fasta_file
def sortarchs(inputdir, outputdir): archsdir = outputdir Path.mkdir(archsdir) sorted_archs = {} loop_file_name = os.path.join(archsdir, 'ArchDB.{0}.db') loop_split_file_name = os.path.join(archsdir, 'ArchDB.{0}.{1:02d}-{2:02d}.db') sections_ini = [ 0, 4, 7,14,21] sections_end = [ 4, 6,13,20, 0] for archfile in Path.list_files(root = inputdir, pattern = '*.archObj'): filename = os.path.basename(archfile) data = filename.split('_') length = int(data[0]) archtype = data[1] sorted_archs.setdefault(archtype,{}).setdefault(length,[]) sorted_archs[archtype][length].append(archfile) for archtype in sorted_archs: SBIglobals.alert('verbose', None, "ARCHS: " + archtype + "\n") fd = File(loop_file_name.format(archtype), 'w') fdp = [] for x in range(len(sections_ini)): fdp.append(File(loop_split_file_name.format(archtype, sections_ini[x], sections_end[x]), 'w')) for length in sorted(sorted_archs[archtype]): SBIglobals.alert('verbose', None, '\t{0}'.format(length)) for archfile in sorted_archs[archtype][length]: SBIglobals.alert('verbose', None, '\t\t{0}\n'.format(archfile)) nsp = Arch.load(archfile) fd.descriptor.write(nsp.archtype_format() + "\n") for x in range(len(fdp)): if length >= sections_ini[x] and (sections_end[x] == 0 or length <= sections_end[x]): fdp[x].descriptor.write(nsp.archtype_format() + "\n") fd.close() for x in range(len(fdp)): fdp[x].close()
def build(file_name, sequence_id, sequence, force=None): ''' Creates a Fasta object and a FASTA file from a sequence. @param: file_name @pdef: name of the fasta file (with path, if necessary) @ptype: {String} @param: sequence_id @pdef: name of the sequence @ptype: {String} @param: sequence @pdef: sequence @ptype: {String} or {List} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} ''' newFasta = File(file_name, 'w', overwrite=force) newSeq = Sequence(sequence_id=sequence_id, sequence=sequence) newFasta.write(newSeq.format('FASTA')) newFasta.close() return Fasta(fasta_file=newFasta.full, auto_load=0)
def _parse(self): file_fd = File(self._dsspfile) read = False continuity = -1000 readline = 0 for line in file_fd.read(): if line.startswith(" # RESIDUE AA STRUCTURE BP1 BP2 ACC"): read = True continue if read: if line[13:14] != '!': res_num = int(line[6:10].strip()) ss = line[16:17] if line[16:17] != ' ' else '-' buried = int(line[35:38].strip()) aa = line[13:15].strip() self._dsspdata.append(DSSP(secondary_structure = ss, accessibility = buried, amino = aa)) self._dsspdata[-1].add_hydrogen_links(line[39:50], line[50:61], line[61:72], line[72:84]) if readline > 0: if res_num != continuity + 1: self._gapped = True continuity = res_num readline += 1 else: msg = "truncated chain!{0}\n".format(self._dsspfile) sys.stderr.write(msg) SBIg.warn(self, msg) self._gapped = True file_fd.close()
def _save_release(self): ''' Store the release data into a file. ''' f = File(os.path.join(self.local, self._CONTROL_FILE), 'w', True) f.write(json.dumps(self._RELEASE)) f.close()
def _process(self): enzymes = self._parse_enzclass() + self._parse_enzymedat() enzymes.sort() enzFile = File(self._enzfile, 'w', True) for e in enzymes: enzFile.write(repr(e) + "\n") enzFile.close()
def _process(self): targets = self._process_targets() drugs = self._process_drugs(targets) drugFile = File(self._drugfile, 'w', True) for d in drugs: drugFile.write(repr(d) + "\n") drugFile.close()
def format2file(self, filename, extension = 'pdb', center = False): if extension not in ('pdb', 'js'): raise AttributeError('Not accepted extension') structure = File('.'.join([filename, extension]), 'w') if extension == 'pdb': structure.write(self.pdb_format(center = center)) elif extension == 'js': structure.write(self.js_format(center = center)) structure.close()
def subset(self, sequence_ids, new_fasta_file, all_but=False, prefix_size=None, index=False, force=None): ''' Creates a new {Fasta} with the requested subset of sequences. @param: sequence_ids @pdef: sequence identifier(s) @ptype: {String}, {List} or {Set} @param: new_fasta_file @pdef: name of the new fasta file @ptype: {String} @param: all_but @pdef: Flag. Instead of retrieving the given ids, we retrieve all except the given ids. @pdefault: _False_ @ptype: {Boolean} @param: prefix_size @pdef: maximum characters for the prefix. If _None_, all the characters are included. @pdefault: _None_ @ptype: {Integer} @param: index @pdef: create the index file also, in case it does exist @pdefault: _False_ @ptype: {Boolean} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @raises: {AttributeError} if sequence_ids is not a valid type. @return: {Fasta} ''' sequences = self.retrieve(sequence_ids, all_but, prefix_size) fasta_file = Fasta.build_multifasta(new_fasta_file, sequences, force) if self.has_index and index: idxfile = File(self.index_file) newidx = File(fasta_file.file.full + '.idx', 'w') seqids = set(fasta_file.sequence_identifiers) for idx in idxfile.read(): if idx.split()[0].strip('>') in seqids: newidx.write(idx) idxfile.close() newidx.close() fasta_file.index_file = newidx.full return fasta_file
def read_compacted_blast(compacted_blast_file): ''' Read data from a printed compacted blast into {BlastResult}. Not all options will be available in that new object. @param: compacted_blast_file @pdef: file of the compacted blast print @ptype: {String} @return: {BlastResult} ''' from BlastHit import BlastHit query_name, query_sequence = None, None version, matrix, database = None, None, None gap_open, gap_extend, self_hit = None, None, None br = None cbf = File(compacted_blast_file) for line in cbf.read(): if line.startswith('#'): if line.startswith('#Query:'): query_name = line.strip().split()[-1] if line.startswith('#Query Sequence:'): query_sequence = line.strip().split()[-1] if line.startswith('#Blast Version:'): version = line.strip().split()[-1] if line.startswith('#Search on matrix:'): matrix = line.strip().split()[-1] if line.startswith('#Gap open penalty:'): gap_open = line.strip().split()[-1] if line.startswith('#Gap extension penalty:'): gap_extend = line.strip().split()[-1] if line.startswith('#Database searched:'): database = line.strip().split()[-1] if line.startswith('#Self Hit is omitted:'): self_hit = line.strip().split()[-1] else: if br is None: if version is None: bh = None else: bh = BlastHeader(version, matrix, gap_open, gap_extend, database, self_hit) br = BlastResult(query_name, query_sequence, bh) d = line.strip().split() hit = BlastHit( [d[2], d[3]], [d[8], d[9]], [int(x) for x in d[10].split(',')[0].split(':')], 1, [d[4], d[5], d[6], d[7]]) br.add_hit(hit) cbf.close() return br
def make_PDBseq(self, log_file, resolution_threshold=None): if not self.has_local: raise NameError( 'A local PDB database must be defined to do create a PDBseq database.' ) outdir = self.PDBseq if self.PDBseq is not None else os.curdir Path.mkdir(self.PDBseq) fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'), action='w', overwrite=True) fasta_fd = fasta_file.descriptor idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'), action='w', overwrite=True) idx_fd = idx_file.descriptor # if resolution_threshold is not None: # filtered_file_name = self.get_PDBseq_filtered(resolution_threshold) # filtered_file = File(file_name = filtered_file_name, action = 'w', overwrite = True) # filtered_fd = filtered_file.descriptor # resolutions = self.get_resolutions(resolution_threshold = resolution_threshold) log_file = File(file_name=log_file, action='w', overwrite=True) log_idx = log_file.descriptor for pdb_file in self.localPDBs: log_idx.write("Reading File: {0}\n".format(pdb_file)) newPDB = PDB(pdb_file=pdb_file, dehydrate=True) fasta_idx = newPDB.FASTA_IDX(nucleotide=False) if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']): log_idx.write( 'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n' .format(newPDB.id)) if len(fasta_idx['FASTA']) > 0: log_idx.write('\tPrinting FASTA and IDX...\n') else: log_idx.write('\tProblably just a nucleotide PDB...\n') for c in range(len(fasta_idx['FASTA'])): sequence = fasta_idx['FASTA'][c].split('\n')[1] sequence = sequence.replace('X', '').replace('x', '') if len(sequence) > 0: fasta_fd.write(fasta_idx['FASTA'][c] + "\n") if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca: filtered_fd.write(fasta_idx['FASTA'][c] + "\n") idx_fd.write(fasta_idx['IDX'][c] + "\n") del (newPDB) #CLOSE & END fasta_file.close() idx_file.close() if resolution_threshold is not None: filtered_fd.close()
def release(self): ''' Retrieves release data for the database. Not according to the DB release, but to when we downloaded it. @returns: {Dictionary} ''' if os.path.isfile(os.path.join(self.local, self._CONTROL_FILE)): f = File(os.path.join(self.local, self._CONTROL_FILE)) data = json.loads(f.read()) f.close() else: data = self._RELEASE return data
def print_compacted_blast(self, out_file=None): ''' Print the compacted format of the blast hit. @param: out_file @pdef: file to print the blast data into. @pdefault: _None_ @ptype: {String} ''' if out_file is not None: output = File(out_file, 'w') output.write("%s\n" % self.str_compacted_blast()) output.close() else: print self.str_compacted_blast()
def items(self): ''' Loops through the items of the database @yields: Object depending on the database. ''' if not self.has_local: SBIg.throw(self, 'A local database needs to be build first', IOError) for ifile in self._ITEM_FILES: ifile = os.path.join(self.local, ifile) f = File(ifile) for line in f.read(): yield self._DBOBJECT.grab(line.strip()) f.close()
def _process(self): go_dic = {} parseFile = File(os.path.join(self.local, self._gfile), 'r') go = None for line in parseFile.descriptor: line = re.sub('\'', '\\\'', line) if line.startswith('[Term]'): if go is not None: go_dic[go.id] = go if line.startswith('id:'): go = GOterm(id = line.split()[1].strip()) continue if line.startswith('name:'): go.name = " ".join(line.split()[1:]).strip() continue if line.startswith('namespace:'): go.namespace = line.split()[1].strip() continue if line.startswith('alt_id:'): go.alt_id.append(line.split()[1].strip()) continue if line.startswith('is_obsolete:'): go.obsolete = True continue if line.startswith('is_a:'): go.parents.add(line.split()[1].strip()) continue if line.startswith('relationship:'): go.relations.append((line.split()[1].strip(),line.split()[2].strip())) continue if line.startswith('[Typedef]'): go_dic[go.id] = go break parseFile.close() for go in go_dic: go_dic[go].parents = self._search_parents(go_dic, go) goFile = File(self._gofile, 'w', True) for go in go_dic: go_dic[go].parents.add(go) goFile.write(str(go_dic[go]) + "\n") goFile.close()
def print_representation(self, line_split=160, out_file=None): ''' Print the alignment representation of the blast hit. @param: line_split @pdef: number of characters per line @pdefault: 160 @ptype: {Integer} @param: out_file @pdef: file to print the blast data into. @pdefault: _None_ @ptype: {String} ''' if out_file is not None: output = File(out_file, 'w') output.write("%s\n" % self.str_representation(line_split)) output.close() else: print self.str_representation(line_split)
def _process(self): tmoFile = File(self._pdbtmfile, 'w', True) for xmlfile in Path.list_files( os.path.join(self._local, 'pdbtm/database/'), '*.xml'): xmldata = TM( pdb=os.path.splitext(os.path.split(xmlfile)[1])[0].upper()) skip_chains = set() read = False fdxml = open(xmlfile) for line in fdxml: if line.startswith(' <TMRES>'): xmldata.tmres = line elif line.startswith(' <TMTYPE'): xmldata.tmtype = line elif line.startswith(' <PDBKWRES'): xmldata.kwres = line elif line.startswith(' <SIDEDEFINITION'): m = re.search('Side1="(\S+)"', line) xmldata.side = m.group(1) elif line.startswith(' <APPLY_TO_CHAIN'): m = re.search('NEW_CHAINID=\"(\S{1})\"', line) if m: skip_chains.add(m.group(1)) elif line.startswith(' <CHAIN '): m = re.search( 'CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"', line) if m: chain, num, tmtype = m.group(1), m.group(2), m.group(3) if not chain in skip_chains: cdata = tuple([chain, num, tmtype]) xmldata.set_chain(cdata) read = True elif line.startswith(' <REGION ') and read: m = re.search( 'pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"', line) ini, end, tmtype = m.group(1), m.group(2), m.group(3) xmldata.set_chain(cdata, tuple([ini, end, tmtype])) elif line.startswith(' </CHAIN>'): read = False fdxml.close() if len(xmldata.chains) > 0: tmoFile.write(str(xmldata) + "\n") tmoFile.close()
def reduce(self, new_fasta_file, list_file, force=None): ''' Reduces the {Fasta} by removing identical sequences. @param: new_fasta_file @pdef: name of the new fasta file @ptype: {String} @param: list_file @pdef: name of the repetition list file @ptype: {String} @param: force @pdef: overwrite previous files with the same name @pdefault: _SBIglobals.overwrite_ @ptype: {Boolean} @return: {Fasta} and {File} with the list of identical sequences. ''' seq_md5 = {} sequences = [] for seq in self.live_show(): md5 = seq.md5 if not md5 in seq_md5: sequences.append(seq) seq_md5.setdefault(md5, []) else: SBIg.alert( 'debug', self, '{0} repeats of {1}'.format(seq.id, seq_md5[md5][0])) seq_md5[md5].append(seq.id) fasta = Fasta.build_multifasta(new_fasta_file, sequences, force) listfile = File(list_file, 'w') for md5 in seq_md5: listfile.write('\t'.join(seq_md5[md5]) + '\n') listfile.close() return fasta, listfile
def _process(self): inh = {} nodefile = File(file_name = self._nodes, action = 'r') for line in nodefile.descriptor: line = re.sub('\'', '\\\'', line) line_data = line.split('|') inh[line_data[0].strip()] = TaxID(line_data[0].strip()) inh[line_data[0].strip()].parent = line_data[1].strip() inh[line_data[0].strip()].rank = line_data[2].strip() nodefile.close() namefile = File(file_name = self._names, action = 'r') for line in namefile.descriptor: line = re.sub('\'', '\\\'', line) line_data = line.split('|') if line_data[3].strip() == 'scientific name': inh[line_data[0].strip()].name = line_data[1].strip() namefile.close() delefile = File(file_name = self._delet, action = 'r') for line in delefile.descriptor: data = line.split('|') inh[data[0].strip()] = TaxID(data[0].strip()) inh[data[0].strip()].old = True delefile.close() mrgefile = File(file_name = self._merged, action = 'r') for line in mrgefile.descriptor: data = line.split('|') inh[data[0].strip()] = TaxID(data[0].strip()) inh[data[0].strip()].old = True inh[data[0].strip()].new = data[1].strip() mrgefile.close() taxFile = File(self._taxid, 'w', True) for taxid in inh: taxFile.write(str(inh[taxid]) + "\n") taxFile.close()
def _process(self, update=False): ''' Transform the source files into the final local db files. @param: update @pdef: toggles between create and update processing @pdefault: _False_ @ptype: {Boolean ''' if update: old = self._RELEASE['total_items'].copy() j = 0 for i in range(len(self._SOURCES)): dfilen = os.path.join(self.local, self._SOURCES[i]) ofilen = os.path.join(self.local, self._MANDATORY_FILES[j]) ffilen = os.path.join(self.local, self._MANDATORY_FILES[j + 1]) if not os.path.isfile(dfilen): continue SBIg.alert('verbose', self, 'Parsing: {0}'.format(dfilen)) SBIg.alert('verbose', self, 'DB file to: {0}'.format(ofilen)) SBIg.alert('verbose', self, 'Fasta file to: {0}'.format(ffilen)) dfile = File(dfilen) ofile = File(ofilen, 'w', update) ffile = File(ffilen, 'w', update) protein = None for protein in Connect._parse_uniprot(dfile): pname = protein.entry_name pvers = protein.version SBIg.alert('verbose', self, 'Protein: {0}'.format(pname)) if not update: self._RELEASE['total_items'][pname] = pvers else: if pname not in self._RELEASE['total_items']: self._RELEASE['new_items'][pname] = pvers else: del (old[pname]) if self._RELEASE['total_items'][pname] != pvers: self._RELEASE['update_items'][pname] = pvers ffile.write(protein.sequence.format('FASTA') + '\n') ofile.write(protein.json() + '\n') j += 2 dfile.close() ofile.close() ffile.close() if update: self._RELEASE['total_items'].update(self._RELEASE['new_items']) self._RELEASE['total_items'].update(self._RELEASE['update_items']) self._RELEASE['deleted_items'] = old for k in self._RELEASE['deleted_items']: del (self._RELEASE['total_items'][k])
def get_FASTA_IDX_by_names_to_file(self, names, outfile): fastafile = Fasta(self.PDBseq) selectedfasta = fastafile.retrieve(copy.deepcopy(names)) output_fasta = File(outfile, 'w') for sequence in selectedfasta: output_fasta.write(sequence.format('FASTA') + "\n") output_fasta.close() idxfile = self.PDBseq + '.idx' output_idx = File(outfile + '.idx', 'w') input_idx = File(idxfile, 'r') for line in input_idx.descriptor: info = line.split() pdbname = info[0][1:] if pdbname in names: output_idx.write(line) input_idx.close() output_idx.close()
def localTaxIDs(self): taxFile = File(self._taxid, 'r') for tax_line in taxFile.descriptor: yield tax_line taxFile.close()
def correct_hit_count(self, count_hit_file=None, count_query_file=None, return_correction_dict=False): ''' Corrects the starting point of the hits and the query, if needed. Why? When blasting vs. PDB (for example), sometimes the hit positions given by blast are wrong, as the blast always consider the first position of the hit sequence as 1 and PDB does not. Even more, the position reference doesn't even need to be a number. As the specific location in the PDB is important, we need to adapt our blasts so than we can read that data. Keep in mind that hits and query must be corrected together in this step, as this function cannot be called twice for a same instance. @param: count_hit_file @pdef: file containing the idex data for the query database each sequence in this file will have a format such as: >3K2K_A -7 ;-6 ;-5 ;-4 ;-3 ;-2 ;-1 ;0 ;1 ;2 ;3 ;4 ;5 ;6 ;7 ... @ptype: {String} @param: count_query_file @pdef: sometimes we might also need to correct the query (if PDB vs. PDB). Same format as count_hit_file. They might be the same file. @ptype: {String} @param: return_correction_dict @pdef: instead of actually executing the correction, it only returns the dictionary for further use. @pdefault: _False_ @ptype: {Boolean} @raises: {IOError} if the correction index file does not exist. @raises: {AttributeError} if the BlastResult does not contain any BlastHit. @raises: {BlastError} if it has been called before for this instance. ''' if not self.has_hits: SBIg.warn( self, "BlastResult of {0} has no hits to correct".format(self.query)) return if self.are_hits_corrected: be = BlastExe.BlastError() raise be.corrected_hits() SBIg.alert('debug', self, 'Correcting indexes for {0}'.format(self.query)) cfile = File(count_hit_file) cq = False codes_of_interest = set([hit.sequenceID for hit in self.raw_hits]) if count_query_file == count_hit_file: codes_of_interest.add(self.query) count_query_file = None cq = True start_index_dic = {} for line in cfile.read(): if len(line.strip()) > 0: k = line.split('\t') if k[0].lstrip('>') in codes_of_interest: start_index_dic[k[0].lstrip('>')] = k[1].strip().split(';') cfile.close() if count_query_file is not None: cfile = File(count_query_file) for line in cfile.read(): if len(line.strip()) > 0: k = line.split('\t') if k[0].lstrip('>') == self.query: start_index_dic[k[0].lstrip('>')] = k[1].strip().split( ';') cfile.read().close() cq = True if cq: SBIg.alert('debug', self, '\tFixing Query {0}'.format(self.query)) self._query_index = start_index_dic[self.query] if return_correction_dict: return start_index_dic for hit in self._hits: # This tests between the options PDB/PDB_ID or PDB_ID in case # the TAB file has different codification h = hit.sequenceID hit_ID = h if h in start_index_dic else h.split("/")[-1] SBIg.alert('debug', self, '\tFixing {0}'.format(hit_ID)) hit.correct_hit_count(new_index=start_index_dic[hit_ID]) if cq: SBIg.alert('debug', self, '\tFixing Query {0}'.format(self.query)) hit.correct_query_count(new_index=start_index_dic[self.query]) self._correctedHits = True
class PDBeChem(object): """ """ def __init__(self, cif_file): self._file = File(file_name = cif_file, action = 'r') self.__name__ = 'databases.PDBeChem' # This must be included in every class for the SBIglobals.alert() self._id = None self._name = None self._type = None self._formula = None self._parent = None self._weight = None self._fcharge = None self._code1l = None self._flformula = {} self._parse() self._decompose_formula() """ATTRIBUTES""" @property def id(self): return self._id @property def name(self): return self._name @property def type(self): return self._type @property def formula(self): return self._formula @property def full_formula(self): return self._flformula @property def parent(self): return self._parent @property def weight(self): return self._weight @property def formal_charge(self): return self._fcharge @property def code1(self): return self._code1l @property def code3(self): return self._id """PRIVATE METHODS""" def _parse(self): for line in self._file.descriptor: if line.startswith('_chem_comp.'): line = line.replace('_chem_comp.', '') value = line[35:].strip().strip('"') value = value.replace(' (NON-PREFERRED NAME)', '') value = value if value != '?' else None if line.startswith('id'): self._id = value if line.startswith('pdbx_type'): self._type = value if line.startswith('formula '): self._formula = value if line.startswith('formula_weight'): self._weight = value if line.startswith('pdbx_formal_charge'): self._fcharge = value if line.startswith('one_letter_code'): self._code1l = value if line.startswith('name'): self._name = value.upper() if line.startswith('mon_nstd_parent_comp_id'): self._parent = set([x.strip() for x in value.split(',')]) if value is not None else None if line.startswith(';') and self._name == '': self._name += line.strip().lstrip(';').upper() self._file.close() def _decompose_formula(self): if self.formula is not None: data = self.formula.split() atregex = re.compile('(\D+)(\d*)') for atom in data: m = atregex.search(atom) if m.group(1) in element_dic: self._flformula[m.group(1)] = m.group(2) if m.group(2) != '' else 1 """OVERWRITE INHERITED FUNCTIONS""" def __str__(self): if self.code1 is not None and self.parent is not None: return "[{0.id} - {0.code1} from {0.parent}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(self) elif self.code1 is not None: return "[{0.id} - {0.code1}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(self) elif self.parent is not None: return "[{0.id} from {0.parent}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(self) else: return "[{0.id}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format(self)
class CDhitList(StorableObject): ''' List of cd-hit clusters. ''' def __init__(self, cdhit_file=None): ''' @param: cdhit_file @pdef: name of the cd-hit output file @pdefault: _None_. Create an empty list @ptype: {String} ''' self._clusters = [] self._allseqids = {} if cdhit_file is not None: self._file = File(file_name=cdhit_file) else: self._file = None if self._file is not None: self._parse_file() ############## # ATTRIBUTES # ############## @property def clusters(self): ''' List of cd-hit clusters. @return: {List} of {CDhit} ''' return self._clusters ########### # METHODS # ########### def get_cluster4sequence(self, sequence): ''' Retrieve a cluster for a given sequence. _None_ if the sequence is not found. @param: sequence @pdef: name of the query sequence @ptype: {String} @return: {CDhit} ''' if sequence in self._allseqids: return self._clusters[self._allseqids[sequence]] else: return None def is_in_cluster(self, sequence): ''' Evaluate if the sequence is in a cluster. @param: sequence @pdef: name of the query sequence @ptype: {String} @return: {String} as 'N' if no, 'H' if yes and 'M' if cluster master ''' c = self.get_cluster4sequence(sequence) if c is None: return 'N' else: return 'M' if c.is_master(sequence) else 'H' def add_cluster(self, cluster): ''' Add a cd-hit cluster to the object. @param: cluster @pdef: new cd-hit cluster to add @ptype: {CDhit} ''' self._clusters.append(cluster) def add_sequence2cluster(self, sequence, cluster_id=None): ''' Add a new sequence to a given cluster. @param: sequence @pdef: name of the query sequence @ptype: {String} @param: cluster_id @pdef: identifier of the cluster @pdefault: _None_. Refers to the last added cluster. @ptype: {String} ''' if cluster_id is None: self.clusters[-1].add_sequence(sequence) self._allseqids[sequence.name] = len(self.clusters) - 1 else: for x in range(len(self._clusters)): if self._clusters[x].identifier == cluster_id: self._clusters[x].add_sequence(sequence) self._allseqids[sequence.name] = x break def dictionary_role_summary(self): ''' Creates a dictionary separating master sequences and homolog sequences. @return: {Dictionary} ''' data = {'master': [], 'homolog': []} for c in self.clusters: data['master'].append(c.master.name) for s in c.sequences: data['homolog'].append(s) return data def merge_clusters(self, cluster_file): ''' When using an intermediate state to cluster by homology, the result of the second clustering is a clustering of clusters. We need to transform this into the original sequences @param: cluster_file @pdef: name of the second-step cluster output @ptype: {String} ''' clustlist = CDhitList(cluster_file) newlist = CDhitList() cluster_re = re.compile('Cluster\s+(\d+)') for cl in clustlist.clusters: c = CDhit(cluster_id=cl.identifier) newlist.add_cluster(c) cnum = int(cluster_re.search(cl.master.name).group(1)) oldclust = self.clusters[cnum] newlist.add_sequence2cluster(sequence=oldclust.master) for s in oldclust.sequences: newlist.add_sequence2cluster(sequence=oldclust.sequences[s]) for s in cl.sequences: idclust = cl.sequences[s] cnum = int(cluster_re.search(idclust.name).group(1)) oldclust = self.clusters[cnum] master = oldclust.master master.homology = idclust.homology newlist.add_sequence2cluster(sequence=master) for s in oldclust.sequences: h = oldclust.sequences[s] h.homology = int(h.homology * float(idclust.homology) / 10) newlist.add_sequence2cluster(sequence=h) self._clusters = newlist._clusters self._allseqids = newlist._allseqids ################### # PRIVATE METHODS # ################### def _parse_file(self): ''' Read the cd-hit output file into a {CDhitList} ''' homolog_re = re.compile('(\d+)aa,\s+\>([\s\w]+)\.{3}') for line in self._file.read(): if line.startswith('>'): c = CDhit(cluster_id=line.split()[-1].strip()) self.add_cluster(c) else: data = homolog_re.search(line) d = line.split() h = CDhitHomolog(name=data.group(2), length=data.group(1), homology=d[-1]) self.add_sequence2cluster(sequence=h) self._file.close() def __len__(self): return len(self._clusters) def __repr__(self): text = [] for c in self.clusters: text.append('{0}'.format(c)) return '\n'.join(text)
class PDB(StorableObject): """ A {PDB} is a collection of {Chain} """ def __init__(self, pdb_file=None, dehydrate=False, header=False, onlyheader=False, biomolecule=False): """ @type pdb_file: String @param pdb_file: PDB formated file to read @raise IOError if pdb_file does not exist and it is not an empty object """ if biomolecule or onlyheader: header = True self._pdb_file = pdb_file self._chains = [] self._NMR = False self._NMR_chains = [] self._chain_id = set() self._biomol_id = -1 # -1 -> original # 0 -> symmetry # >0 -> biomolecule self._header = None self._has_prot = False self._has_nucl = False self._COMPND = None if self.pdb_file is not None: self._pdb_file = File(file_name=self._pdb_file, action='r') self._read_PDB_file(header=header, onlyheader=onlyheader, biomolecule=biomolecule) if dehydrate: self.dehydrate() # # ATTRIBUTES # @property def pdb_file(self): """ PDB file name @rtype: String """ return self._pdb_file @pdb_file.setter def pdb_file(self, value): """ Sets a PDB file if none has been given @raise UsedAttributeError """ if self._pdb_file is not None: raise AttributeError( "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object".format(self._pdb_file.full, value)) if isinstance(value, File): self._pdb_file = value else: self._pdb_file = File(file_name=value, type='r') @property def chain_identifiers(self): return self._chain_id @property def id(self): return self._chains[0].pdb @property def chains(self): """ List of {Chain} contained in the PDB w/out NMR replicas @rtype: List of {Chain} """ return self._chains @property def proteins(self): """ List of {ProteinChain} contained in the PDB w/out NMR replicas @rtype: List of {ProteinChain} (iterator) """ for chain in self.chains: if isinstance(chain, ChainOfProtein): yield chain @property def nucleotides(self): """ List of {NucleotideChain} contained in the PDB w/out NMR replicas @rtype: List of {NucleotideChain} (iterator) """ for chain in self.chains: if isinstance(chain, ChainOfNucleotide): yield chain @property def non_standard_chains(self): """ List of non {NucleotideChain}/ non {ProteinChain} contained in the PDB w/out NMR replicas @rtype: List of non {NucleotideChain}/ non {ProteinChain} (iterator) """ for chain in self.chains: if not isinstance(chain, ChainOfNucleotide) and not isinstance(chain, ChainOfProtein): yield chain @property def all_models(self): """ List of {Chain} contained in the PDB w/ NMR replicas @rtype: List of {Chain} """ return self._chains + self._NMR_chains @property def header(self): if self._header is None: return '' else: return self._header @property def biomolecule_identifier(self): return self._biomol_id # # COMPLEX GETTERS & SETTERS # def get_chain_by_id(self, id): """ Returns a chain according to its id or None if no chain with that id is found @rtype: {Chain} """ for chain in self._chains: if chain.chain == id: return chain return None def add_chain(self, chain, NMR=False): """ Adds a new chain to the PDB """ if not NMR: self._chains.append(chain) elif NMR and self._NMR: self._NMR_chains.append(chain) self._chain_id.add(chain.chain) def add_chains(self, chains, NMR=False): """ Adds a new chains to the PDB """ for chain in chains: self.add_chain(chain=chain, NMR=NMR) def _get_chain_position_by_id(self, id): """ Returns the position in the chain array where the chain is @rtype: Integer """ for x in range(len(self._chains)): if self._chains[x].chain == id: return x return None # # BOOLEANS # @property def is_NMR(self): """ Identifies if the PDB contains NMRs @rtype: Boolean """ return self._NMR def chain_exists(self, chain): """ Confirms if a given chain exists in the PDB @rtype: Boolean """ return chain in self._chain_id @property def has_protein(self): """ Checks if the PDB contains a protein (not only) @rtype: Boolean """ return self._has_prot @property def has_nucleotide(self): """ Checks if the PDB contains a nucleotide chain (not only) @rtype: Boolean """ return self._has_nucl @property def repeated_chain_ids(self): """ Checks if more than one {Chain} has the same assigned ID @rtype: Boolean """ return len(self._chain_id) < len(self._chains) @property def is_all_ca(self): for p in self.proteins: if p.is_only_ca(): return True return False # # METHODS # def dehydrate(self): recheck_chains = False for c in self.chains: c.dehydrate() if c.is_empty: recheck_chains = True if recheck_chains: c = [] for ch in self.chains: if not ch.is_empty: c.append(ch) else: self._chain_id.remove(ch.chain) self._chains = c def duplicate(self, hetero=True, water=False, NMR=False): """ Returns a {PDB} identical to the original but as a new object @rtype: {PDB} """ new_PDB = PDB() new_PDB.pdb_file = self.pdb_file for chain in self.chains: new_PDB.add_chain( chain=chain.duplicate(hetero=hetero, water=water)) if NMR: for chain in self._NMR_chains: new_PDB.add_chain(chain=chain.duplicate( hetero=hetero, water=water), NMR=True) new_PDB._NMR = self._NMR new_PDB._has_prot = self._has_prot new_PDB._has_nucl = self._has_nucl return new_PDB def apply_symmetry_matrices(self): """ Only works if the PDB file is an original PDB file or the matrices have been added in the correct PDB format @rtype: {PDB} """ if self._header is None: self._read_PDB_file(header=True, onlyheader=True) return self._apply_matrix(matrix=self.header.symmetry_matrix) def apply_biomolecule_matrices(self, keepchains=False, water=True): """ Only works if the PDB file is an original PDB file or the matrices have been added in the correct PDB format @rtype: {PDB} """ if self._header is None: self._read_PDB_file(header=True, onlyheader=True) PDB_list = [] for matrix in self.header.biomolecules: PDB_list.append(self._apply_matrix(matrix=matrix, keepchains=keepchains, realchains=self._chain_id, water=water)) return PDB_list def _apply_matrix(self, matrix, keepchains=False, realchains=None, water=True): new_PDB = PDB() new_PDB._biomol_id = matrix.identifier for chain in self.chains: if chain.chain in matrix.chains: for mat in matrix.matrices: new_chain = chain.duplicate(water=water) new_chain.reposition(matrix=mat.matrix, vector=mat.vector) if len(new_chain) >= 1: new_PDB.add_chain(chain=new_chain) if not keepchains: new_PDB.tmpclean(cluster_by_alternative_id=True, exclude_chains = realchains) return new_PDB def clean(self): first_atom = 1 for c in self.chains: c.clean(initatom=first_atom) first_atom = c.last_residue.last_atom_number + 1 def tmpclean(self, cluster_by_alternative_id=False, exclude_chains = None): """ Makes a clean version of the PDB, rechaining in order and renumerating atoms. Renumbering residues is optional """ pchainsIDs = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890" chainsIDs = "" chainsNIDs = "" chainID = 0 atom_count = 1 for x in range(len(pchainsIDs)): if exclude_chains is not None and pchainsIDs[x] in exclude_chains: chainsNIDs += pchainsIDs[x] elif not self.chain_exists(chain=pchainsIDs[x]): chainsIDs += pchainsIDs[x] else: chainsNIDs += pchainsIDs[x] chain_change = len(self) <= len(chainsIDs) for chain in self.chains: if (chain.chain not in chainsNIDs) and chain_change: self._chain_id.add(chain.chain) chain.chain = chainsIDs[chainID] chainID += 1 self._chain_id.add(chain.chain) if cluster_by_alternative_id: if self._COMPND is None: self._COMPND = {} if chain.alternative_id not in self._COMPND: self._COMPND.setdefault( chain.alternative_id, []).append(chain.alternative_id) self._COMPND[chain.alternative_id].append(chain.chain) else: chainsNIDs = chainsNIDs.replace(chain.chain, '') chain.renumerate_atoms(init=atom_count) atom_count += (chain.atom_length) def fuse_chains(self, chains_ids): """ Fuses several chains into the first one. It will not allow to fuse different structural chains. It does not alter the {PDB}, but provides a new one @rtype: {Chain} @raise AttributeError if: a) A given chain ID is not present b) Try to fuse different structural chains """ if len(self._chain_id.intersection(set(chains_ids))) < len(chains_ids): raise AttributeError( "Some of the given chains to fues do not exist") error_counter = 0 error_control = [False, False] new_PDB = PDB() for c in chains_ids: chain = self.get_chain_by_id(id=c) new_PDB.add_chain(chain=chain.duplicate()) if isinstance(chain, ChainOfProtein) and not error_control[0]: error_counter += 1 error_control[0] = True elif isinstance(chain, ChainOfNucleotide) and not error_control[1]: error_counter += 1 error_control[1] = True if error_counter == 2: raise AttributeError( "Fuse different kinds of structural chain is not possible\n") init_chain_num = new_PDB.chains[0].last_residue.number for x in range(1, len(new_PDB.chains)): new_PDB.chains[x].renumerate_residues(init=init_chain_num + 1) init_chain_num = new_PDB.chains[0].last_residue.number new_PDB.chains[0].fuse(chain=new_PDB.chains[x]) return_PDB = PDB() return_PDB.add_chain(chain=new_PDB.chains[0]) return return_PDB # def calculate_dssp(self, out_dir = None, store = True): # """ # Executes DSSP and assigns the prediction to each chain # @param out_dir: directory to save the output # @defaut out_dir: None # @param store: Save the dssp output(?) # """ # for chain in self.proteins: # if out_dir is None: # pdb_file = chain.globalID + ".pdb2dssp" # dssp_file = chain.globalID + ".dssp" # else: # Path.mkdir(newdir = out_dir) # pdb_file = os.path.join(os.path.abspath(out_dir), chain.globalID + ".pdb2dssp") # dssp_file = os.path.join(os.path.abspath(out_dir), chain.globalID + ".dssp") # pdb_fd = open(pdb_file, 'w') # pdb_fd.write(chain.PDB_format()) # pdb_fd.close() # dssp_calc = DSSPexec(pdb_file = pdb_file, dssp_file = dssp_file, # chain = chain, store = store) def rotate(self, matrix=None): """ Rotates each {Chain} according to a given matrix @type matrix: numpy.matrix """ if matrix is None: matrix = numpy.identity(3, float) for chain in self.all_models: chain.rotate(matrix=matrix) def translate(self, vector=None): """ Translates each {Chain} according to a translational vector @type vector: numpy.array """ if vector is None: vector = numpy.zeros(3, float) for chain in self.all_models: chain.translate(vector=vector) def reposition(self, matrix=None, vector=None): """ Rotates and Translates each {Chain} according to a matrix and a translational vector @type matrix: numpy.matrix @type vector: numpy.array """ if matrix is None: matrix = numpy.identity(3, float) if vector is None: vector = numpy.zeros(3, float) for chain in self.all_models: chain.reposition(matrix=matrix, vector=vector) # def calculate_protein_heteroatom_contacts(self, distance = 6): # """ # Returns a {HeteroatomContacts} list with the contacts between a protein and its heteroatoms # at a maximum given distance # @type distance: Integer # @rtype: list of {HeteroatomContacts} # """ # data = [] # for protein in self.proteins: # data.append(HeteroatomContacts(chain = protein, max_distance = distance)) # return data # # OVERRIDE PARENT'S FUNCTIONS # @staticmethod def read(input_file, format='PDB'): """ Reads a file of data in a specific format and returns the object @type input_file: String @param input_file: File to read @type format: String @param format: Format of the file to read """ if format == 'PDB': pdb = PDB(pdb_file=input_file) return pdb def write(self, output_file=None, format='PDB', force=None, clean=False): """ Writes the object in a specific format @type output_file: String @param output_file: File to write @type format: String @param format: Format of the file to print """ outfile = File( file_name=output_file, action='w', overwrite=SBIg.decide_overwrite(force)) if format == 'PDB': self._write_PDB_file(pdb_file=outfile, clean=clean) # # IO # def _read_PDB_file(self, header=False, onlyheader=False, biomolecule=False): """ Process and load crystal data from a PDB formated file """ from parse_pdb import read_PDB_file, read_PDB_header if header: read_PDB_header(self) self._pdb_file.close() self._pdb_file.open() if not onlyheader: # read_PDB_file(self, biomolecule=biomolecule) read_PDB_file(self) self._pdb_file.close() # def _represent_COMPND(self): # if self._COMPND is None: return '' # data = [] # mol_counter = 1 # for chain in self._COMPND: # data.append("COMPND MOL_ID: %d;" %mol_counter) # data.append("COMPND 2 CHAIN: " + ",".join(self._COMPND[chain]) + ";") # if len(self._biomolecA) > 0: # matrices = [] # for mat in self._biomolecA: # if mat[1] == chain: matrices.append(str(mat[0])) # data.append("COMPND 3 MATRICES: " + ",".join(sorted(matrices))) # mol_counter += 1 # return "\n".join(data) + "\n" def _write_PDB_file(self, pdb_file, clean=False): """ Print a crystal into a PDB formated file """ # out_fd = pdb_file.descriptor # out_fd.write(self._represent_COMPND()) pdb_file.write(self.PDB_format(clean=clean) + "\n") pdb_file.close() def PDB_format(self, clean=False, terminal=True): """ Strings a {PDB} in PDB format @rtype: String """ lines = [] if clean: self.clean() for chain in self._chains: lines.append(chain.PDB_format(terminal=terminal)) lines.append("END") return "\n".join(lines) def FASTA_format(self, gapped=True, protein=True, nucleotide=False): # TODO: return fasta object lines = [] for c in self.chains: if isinstance(c, ChainOfProtein) and protein: lines.append( ">{0}\t{1}".format(c.globalID, c.aminoacids[0].identifier)) if gapped: lines.append("{0}".format(c.gapped_protein_sequence)) else: lines.append("{0}".format(c.protein_sequence)) if isinstance(c, ChainOfNucleotide) and nucleotide: lines.append( ">{0}\t{1}".format(c.globalID, c.nucleotides[0].identifier)) if gapped: lines.append("{0}".format(c.gapped_nucleotide_sequence())) else: lines.append("{0}".format(c.nucleotide_sequence())) if len(lines) == 0: return "" else: return "\n".join(lines) + "\n" def IDX_format(self, protein=True, nucleotide=False): lines = [] for c in self.chains: if isinstance(c, ChainOfProtein) and protein: lines.append(">{0}\t{1}".format(c.globalID, c.protein_idx)) if isinstance(c, ChainOfNucleotide) and nucleotide: lines.append( ">{0}\t{1}".format(c.globalID, c.nucleotide_idx())) if len(lines) == 0: return "" else: return "\n".join(lines) + "\n" def FASTA_IDX(self, protein=True, nucleotide=False): data = {} data.setdefault('FASTA', []) data.setdefault('IDX', []) for c in self.chains: if isinstance(c, ChainOfProtein) and protein: data['FASTA'].append( ">{0}\n{1}".format(c.globalID, c.gapped_protein_sequence)) data['IDX'].append( ">{0}\t{1}".format(c.globalID, c.protein_idx)) if isinstance(c, ChainOfNucleotide) and nucleotide: data['FASTA'].append( ">{0}\n{1}".format(c.globalID, c.gapped_nucleotide_sequence())) data['IDX'].append( ">{0}\t{1}".format(c.globalID, c.nucleotide_idx())) return data # # OVERRIDE DEFAULT METHODS # def __len__(self): return len(self._chains)