def build_multifasta(file_name, sequenceList, force=False): newFasta = File(file_name, 'w', overwrite=force) file_dsc = newFasta.descriptor for sequence in sequenceList: file_dsc.write(sequence.format('FASTA') + "\n") newFasta.close() return Fasta(fasta_file=newFasta.full)
def _process(self): tmoFile = File(self._pdbtmfile,'w', True) for xmlfile in Path.list_files(os.path.join(self._local,'pdbtm/database/'), '*.xml'): xmldata = TM(pdb = os.path.splitext(os.path.split(xmlfile)[1])[0].upper()) skip_chains = set() read = False fdxml = open(xmlfile) for line in fdxml: if line.startswith(' <TMRES>'): xmldata.tmres = line elif line.startswith(' <TMTYPE'): xmldata.tmtype = line elif line.startswith(' <PDBKWRES'): xmldata.kwres = line elif line.startswith(' <SIDEDEFINITION'): m = re.search('Side1="(\S+)"', line) xmldata.side = m.group(1) elif line.startswith(' <APPLY_TO_CHAIN'): m = re.search('NEW_CHAINID=\"(\S{1})\"', line) if m: skip_chains.add(m.group(1)) elif line.startswith(' <CHAIN '): m = re.search('CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"', line) if m: chain, num, tmtype = m.group(1), m.group(2), m.group(3) if not chain in skip_chains: cdata = tuple([chain, num, tmtype]) xmldata.set_chain(cdata) read = True elif line.startswith(' <REGION ') and read: m = re.search('pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"', line) ini, end, tmtype = m.group(1), m.group(2), m.group(3) xmldata.set_chain(cdata, tuple([ini, end, tmtype])) elif line.startswith(' </CHAIN>'): read = False fdxml.close() if len(xmldata.chains) > 0: tmoFile.write(str(xmldata)+"\n") tmoFile.close()
def format2file(self, filename, extension='pdb', center=False): if extension not in ('pdb', 'js'): raise AttributeError('Not accepted extension') structure = File('.'.join([filename, extension]), 'w') if extension == 'pdb': structure.write(self.pdb_format(center=center)) elif extension == 'js': structure.write(self.js_format(center=center)) structure.close()
def build(file_name, sequenceID, sequence, force=False): newFasta = File(file_name, 'w', overwrite=force) newSeq = Sequence(seqID=sequenceID, sequence=sequence) file_dsc = newFasta.descriptor file_dsc.write(newSeq.format('FASTA')) newFasta.close() return Fasta(fasta_file=newFasta.full)
def _process(self): enzymes = self._parse_enzclass() + self._parse_enzymedat() enzymes.sort() enzFile = File(self._enzfile, 'w', True) for e in enzymes: enzFile.write(repr(e) + "\n") enzFile.close()
def _process(self): targets = self._process_targets() drugs = self._process_drugs(targets) drugFile = File(self._drugfile, 'w', True) for d in drugs: drugFile.write(repr(d) + "\n") drugFile.close()
def make_PDBseq(self, log_file, resolution_threshold=None): if not self.has_local: raise NameError( 'A local PDB database must be defined to do create a PDBseq database.' ) outdir = self.PDBseq if self.PDBseq is not None else os.curdir Path.mkdir(self.PDBseq) fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'), action='w', overwrite=True) fasta_fd = fasta_file.descriptor idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'), action='w', overwrite=True) idx_fd = idx_file.descriptor # if resolution_threshold is not None: # filtered_file_name = self.get_PDBseq_filtered(resolution_threshold) # filtered_file = File(file_name = filtered_file_name, action = 'w', overwrite = True) # filtered_fd = filtered_file.descriptor # resolutions = self.get_resolutions(resolution_threshold = resolution_threshold) log_file = File(file_name=log_file, action='w', overwrite=True) log_idx = log_file.descriptor for pdb_file in self.localPDBs: log_idx.write("Reading File: {0}\n".format(pdb_file)) newPDB = PDB(pdb_file=pdb_file, dehydrate=True) fasta_idx = newPDB.FASTA_IDX(nucleotide=False) if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']): log_idx.write( 'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n' .format(newPDB.id)) if len(fasta_idx['FASTA']) > 0: log_idx.write('\tPrinting FASTA and IDX...\n') else: log_idx.write('\tProblably just a nucleotide PDB...\n') for c in range(len(fasta_idx['FASTA'])): sequence = fasta_idx['FASTA'][c].split('\n')[1] sequence = sequence.replace('X', '').replace('x', '') if len(sequence) > 0: fasta_fd.write(fasta_idx['FASTA'][c] + "\n") if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca: filtered_fd.write(fasta_idx['FASTA'][c] + "\n") idx_fd.write(fasta_idx['IDX'][c] + "\n") del (newPDB) #CLOSE & END fasta_file.close() idx_file.close() if resolution_threshold is not None: filtered_fd.close()
def _process(self): go_dic = {} parseFile = File(os.path.join(self.local, self._gfile), 'r') go = None for line in parseFile.descriptor: line = re.sub('\'', '\\\'', line) if line.startswith('[Term]'): if go is not None: go_dic[go.id] = go if line.startswith('id:'): go = GOterm(id=line.split()[1].strip()) continue if line.startswith('name:'): go.name = " ".join(line.split()[1:]).strip() continue if line.startswith('namespace:'): go.namespace = line.split()[1].strip() continue if line.startswith('alt_id:'): go.alt_id.append(line.split()[1].strip()) continue if line.startswith('is_obsolete:'): go.obsolete = True continue if line.startswith('is_a:'): go.parents.add(line.split()[1].strip()) continue if line.startswith('relationship:'): go.relations.append( (line.split()[1].strip(), line.split()[2].strip())) continue if line.startswith('[Typedef]'): go_dic[go.id] = go break parseFile.close() for go in go_dic: go_dic[go].parents = self._search_parents(go_dic, go) goFile = File(self._gofile, 'w', True) for go in go_dic: go_dic[go].parents.add(go) goFile.write(str(go_dic[go]) + "\n") goFile.close()
def _process(self): inh = {} nodefile = File(file_name=self._nodes, action='r') for line in nodefile.descriptor: line = re.sub('\'', '\\\'', line) line_data = line.split('|') inh[line_data[0].strip()] = TaxID(line_data[0].strip()) inh[line_data[0].strip()].parent = line_data[1].strip() inh[line_data[0].strip()].rank = line_data[2].strip() nodefile.close() namefile = File(file_name=self._names, action='r') for line in namefile.descriptor: line = re.sub('\'', '\\\'', line) line_data = line.split('|') if line_data[3].strip() == 'scientific name': inh[line_data[0].strip()].name = line_data[1].strip() namefile.close() delefile = File(file_name=self._delet, action='r') for line in delefile.descriptor: data = line.split('|') inh[data[0].strip()] = TaxID(data[0].strip()) inh[data[0].strip()].old = True delefile.close() mrgefile = File(file_name=self._merged, action='r') for line in mrgefile.descriptor: data = line.split('|') inh[data[0].strip()] = TaxID(data[0].strip()) inh[data[0].strip()].old = True inh[data[0].strip()].new = data[1].strip() mrgefile.close() taxFile = File(self._taxid, 'w', True) for taxid in inh: taxFile.write(str(inh[taxid]) + "\n") taxFile.close()
def _parse_uniprot_file(self, source, destination, fasta, code): sourceFile = File(source, 'r') destinFile = File(destination, 'w', True) fastaFile = File(fasta, 'w', True) protein = None for line in sourceFile.descriptor: if line.startswith('ID'): protein = Uniprot(line.split()[1].strip(), code) if line.startswith('AC'): protein.accession = line.split()[1:] if line.startswith('OX'): protein.taxid = line.split()[1] if line.startswith('OH'): protein.hosts = line.split()[1] if line.startswith('DR'): protein.databases = line.split()[1:3] if line.startswith(' '): protein.sequence = line.strip().replace(' ', '') if line.startswith('//'): destinFile.write(str(protein) + "\n") fastaFile.write(repr(protein) + "\n") sourceFile.close() destinFile.close()
def get_FASTA_IDX_by_names_to_file(self, names, outfile): fastafile = Fasta(self.PDBseq) selectedfasta = fastafile.retrieve(copy.deepcopy(names)) output_fasta = File(outfile, 'w') for sequence in selectedfasta: output_fasta.write(sequence.format('FASTA') + "\n") output_fasta.close() idxfile = self.PDBseq + '.idx' output_idx = File(outfile + '.idx', 'w') input_idx = File(idxfile, 'r') for line in input_idx.descriptor: info = line.split() pdbname = info[0][1:] if pdbname in names: output_idx.write(line) input_idx.close() output_idx.close()
class PDB(StorableObject): """ A {PDB} is a collection of {Chain} """ def __init__(self, pdb_file=None, dehydrate=False, header=False, onlyheader=False, biomolecule=False): """ @type pdb_file: String @param pdb_file: PDB formated file to read @raise IOError if pdb_file does not exist and it is not an empty object """ if biomolecule or onlyheader: header = True self._pdb_file = pdb_file self._chains = [] self._NMR = False self._NMR_chains = [] self._chain_id = set() self._biomol_id = -1 # -1 -> original # 0 -> symmetry # >0 -> biomolecule self._header = None self._has_prot = False self._has_nucl = False self._COMPND = None if self.pdb_file is not None: self._pdb_file = File(file_name=self._pdb_file, action='r') self._read_PDB_file(header=header, onlyheader=onlyheader, biomolecule=biomolecule) if dehydrate: self.dehydrate() # # ATTRIBUTES # @property def pdb_file(self): """ PDB file name @rtype: String """ return self._pdb_file @pdb_file.setter def pdb_file(self, value): """ Sets a PDB file if none has been given @raise UsedAttributeError """ if self._pdb_file is not None: raise AttributeError( "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object" .format(self._pdb_file.full, value)) if isinstance(value, File): self._pdb_file = value else: self._pdb_file = File(file_name=value, type='r') @property def chain_identifiers(self): return self._chain_id @property def id(self): return self._chains[0].pdb @property def chains(self): """ List of {Chain} contained in the PDB w/out NMR replicas @rtype: List of {Chain} """ return self._chains @property def proteins(self): """ List of {ProteinChain} contained in the PDB w/out NMR replicas @rtype: List of {ProteinChain} (iterator) """ for chain in self.chains: if isinstance(chain, ChainOfProtein): yield chain @property def nucleotides(self): """ List of {NucleotideChain} contained in the PDB w/out NMR replicas @rtype: List of {NucleotideChain} (iterator) """ for chain in self.chains: if isinstance(chain, ChainOfNucleotide): yield chain @property def non_standard_chains(self): """ List of non {NucleotideChain}/ non {ProteinChain} contained in the PDB w/out NMR replicas @rtype: List of non {NucleotideChain}/ non {ProteinChain} (iterator) """ for chain in self.chains: if not isinstance(chain, ChainOfNucleotide) and not isinstance( chain, ChainOfProtein): yield chain @property def all_models(self): """ List of {Chain} contained in the PDB w/ NMR replicas @rtype: List of {Chain} """ return self._chains + self._NMR_chains @property def header(self): if self._header is None: return '' else: return self._header @property def biomolecule_identifier(self): return self._biomol_id # # COMPLEX GETTERS & SETTERS # def get_chain_by_id(self, id): """ Returns a chain according to its id or None if no chain with that id is found @rtype: {Chain} """ for chain in self._chains: if chain.chain == id: return chain return None def add_chain(self, chain, NMR=False): """ Adds a new chain to the PDB """ if not NMR: self._chains.append(chain) elif NMR and self._NMR: self._NMR_chains.append(chain) self._chain_id.add(chain.chain) def add_chains(self, chains, NMR=False): """ Adds a new chains to the PDB """ for chain in chains: self.add_chain(chain=chain, NMR=NMR) def _get_chain_position_by_id(self, id): """ Returns the position in the chain array where the chain is @rtype: Integer """ for x in range(len(self._chains)): if self._chains[x].chain == id: return x return None # # BOOLEANS # @property def is_NMR(self): """ Identifies if the PDB contains NMRs @rtype: Boolean """ return self._NMR def chain_exists(self, chain): """ Confirms if a given chain exists in the PDB @rtype: Boolean """ return chain in self._chain_id @property def has_protein(self): """ Checks if the PDB contains a protein (not only) @rtype: Boolean """ return self._has_prot @property def has_nucleotide(self): """ Checks if the PDB contains a nucleotide chain (not only) @rtype: Boolean """ return self._has_nucl @property def repeated_chain_ids(self): """ Checks if more than one {Chain} has the same assigned ID @rtype: Boolean """ return len(self._chain_id) < len(self._chains) @property def is_all_ca(self): for p in self.proteins: if p.is_only_ca(): return True return False # # METHODS # def dehydrate(self): recheck_chains = False for c in self.chains: c.dehydrate() if c.is_empty: recheck_chains = True if recheck_chains: c = [] for ch in self.chains: if not ch.is_empty: c.append(ch) else: self._chain_id.remove(ch.chain) self._chains = c def duplicate(self, hetero=True, water=False, NMR=False): """ Returns a {PDB} identical to the original but as a new object @rtype: {PDB} """ new_PDB = PDB() new_PDB.pdb_file = self.pdb_file for chain in self.chains: new_PDB.add_chain( chain=chain.duplicate(hetero=hetero, water=water)) if NMR: for chain in self._NMR_chains: new_PDB.add_chain(chain=chain.duplicate(hetero=hetero, water=water), NMR=True) new_PDB._NMR = self._NMR new_PDB._has_prot = self._has_prot new_PDB._has_nucl = self._has_nucl return new_PDB def apply_symmetry_matrices(self): """ Only works if the PDB file is an original PDB file or the matrices have been added in the correct PDB format @rtype: {PDB} """ if self._header is None: self._read_PDB_file(header=True, onlyheader=True) return self._apply_matrix(matrix=self.header.symmetry_matrix) def apply_biomolecule_matrices(self, keepchains=False, water=True): """ Only works if the PDB file is an original PDB file or the matrices have been added in the correct PDB format @rtype: {PDB} """ if self._header is None: self._read_PDB_file(header=True, onlyheader=True) PDB_list = [] for matrix in self.header.biomolecules: PDB_list.append( self._apply_matrix(matrix=matrix, keepchains=keepchains, water=water)) return PDB_list def _apply_matrix(self, matrix, keepchains=False, water=True): new_PDB = PDB() new_PDB._biomol_id = matrix.identifier for chain in self.chains: if chain.chain in matrix.chains: for mat in matrix.matrices: new_chain = chain.duplicate(water=water) new_chain.reposition(matrix=mat.matrix, vector=mat.vector) if len(new_chain) >= 1: new_PDB.add_chain(chain=new_chain) if not keepchains: new_PDB.tmpclean(cluster_by_alternative_id=True) return new_PDB def clean(self): first_atom = 1 for c in self.chains: c.clean(initatom=first_atom) first_atom = c.last_residue.last_atom_number + 1 def tmpclean(self, cluster_by_alternative_id=False): """ Makes a clean version of the PDB, rechaining in order and renumerating atoms. Renumbering residues is optional """ pchainsIDs = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890" chainsIDs = "" chainsNIDs = "" chainID = 0 atom_count = 1 for x in range(len(pchainsIDs)): if not self.chain_exists(chain=pchainsIDs[x]): chainsIDs += pchainsIDs[x] else: chainsNIDs += pchainsIDs[x] chain_change = len(self) <= len(chainsIDs) for chain in self.chains: if (not chain.chain in chainsNIDs) and chain_change: self._chain_id.add(chain.chain) chain.chain = chainsIDs[chainID] chainID += 1 self._chain_id.add(chain.chain) if cluster_by_alternative_id: if self._COMPND is None: self._COMPND = {} if not self._COMPND.has_key(chain.alternative_id): self._COMPND.setdefault(chain.alternative_id, []).append( chain.alternative_id) self._COMPND[chain.alternative_id].append(chain.chain) else: chainsNIDs = chainsNIDs.replace(chain.chain, '') chain.renumerate_atoms(init=atom_count) atom_count += (chain.atom_length) def fuse_chains(self, chains_ids): """ Fuses several chains into the first one. It will not allow to fuse different structural chains. It does not alter the {PDB}, but provides a new one @rtype: {Chain} @raise AttributeError if: a) A given chain ID is not present b) Try to fuse different structural chains """ if len(self._chain_id.intersection(set(chains_ids))) < len(chains_ids): raise AttributeError( "Some of the given chains to fues do not exist") error_counter = 0 error_control = [False, False] new_PDB = PDB() for c in chains_ids: chain = self.get_chain_by_id(id=c) new_PDB.add_chain(chain=chain.duplicate()) if isinstance(chain, ChainOfProtein) and not error_control[0]: error_counter += 1 error_control[0] = True elif isinstance(chain, ChainOfNucleotide) and not error_control[1]: error_counter += 1 error_control[1] = True if error_counter == 2: raise AttributeError( "Fuse different kinds of structural chain is not possible\n" ) init_chain_num = new_PDB.chains[0].last_residue.number for x in range(1, len(new_PDB.chains)): new_PDB.chains[x].renumerate_residues(init=init_chain_num + 1) init_chain_num = new_PDB.chains[0].last_residue.number new_PDB.chains[0].fuse(chain=new_PDB.chains[x]) return_PDB = PDB() return_PDB.add_chain(chain=new_PDB.chains[0]) return return_PDB # def calculate_dssp(self, out_dir = None, store = True): # """ # Executes DSSP and assigns the prediction to each chain # @param out_dir: directory to save the output # @defaut out_dir: None # @param store: Save the dssp output(?) # """ # for chain in self.proteins: # if out_dir is None: # pdb_file = chain.globalID + ".pdb2dssp" # dssp_file = chain.globalID + ".dssp" # else: # Path.mkdir(newdir = out_dir) # pdb_file = os.path.join(os.path.abspath(out_dir), chain.globalID + ".pdb2dssp") # dssp_file = os.path.join(os.path.abspath(out_dir), chain.globalID + ".dssp") # pdb_fd = open(pdb_file, 'w') # pdb_fd.write(chain.PDB_format()) # pdb_fd.close() # dssp_calc = DSSPexec(pdb_file = pdb_file, dssp_file = dssp_file, # chain = chain, store = store) def rotate(self, matrix=None): """ Rotates each {Chain} according to a given matrix @type matrix: numpy.matrix """ if matrix is None: matrix = numpy.identity(3, float) for chain in self.all_models: chain.rotate(matrix=matrix) def translate(self, vector=None): """ Translates each {Chain} according to a translational vector @type vector: numpy.array """ if vector is None: vector = numpy.zeros(3, float) for chain in self.all_models: chain.translate(vector=vector) def reposition(self, matrix=None, vector=None): """ Rotates and Translates each {Chain} according to a matrix and a translational vector @type matrix: numpy.matrix @type vector: numpy.array """ if matrix is None: matrix = numpy.identity(3, float) if vector is None: vector = numpy.zeros(3, float) for chain in self.all_models: chain.reposition(matrix=matrix, vector=vector) # def calculate_protein_heteroatom_contacts(self, distance = 6): # """ # Returns a {HeteroatomContacts} list with the contacts between a protein and its heteroatoms # at a maximum given distance # @type distance: Integer # @rtype: list of {HeteroatomContacts} # """ # data = [] # for protein in self.proteins: # data.append(HeteroatomContacts(chain = protein, max_distance = distance)) # return data # # OVERRIDE PARENT'S FUNCTIONS # @staticmethod def read(input_file, format='PDB'): """ Reads a file of data in a specific format and returns the object @type input_file: String @param input_file: File to read @type format: String @param format: Format of the file to read """ if format == 'PDB': pdb = PDB(pdb_file=input_file) return pdb def write(self, output_file=None, format='PDB', force=False, clean=False): """ Writes the object in a specific format @type output_file: String @param output_file: File to write @type format: String @param format: Format of the file to print """ outfile = File(file_name=output_file, action='w', overwrite=force) if format == 'PDB': self._write_PDB_file(pdb_file=outfile, clean=clean) # # IO # def _read_PDB_file(self, header=False, onlyheader=False, biomolecule=False): """ Process and load crystal data from a PDB formated file """ from parse_pdb import read_PDB_file, read_PDB_header if header: read_PDB_header(self) self._pdb_file.close() if not onlyheader: # read_PDB_file(self, biomolecule=biomolecule) read_PDB_file(self) self._pdb_file.close() # def _represent_COMPND(self): # if self._COMPND is None: return '' # data = [] # mol_counter = 1 # for chain in self._COMPND: # data.append("COMPND MOL_ID: %d;" %mol_counter) # data.append("COMPND 2 CHAIN: " + ",".join(self._COMPND[chain]) + ";") # if len(self._biomolecA) > 0: # matrices = [] # for mat in self._biomolecA: # if mat[1] == chain: matrices.append(str(mat[0])) # data.append("COMPND 3 MATRICES: " + ",".join(sorted(matrices))) # mol_counter += 1 # return "\n".join(data) + "\n" def _write_PDB_file(self, pdb_file, clean=False): """ Print a crystal into a PDB formated file """ out_fd = pdb_file.descriptor # out_fd.write(self._represent_COMPND()) out_fd.write(self.PDB_format(clean=clean) + "\n") pdb_file.close() def PDB_format(self, clean=False, terminal=True): """ Strings a {PDB} in PDB format @rtype: String """ lines = [] if clean: self.clean() for chain in self._chains: lines.append(chain.PDB_format(terminal=terminal)) lines.append("END") return "\n".join(lines) def FASTA_format(self, gapped=True, protein=True, nucleotide=False): lines = [] for c in self.chains: if isinstance(c, ChainOfProtein) and protein: lines.append(">{0}\t{1}".format(c.globalID, c.aminoacids[0].identifier)) if gapped: lines.append("{0}".format(c.gapped_protein_sequence)) else: lines.append("{0}".format(c.protein_sequence)) if isinstance(c, ChainOfNucleotide) and nucleotide: lines.append(">{0}\t{1}".format(c.globalID, c.nucleotides[0].identifier)) if gapped: lines.append("{0}".format(c.gapped_nucleotide_sequence())) else: lines.append("{0}".format(c.nucleotide_sequence())) if len(lines) == 0: return "" else: return "\n".join(lines) + "\n" def IDX_format(self, protein=True, nucleotide=False): lines = [] for c in self.chains: if isinstance(c, ChainOfProtein) and protein: lines.append(">{0}\t{1}".format(c.globalID, c.protein_idx)) if isinstance(c, ChainOfNucleotide) and nucleotide: lines.append(">{0}\t{1}".format(c.globalID, c.nucleotide_idx())) if len(lines) == 0: return "" else: return "\n".join(lines) + "\n" def FASTA_IDX(self, protein=True, nucleotide=False): data = {} data.setdefault('FASTA', []) data.setdefault('IDX', []) for c in self.chains: if isinstance(c, ChainOfProtein) and protein: data['FASTA'].append(">{0}\n{1}".format( c.globalID, c.gapped_protein_sequence)) data['IDX'].append(">{0}\t{1}".format(c.globalID, c.protein_idx)) if isinstance(c, ChainOfNucleotide) and nucleotide: data['FASTA'].append(">{0}\n{1}".format( c.globalID, c.gapped_nucleotide_sequence())) data['IDX'].append(">{0}\t{1}".format(c.globalID, c.nucleotide_idx())) return data # # OVERRIDE DEFAULT METHODS # def __len__(self): return len(self._chains)
class PDBeChem(object): """ """ def __init__(self, cif_file): self._file = File(file_name=cif_file, action='r') self.__name__ = 'databases.PDBeChem' # This must be included in every class for the SBIglobals.alert() self._id = None self._name = None self._type = None self._formula = None self._parent = None self._weight = None self._fcharge = None self._code1l = None self._flformula = {} self._parse() self._decompose_formula() """ATTRIBUTES""" @property def id(self): return self._id @property def name(self): return self._name @property def type(self): return self._type @property def formula(self): return self._formula @property def full_formula(self): return self._flformula @property def parent(self): return self._parent @property def weight(self): return self._weight @property def formal_charge(self): return self._fcharge @property def code1(self): return self._code1l @property def code3(self): return self._id """PRIVATE METHODS""" def _parse(self): for line in self._file.descriptor: if line.startswith('_chem_comp.'): line = line.replace('_chem_comp.', '') value = line[35:].strip().strip('"') value = value.replace(' (NON-PREFERRED NAME)', '') value = value if value != '?' else None if line.startswith('id'): self._id = value if line.startswith('pdbx_type'): self._type = value if line.startswith('formula '): self._formula = value if line.startswith('formula_weight'): self._weight = value if line.startswith('pdbx_formal_charge'): self._fcharge = value if line.startswith('one_letter_code'): self._code1l = value if line.startswith('name'): self._name = value.upper() if line.startswith('mon_nstd_parent_comp_id'): self._parent = set([x.strip() for x in value.split(',') ]) if value is not None else None if line.startswith(';') and self._name == '': self._name += line.strip().lstrip(';').upper() self._file.close() def _decompose_formula(self): if self.formula is not None: data = self.formula.split() atregex = re.compile('(\D+)(\d*)') for atom in data: m = atregex.search(atom) if m.group(1) in element_dic: self._flformula[m.group(1)] = m.group( 2) if m.group(2) != '' else 1 """OVERWRITE INHERITED FUNCTIONS""" def __str__(self): if self.code1 is not None and self.parent is not None: return "[{0.id} - {0.code1} from {0.parent}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format( self) elif self.code1 is not None: return "[{0.id} - {0.code1}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format( self) elif self.parent is not None: return "[{0.id} from {0.parent}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format( self) else: return "[{0.id}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format( self)
class CDhitList(StorableObject): def __init__(self, cdhitfile): self._clusters = [] self._allseqids = {} self._file = File(file_name=cdhitfile) self._parse_file() @property def clusters(self): return self._clusters def get_cluster4sequence(self, sequence): if sequence in self._allseqids: return self._clusters[self._allseqids[sequence]] else: return None def is_in_cluster(self, sequence): c = self.get_cluster4sequence(sequence) if c is None: return 'N' else: return 'M' if c.is_master(sequence) else 'H' def add_cluster(self, cluster): self._clusters.append(cluster) def add_sequence2cluster(self, sequence, clusterid=None): if clusterid is None: self.clusters[-1].add_sequence(sequence) self._allseqids[sequence.name] = len(self.clusters) - 1 else: for x in range(len(self._clusters)): if self._clusters[x].identifier == clusterid: self._clusters[x].add_sequence(sequence) self._allseqids[sequence.name] = x break def dictionary_role_summary(self): data = {'master': [], 'homolog': []} for c in self.clusters: data['master'].append(c.master.name) for s in c.sequences: data['homolog'].append(s) return data def _parse_file(self): for line in self._file.descriptor: if line.startswith('>'): c = CDhit(clusterid=line.split()[-1].strip()) self.add_cluster(c) else: data = line.split()[1:] h = CDhitHomolog(name=data[1], length=data[0], homology=data[-1]) self.add_sequence2cluster(sequence=h) self._file.close() def __repr__(self): text = [] for c in self.clusters: text.append('{0}'.format(c)) return '\n'.join(text)
def localTaxIDs(self): taxFile = File(self._taxid, 'r') for tax_line in taxFile.descriptor: yield tax_line taxFile.close()