def build_multifasta(file_name, sequenceList, force=False): newFasta = File(file_name, 'w', overwrite=force) file_dsc = newFasta.descriptor for sequence in sequenceList: file_dsc.write(sequence.format('FASTA') + "\n") newFasta.close() return Fasta(fasta_file=newFasta.full)
def build(file_name, sequenceID, sequence, force=False): newFasta = File(file_name, 'w', overwrite=force) newSeq = Sequence(seqID=sequenceID, sequence=sequence) file_dsc = newFasta.descriptor file_dsc.write(newSeq.format('FASTA')) newFasta.close() return Fasta(fasta_file=newFasta.full)
def format2file(self, filename, extension='pdb', center=False): if extension not in ('pdb', 'js'): raise AttributeError('Not accepted extension') structure = File('.'.join([filename, extension]), 'w') if extension == 'pdb': structure.write(self.pdb_format(center=center)) elif extension == 'js': structure.write(self.js_format(center=center)) structure.close()
def pdb_file(self, value): """ Sets a PDB file if none has been given @raise UsedAttributeError """ if self._pdb_file is not None: raise AttributeError( "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object" .format(self._pdb_file.full, value)) if isinstance(value, File): self._pdb_file = value else: self._pdb_file = File(file_name=value, type='r')
def __init__(self, cif_file): self._file = File(file_name=cif_file, action='r') self.__name__ = 'databases.PDBeChem' # This must be included in every class for the SBIglobals.alert() self._id = None self._name = None self._type = None self._formula = None self._parent = None self._weight = None self._fcharge = None self._code1l = None self._flformula = {} self._parse() self._decompose_formula()
def __init__(self, database, search_type = 'prot'): #Search Type Check if search_type not in set(['prot','nucl']): raise BE(-10) self._search_type = search_type #Blast executable configuration self._configurator = ConfigParser.RawConfigParser(allow_no_value=True) self._configurator.read(os.getenv('SBI_CONFIG_FILE',default_configuration_file)) self._exe = Executable(executable = self._configurator.get('blast','executable'), path = self._configurator.get('blast','path'), variable_path = self._configurator.get('blast','variable_path')) #Database Configuration self._database = self._check_database(os.path.abspath(database)) if os.path.isfile(self._database.file.full + ".idx"): self._idx = File(file_name = self._database.file.full + ".idx", action = 'r') else: self._idx = None #Adding fixed blast parameters self._exe.add_attribute(self._database.file.full, '-db') self._exe.add_attribute('5', '-outfmt') self._exe.add_parameter('-lcase_masking') SBIglobals.alert('debug', self, 'New Blast Executable created.\nBlast executable at {0}\n'.format(self._exe.full_executable)) self._selfHit = False self._hitIDformat = 'single' self._overwritte = False self._clean_files = True
def make_PDBseq(self, log_file, resolution_threshold=None): if not self.has_local: raise NameError( 'A local PDB database must be defined to do create a PDBseq database.' ) outdir = self.PDBseq if self.PDBseq is not None else os.curdir Path.mkdir(self.PDBseq) fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'), action='w', overwrite=True) fasta_fd = fasta_file.descriptor idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'), action='w', overwrite=True) idx_fd = idx_file.descriptor # if resolution_threshold is not None: # filtered_file_name = self.get_PDBseq_filtered(resolution_threshold) # filtered_file = File(file_name = filtered_file_name, action = 'w', overwrite = True) # filtered_fd = filtered_file.descriptor # resolutions = self.get_resolutions(resolution_threshold = resolution_threshold) log_file = File(file_name=log_file, action='w', overwrite=True) log_idx = log_file.descriptor for pdb_file in self.localPDBs: log_idx.write("Reading File: {0}\n".format(pdb_file)) newPDB = PDB(pdb_file=pdb_file, dehydrate=True) fasta_idx = newPDB.FASTA_IDX(nucleotide=False) if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']): log_idx.write( 'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n' .format(newPDB.id)) if len(fasta_idx['FASTA']) > 0: log_idx.write('\tPrinting FASTA and IDX...\n') else: log_idx.write('\tProblably just a nucleotide PDB...\n') for c in range(len(fasta_idx['FASTA'])): sequence = fasta_idx['FASTA'][c].split('\n')[1] sequence = sequence.replace('X', '').replace('x', '') if len(sequence) > 0: fasta_fd.write(fasta_idx['FASTA'][c] + "\n") if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca: filtered_fd.write(fasta_idx['FASTA'][c] + "\n") idx_fd.write(fasta_idx['IDX'][c] + "\n") del (newPDB) #CLOSE & END fasta_file.close() idx_file.close() if resolution_threshold is not None: filtered_fd.close()
def _process(self): go_dic = {} parseFile = File(os.path.join(self.local, self._gfile), 'r') go = None for line in parseFile.descriptor: line = re.sub('\'', '\\\'', line) if line.startswith('[Term]'): if go is not None: go_dic[go.id] = go if line.startswith('id:'): go = GOterm(id=line.split()[1].strip()) continue if line.startswith('name:'): go.name = " ".join(line.split()[1:]).strip() continue if line.startswith('namespace:'): go.namespace = line.split()[1].strip() continue if line.startswith('alt_id:'): go.alt_id.append(line.split()[1].strip()) continue if line.startswith('is_obsolete:'): go.obsolete = True continue if line.startswith('is_a:'): go.parents.add(line.split()[1].strip()) continue if line.startswith('relationship:'): go.relations.append( (line.split()[1].strip(), line.split()[2].strip())) continue if line.startswith('[Typedef]'): go_dic[go.id] = go break parseFile.close() for go in go_dic: go_dic[go].parents = self._search_parents(go_dic, go) goFile = File(self._gofile, 'w', True) for go in go_dic: go_dic[go].parents.add(go) goFile.write(str(go_dic[go]) + "\n") goFile.close()
def __init__(self, pdb_file=None, dehydrate=False, header=False, onlyheader=False, biomolecule=False): """ @type pdb_file: String @param pdb_file: PDB formated file to read @raise IOError if pdb_file does not exist and it is not an empty object """ if biomolecule or onlyheader: header = True self._pdb_file = pdb_file self._chains = [] self._NMR = False self._NMR_chains = [] self._chain_id = set() self._biomol_id = -1 # -1 -> original # 0 -> symmetry # >0 -> biomolecule self._header = None self._has_prot = False self._has_nucl = False self._COMPND = None if self.pdb_file is not None: self._pdb_file = File(file_name=self._pdb_file, action='r') self._read_PDB_file(header=header, onlyheader=onlyheader, biomolecule=biomolecule) if dehydrate: self.dehydrate()
def _process(self): tmoFile = File(self._pdbtmfile,'w', True) for xmlfile in Path.list_files(os.path.join(self._local,'pdbtm/database/'), '*.xml'): xmldata = TM(pdb = os.path.splitext(os.path.split(xmlfile)[1])[0].upper()) skip_chains = set() read = False fdxml = open(xmlfile) for line in fdxml: if line.startswith(' <TMRES>'): xmldata.tmres = line elif line.startswith(' <TMTYPE'): xmldata.tmtype = line elif line.startswith(' <PDBKWRES'): xmldata.kwres = line elif line.startswith(' <SIDEDEFINITION'): m = re.search('Side1="(\S+)"', line) xmldata.side = m.group(1) elif line.startswith(' <APPLY_TO_CHAIN'): m = re.search('NEW_CHAINID=\"(\S{1})\"', line) if m: skip_chains.add(m.group(1)) elif line.startswith(' <CHAIN '): m = re.search('CHAINID=\"(\S{1})\" NUM_TM=\"(\d{1})\" TYPE=\"(\S+)\"', line) if m: chain, num, tmtype = m.group(1), m.group(2), m.group(3) if not chain in skip_chains: cdata = tuple([chain, num, tmtype]) xmldata.set_chain(cdata) read = True elif line.startswith(' <REGION ') and read: m = re.search('pdb_beg=\"(\-*\d+\w*)\"[\s\S]+pdb_end=\"(\-*\d+\w*)\"\s+type=\"(\w{1})\"', line) ini, end, tmtype = m.group(1), m.group(2), m.group(3) xmldata.set_chain(cdata, tuple([ini, end, tmtype])) elif line.startswith(' </CHAIN>'): read = False fdxml.close() if len(xmldata.chains) > 0: tmoFile.write(str(xmldata)+"\n") tmoFile.close()
def write(self, output_file=None, format='PDB', force=False, clean=False): """ Writes the object in a specific format @type output_file: String @param output_file: File to write @type format: String @param format: Format of the file to print """ outfile = File(file_name=output_file, action='w', overwrite=force) if format == 'PDB': self._write_PDB_file(pdb_file=outfile, clean=clean)
def _process(self): enzymes = self._parse_enzclass() + self._parse_enzymedat() enzymes.sort() enzFile = File(self._enzfile, 'w', True) for e in enzymes: enzFile.write(repr(e) + "\n") enzFile.close()
def __init__(self, fasta_file): if isinstance(fasta_file, basestring): self._file = File(file_name=fasta_file, action='r') elif isinstance(fasta_file, File): self._file = fasta_file self._file.action = 'r' else: raise AttributeError('Check the input of the Fasta object') self._is_multifasta = self._check_multifasta() self._sequences = [] self._seqfinder = {}
def _process(self): targets = self._process_targets() drugs = self._process_drugs(targets) drugFile = File(self._drugfile, 'w', True) for d in drugs: drugFile.write(repr(d) + "\n") drugFile.close()
def get_PDBeChem(self, chemID): if self.has_local: for chem_file in self.localPDBeChems: newfile = File(file_name=chem_file, action='r') if newfile.prefix.upper() == chemID.upper(): return chem_file #If we do not find it in local (or we do not have a local) we search it on the FTP chem_file = chemID.upper() + '.cif' source = PDBeChemftp['single'] + chem_file try: urllib.urlretrieve(source, chem_file) except: return False return os.path.abspath(chem_file)
def get_PDBs(self, pdbIDset): if isintance(pdbIDset, str): warnings.warn( 'For single PDB search the get_PDB function is recomended.') yield self.get_PDB(pdbIDset) else: pdbIDset = set([x.upper() for x in pdbIDset]) if self.has_local: for pdb_file in self.localPDBs: newfile = File(file_name=pdb_file, action='r') if newfile.prefix.lstrip('pdb').upper() in pdbIDset: yield pdb_file else: for pdbID in pdbIDset: yield self.get_PDB(pdbID)
def get_PDB(self, pdbID): if self.has_local: rootdir = os.path.join(self.local, pdbID.lower()[1:3]) for pdb_file in Path.list_files(root=rootdir, pattern='*.ent.gz'): newfile = File(file_name=pdb_file, action='r') if newfile.prefix.lstrip('pdb').upper() == pdbID.upper(): return pdb_file #If we do not find it in local (or we do not have a local) we search it on the FTP pdb_file = 'pdb' + pdbID.lower() + '.ent.gz' source = 'ftp://' + PDBftp['address'] + os.path.join( PDBftp['structures'], pdbID[1:3].lower(), pdb_file) try: urllib.urlretrieve(source, pdb_file) except: return False return os.path.abspath(pdb_file)
def get_PDBeChems(self, chemIDset): if isintance(chemIDset, str): warnings.warn( 'For single PDBeChem search the get_PDBeChem function is recomended.' ) yield self.get_PDBeChem(chemIDset) else: chemIDset = set([x.upper() for x in chemIDset]) if self.has_local: for chem_file in self.localPDBeChems: newfile = File(file_name=chem_file, action='r') if newfile.prefix.lstrip('pdb').upper() in chemIDset: yield chem_file else: for chemID in chemIDset: yield self.get_PDBeChem(chemID)
def get_resolutions(self): # resolutions (-1) are for methods that do not define resolution resolutions = {} ftp = ftplib.FTP(PDBftp['address']) ftp.login() ftp.cwd(PDBftp['derived']) resoluIDX = [] ftp.retrlines('RETR ' + PDBftp['resolution'], resoluIDX.append) ftp.quit() SBIglobals.alert('debug', self, 'Retrieving resolution data from PDB FTP...') active = False for line in resoluIDX: if line.startswith('-'): active = True continue if active and len(line.strip()) > 0: data = [x.strip() for x in line.split(';')] if len(data[1]) > 0: SBIglobals.alert( 'debug', self, '\tResolution for {0[0]} is {0[1]}...'.format(data)) # if resolution_threshold is None: resolutions[data[0]] = data[1] #rsync is accumulative, we might have structures that are not in the residu.idx anymore.. must check for pdb_file in self.localPDBs: newfile = File(file_name=pdb_file, action='r') pdbid = newfile.prefix.lstrip('pdb').upper() if pdbid not in resolutions: pdbobj = PDB(pdb_file=pdb_file, header=True, onlyheader=True) SBIglobals.alert( 'debug', self, '\tGrabbing Resolution for {0} is {1}...'.format( pdbid, pdbobj.header.resolution)) resolutions[pdbid] = pdbobj.header.resolution return resolutions
class PDBeChem(object): """ """ def __init__(self, cif_file): self._file = File(file_name=cif_file, action='r') self.__name__ = 'databases.PDBeChem' # This must be included in every class for the SBIglobals.alert() self._id = None self._name = None self._type = None self._formula = None self._parent = None self._weight = None self._fcharge = None self._code1l = None self._flformula = {} self._parse() self._decompose_formula() """ATTRIBUTES""" @property def id(self): return self._id @property def name(self): return self._name @property def type(self): return self._type @property def formula(self): return self._formula @property def full_formula(self): return self._flformula @property def parent(self): return self._parent @property def weight(self): return self._weight @property def formal_charge(self): return self._fcharge @property def code1(self): return self._code1l @property def code3(self): return self._id """PRIVATE METHODS""" def _parse(self): for line in self._file.descriptor: if line.startswith('_chem_comp.'): line = line.replace('_chem_comp.', '') value = line[35:].strip().strip('"') value = value.replace(' (NON-PREFERRED NAME)', '') value = value if value != '?' else None if line.startswith('id'): self._id = value if line.startswith('pdbx_type'): self._type = value if line.startswith('formula '): self._formula = value if line.startswith('formula_weight'): self._weight = value if line.startswith('pdbx_formal_charge'): self._fcharge = value if line.startswith('one_letter_code'): self._code1l = value if line.startswith('name'): self._name = value.upper() if line.startswith('mon_nstd_parent_comp_id'): self._parent = set([x.strip() for x in value.split(',') ]) if value is not None else None if line.startswith(';') and self._name == '': self._name += line.strip().lstrip(';').upper() self._file.close() def _decompose_formula(self): if self.formula is not None: data = self.formula.split() atregex = re.compile('(\D+)(\d*)') for atom in data: m = atregex.search(atom) if m.group(1) in element_dic: self._flformula[m.group(1)] = m.group( 2) if m.group(2) != '' else 1 """OVERWRITE INHERITED FUNCTIONS""" def __str__(self): if self.code1 is not None and self.parent is not None: return "[{0.id} - {0.code1} from {0.parent}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format( self) elif self.code1 is not None: return "[{0.id} - {0.code1}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format( self) elif self.parent is not None: return "[{0.id} from {0.parent}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format( self) else: return "[{0.id}: {0.weight} - {0.formula} - {0.formal_charge}] {0.name} - {0.type}".format( self)
def localTM(self): tmoFile = File(self._pdbtmfile, 'r') for tm_line in tmoFile.descriptor: yield tm_line
def localEnzymes(self): enzFile = File(self._enzfile, 'r') for enz_line in enzFile.descriptor: yield enz_line
def localTrembls(self): tblFile = File(self._trbfile, 'r') for uni_line in tblFile.descriptor: yield uni_line
def descriptions(self): dscFile = File(self._desc, 'r') for dsc_line in dscFile.descriptor: if not dsc_line.startswith('#'): yield dsc_line
def localGOs(self): goFile = File(self._gofile, 'r') for go_line in goFile.descriptor: yield go_line
def relations(self): relFile = File(self._rel, 'r') for rel_line in relFile.descriptor: if not rel_line.startswith('#'): yield rel_line
class PDB(StorableObject): """ A {PDB} is a collection of {Chain} """ def __init__(self, pdb_file=None, dehydrate=False, header=False, onlyheader=False, biomolecule=False): """ @type pdb_file: String @param pdb_file: PDB formated file to read @raise IOError if pdb_file does not exist and it is not an empty object """ if biomolecule or onlyheader: header = True self._pdb_file = pdb_file self._chains = [] self._NMR = False self._NMR_chains = [] self._chain_id = set() self._biomol_id = -1 # -1 -> original # 0 -> symmetry # >0 -> biomolecule self._header = None self._has_prot = False self._has_nucl = False self._COMPND = None if self.pdb_file is not None: self._pdb_file = File(file_name=self._pdb_file, action='r') self._read_PDB_file(header=header, onlyheader=onlyheader, biomolecule=biomolecule) if dehydrate: self.dehydrate() # # ATTRIBUTES # @property def pdb_file(self): """ PDB file name @rtype: String """ return self._pdb_file @pdb_file.setter def pdb_file(self, value): """ Sets a PDB file if none has been given @raise UsedAttributeError """ if self._pdb_file is not None: raise AttributeError( "The PDB object is loaded from file {0}. To load the new file {1} create a new PDB object" .format(self._pdb_file.full, value)) if isinstance(value, File): self._pdb_file = value else: self._pdb_file = File(file_name=value, type='r') @property def chain_identifiers(self): return self._chain_id @property def id(self): return self._chains[0].pdb @property def chains(self): """ List of {Chain} contained in the PDB w/out NMR replicas @rtype: List of {Chain} """ return self._chains @property def proteins(self): """ List of {ProteinChain} contained in the PDB w/out NMR replicas @rtype: List of {ProteinChain} (iterator) """ for chain in self.chains: if isinstance(chain, ChainOfProtein): yield chain @property def nucleotides(self): """ List of {NucleotideChain} contained in the PDB w/out NMR replicas @rtype: List of {NucleotideChain} (iterator) """ for chain in self.chains: if isinstance(chain, ChainOfNucleotide): yield chain @property def non_standard_chains(self): """ List of non {NucleotideChain}/ non {ProteinChain} contained in the PDB w/out NMR replicas @rtype: List of non {NucleotideChain}/ non {ProteinChain} (iterator) """ for chain in self.chains: if not isinstance(chain, ChainOfNucleotide) and not isinstance( chain, ChainOfProtein): yield chain @property def all_models(self): """ List of {Chain} contained in the PDB w/ NMR replicas @rtype: List of {Chain} """ return self._chains + self._NMR_chains @property def header(self): if self._header is None: return '' else: return self._header @property def biomolecule_identifier(self): return self._biomol_id # # COMPLEX GETTERS & SETTERS # def get_chain_by_id(self, id): """ Returns a chain according to its id or None if no chain with that id is found @rtype: {Chain} """ for chain in self._chains: if chain.chain == id: return chain return None def add_chain(self, chain, NMR=False): """ Adds a new chain to the PDB """ if not NMR: self._chains.append(chain) elif NMR and self._NMR: self._NMR_chains.append(chain) self._chain_id.add(chain.chain) def add_chains(self, chains, NMR=False): """ Adds a new chains to the PDB """ for chain in chains: self.add_chain(chain=chain, NMR=NMR) def _get_chain_position_by_id(self, id): """ Returns the position in the chain array where the chain is @rtype: Integer """ for x in range(len(self._chains)): if self._chains[x].chain == id: return x return None # # BOOLEANS # @property def is_NMR(self): """ Identifies if the PDB contains NMRs @rtype: Boolean """ return self._NMR def chain_exists(self, chain): """ Confirms if a given chain exists in the PDB @rtype: Boolean """ return chain in self._chain_id @property def has_protein(self): """ Checks if the PDB contains a protein (not only) @rtype: Boolean """ return self._has_prot @property def has_nucleotide(self): """ Checks if the PDB contains a nucleotide chain (not only) @rtype: Boolean """ return self._has_nucl @property def repeated_chain_ids(self): """ Checks if more than one {Chain} has the same assigned ID @rtype: Boolean """ return len(self._chain_id) < len(self._chains) @property def is_all_ca(self): for p in self.proteins: if p.is_only_ca(): return True return False # # METHODS # def dehydrate(self): recheck_chains = False for c in self.chains: c.dehydrate() if c.is_empty: recheck_chains = True if recheck_chains: c = [] for ch in self.chains: if not ch.is_empty: c.append(ch) else: self._chain_id.remove(ch.chain) self._chains = c def duplicate(self, hetero=True, water=False, NMR=False): """ Returns a {PDB} identical to the original but as a new object @rtype: {PDB} """ new_PDB = PDB() new_PDB.pdb_file = self.pdb_file for chain in self.chains: new_PDB.add_chain( chain=chain.duplicate(hetero=hetero, water=water)) if NMR: for chain in self._NMR_chains: new_PDB.add_chain(chain=chain.duplicate(hetero=hetero, water=water), NMR=True) new_PDB._NMR = self._NMR new_PDB._has_prot = self._has_prot new_PDB._has_nucl = self._has_nucl return new_PDB def apply_symmetry_matrices(self): """ Only works if the PDB file is an original PDB file or the matrices have been added in the correct PDB format @rtype: {PDB} """ if self._header is None: self._read_PDB_file(header=True, onlyheader=True) return self._apply_matrix(matrix=self.header.symmetry_matrix) def apply_biomolecule_matrices(self, keepchains=False, water=True): """ Only works if the PDB file is an original PDB file or the matrices have been added in the correct PDB format @rtype: {PDB} """ if self._header is None: self._read_PDB_file(header=True, onlyheader=True) PDB_list = [] for matrix in self.header.biomolecules: PDB_list.append( self._apply_matrix(matrix=matrix, keepchains=keepchains, water=water)) return PDB_list def _apply_matrix(self, matrix, keepchains=False, water=True): new_PDB = PDB() new_PDB._biomol_id = matrix.identifier for chain in self.chains: if chain.chain in matrix.chains: for mat in matrix.matrices: new_chain = chain.duplicate(water=water) new_chain.reposition(matrix=mat.matrix, vector=mat.vector) if len(new_chain) >= 1: new_PDB.add_chain(chain=new_chain) if not keepchains: new_PDB.tmpclean(cluster_by_alternative_id=True) return new_PDB def clean(self): first_atom = 1 for c in self.chains: c.clean(initatom=first_atom) first_atom = c.last_residue.last_atom_number + 1 def tmpclean(self, cluster_by_alternative_id=False): """ Makes a clean version of the PDB, rechaining in order and renumerating atoms. Renumbering residues is optional """ pchainsIDs = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890" chainsIDs = "" chainsNIDs = "" chainID = 0 atom_count = 1 for x in range(len(pchainsIDs)): if not self.chain_exists(chain=pchainsIDs[x]): chainsIDs += pchainsIDs[x] else: chainsNIDs += pchainsIDs[x] chain_change = len(self) <= len(chainsIDs) for chain in self.chains: if (not chain.chain in chainsNIDs) and chain_change: self._chain_id.add(chain.chain) chain.chain = chainsIDs[chainID] chainID += 1 self._chain_id.add(chain.chain) if cluster_by_alternative_id: if self._COMPND is None: self._COMPND = {} if not self._COMPND.has_key(chain.alternative_id): self._COMPND.setdefault(chain.alternative_id, []).append( chain.alternative_id) self._COMPND[chain.alternative_id].append(chain.chain) else: chainsNIDs = chainsNIDs.replace(chain.chain, '') chain.renumerate_atoms(init=atom_count) atom_count += (chain.atom_length) def fuse_chains(self, chains_ids): """ Fuses several chains into the first one. It will not allow to fuse different structural chains. It does not alter the {PDB}, but provides a new one @rtype: {Chain} @raise AttributeError if: a) A given chain ID is not present b) Try to fuse different structural chains """ if len(self._chain_id.intersection(set(chains_ids))) < len(chains_ids): raise AttributeError( "Some of the given chains to fues do not exist") error_counter = 0 error_control = [False, False] new_PDB = PDB() for c in chains_ids: chain = self.get_chain_by_id(id=c) new_PDB.add_chain(chain=chain.duplicate()) if isinstance(chain, ChainOfProtein) and not error_control[0]: error_counter += 1 error_control[0] = True elif isinstance(chain, ChainOfNucleotide) and not error_control[1]: error_counter += 1 error_control[1] = True if error_counter == 2: raise AttributeError( "Fuse different kinds of structural chain is not possible\n" ) init_chain_num = new_PDB.chains[0].last_residue.number for x in range(1, len(new_PDB.chains)): new_PDB.chains[x].renumerate_residues(init=init_chain_num + 1) init_chain_num = new_PDB.chains[0].last_residue.number new_PDB.chains[0].fuse(chain=new_PDB.chains[x]) return_PDB = PDB() return_PDB.add_chain(chain=new_PDB.chains[0]) return return_PDB # def calculate_dssp(self, out_dir = None, store = True): # """ # Executes DSSP and assigns the prediction to each chain # @param out_dir: directory to save the output # @defaut out_dir: None # @param store: Save the dssp output(?) # """ # for chain in self.proteins: # if out_dir is None: # pdb_file = chain.globalID + ".pdb2dssp" # dssp_file = chain.globalID + ".dssp" # else: # Path.mkdir(newdir = out_dir) # pdb_file = os.path.join(os.path.abspath(out_dir), chain.globalID + ".pdb2dssp") # dssp_file = os.path.join(os.path.abspath(out_dir), chain.globalID + ".dssp") # pdb_fd = open(pdb_file, 'w') # pdb_fd.write(chain.PDB_format()) # pdb_fd.close() # dssp_calc = DSSPexec(pdb_file = pdb_file, dssp_file = dssp_file, # chain = chain, store = store) def rotate(self, matrix=None): """ Rotates each {Chain} according to a given matrix @type matrix: numpy.matrix """ if matrix is None: matrix = numpy.identity(3, float) for chain in self.all_models: chain.rotate(matrix=matrix) def translate(self, vector=None): """ Translates each {Chain} according to a translational vector @type vector: numpy.array """ if vector is None: vector = numpy.zeros(3, float) for chain in self.all_models: chain.translate(vector=vector) def reposition(self, matrix=None, vector=None): """ Rotates and Translates each {Chain} according to a matrix and a translational vector @type matrix: numpy.matrix @type vector: numpy.array """ if matrix is None: matrix = numpy.identity(3, float) if vector is None: vector = numpy.zeros(3, float) for chain in self.all_models: chain.reposition(matrix=matrix, vector=vector) # def calculate_protein_heteroatom_contacts(self, distance = 6): # """ # Returns a {HeteroatomContacts} list with the contacts between a protein and its heteroatoms # at a maximum given distance # @type distance: Integer # @rtype: list of {HeteroatomContacts} # """ # data = [] # for protein in self.proteins: # data.append(HeteroatomContacts(chain = protein, max_distance = distance)) # return data # # OVERRIDE PARENT'S FUNCTIONS # @staticmethod def read(input_file, format='PDB'): """ Reads a file of data in a specific format and returns the object @type input_file: String @param input_file: File to read @type format: String @param format: Format of the file to read """ if format == 'PDB': pdb = PDB(pdb_file=input_file) return pdb def write(self, output_file=None, format='PDB', force=False, clean=False): """ Writes the object in a specific format @type output_file: String @param output_file: File to write @type format: String @param format: Format of the file to print """ outfile = File(file_name=output_file, action='w', overwrite=force) if format == 'PDB': self._write_PDB_file(pdb_file=outfile, clean=clean) # # IO # def _read_PDB_file(self, header=False, onlyheader=False, biomolecule=False): """ Process and load crystal data from a PDB formated file """ from parse_pdb import read_PDB_file, read_PDB_header if header: read_PDB_header(self) self._pdb_file.close() if not onlyheader: # read_PDB_file(self, biomolecule=biomolecule) read_PDB_file(self) self._pdb_file.close() # def _represent_COMPND(self): # if self._COMPND is None: return '' # data = [] # mol_counter = 1 # for chain in self._COMPND: # data.append("COMPND MOL_ID: %d;" %mol_counter) # data.append("COMPND 2 CHAIN: " + ",".join(self._COMPND[chain]) + ";") # if len(self._biomolecA) > 0: # matrices = [] # for mat in self._biomolecA: # if mat[1] == chain: matrices.append(str(mat[0])) # data.append("COMPND 3 MATRICES: " + ",".join(sorted(matrices))) # mol_counter += 1 # return "\n".join(data) + "\n" def _write_PDB_file(self, pdb_file, clean=False): """ Print a crystal into a PDB formated file """ out_fd = pdb_file.descriptor # out_fd.write(self._represent_COMPND()) out_fd.write(self.PDB_format(clean=clean) + "\n") pdb_file.close() def PDB_format(self, clean=False, terminal=True): """ Strings a {PDB} in PDB format @rtype: String """ lines = [] if clean: self.clean() for chain in self._chains: lines.append(chain.PDB_format(terminal=terminal)) lines.append("END") return "\n".join(lines) def FASTA_format(self, gapped=True, protein=True, nucleotide=False): lines = [] for c in self.chains: if isinstance(c, ChainOfProtein) and protein: lines.append(">{0}\t{1}".format(c.globalID, c.aminoacids[0].identifier)) if gapped: lines.append("{0}".format(c.gapped_protein_sequence)) else: lines.append("{0}".format(c.protein_sequence)) if isinstance(c, ChainOfNucleotide) and nucleotide: lines.append(">{0}\t{1}".format(c.globalID, c.nucleotides[0].identifier)) if gapped: lines.append("{0}".format(c.gapped_nucleotide_sequence())) else: lines.append("{0}".format(c.nucleotide_sequence())) if len(lines) == 0: return "" else: return "\n".join(lines) + "\n" def IDX_format(self, protein=True, nucleotide=False): lines = [] for c in self.chains: if isinstance(c, ChainOfProtein) and protein: lines.append(">{0}\t{1}".format(c.globalID, c.protein_idx)) if isinstance(c, ChainOfNucleotide) and nucleotide: lines.append(">{0}\t{1}".format(c.globalID, c.nucleotide_idx())) if len(lines) == 0: return "" else: return "\n".join(lines) + "\n" def FASTA_IDX(self, protein=True, nucleotide=False): data = {} data.setdefault('FASTA', []) data.setdefault('IDX', []) for c in self.chains: if isinstance(c, ChainOfProtein) and protein: data['FASTA'].append(">{0}\n{1}".format( c.globalID, c.gapped_protein_sequence)) data['IDX'].append(">{0}\t{1}".format(c.globalID, c.protein_idx)) if isinstance(c, ChainOfNucleotide) and nucleotide: data['FASTA'].append(">{0}\n{1}".format( c.globalID, c.gapped_nucleotide_sequence())) data['IDX'].append(">{0}\t{1}".format(c.globalID, c.nucleotide_idx())) return data # # OVERRIDE DEFAULT METHODS # def __len__(self): return len(self._chains)
def get_FASTA_IDX_by_names_to_file(self, names, outfile): fastafile = Fasta(self.PDBseq) selectedfasta = fastafile.retrieve(copy.deepcopy(names)) output_fasta = File(outfile, 'w') for sequence in selectedfasta: output_fasta.write(sequence.format('FASTA') + "\n") output_fasta.close() idxfile = self.PDBseq + '.idx' output_idx = File(outfile + '.idx', 'w') input_idx = File(idxfile, 'r') for line in input_idx.descriptor: info = line.split() pdbname = info[0][1:] if pdbname in names: output_idx.write(line) input_idx.close() output_idx.close()
class CDhitList(StorableObject): def __init__(self, cdhitfile): self._clusters = [] self._allseqids = {} self._file = File(file_name=cdhitfile) self._parse_file() @property def clusters(self): return self._clusters def get_cluster4sequence(self, sequence): if sequence in self._allseqids: return self._clusters[self._allseqids[sequence]] else: return None def is_in_cluster(self, sequence): c = self.get_cluster4sequence(sequence) if c is None: return 'N' else: return 'M' if c.is_master(sequence) else 'H' def add_cluster(self, cluster): self._clusters.append(cluster) def add_sequence2cluster(self, sequence, clusterid=None): if clusterid is None: self.clusters[-1].add_sequence(sequence) self._allseqids[sequence.name] = len(self.clusters) - 1 else: for x in range(len(self._clusters)): if self._clusters[x].identifier == clusterid: self._clusters[x].add_sequence(sequence) self._allseqids[sequence.name] = x break def dictionary_role_summary(self): data = {'master': [], 'homolog': []} for c in self.clusters: data['master'].append(c.master.name) for s in c.sequences: data['homolog'].append(s) return data def _parse_file(self): for line in self._file.descriptor: if line.startswith('>'): c = CDhit(clusterid=line.split()[-1].strip()) self.add_cluster(c) else: data = line.split()[1:] h = CDhitHomolog(name=data[1], length=data[0], homology=data[-1]) self.add_sequence2cluster(sequence=h) self._file.close() def __repr__(self): text = [] for c in self.clusters: text.append('{0}'.format(c)) return '\n'.join(text)
def __init__(self, cdhitfile): self._clusters = [] self._allseqids = {} self._file = File(file_name=cdhitfile) self._parse_file()