def parse_pdb (self): pdb_struct = None #checking for file handle or file name to parse if self.pdb_file: pdb_struct = PDBParser(PERMISSIVE=True).get_structure('ref', self.pdb_file)[0] elif self.pdb_filename: pdb_struct = PDBParser(PERMISSIVE=True).get_structure('ref', self.pdb_filename)[0] else: return None #extracting sequence and preparing dictionary of residues #bio.pdb reads pdb in the following cascade: model->chain->residue->atom for chain in pdb_struct: self.pdb_seq[chain.id] = Seq('') for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname == "HID": self.pdb_seq[chain.id] += polypeptide.three_to_one('HIS') else: try: self.pdb_seq[chain.id] += polypeptide.three_to_one(res.resname) except Exception as msg: continue return pdb_struct
def parse_pdb(self): pdb_struct = None #checking for file handle or file name to parse if self.pdb_file: pdb_struct = PDBParser(PERMISSIVE=True, QUIET=True).get_structure( 'ref', self.pdb_file)[0] elif self.pdb_filename: pdb_struct = PDBParser(PERMISSIVE=True, QUIET=True).get_structure( 'ref', self.pdb_filename)[0] else: return None #extracting sequence and preparing dictionary of residues #bio.pdb reads pdb in the following cascade: model->chain->residue->atom for chain in pdb_struct: self.pdb_seq[chain.id] = Seq('') for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname == "HID": self.pdb_seq[chain.id] += polypeptide.three_to_one('HIS') else: try: self.pdb_seq[chain.id] += polypeptide.three_to_one( res.resname) except Exception as msg: continue return pdb_struct
def get_adjacency_matrix(pdb_id, pdb_file): parser = PDBParser() # initialize biopython PDB parser structure = parser.get_structure(pdb_id, pdb_file) # get PDB parsed by providing id and file name # deriving all amino acids based on presence of beta carbon amino_acids = [res for res in structure[0]['A'] if 'CB' in res] # set up df based on num. of amino acids. All amino acid pair interaction values will be appended. adj_df_values = np.zeros(shape=(len(amino_acids), len(amino_acids))) for i, r1 in enumerate(amino_acids): for j, r2 in enumerate(amino_acids): if i != j: # looking through all non-self AA interactions distance = r1['CB'] - r2['CB'] # distance in Angstrom, 3D space between beta carbons on 2 amino acids # if 3D distance < 8 Angstrom, then 3D contact is assumed. # Adjancency matrix has a 1 for amino acids with 3D contact (8 A limit) and 0 for not. if distance <= 8: adj_df_values[i][j] = 1.0 else: adj_df_values[i][j] = 0 else: adj_df_values[i][j] = 0 # df with rows and cols having aa name and position; values from appended df adjacency_df = pd.DataFrame(index=[polypep.three_to_one(r.get_resname()) for r in amino_acids], columns=[polypep.three_to_one(r.get_resname()) for r in amino_acids], data=adj_df_values) return adjacency_df
def select_ref_atoms(self, fragment, ref_pdbio_struct, use_similar=False): for chain in ref_pdbio_struct: for res in chain: try: gn = self.get_generic_number(res) if gn == fragment.rotamer.residue.display_generic_number.label: logger.info("Ref {}:{}\tFragment {}:{}".format( polypeptide.three_to_one(res.resname), self.get_generic_number(res), fragment.rotamer.residue.amino_acid, fragment. rotamer.residue.display_generic_number.label)) if use_similar: for rule in self.similarity_rules: if polypeptide.three_to_one( res.resname ) in rule[self.similarity_dict[ "target_residue"]] and fragment.rotamer.residue.amino_acid in rule[ self.similarity_dict[ "target_residue"]] and fragment.interaction_type.slug in rule[ self.similarity_dict[ "interaction_type"]]: return [res['CA'], res['N'], res['O']] else: return [res['CA'], res['N'], res['O']] except Exception as msg: continue return []
def get_distance_matrix(pdb_id, pdb_file): parser = PDBParser() # initialize biopython PDB parser structure = parser.get_structure(pdb_id, pdb_file) # get PDB parsed by providing id and file name # deriving all amino acids based on presence of beta carbon amino_acids = [res for res in structure[0]['A'] if 'CB' in res] # set up df based on num. of amino acids. All amino acid pair interaction values will be appended. dist_df_values = np.zeros(shape=(len(amino_acids), len(amino_acids))) for i, r1 in enumerate(amino_acids): for j, r2 in enumerate(amino_acids): if i != j: # looking through all non-self AA interactions dist = r1['CB'] - r2['CB'] # distance in Angstrom, 3D space between beta carbons on 2 amino acids dist_df_values[i][j] = dist # distance matrix just has 3D distance values. No cutoff required. else: dist_df_values[i][j] = 0 # df with rows and cols having aa name and position; values from appended df distance_df = pd.DataFrame(index=[polypep.three_to_one(r.get_resname()) for r in amino_acids], columns=[polypep.three_to_one(r.get_resname()) for r in amino_acids], data=dist_df_values) return distance_df
def parse_structure(self, pdb_struct): """ extracting sequence and preparing dictionary of residues bio.pdb reads pdb in the following cascade: model->chain->residue->atom """ for chain in pdb_struct: self.residues[chain.id] = {} self.pdb_seq[chain.id] = Seq('') for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname == "HID": resname = polypeptide.three_to_one('HIS') else: if res.resname not in self.residue_list: continue self.residues[chain.id][res.id[1]] = MappedResidue( res.id[1], polypeptide.three_to_one(res.resname)) self.pdb_seq[chain.id] = ''.join([ self.residues[chain.id][x].name for x in sorted(self.residues[chain.id].keys()) ]) for pos, res in enumerate(sorted(self.residues[chain.id].keys()), start=1): self.residues[chain.id][res].pos_in_aln = pos
def aa_to_index(aa): """ :param aa: Three character amino acid name. :returns: Integer index as per BioPython, unknown/non-standard amino acids return 20. """ if Polypeptide.is_aa(aa, standard=True): return Polypeptide.three_to_index(aa) else: return 20
def _calc_residue_dist(self, residue_one, residue_two, dist_atoms='CA'): """Returns the C-alpha distance between two residues""" if not Polypeptide.is_aa(residue_one) or not Polypeptide.is_aa(residue_two): return np.nan dist_atom_1 = dist_atoms if dist_atoms in residue_one else 'CA' dist_atom_2 = dist_atoms if dist_atoms in residue_two else 'CA' try: diff_vector = residue_one[dist_atom_1].coord - residue_two[dist_atom_2].coord except KeyError: return np.nan return np.sqrt(np.sum(diff_vector * diff_vector))
def MutationsDict(file, positions=None): """Get dictionary with lists of mutations per position in protein, ignore positions without residue in pdb file. Parameters: file (string): pdb file to get mutations from positions: list of tuples of the form (chain, first, last) for positions to mutate for all other aminoacids. If None, mutates all positions in all chains Returns: dict with keys :aa:chain:position, each containing lists with :aa:chain:position:mutated_aa for all mutations """ # Sorted list of one letter amino acids AA = list(Bio.PDB.Polypeptide.aa1) # Generate model of original pdb file model = it.Pmolecule(file).model # Dict to store mutations mutations = dict() if positions: for chain_id, first, last in positions: # Get chain corresponding to chain_id given chain = next(chain for chain in model.get_chains() if chain.id == chain_id) for residue in chain: if pp.is_aa(residue): code = pp.three_to_one(residue.get_resname()) position = residue.id[1] prefix = code + chain_id + str(position) # Only save positions between first and last if position in range(first, last + 1): mutations[prefix] = [ prefix + aa for aa in AA if aa != code ] else: for chain in model.get_chains(): for residue in chain: if pp.is_aa(residue): code = pp.three_to_one(residue.get_resname()) position = residue.id[1] chain_id = chain.id prefix = code + chain_id + str(position) mutations[prefix] = [ prefix + aa for aa in AA if aa != code ] return mutations
def definePeptideChain(self): # find peptide chain if not stated in self.__table and fill self.__peptide l = 'INFINITY' # with list of peptide residues if length is less than 30 if not self.__table['chain_antigen'][0]: for i in self.__chains: buf = len(PPBuilder().build_peptides(self.__struct[0][i])[0]) if (buf <= l): l = buf chid = i self.__table.loc['chain_antigen', :] = chid else: chid = self.__table['chain_antigen'][0] pp = list(self.__struct[0][chid]) if (len(pp) > 30): line = self.__name + '\t;TOO MANY AMINO ACIDS (' + str(len(pp)) + ') TO BE A PEPTIDE :(\n' self.printerr('definePeptideChain(): ' + line) return 0 pep_res = [] for r in pp: if (Polypeptide.is_aa(r.get_resname(), standard=True)): pep_res.append(r) self.__peptide = pep_res self.__regions_res.update({'peptide':pep_res}) return 1
def modeller_get_chain_seqs(target_protein, target_chain, version): target_path = path.join(PATHS.modeller, target_protein + target_chain) target_pdb_fname = 'v%s_pdb' % version + target_protein + '.ent' pdb_file_path = path.join(target_path, target_pdb_fname) if not path.isfile(pdb_file_path): LOGGER.warning('File %s not found' % pdb_file_path) return None, None parser = PDBParser(PERMISSIVE=1, QUIET=True) structure_id = path.basename(target_pdb_fname).split('.')[0] try: structure = parser.get_structure(structure_id, pdb_file_path) except: print( "ERROR: failed parser.get_structure(structure_id, pdb_fname) for " + target_pdb_fname) return None model = structure[0] try: chain = model[target_chain] except KeyError: return None chain_lst = [] for res in chain.get_residues(): if is_aa(res) and res.get_id()[0] == ' ': if res.resname == 'UNK' or res.resname == 'ASX': chain_lst.append('-') elif res.resname == 'SEC': chain_lst.append('U') else: chain_lst.append(Polypeptide.three_to_one(res.resname)) return chain_lst, chain
def write_FASTAs(PDB_ID, chains): polypeptide_IDs = [] for chain_ID, residues in chains.items(): if residues and Polypeptide.is_aa(residues[0][0]): polypeptide_ID = '{}_{}'.format(PDB_ID, chain_ID) polypeptide_IDs.append(polypeptide_ID) sequence = [] for resname, resseq, icode in residues: try: sequence.append(Polypeptide.three_to_one(resname)) except KeyError: sequence.append('X') with open('{}.fasta'.format(polypeptide_ID), mode='w') as f: f.write('>{}\n'.format(polypeptide_ID)) f.write('{}\n'.format(''.join(sequence))) return polypeptide_IDs
def residue_seq_to_one(seq): """ Standard mapping from 3-letters amino acid type encoding to one. """ three_to_one = lambda r: Polypeptide.three_to_one(r.name)\ if r.name in Polypeptide.standard_aa_names else 'U' return list(map(three_to_one, seq))
def get_sequence(self, chain_id): """ Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. """ from Bio.PDB import Polypeptide chain = self.structure.child_list[0].child_dict[chain_id] # print(chain.child_list) sequence_list = [] for residue in chain.child_list: try: poly_short = Polypeptide.three_to_one(residue.resname) except KeyError: # probably the start of only HOH -> discard rest # print(poly_short) break # print(poly_short) sequence_list.append(poly_short) sequence = ''.join(sequence_list) return sequence
def get_chain_to_valid_residues(structure, pdb_name=None): """Get tuples of chains and their valid residues.""" if pdb_name is None: pdb_name = () else: pdb_name = (pdb_name, ) chain_res = [] if type(structure) is Bio.PDB.Structure.Structure: for model in structure: for chain in model: residues = [ res for res in chain if poly.is_aa(res.get_resname(), standard=True) and 'CA' in res ] if len(residues) != 0: chain_res.append( (pdb_name + (str(model.serial_num), chain.get_id()), residues)) else: if 'atom_name' in structure.columns: calphas = structure[structure['atom_name'] == 'CA'] else: calphas = structure[structure['maestro_atom_name'] == ' CA '] calphas = calphas[calphas['resname'] != 'UNK'] for (chain, chain_ca) in calphas.groupby(['model', 'chain']): residues = [ca for idx, ca in chain_ca.iterrows()] if len(residues) != 0: chain_res.append((pdb_name + chain, residues)) return chain_res
def get_missing_sidechains(pdb_dataset, output_scwrl): """Get residues that are missing atoms.""" for pdb_filename in db.get_structures_filenames(pdb_dataset): biopy_structure = db.parse_biopython_structure(pdb_filename) pdb_name = db.get_pdb_name(pdb_filename) missing = 0 scwrl_list = [] logging.info("Processing {:}".format(pdb_name)) for model in biopy_structure: for chain in model: for i, residue in enumerate(chain): res_name = residue.resname if res_name not in expected: logging.warning("Non-standard residue found: {:}. " "Skipping.".format(res_name)) continue res_code = poly.three_to_one(res_name) res_id = residue.id[1] curr_count = len( Bio.PDB.Selection.unfold_entities(residue, 'A')) if curr_count != expected[res_name]: logging.debug( "Missing residue {:} at position {:} (with id {:})" " which has {:} instead of the expected {:} atoms." .format(res_name, i, res_id, curr_count, expected[res_name])) missing += 1 scwrl_list.append(res_code.upper()) else: scwrl_list.append(res_code.lower()) logging.debug("Missing {:} residue total".format(missing)) with open(output_scwrl, 'w') as f: f.write("".join(scwrl_list))
def constructor(self, recalculate): chain_obj = global_stuff.the_obj_manager.get_variable(pdb_chain_wrapper(self.params), recalculate) # write the seq file at location + name raw_seq_string = ''.join([Polypeptide.three_to_one(res.resname) for res in chain_obj]) seq = Bio.Seq.Seq(raw_seq_string) seq_record = Bio.SeqRecord.SeqRecord(seq) SeqIO.write(seq_record, self.get_file_location(), 'fasta') return open(self.get_file_location(),'r')
def getClearPeptideSeq(self): if not self.__peptide: self.printerr('getClearPeptideSeq(): PEPTIDE (' + self.__name +') IS EMPTY\n') return 0 s = '' for r in list(self.__peptide): s = s + Polypeptide.three_to_one(r.get_resname()) return s
def select_ref_atoms (self, fragment, ref_pdbio_struct, use_similar=False): for chain in ref_pdbio_struct: for res in chain: try: gn = self.get_generic_number(res) if gn == fragment.rotamer.residue.display_generic_number.label: logger.info("Ref {}:{}\tFragment {}:{}".format(polypeptide.three_to_one(res.resname), self.get_generic_number(res), fragment.rotamer.residue.amino_acid, fragment.rotamer.residue.display_generic_number.label)) if use_similar: for rule in self.similarity_rules: if polypeptide.three_to_one(res.resname) in rule[self.similarity_dict["target_residue"]] and fragment.rotamer.residue.amino_acid in rule[self.similarity_dict["target_residue"]] and fragment.interaction_type.slug in rule[self.similarity_dict["interaction_type"]]: return [res['CA'], res['N'], res['O']] else: return [res['CA'], res['N'], res['O']] except Exception as msg: continue return []
def get_peptide_sequence(self, residues): """ Returns a sequence string of a given list of Bio.PDB.Residue objects. """ return "".join([ polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in residues if x.resname in self.residue_list ])
def get_chain_sequence(self, chain): """ Returns a sequence string of a given chain. """ return "".join([ polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in self.residues[chain] if x.resname in self.residue_list ])
def get_sequence( self, chain_id ): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' chain = self.structure.child_list[0].child_dict[chain_id] sequence = "" for residue in chain: if Polypeptide.is_aa(residue): long_name = residue.get_resname() sequence += Polypeptide.three_to_one(long_name) return sequence
def parse_structure(self): for residue in self.structure.get_residues(): if PDB.is_aa(residue, standard=True): #only consider standard 20 residues res = residue.id[1] if res not in self.residues: #dont doublecount mutated residues (ex. 1ORC) self.residues.append(res) self.d_sequence[res] = Polypeptide.three_to_one( Residue.Residue.get_resname(residue))
def calcDistMatrices(self, key1, key2): # calculate and store distance matrix to self.__d_matrices for a pair of regions res_list_1 = self.getRegion(key1) # key1 and key2 refer to keys of self.__regions_res dictionary res_list_2 = self.getRegion(key2) if not res_list_1 or not res_list_2: self.printerr('calcDistMatrices(): RESIDUE LIST IS EMPTY\n') return 0 values = [] for res1 in res_list_1: values.append([]) for res2 in res_list_2: values[len(values)-1].append(residuesMinDist(res1, res2)) rows = [Polypeptide.three_to_one(x.get_resname()) for x in res_list_1] cols = [Polypeptide.three_to_one(x.get_resname()) for x in res_list_2] mat = pd.DataFrame(values, index = rows, columns = cols) self.__d_matrices.update({(key1, key2): mat}) return 1
def get_chain_sequences(df): """Return list of tuples of (id, sequence) for different chains of monomers in a given dataframe.""" # Keep only CA of standard residues df = df[df['name'] == 'CA'].drop_duplicates() df = df[df['resname'].apply(lambda x: Poly.is_aa(x, standard=True))] df['resname'] = df['resname'].apply(Poly.three_to_one) chain_sequences = [] for c, chain in df.groupby(['ensemble', 'subunit', 'structure', 'model', 'chain']): seq = ''.join(chain['resname']) chain_sequences.append((tuple([str(x) for x in c]), seq)) return chain_sequences
def fastaToCNSThreeLetter(seqRecord,width=10): seqStr = seqRecord.seq.tostring() L = len(seqStr) outSeq = [PP.one_to_three(i) for i in seqStr] numbered = zip(outSeq,count(0)) groups = [[j[0] for j in i[1]] for i in groupby(numbered,lambda x:x[1]/width)] joined = '\n'.join([' '.join(one_group) for one_group in groups]) + '\n' return joined
def standard_residue_filter(df): """Filter out non-standard residues.""" residues = df[['structure', 'model', 'chain', 'residue', 'resname']] \ .drop_duplicates() sel = residues['resname'].apply(lambda x: Poly.is_aa(x, standard=True)) residues['to_keep'] = sel residues_to_keep = residues.set_index( ['structure', 'model', 'chain', 'residue', 'resname'])['to_keep'] to_keep = residues_to_keep.loc[df.set_index( ['structure', 'model', 'chain', 'residue', 'resname']).index] return df[to_keep.values]
def get_sequence(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' sequence = self.get_amino_residues(chain_id) return ''.join( Polypeptide.three_to_one(res.get_resname()) for res in sequence)
def get_torsion_angle(self): target_file = os.path.join(self._target_dir, 'target_angels_v%s.pkl' % self.VERSION) if os.path.isfile(target_file) and not self._is_modeller: return pkl_load(target_file) poly = Polypeptide.Polypeptide(self._bio_chain) all_angles = np.array(poly.get_phi_psi_list()) angels_arr = np.array(all_angles[self._aa_mask], dtype=np.float32) # if not self._is_modeller: # pkl_save(target_file, angels_arr) return angels_arr
def GetList(protein, mincount, measure_cutoffs, thresh=9.0, loss=True): """Get list with SSP positions, AA in three letter code. If loss==False, use complement for gain predictions. """ pos = GetNetworkExtremes(protein, mincount, measure_cutoffs, thresh=thresh) if not loss: total_pos = functional_data[protein].columns complement = [i for i in total_pos if i not in pos] pos = complement positions = map(lambda x: pp.one_to_three(x[0]) + x[1:], pos) return list(positions)
def parse_structure(self, pdb_struct): """ extracting sequence and preparing dictionary of residues bio.pdb reads pdb in the following cascade: model->chain->residue->atom """ for chain in pdb_struct: self.residues[chain.id] = {} self.pdb_seq[chain.id] = Seq('') for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname == "HID": resname = polypeptide.three_to_one('HIS') else: if res.resname not in self.residue_list: continue self.residues[chain.id][res.id[1]] = MappedResidue(res.id[1], polypeptide.three_to_one(res.resname)) self.pdb_seq[chain.id] = ''.join([self.residues[chain.id][x].name for x in sorted(self.residues[chain.id].keys())]) for pos, res in enumerate(sorted(self.residues[chain.id].keys()), start=1): self.residues[chain.id][res].pos_in_aln = pos
def create_structure_rotamer(PDB_residue, residue_object, structure): out_stream = StringIO() io = PDBIO() # print(PDB_residue) io.set_structure(PDB_residue) io.save(out_stream) pdbdata = PdbData.objects.get_or_create(pdb=out_stream.getvalue())[0] missing_atoms = atom_num_dict[Polypeptide.three_to_one( PDB_residue.get_resname())] > len(PDB_residue.get_unpacked_list()) rot = Rotamer(missing_atoms=missing_atoms, pdbdata=pdbdata, residue=residue_object, structure=structure) return rot
def get_all_chain_sequences_df(df): """Return list of tuples of (struct_name, chain_sequences) for sharded.""" all_chain_sequences = [] # Keep only CA of standard residues df = df[df['name'] == 'CA'].drop_duplicates() df = df[df['resname'].apply(lambda x: Poly.is_aa(x, standard=True))] df['resname'] = df['resname'].apply(Poly.three_to_one) for s, structure in df.groupby(['ensemble', 'subunit', 'structure']): chain_sequences = [] for c, chain in structure.groupby(['model', 'chain']): seq = ''.join(chain['resname']) chain_sequences.append((c, seq)) all_chain_sequences.append((s, chain_sequences)) return all_chain_sequences
def constructor(self, params, recalculate, to_pickle = False, to_filelize = False, always_recalculate = False, old_obj = None): the_dict = self.get_var_or_file(objects.baW, params, recalculate, True, True, False) chain_letter = self.get_param(params, c) pos = self.get_param(params, 'pos') key = (pos, chain_letter) vals = the_dict[key] res_three = vals[0] chain_seq_in_one = self.get_var_or_file(objects.dW, params, recalculate, True, False) pos_to_aa = self.get_var_or_file(objects.eW, params, recalculate, True, False) res_one = chain_seq_in_one[pos_to_aa[pos]] assert(Polypeptide.three_to_one(res_three) == res_one) # pdb.set_trace() return [vals[1], vals[3], vals[5], vals[7], vals[9]]
def pdb2seq(pdbname): import Bio.PDB.Polypeptide as bio seq = "" with open(pdbname, "r") as pdb: prev_n, n = 0, 0 for line in pdb: line = line.strip("\n") if line[:4] == "ATOM": n = int(line[23:26]) if n != prev_n: aa = line[17:20] seq += bio.three_to_one(aa) prev_n = n return (seq)
def extract_seqs(structure, defmodel): ''' Uses Biopython to count the numer of chains and to extract the each chain's sequence as a list of sequences. Called by: clean_and_sort() ''' nchains = 0 for model in structure: if model.id == defmodel: seqs = [] chain_ids = [] for chain in model: nchains += 1 seqlist = [] for residue in chain: if bpp_poly.is_aa(residue.get_resname(), standard=True): seqlist.append( bpp_poly.three_to_one(residue.get_resname())) else: seqlist.append('X') seq = str("".join(seqlist)) seqs.append(seq) chain_ids.append(chain.id) return nchains, seqs, chain_ids
def get_bfactors( self, chain_id ): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the B-Factors for all residues in a chain of a Biopython.PDB structure. The B-Factors describe the mobility of an atom or a residue. In a Biopython.PDB structure B-Factors are given for each atom in a residue. Calculate the mean B-Factor for a residue by averaging over the B-Factor of all atoms in a residue. Sometimes B-Factors are not available for a certain residue; (e.g. the residue was not resolved); insert np.nan for those cases. Finally normalize your B-Factors using Standard scores (zero mean, unit variance). You have to use np.nanmean, np.nanvar etc. if you have nan values in your array. The returned data structure has to be a numpy array rounded again to integer. ''' chain = self.structure.child_list[0].child_dict[chain_id] residues1 = chain.get_list() residues = [] #remove residues that are not AAs for res_nr in range(len(residues1)): if Polypeptide.is_aa(residues1[res_nr]): residues.append(residues1[res_nr]) length = len(residues) b_factors = np.zeros(length, dtype=np.float32) #calculate bfactor average per residue tmp_factors = np.zeros(length, dtype=np.float32) for res_nr in range(length): atoms = residues[res_nr].get_list() bfactor = 0 atom_count = 0 for ato in atoms: bfactor += ato.bfactor atom_count += 1 bfactor /= atom_count tmp_factors[res_nr] = bfactor #normalize mean = np.nanmean(tmp_factors) vari = np.nanstd(tmp_factors) for foo in range(length): b_factors[foo] = (tmp_factors[foo]-mean)/vari return b_factors.astype( np.int ) # return rounded (integer) values
def clean_pdb(structure, pdb_name, clean_dir): ''' Function to select and write pdb with only aminoacids Invokes SelectAA class constructed with Bio.PDB.select Called by: clean_pdb_files() clean_and_sort() ''' reslist = [] clean_name = clean_dir + pdb_name + '.clean.pdb' for res in structure.get_residues(): if bpp_poly.is_aa(res.get_resname(), standard=True): reslist.append(res.resname) if len(reslist) > 30: io.set_structure(structure) io.save(clean_name, SelectAA()) return True else: return False
def get_sequence(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' chains = list(self.structure.get_chains()) for x in chains: if x.id == chain_id: ret = "" for res in x.get_unpacked_list(): if res.resname != 'HOH': ret += PP.three_to_one(res.resname) return ret return None
def calcNrgMatrices(self, gromacs_dir, xpm_dir, *keys): # calculate and store energy matrices to self.__e_matrices for pairs of res_list = [self.getRegion(key) for key in keys] # regions; keys is a list of keys from self.__regions_res dictionary if not all(res_list): self.printerr('calcNrgMatrices: RESIDUE LIST IS EMPTY\n') return 0 # Create pdb containing only desirable regions self.pushToPDB(gromacs_dir, *keys) # Execute Gromacs script list_of_seqs = [''.join(map(lambda res: Polypeptide.three_to_one(res.get_resname()), x)) for x in res_list] bashCommand = " ".join(["./grom_script.sh", gromacs_dir, xpm_dir, self.__name] + list_of_seqs) exitcode = subprocess.call(bashCommand, shell=True) if not exitcode: self.printerr('calcNrgMatrices: GROMACS FAILURE\n') return 0 # Extract DataFrames from xpm files bad_mat, annotation = extractXPM(''.join([xpm_dir, '/', 'total', self.__name, '.xpm'])) if len(annotation) > 2: aminos = reduce(lambda x, y: list(x) + list(y), annotation[1:]) else: aminos = list(annotation[-1]) mat = pd.DataFrame(bad_mat, columns = aminos, index = aminos) ind_list = [0] [ind_list.append(len(x) + ind_list[-1]) for x in annotation[1:]] sub_mats = [] sub_mats_keys = [] for i in range(len(res_list)): for j in range(i, len(res_list)): sub_mats_keys.append((keys[i], keys[j])) sub_mats.append(mat.iloc[ind_list[i]:ind_list[i + 1], ind_list[j]:ind_list[j + 1]]) self.__e_matrices.update(dict(zip(sub_mats_keys, sub_mats))) return 1
def create_fragment(self, fragment_file_name): tokens = fragment_file_name.strip().replace('.pdb', '').split('_') #Not the most efficient way, but gives the overview on what is going on generic_num = float("%s.%s" %(tokens[0], tokens[1])) res_name = tokens[2] protein_entry_name = tokens[3] pdb_code = tokens[4] if len(tokens) > 5: if len(tokens) == 7: feature = '_'.join([tokens[5], tokens[6]]) elif len(tokens) == 6: feature = tokens[5] #Checking the if the crystal is in the database try: s = Structure.objects.get(pdb_code__index=pdb_code) except Structure.DoesNotExist: self.logger.warning('Cannot find the structure {} in the database. Skipping the fragment {}'.format(pdb_code, fragment_file_name.strip().replace('.pdb', ''))) return #ResidueFragmentInteractionType try: i, created = ResidueFragmentInteractionType.objects.get_or_create(slug=feature, name=self.interactions[feature]) except Exception: self.logger.info("Failed to find or create feature {}...".format(feature)) #Rotamer and Fragment try: fragment_struct = PDBParser(PERMISSIVE=True).get_structure('frag', os.sep.join([self.fragments_dir, fragment_file_name]))[0] fragment_pdb_data = '' r = None for residue in fragment_struct.get_residues(): hetfield, resseq, icode=residue.get_id() if hetfield == ' ': #Amino acid try: r = Residue.objects.get(sequence_number=int(resseq), amino_acid=polypeptide.three_to_one(residue.resname),protein_conformation=s.protein_conformation) d, created = PdbData.objects.get_or_create(pdb=extract_pdb_data(residue)) rot, created = Rotamer.objects.get_or_create(residue=r, structure=s, pdbdata=d) #rot.save() except Exception as msg: self.logger.error('Failed to add rotamer {}:{}{}\n'.format(pdb_code, resseq, msg)) return else: fragment_pdb_data += extract_pdb_data(residue) try: fd, created = PdbData.objects.get_or_create(pdb=fragment_pdb_data) #Taking the first ligand from the list, since existing fragments do not contain the ligand info f, created = Fragment.objects.get_or_create(residue=r, ligand=s.ligands.all()[0], structure=s, pdbdata=fd) #f.save() except Exception as msg: self.logger.error('Failed to add fragment {}\n{}'.format(fragment_file_name, msg)) except Exception as msg: self.logger.error('Failed to add fragment {} to the db\n{}'.format(fragment_file_name, msg)) #StructureLigandInteraction try: lr, created = LigandRole.objects.get_or_create(name='unknown',slug='unknown') sli, created = StructureLigandInteraction.objects.get_or_create(structure=s, ligand=s.ligands.all()[0], ligand_role=lr) except Exception as msg: self.logger.error("Failed to add fragment {} to the db\n{}".format(fragment_file_name, msg)) try: rfi, created = ResidueFragmentInteraction.objects.get_or_create(structure_ligand_pair=sli, rotamer=rot, fragment=f, interaction_type=i) self.logger.info("Successfully added interacting fragment {}".format(fragment_file_name)) except Exception as msg: self.logger.error("Failed to add fragment {} to the db\n{}".format(fragment_file_name, msg))
def create_residues(self, args): schemes = { 'gpcrdb': {'type': False}, 'gpcrdba': { 'type': 'structure', 'seq_based': 'bw', }, 'gpcrdbb': { 'type': 'structure', 'seq_based': 'woot', }, 'gpcrdbc': { 'type': 'structure', 'seq_based': 'pin', }, 'gpcrdbf': { 'type': 'structure', 'seq_based': 'wang', }, 'bw': {'type': 'sequence'}, 'woot': {'type': 'sequence'}, 'pin': {'type': 'sequence'}, 'wang': {'type': 'sequence'}, } for scheme_name, scheme in schemes.items(): schemes[scheme_name]['obj'] = ResidueNumberingScheme.objects.get(slug=scheme_name) mapping_file = os.sep.join([self.generic_numbers_source_dir, 'mapping_' + scheme_name + '.txt']) if os.path.isfile(mapping_file): with open(mapping_file, "r", encoding='UTF-8') as scheme_table_file: schemes[scheme_name]['table'] = {} for row in scheme_table_file: split_row = shlex.split(row) schemes[scheme_name]['table'][split_row[0]] = split_row[1] missing_proteins = [] self.logger.info('CREATING RESIDUES') for arg in args: if os.path.exists(os.sep.join([self.dump_source_dir, arg])): residue_data_fh = open(os.sep.join([self.dump_source_dir, arg]), 'r') self.logger.info('Parsing residue data from {}'.format(arg)) else: print("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg]))) self.logger.error("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg]))) continue for line in residue_data_fh: id,res_num,res_name,oli,gpcrdb,bw,bw2,bs,prot_name,sec_str_name = [x.strip().strip('"') for x in line.split(',')] #double strip due to some weird bug... if prot_name in missing_proteins: continue # fetch schemes and conversion tables #Checking if the protein exists in the db try: pconf = ProteinConformation.objects.get(protein__entry_name=prot_name, state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist as e: missing_proteins.append(prot_name) continue #Checking if given residue already exists in the db try: Residue.objects.get(protein_conformation=pconf.id, sequence_number=res_num) continue except Residue.DoesNotExist as e: pass r = Residue() r.protein_conformation = pconf r.sequence_number = int(res_num) r.amino_acid = polypeptide.three_to_one(res_name.upper()) generic_numbers = [] try: r.save() self.logger.info('Created residue {:n}{!s} for protein {!s}'.format(r.sequence_number, r.amino_acid, pconf.protein.entry_name)) except Exception as msg: print(msg) self.logger.error('Failed to create residue {:n}{!s} for protein {!s}'.format( r.sequence_number, r.amino_acid, pconf.protein.entry_name)) continue # residue segment dump_segment = sec_str_name try: r.protein_segment = ProteinSegment.objects.get(slug=dump_segment) except: self.logger.error('Failed to fetch protein segment {}'.format(dump_segment)) # generic number if (str(oli) != '0' and gpcrdb != 'None' and bw != 'None'): # separate bulge number (1241 - > 124 + 1) bulge_prime = '' dump_oliveira = str(oli) if len(dump_oliveira) == 4: bulge_prime = dump_oliveira[3] dump_oliveira = dump_oliveira[:3] dump_gpcrdb = gpcrdb[:4] dump_seq_based = bw # default gpcrdb number def_gpcrdb = False if dump_oliveira in schemes[settings.DEFAULT_NUMBERING_SCHEME]['table']: default_label = (schemes[settings.DEFAULT_NUMBERING_SCHEME]['table'][dump_oliveira] + bulge_prime) try: def_gpcrdb = ResidueGenericNumber.objects.get(label=default_label, scheme=schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj']) except ResidueGenericNumber.DoesNotExist as e: def_gpcrdb = ResidueGenericNumber() def_gpcrdb.label = default_label def_gpcrdb.scheme = schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'] def_gpcrdb.protein_segment = r.protein_segment def_gpcrdb.save() self.logger.info('Created generic number {:s} in numbering scheme {:s}' .format(default_label, schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'].short_name)) # if default number was found/added successfully, process the alternative numbers if def_gpcrdb: # add default generic number to residue record r.generic_number = def_gpcrdb # dict of sequence-based numbers, for use in structure-based numbers (5.46x461) seq_based_labels = {} # sequence-based schemes first (the sequence-based numbers are needed for the # structure based schemes) for scheme_name, scheme in schemes.items(): if scheme['type'] == 'sequence': # is this number in the scheme defined for this protein? if scheme_name == schemes[pconf.protein.residue_numbering_scheme.slug]['seq_based']: seq_based_label = dump_seq_based # if not convert the number to the correct scheme else: slug = pconf.protein.residue_numbering_scheme.slug for d, c in schemes[schemes[slug]['seq_based']]['table'].items(): if c == dump_seq_based: seq_based_label = scheme['table'][d] break # fetch/insert the number try: seq_based = ResidueGenericNumber.objects.get(label=seq_based_label, scheme=scheme['obj']) except ResidueGenericNumber.DoesNotExist as e: seq_based = ResidueGenericNumber() seq_based.label = seq_based_label seq_based.scheme = scheme['obj'] seq_based.protein_segment = r.protein_segment seq_based.save() r.alternative_generic_numbers.add(seq_based) # add added number to the dict for later use seq_based_labels[scheme_name] = seq_based_label # structure-based numbers for scheme_name, scheme in schemes.items(): if scheme['type'] == 'structure': # is this number in the scheme defined for this protein? if scheme_name == pconf.protein.residue_numbering_scheme.slug: struct_based_label = dump_gpcrdb + bulge_prime # if not convert the number to the correct scheme else: for d, c in schemes[pconf.protein.residue_numbering_scheme.slug]['table'].items(): if c == dump_gpcrdb: struct_based_label = scheme['table'][d] + bulge_prime break # add the sequence-based label (5x461 -> 5.46x461) split_struct_based_label = struct_based_label.split('x') struct_based_label = (seq_based_labels[scheme['seq_based']] + 'x' + split_struct_based_label[1]) # fetch/insert the number try: struct_based = ResidueGenericNumber.objects.get( label=struct_based_label, scheme=scheme['obj']) except ResidueGenericNumber.DoesNotExist as e: struct_based = ResidueGenericNumber() struct_based.label = struct_based_label struct_based.scheme = scheme['obj'] struct_based.protein_segment = r.protein_segment struct_based.save() # add to residue as a display number or alternative number? if scheme_name == pconf.protein.residue_numbering_scheme.slug: r.display_generic_number = struct_based else: r.alternative_generic_numbers.add(struct_based) try: r.save() self.logger.info('Added generic numbers for residue {}{!s} for protein {!s}'.format(res_num, res_name, pconf.protein.entry_name)) except Exception as msg: print(msg) self.logger.error( 'Failed to create generic numbers for residue {}{!s} for protein {!s}'.format(res_num, res_name, pconf.protein.entry_name)) self.logger.info('COMPLETED CREATING RESIDUES')
def get_chain_sequence(self, chain): """ Returns a sequence string of a given chain. """ return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in self.residues[chain] if x.resname in self.residue_list])
def get_peptide_sequence(self, residues): """ Returns a sequence string of a given list of Bio.PDB.Residue objects. """ return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in residues if x.resname in self.residue_list])
def get_chain_sequence(self, chain): return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in chain if x.resname in self.residue_list])