def _calc_residue_dist(self, residue_one, residue_two, dist_atoms='CA'): """Returns the C-alpha distance between two residues""" if not Polypeptide.is_aa(residue_one) or not Polypeptide.is_aa(residue_two): return np.nan dist_atom_1 = dist_atoms if dist_atoms in residue_one else 'CA' dist_atom_2 = dist_atoms if dist_atoms in residue_two else 'CA' try: diff_vector = residue_one[dist_atom_1].coord - residue_two[dist_atom_2].coord except KeyError: return np.nan return np.sqrt(np.sum(diff_vector * diff_vector))
def MutationsDict(file, positions=None): """Get dictionary with lists of mutations per position in protein, ignore positions without residue in pdb file. Parameters: file (string): pdb file to get mutations from positions: list of tuples of the form (chain, first, last) for positions to mutate for all other aminoacids. If None, mutates all positions in all chains Returns: dict with keys :aa:chain:position, each containing lists with :aa:chain:position:mutated_aa for all mutations """ # Sorted list of one letter amino acids AA = list(Bio.PDB.Polypeptide.aa1) # Generate model of original pdb file model = it.Pmolecule(file).model # Dict to store mutations mutations = dict() if positions: for chain_id, first, last in positions: # Get chain corresponding to chain_id given chain = next(chain for chain in model.get_chains() if chain.id == chain_id) for residue in chain: if pp.is_aa(residue): code = pp.three_to_one(residue.get_resname()) position = residue.id[1] prefix = code + chain_id + str(position) # Only save positions between first and last if position in range(first, last + 1): mutations[prefix] = [ prefix + aa for aa in AA if aa != code ] else: for chain in model.get_chains(): for residue in chain: if pp.is_aa(residue): code = pp.three_to_one(residue.get_resname()) position = residue.id[1] chain_id = chain.id prefix = code + chain_id + str(position) mutations[prefix] = [ prefix + aa for aa in AA if aa != code ] return mutations
def get_chain_to_valid_residues(structure, pdb_name=None): """Get tuples of chains and their valid residues.""" if pdb_name is None: pdb_name = () else: pdb_name = (pdb_name, ) chain_res = [] if type(structure) is Bio.PDB.Structure.Structure: for model in structure: for chain in model: residues = [ res for res in chain if poly.is_aa(res.get_resname(), standard=True) and 'CA' in res ] if len(residues) != 0: chain_res.append( (pdb_name + (str(model.serial_num), chain.get_id()), residues)) else: if 'atom_name' in structure.columns: calphas = structure[structure['atom_name'] == 'CA'] else: calphas = structure[structure['maestro_atom_name'] == ' CA '] calphas = calphas[calphas['resname'] != 'UNK'] for (chain, chain_ca) in calphas.groupby(['model', 'chain']): residues = [ca for idx, ca in chain_ca.iterrows()] if len(residues) != 0: chain_res.append((pdb_name + chain, residues)) return chain_res
def definePeptideChain(self): # find peptide chain if not stated in self.__table and fill self.__peptide l = 'INFINITY' # with list of peptide residues if length is less than 30 if not self.__table['chain_antigen'][0]: for i in self.__chains: buf = len(PPBuilder().build_peptides(self.__struct[0][i])[0]) if (buf <= l): l = buf chid = i self.__table.loc['chain_antigen', :] = chid else: chid = self.__table['chain_antigen'][0] pp = list(self.__struct[0][chid]) if (len(pp) > 30): line = self.__name + '\t;TOO MANY AMINO ACIDS (' + str(len(pp)) + ') TO BE A PEPTIDE :(\n' self.printerr('definePeptideChain(): ' + line) return 0 pep_res = [] for r in pp: if (Polypeptide.is_aa(r.get_resname(), standard=True)): pep_res.append(r) self.__peptide = pep_res self.__regions_res.update({'peptide':pep_res}) return 1
def aa_to_index(aa): """ :param aa: Three character amino acid name. :returns: Integer index as per BioPython, unknown/non-standard amino acids return 20. """ if Polypeptide.is_aa(aa, standard=True): return Polypeptide.three_to_index(aa) else: return 20
def get_chain_sequences(df): """Return list of tuples of (id, sequence) for different chains of monomers in a given dataframe.""" # Keep only CA of standard residues df = df[df['name'] == 'CA'].drop_duplicates() df = df[df['resname'].apply(lambda x: Poly.is_aa(x, standard=True))] df['resname'] = df['resname'].apply(Poly.three_to_one) chain_sequences = [] for c, chain in df.groupby(['ensemble', 'subunit', 'structure', 'model', 'chain']): seq = ''.join(chain['resname']) chain_sequences.append((tuple([str(x) for x in c]), seq)) return chain_sequences
def standard_residue_filter(df): """Filter out non-standard residues.""" residues = df[['structure', 'model', 'chain', 'residue', 'resname']] \ .drop_duplicates() sel = residues['resname'].apply(lambda x: Poly.is_aa(x, standard=True)) residues['to_keep'] = sel residues_to_keep = residues.set_index( ['structure', 'model', 'chain', 'residue', 'resname'])['to_keep'] to_keep = residues_to_keep.loc[df.set_index( ['structure', 'model', 'chain', 'residue', 'resname']).index] return df[to_keep.values]
def get_all_chain_sequences_df(df): """Return list of tuples of (struct_name, chain_sequences) for sharded.""" all_chain_sequences = [] # Keep only CA of standard residues df = df[df['name'] == 'CA'].drop_duplicates() df = df[df['resname'].apply(lambda x: Poly.is_aa(x, standard=True))] df['resname'] = df['resname'].apply(Poly.three_to_one) for s, structure in df.groupby(['ensemble', 'subunit', 'structure']): chain_sequences = [] for c, chain in structure.groupby(['model', 'chain']): seq = ''.join(chain['resname']) chain_sequences.append((c, seq)) all_chain_sequences.append((s, chain_sequences)) return all_chain_sequences
def write_FASTAs(PDB_ID, chains): polypeptide_IDs = [] for chain_ID, residues in chains.items(): if residues and Polypeptide.is_aa(residues[0][0]): polypeptide_ID = '{}_{}'.format(PDB_ID, chain_ID) polypeptide_IDs.append(polypeptide_ID) sequence = [] for resname, resseq, icode in residues: try: sequence.append(Polypeptide.three_to_one(resname)) except KeyError: sequence.append('X') with open('{}.fasta'.format(polypeptide_ID), mode='w') as f: f.write('>{}\n'.format(polypeptide_ID)) f.write('{}\n'.format(''.join(sequence))) return polypeptide_IDs
def get_bfactors( self, chain_id ): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the B-Factors for all residues in a chain of a Biopython.PDB structure. The B-Factors describe the mobility of an atom or a residue. In a Biopython.PDB structure B-Factors are given for each atom in a residue. Calculate the mean B-Factor for a residue by averaging over the B-Factor of all atoms in a residue. Sometimes B-Factors are not available for a certain residue; (e.g. the residue was not resolved); insert np.nan for those cases. Finally normalize your B-Factors using Standard scores (zero mean, unit variance). You have to use np.nanmean, np.nanvar etc. if you have nan values in your array. The returned data structure has to be a numpy array rounded again to integer. ''' chain = self.structure.child_list[0].child_dict[chain_id] residues1 = chain.get_list() residues = [] #remove residues that are not AAs for res_nr in range(len(residues1)): if Polypeptide.is_aa(residues1[res_nr]): residues.append(residues1[res_nr]) length = len(residues) b_factors = np.zeros(length, dtype=np.float32) #calculate bfactor average per residue tmp_factors = np.zeros(length, dtype=np.float32) for res_nr in range(length): atoms = residues[res_nr].get_list() bfactor = 0 atom_count = 0 for ato in atoms: bfactor += ato.bfactor atom_count += 1 bfactor /= atom_count tmp_factors[res_nr] = bfactor #normalize mean = np.nanmean(tmp_factors) vari = np.nanstd(tmp_factors) for foo in range(length): b_factors[foo] = (tmp_factors[foo]-mean)/vari return b_factors.astype( np.int ) # return rounded (integer) values
def get_sequence( self, chain_id ): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' chain = self.structure.child_list[0].child_dict[chain_id] sequence = "" for residue in chain: if Polypeptide.is_aa(residue): long_name = residue.get_resname() sequence += Polypeptide.three_to_one(long_name) return sequence
def clean_pdb(structure, pdb_name, clean_dir): ''' Function to select and write pdb with only aminoacids Invokes SelectAA class constructed with Bio.PDB.select Called by: clean_pdb_files() clean_and_sort() ''' reslist = [] clean_name = clean_dir + pdb_name + '.clean.pdb' for res in structure.get_residues(): if bpp_poly.is_aa(res.get_resname(), standard=True): reslist.append(res.resname) if len(reslist) > 30: io.set_structure(structure) io.save(clean_name, SelectAA()) return True else: return False
def standard_residue_filter(df): """ Filter out non-standard residues. :param df: dataframe to filter against. :type df: atoms dataframe. :return: same dataframe, but with only with atoms corresponding to standard residues left. :rtype: atoms dataframe. """ residues = df[['structure', 'model', 'chain', 'residue', 'resname']] \ .drop_duplicates() sel = residues['resname'].apply( lambda x: Poly.is_aa(x, standard=True)) residues['to_keep'] = sel residues_to_keep = residues.set_index( ['structure', 'model', 'chain', 'residue', 'resname'])['to_keep'] to_keep = residues_to_keep.loc[df.set_index( ['structure', 'model', 'chain', 'residue', 'resname']).index] return df[to_keep.values]
def extract_seqs(structure, defmodel): ''' Uses Biopython to count the numer of chains and to extract the each chain's sequence as a list of sequences. Called by: clean_and_sort() ''' nchains = 0 for model in structure: if model.id == defmodel: seqs = [] chain_ids = [] for chain in model: nchains += 1 seqlist = [] for residue in chain: if bpp_poly.is_aa(residue.get_resname(), standard=True): seqlist.append( bpp_poly.three_to_one(residue.get_resname())) else: seqlist.append('X') seq = str("".join(seqlist)) seqs.append(seq) chain_ids.append(chain.id) return nchains, seqs, chain_ids
def get_contact_map( self, chain_id ): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return a complete contact map (see description in exercise sheet) for a given chain in a Biopython.PDB structure as numpy array. The values in the matrix describe the c-alpha distance between all residues in a chain of a Biopython.PDB structure. Only integer values of the distance have to be given (see below). ''' length = len(self.get_sequence(chain_id)) contact_map = np.zeros( (length,length), dtype=np.float32 ) aa = [] for residue in self.structure.child_list[0].child_dict[chain_id]: if Polypeptide.is_aa(residue): aa.append(residue) for foo in range(0,length): for bar in range(0,length): contact_map[foo][bar] = self.get_residue_distance(aa[foo], aa[bar]) return contact_map.astype( np.int ) # return rounded (integer) values
def _aa_mask(self): poly = Polypeptide.Polypeptide(self._bio_chain) aa_mask = [Polypeptide.is_aa(r) for r in poly] return aa_mask
def get_structure_seqrecords(model): """Get a dictionary of a PDB file's sequences. Special cases include: - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR - HETATMs. Currently written as an "X", or unknown amino acid. Args: model: Biopython Model object of a Structure Returns: list: List of SeqRecords """ structure_seq_records = [] # Loop over each chain of the PDB for chain in model: tracker = 0 chain_seq = '' chain_resnums = [] # Loop over the residues for res in chain.get_residues(): # NOTE: you can get the residue number too res_id = res.id res_num = res_id[1] res_icode = res_id[2] # Double check if the residue name is a standard residue # If it is not a standard residue (ie. selenomethionine), # it will be filled in with an X on the next iteration) if Polypeptide.is_aa(res, standard=True): end_tracker = res_num res_aa_one = Polypeptide.three_to_one(res.get_resname()) # Tracker to fill in X's if end_tracker != (tracker + 1): if res_icode != ' ': chain_seq += res_aa_one chain_resnums.append(res_num) tracker = end_tracker + 1 continue else: multiplier = (end_tracker - tracker - 1) chain_seq += 'X' * multiplier # Residue numbers for unresolved or nonstandard residues are Infinite chain_resnums.extend([float("Inf")] * multiplier) chain_seq += res_aa_one chain_resnums.append(res_num) tracker = end_tracker else: continue chain_seq_record = SeqRecord(Seq(chain_seq, IUPAC.protein), id=chain.get_id()) chain_seq_record.letter_annotations[ 'structure_resnums'] = chain_resnums structure_seq_records.append(chain_seq_record) return structure_seq_records
def build_matrix( path: str, filename: str, truncate_log: Union[tqdm.tqdm, None] = None) -> BuildMatrixDict: """Build the input matrix for one protein. Args: path: path of the pdb file. filename: name of the file (without extension). truncate_log: tqdm logger Returns: Build matrix dictionary """ PROTEIN_SEQ_MAX_LEN = 4000 protein_matrix = [[0 for x in range(PROTEIN_SEQ_MAX_LEN)] for y in range(10)] protein_structure = PDBParser().get_structure(filename, path) protein_model = list(protein_structure.get_models()) protein_chains = list(protein_model[0].get_chains()) col = 0 try: for chain in protein_chains: protein_residues = list(chain.get_residues()) for residue in protein_residues: if Polypeptide.is_aa(residue.get_resname(), standard=True): atoms = list(residue.get_atoms()) x = [] y = [] z = [] for atom in atoms: vec = atom.get_vector() x.append(vec.__getitem__(0)) y.append(vec.__getitem__(1)) z.append(vec.__getitem__(2)) # calculate position of residue x = round(mean(x)) y = round(mean(y)) z = round(mean(z)) # one letter code code = Polypeptide.three_to_one(residue.get_resname()) aa = amino_acid[code] protein_matrix[0][col] = aa["code"] protein_matrix[1][col] = x protein_matrix[2][col] = y protein_matrix[3][col] = z protein_matrix[4][col] = aa["hydropathy"] protein_matrix[5][col] = aa["hydropathy_index"] protein_matrix[6][col] = aa["acidity_basicity"] protein_matrix[7][col] = aa["mass"] protein_matrix[8][col] = aa["isoelectric_point"] protein_matrix[9][col] = aa["charge"] # Even if the current residue is not amino acid we increase the col. # 0 is save at this position if it is not an amino acid. col = col + 1 except IndexError: if truncate_log is not None: truncate_log.set_description_str( f"Protein {filename} is truncated.") # Prepare dict so it can be load to vaex dataframe dic: BuildMatrixDict = { "seq": [[]], "x_pos": [[]], "y_pos": [[]], "z_pos": [[]], "hydropathy": [[]], "hydropathy_index": [[]], "acidity_basicity": [[]], "mass": [[]], "isoelectric_point": [[]], "charge": [[]], } for i in range(10): dic[col_name[i]] = pyarrow.array( [[protein_matrix[i][x] for x in range(PROTEIN_SEQ_MAX_LEN)]]) return dic
def get_structure_seqs(pdb_file, file_type): """Get a dictionary of a PDB file's sequences. Special cases include: - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR - HETATMs. Currently written as an "X", or unknown amino acid. Args: pdb_file: Path to PDB file Returns: dict: Dictionary of: {chain_id: sequence} """ # TODO: Please check out capitalization of chain IDs in mmcif files. example: 5afi - chain "l" is present but # it seems like biopython capitalizes it to chain L # Get the first model my_structure = StructureIO(pdb_file) model = my_structure.first_model structure_seqs = {} # Loop over each chain of the PDB for chain in model: chain_seq = '' tracker = 0 # Loop over the residues for res in chain.get_residues(): # NOTE: you can get the residue number too # res_num = res.id[1] # Double check if the residue name is a standard residue # If it is not a standard residue (ie. selenomethionine), # it will be filled in with an X on the next iteration) if Polypeptide.is_aa(res, standard=True): full_id = res.get_full_id() end_tracker = full_id[3][1] i_code = full_id[3][2] aa = Polypeptide.three_to_one(res.get_resname()) # Tracker to fill in X's if end_tracker != (tracker + 1): if i_code != ' ': chain_seq += aa tracker = end_tracker + 1 continue else: chain_seq += 'X' * (end_tracker - tracker - 1) chain_seq += aa tracker = end_tracker else: continue structure_seqs[chain.get_id()] = chain_seq return structure_seqs
def get_knots(pdb, cutoff, cluster_cutoff, genpdb, verbosity): ''' Main routine, uses biopython and pandas to detect knots and cluster them through the implementation of the average linkage algorithm Called by: main() ''' if (pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(".ent.gz") or pdb.endswith(".pdb1") or pdb.endswith(".pdb1.gz") or pdb.endswith(".pdb.gz")) and not pdb.startswith('CONTACTS-'): pdb_name, structure, nchains = strtools.parse_pdb_structure(pdb) print(str('\n' + clrs['p'] + pdb + clrs['n'])) with open('KnotScope.log', 'a') as log: log.write(str('\n[STRUCTURE],' + pdb + '\n')) mainchain = [ atom for atom in bpp.Selection.unfold_entities(structure[0], 'A') if bpp_poly.is_aa(atom.get_parent(), standard=True) and ( atom.id == 'CA') ] # or atom.id == 'N' or atom.id == 'O')] contacts = [] core = [] for atom in mainchain: distances = [] ns = bpp.NeighborSearch(mainchain) center = atom.get_coord() neighbors = [ neighbor for neighbor in ns.search(center, cutoff) if (neighbor.get_parent().id[1] - atom.get_parent().id[1]) > abs(3) ] if neighbors: for neighbor in neighbors: d = neighbor - atom distances.append(d) if d <= cutoff: printv( clrs['y'] + 'Unlikely proximity' + clrs['n'] + ' between residues ' + clrs['y'] + str(atom.get_parent().id[1]) + clrs['n'] + ' and ' + clrs['y'] + str(neighbor.get_parent().id[1]) + clrs['n'] + '!', verbosity) printv(str(d), verbosity) with open('KnotScope.log', 'a') as log: log.write('[CLASH],' + str(atom.get_parent().id[1]) + ',' + str(neighbor.get_parent().id[1]) + ',' + str(d) + '\n') contacts.append(neighbor.get_parent()) contacts.append(atom.get_parent()) if atom not in core: core.append(atom) if neighbor not in core: core.append(neighbor) # Save contacts to pdb file if they exist if contacts and genpdb: io.set_structure(structure) io.save('CONTACTS-' + pdb, strtools.SelectResidues(contacts)) # Start cluster analysis to separate knots pairwisedist = [] # Measure pairwise distances of every CA involved in knots and record in vertical list if len(core) > 1: for a, b in it.combinations(core, 2): d = a - b pairwisedist.append( [a.get_parent().id[1], b.get_parent().id[1], d]) # Add values for diagonal for entry in range(len(core)): line = make_diagonal(core, entry) pairwisedist.append([line[0], line[1], line[2]]) # Create pandas dataframe, make it a square and symmetric matrix df = pd.DataFrame(pairwisedist, index=None, columns=None) df = pd.crosstab(index=df[0], columns=df[1], values=df[2], aggfunc='sum', dropna=True).fillna(0) df = df + df.T # Start average linkage algorithm reslist = list(df.columns) clusters = [] row_index = -1 col_index = -1 array = [] for n in range(df.shape[0]): array.append(n) clusters.append(array.copy()) for k in range(1, df.shape[0]): min_val = sys.maxsize for i in range(0, df.shape[0]): for j in range(0, df.shape[1]): #print(str(df.iloc[i,j])) if type(df.iloc[i, j]) != str: if (df.iloc[i, j] <= min_val): min_val = df.iloc[i, j] row_index = i col_index = j for i in range(0, df.shape[0]): if (i != col_index and i != row_index): temp = (df.iloc[col_index, i] + df.iloc[row_index, i]) / 2 df.iloc[col_index, i] = temp df.iloc[i, col_index] = temp for i in range(0, df.shape[0]): df.iloc[row_index, i] = sys.maxsize df.iloc[i, row_index] = sys.maxsize minimum = min(row_index, col_index) maximum = max(row_index, col_index) for n in range(len(array)): if (array[n] == maximum): array[n] = minimum clusters.append(array.copy()) # Stop iterations when minimum pairwise distance in the matrix is greater than 22 if min_val > cluster_cutoff: break # Get the clusters from last iteration and 'count' elements clustered_res = clusters[-1] counter = collections.Counter(clustered_res) # Combine residue and cluster information and print them user-friendly clusterdict = dict(zip(reslist, clustered_res)) print(clrs['y'] + '\nLikely ' + str(len(set(clusters[-1]))) + ' knot(s) found in structure under chosen criteria...' + clrs['n']) n = 0 k_lengths = [] for cl in set(clusterdict.values()): n += 1 cluster_residues = [] print('\nKnot ' + clrs['y'] + str(n) + clrs['n'] + ' (Cluster id: ' + str(cl) + ') involves ' + clrs['y'] + str(list(counter.values())[n - 1]) + clrs['n'] + ' residues:') for res in clusterdict: if clusterdict[res] == cl: cluster_residues.append(res) k_lengths.append(len(cluster_residues)) print(clrs['y'] + ', '.join([str(a) for a in cluster_residues]) + clrs['n']) with open('KnotScope.log', 'a') as log: log.write('[K' + str(n) + '-RES],' + ','.join([str(a) for a in cluster_residues]) + '\n') log.write('[K' + str(n) + '-LEN],' + str(len(cluster_residues)) + '\n') if clusterdict: nknots = len(set(list(clusterdict.values()))) else: nknots = 0 maxklength = max(k_lengths) with open('KnotScope.log', 'a') as log: log.write('[SUM],str,' + pdb + ',ca_clash,' + str(len(core)) + ',nknots,' + str(nknots) + ',maxklength,' + str(maxklength) + '\n') return clusterdict, nknots, maxklength else: print(clrs['g'] + 'No CA distances under ' + str(cutoff) + ' angstrons found' + clrs['n'] + '!\n') with open('KnotScope.log', 'a') as log: log.write('[SUM],str,' + pdb + ',ca_clash,0,nknots,0,maxklength,0\n') del pdb_name, structure, nchains, contacts elif pdb.startswith('CONTACTS-'): pass else: print(clrs['y'] + pdb + clrs['n'] + ' not a pdb-related structure format.' + clrs['r'] + ' SKIPPING!' + clrs['n'])
def aa_to_index(aa): if Polypeptide.is_aa(aa, standard=True): return Polypeptide.three_to_index(aa) else: return 20
def accept_residue(self, residue): if bpp_poly.is_aa(residue.get_resname(), standard=True): return 1 else: return 0