def AverageProtectionFactors(ProteinStructure): # Calculate the average protection factor AverageLogProtectionFactors = {} for ProteinModel in ProteinStructure: for Chain in ProteinModel: ChainID = Chain.get_id() for Residue in Chain: if is_aa(Residue.get_resname(), standard = True): LogProtectionFactors = [SecondProteinModel[ChainID][Residue.get_id()]["CA"].get_occupancy() for SecondProteinModel in ProteinStructure] AverageLogProtectionFactors[(Chain, Residue)] = sum(LogProtectionFactors) / float(len(LogProtectionFactors)) # And assign them to each residue for ProteinModel in ProteinStructure: for Chain in ProteinModel: for Residue in Chain: if is_aa(Residue.get_resname(), standard = True): for Atom in Residue: Atom.set_occupancy(AverageLogProtectionFactors[(Chain, Residue)]) return
def __init__(self, fasta_align, m1, m2, si=0, sj=1): """Initialise. Attributes: - fasta_align - Alignment object - m1, m2 - two models - si, sj - the sequences in the Alignment object that correspond to the structures """ l = fasta_align.get_alignment_length() # Get the residues in the models rl1 = Selection.unfold_entities(m1, 'R') rl2 = Selection.unfold_entities(m2, 'R') # Residue positions p1 = 0 p2 = 0 # Map equivalent residues to each other map12 = {} map21 = {} # List of residue pairs (None if -) duos = [] for i in range(0, l): column = fasta_align[:, i] aa1 = column[si] aa2 = column[sj] if aa1 != "-": # Position in seq1 is not - while True: # Loop until an aa is found r1 = rl1[p1] p1 = p1 + 1 if is_aa(r1): break self._test_equivalence(r1, aa1) else: r1 = None if aa2 != "-": # Position in seq2 is not - while True: # Loop until an aa is found r2 = rl2[p2] p2 = p2 + 1 if is_aa(r2): break self._test_equivalence(r2, aa2) else: r2 = None if r1: # Map residue in seq1 to its equivalent in seq2 map12[r1] = r2 if r2: # Map residue in seq2 to its equivalent in seq1 map21[r2] = r1 # Append aligned pair (r is None if gap) duos.append((r1, r2)) self.map12 = map12 self.map21 = map21 self.duos = duos
def get_contact_map(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return a complete contact map (see description in exercise sheet) for a given chain in a Biopython.PDB structure as numpy array. The values in the matrix describe the c-alpha distance between all residues in a chain of a Biopython.PDB structure. Only integer values of the distance have to be given (see below). ''' length = self.get_number_of_residues(chain_id) contact_map = np.empty((length, length), dtype=self.dtype) contact_map[:] = np.nan # initialize as nan chain = self.structure[0][chain_id] for i, residue_1 in enumerate(chain): for j, residue_2 in enumerate(chain): # create only lower triangle and diagonale of contact map as it is symmetric # check whether current residue is an AA. Skip e.g. water-molecules if i <= j and is_aa(residue_1) and is_aa(residue_2): ca_dist = residue_1['CA'] - residue_2['CA'] contact_map[i, j] = ca_dist contact_map[j, i] = ca_dist return contact_map.astype( np.int) # return as int to make comparison more robust
def __init__(self, fasta_align, m1, m2, si=0, sj=1): """Initialize. Attributes: - fasta_align - Alignment object - m1, m2 - two models - si, sj - the sequences in the Alignment object that correspond to the structures """ length = fasta_align.get_alignment_length() # Get the residues in the models rl1 = Selection.unfold_entities(m1, 'R') rl2 = Selection.unfold_entities(m2, 'R') # Residue positions p1 = 0 p2 = 0 # Map equivalent residues to each other map12 = {} map21 = {} # List of residue pairs (None if -) duos = [] for i in range(length): column = fasta_align[:, i] aa1 = column[si] aa2 = column[sj] if aa1 != "-": # Position in seq1 is not - while True: # Loop until an aa is found r1 = rl1[p1] p1 = p1 + 1 if is_aa(r1): break self._test_equivalence(r1, aa1) else: r1 = None if aa2 != "-": # Position in seq2 is not - while True: # Loop until an aa is found r2 = rl2[p2] p2 = p2 + 1 if is_aa(r2): break self._test_equivalence(r2, aa2) else: r2 = None if r1: # Map residue in seq1 to its equivalent in seq2 map12[r1] = r2 if r2: # Map residue in seq2 to its equivalent in seq1 map21[r2] = r1 # Append aligned pair (r is None if gap) duos.append((r1, r2)) self.map12 = map12 self.map21 = map21 self.duos = duos
def computeOneFile(self, fileName): ''' Computes distance for each pair of aminoacids for a given pdb file @param fileName: str. fname to pdb file ''' prefixAndChainTypeId = (fileName.split("/")[-1]).split(".pdb")[0] outName = os.path.join(self.outPath, prefixAndChainTypeId + ".distMat") if os.path.isfile(outName): print("Already computed Distance Maps") return 0 structure = self.parser.get_structure(prefixAndChainTypeId, fileName) structCenterMass = self.getStructCenterMass(structure) try: outFile = open(outName, "w") outFile.write( "chainId1 structResId1 chainId2 structResId2 distance angle_to_protCM\n" ) for res1 in structure[0].get_residues(): if is_aa(res1, standard=True): ## print res, res.get_full_id() structId1, modelId1, chainId1, resId1 = res1.get_full_id() resId1 = list(resId1) resId1[1] = str(resId1[1]) resId1 = "".join(resId1[1:]) if chainId1 == " ": chainId1 = "*" for res2 in structure[0].get_residues(): if is_aa(res2, standard=True): ## print( res, res.get_full_id()) structId2, modelId2, chainId2, resId2 = res2.get_full_id( ) resId2 = list(resId2) resId2[1] = str(resId2[1]) resId2 = "".join(resId2[1:]) if chainId2 == " ": chainId2 = "*" magnitude = self.getMagnitude( res1, res2, structCenterMass) # print( chainId1, resId1, chainId2, resId2, magnitude) # a= raw_input() outFile.write( chainId1 + " " + resId1 + " " + chainId2 + " " + resId2 + " " + " ".join([str(val) for val in magnitude]) + "\n") outFile.close() except (KeyboardInterrupt, Exception): print("Exception happend computing %s" % outName) tryToRemove(outName) raise return 0
def __init__(self, model, radius=12.0, offset=0): """Initialize. A residue's exposure is defined as the number of CA atoms around that residues CA atom. A dictionary is returned that uses a L{Residue} object as key, and the residue exposure as corresponding value. :param model: the model that contains the residues :type model: L{Model} :param radius: radius of the sphere (centred at the CA atom) :type radius: float :param offset: number of flanking residues that are ignored in the calculation of the number of neighbors :type offset: int """ assert(offset >= 0) ppb = CaPPBuilder() ppl = ppb.build_peptides(model) fs_map = {} fs_list = [] fs_keys = [] for pp1 in ppl: for i in range(0, len(pp1)): fs = 0 r1 = pp1[i] if not is_aa(r1) or not r1.has_id('CA'): continue ca1 = r1['CA'] for pp2 in ppl: for j in range(0, len(pp2)): if pp1 is pp2 and abs(i - j) <= offset: continue r2 = pp2[j] if not is_aa(r2) or not r2.has_id('CA'): continue ca2 = r2['CA'] d = (ca2 - ca1) if d < radius: fs += 1 res_id = r1.get_id() chain_id = r1.get_parent().get_id() # Fill the 3 data structures fs_map[(chain_id, res_id)] = fs fs_list.append((r1, fs)) fs_keys.append((chain_id, res_id)) # Add to xtra r1.xtra['EXP_CN'] = fs AbstractPropertyMap.__init__(self, fs_map, fs_keys, fs_list)
def __init__(self, model, radius=12.0, offset=0): """Initialize. A residue's exposure is defined as the number of CA atoms around that residues CA atom. A dictionary is returned that uses a L{Residue} object as key, and the residue exposure as corresponding value. :param model: the model that contains the residues :type model: L{Model} :param radius: radius of the sphere (centred at the CA atom) :type radius: float :param offset: number of flanking residues that are ignored in the calculation of the number of neighbors :type offset: int """ assert (offset >= 0) ppb = CaPPBuilder() ppl = ppb.build_peptides(model) fs_map = {} fs_list = [] fs_keys = [] for pp1 in ppl: for i in range(0, len(pp1)): fs = 0 r1 = pp1[i] if not is_aa(r1) or not r1.has_id('CA'): continue ca1 = r1['CA'] for pp2 in ppl: for j in range(0, len(pp2)): if pp1 is pp2 and abs(i - j) <= offset: continue r2 = pp2[j] if not is_aa(r2) or not r2.has_id('CA'): continue ca2 = r2['CA'] d = (ca2 - ca1) if d < radius: fs += 1 res_id = r1.get_id() chain_id = r1.get_parent().get_id() # Fill the 3 data structures fs_map[(chain_id, res_id)] = fs fs_list.append((r1, fs)) fs_keys.append((chain_id, res_id)) # Add to xtra r1.xtra['EXP_CN'] = fs AbstractPropertyMap.__init__(self, fs_map, fs_keys, fs_list)
def __init__(self, align, m1, m2): """Produces a structural alignment of two models Input: - fasta_align - Alignment object - m1, m2 - two models - si, sj - the sequences in the Alignment object that correspond to the structures """ length = align[4]-align[3] # Get the residues in the models rl1 = Selection.unfold_entities(m1, 'R') rl2 = Selection.unfold_entities(m2, 'R') # Residue positions p1 = 0 p2 = 0 # Map equivalent residues to each other map12 = {} map21 = {} residue_pairs = [] for i in range(length): aa1 = align[0][i] aa2 = align[1][i] if aa1 != "-": while True: r1 = rl1[p1] p1 = p1 + 1 if is_aa(r1): break self._test_equivalence(r1, aa1) else: r1 = None if aa2 != "-": while True: r2 = rl2[p2] p2 = p2 +1 if is_aa(r2): break self._test_equivalence(r2, aa2) else: r2 = None if r1: map12[r1] = r2 if r2: map21[r2] = r1 residue_pairs.append((r1,r2)) self.map12 = map12 self.map21 = map21 self.residue_pairs= residue_pairs
def get_sequence(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' residue = [] for chain in self.structure.get_chains(): if chain.id == chain_id: for r in chain.get_residues(): residue.append(r.get_resname()) ### convert residue to amino acids aa = [] for r in residue: if is_aa(r, standard=True): aa.append(three_to_one(r)) sequence = "" for a in aa: sequence += a return sequence
def from_structure(cls, original, filter_residues): """ Loads structure as a protein, exposing protein-specific methods. """ P = cls(original.id) P.full_id = original.full_id for child in original.child_dict.values(): copycat = deepcopy(child) P.add(copycat) # Discriminate non-residues (is_aa function) remove_list = [] if filter_residues: for model in P: for chain in model: for residue in chain: if residue.get_id()[0] != ' ' or not is_aa(residue): remove_list.append(residue) for residue in remove_list: residue.parent.detach_child(residue.id) for chain in P.get_chains(): # Remove empty chains if not len(chain.child_list): model.detach_child(chain.id) P.header = deepcopy(original.header) P.xtra = deepcopy(original.xtra) return P
def get_bfactors( self, chain_id ): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the B-Factors for all residues in a chain of a Biopython.PDB structure. The B-Factors describe the mobility of an atom or a residue. In a Biopython.PDB structure B-Factors are given for each atom in a residue. Calculate the mean B-Factor for a residue by averaging over the B-Factor of all atoms in a residue. Sometimes B-Factors are not available for a certain residue; (e.g. the residue was not resolved); insert np.nan for those cases. Finally normalize your B-Factors using Standard scores (zero mean, unit variance). You have to use np.nanmean, np.nanvar etc. if you have nan values in your array. The returned data structure has to be a numpy array rounded again to integer. ''' means = [] for res in self.structure[0][chain_id]: if is_aa(res.get_resname()): values = [] for atom in res: values.append(atom.get_bfactor()) means.append(np.mean(values)) b_factors = zscore(means) return b_factors.astype(np.int64) # return rounded (integer) values
def get_contact_map( self, chain_id ): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return a complete contact map (see description in exercise sheet) for a given chain in a Biopython.PDB structure as numpy array. The values in the matrix describe the c-alpha distance between all residues in a chain of a Biopython.PDB structure. Only integer values of the distance have to be given (see below). ''' aas = [] for res in self.structure[0][chain_id].get_residues(): if is_aa(res): aas.append(res) length = len(self.get_sequence(chain_id)) contact_map = np.zeros((length, length), dtype=np.float32) for i in range(1, length+1): for j in range(1, length+1): contact_map[i-1][j-1] = self.get_ca_distance_list(aas, i-1, j-1) return contact_map.astype(np.int64) # return rounded (integer) values
def GetResidueDepPDB(pdb, pdbfile): s = GetStructure(pdb) model = s[0] residuelist = Selection.unfold_entities(model, 'R') try: surface = get_surface(pdbfile, PDBTOXYZ, MSMS) except: print "cannot get surface for " + pdbfile return content = "" for residue in residuelist: if not is_aa(residue): continue # minimun average depth for all atoms resid = residue.get_id() resname = residue.get_resname() chainid = residue.get_parent().get_id() try: rd = residue_depth(residue, surface) except: continue ca_rd = ca_depth(residue, surface) info = [pdb, chainid, resid[1], resname, str(rd), str(ca_rd)] for each in info: if not each: continue #print info newline = "\t".join(map(str, info)) + "\n" content = content + newline mutex_writefile.acquire() outobj = open(OUT, "a") outobj.write(content) outobj.close() mutex_writefile.release()
def is_peptide(self): """Check if component comes from a polypeptide""" if self.structure: return self.is_polymer and all([ is_aa(res.get_resname()) for res in self.structure.get_residues() ])
def getPDBSequence(pdb_name, pdb_path, chain): logging.info("getPDBSequence pdb " + pdb_name + " cadena " + chain) from Bio.PDB.PDBParser import PDBParser from Bio.PDB.Polypeptide import three_to_one from Bio.PDB.Polypeptide import is_aa residue_position = [] residue_name = list() try: parser = PDBParser(PERMISSIVE=1) structure = parser.get_structure(pdb_name, pdb_path) model = structure[0] chain = model[chain] for residue in chain: if is_aa(residue.get_resname(), standard=True): residue_name.append(three_to_one(residue.get_resname())) residue_position.append(residue.get_full_id()[3][1]) #else: #residue_name.append("X") #residue_position.append(residue.get_full_id()[3][1]) #raise Exception("Secuencia no valida, error en la posicion: " + str(residue.get_full_id()[3][1])) except Exception as inst: print inst logging.error( "Error no controlado intentando leer la sequencia del pdb " + pdb_name + " cadena " + chain + " path " + pdb_path) raise Exception("PDB Invalido pdb " + pdb_name + " cadena " + chain + " path " + pdb_path) return residue_position, residue_name '''
def _get_ca_list(chain): """ :param chain: The structure chain object :return: """ ca_list = [] # [<ca_atom_object or None>, ...] residues = [ ] # [(<index>, <insertion_code>, <3_letter_residue_name>_upper>), ...] sequence = "" for residue in chain: if is_aa(residue): _, _, chain_id, res_id = residue.get_full_id() try: residues.append((res_id[1], res_id[2].strip(), residue.get_resname().upper())) sequence += IUPACData.protein_letters_3to1_extended.get( residue.get_resname().capitalize(), 'X') ca_list.append(residue['CA']) except KeyError: logging.warning("Failed to find CA in residue {}".format( residue.get_full_id())) return residues, sequence, ca_list
def residues_map(self, selected_chain=None, standard_aa=True): rmap = {} for chain in self.struct.get_chains(): if (not selected_chain) or (selected_chain == chain.id): residues = [x for x in chain.get_residues() if is_aa(x, standard=standard_aa)] rmap[chain.id] = {i: x.id for i, x in enumerate(residues)} return rmap
def CalculateExchangeRates(ProteinModel, Temperature, pH, ReferenceData): # Loop over all residue chains in the model for Chain in ProteinModel: Sequence = [] ChainID = Chain.get_id() # Loop over all residues for Residue in Chain: ResidueName = Residue.get_resname() if is_aa(ResidueName, standard=True): Sequence.append(three_to_one(ResidueName)) else: Sequence.append("?") # Estimate intrinsic exchange rates if not Sequence == []: IntrinsicEnchangeRates = CalculateExchangeRatesForASingleChain( Sequence, Temperature, pH, ReferenceData) # Assign intrinsic exchange rates as b-factors i = 0 for Residue in Chain: IntrinsicExchangeRate = IntrinsicEnchangeRates[i] for Atom in Residue: Atom.set_bfactor(IntrinsicExchangeRate) i += 1 return
def __init__(self, model, pdb_file=None): """Initialize the class.""" # Issue warning if pdb_file is given if pdb_file is not None: warnings.warn( "ResidueDepth no longer requires a pdb file. " "This argument will be removed in a future release " "of Biopython.", BiopythonDeprecationWarning) depth_dict = {} depth_list = [] depth_keys = [] # get_residue residue_list = Selection.unfold_entities(model, 'R') # make surface from PDB file using MSMS surface = get_surface(model) # calculate rdepth for each residue for residue in residue_list: if not is_aa(residue): continue rd = residue_depth(residue, surface) ca_rd = ca_depth(residue, surface) # Get the key res_id = residue.get_id() chain_id = residue.get_parent().get_id() depth_dict[(chain_id, res_id)] = (rd, ca_rd) depth_list.append((residue, (rd, ca_rd))) depth_keys.append((chain_id, res_id)) # Update xtra information residue.xtra['EXP_RD'] = rd residue.xtra['EXP_RD_CA'] = ca_rd AbstractPropertyMap.__init__(self, depth_dict, depth_keys, depth_list)
def splitOnePDB(fname, outPath): try: s= parser.get_structure(fname, fname) except Exception: print ("Error loading pdb") return 0 banLenChains=[] try: for chain in s[0]: badResInChain=0 for res in chain.get_list(): if not is_aa(res,standard=True): badResInChain+=1 chainLen= sum(1 for res in chain if "CA" in res) - badResInChain if chainLen < MIN_SEQ_LEN or chainLen > MAX_SEQ_LEN: print(chainLen) banLenChains.append(chain.get_id()) except KeyError: print ("Not good model") return 0 for badChainId in banLenChains: s[0].detach_child(badChainId) receptorChainList= [] ligandChainList= [] if len( s[0].get_list())<2: print(s) print( s[0].get_list()) print("Not enough good chains") return 0 for chain1 in s[0]: tmpReceptorList=[] for chain2 in s[0]: if chain1!= chain2: tmpReceptorList.append(chain2) if len(tmpReceptorList)>1 or not tmpReceptorList[0] in ligandChainList: ligandChainList.append(chain1) receptorChainList.append(tmpReceptorList) prefix= os.path.basename(fname).split(".")[0] for i, (ligandChain, receptorChains) in enumerate(zip(ligandChainList, receptorChainList)): io=PDBIO() ligandStruct= Structure(prefix+"ligand") ligandStruct.add(Model(0)) ligandChain.set_parent(ligandStruct[0]) ligandStruct[0].add(ligandChain) io.set_structure(ligandStruct) io.save(os.path.join(outPath,prefix+"-"+str(i)+"_l_u.pdb")) io=PDBIO() receptorStruct= Structure(prefix+"receptor") receptorStruct.add(Model(0)) for receptorChain in receptorChains: receptorChain.set_parent(receptorStruct[0]) receptorStruct[0].add(receptorChain) io.set_structure(receptorStruct) io.save(os.path.join(outPath,prefix+"-"+str(i)+"_r_u.pdb")) print( "ligand:", ligandChain, "receptor:",receptorChains )
def __init__(self, model, pdb_file=None): # Issue warning if pdb_file is given if pdb_file is not None: warnings.warn(("ResidueDepth no longer requires a pdb file." " This argument will be removed in a future release" " of Biopython."), BiopythonDeprecationWarning) depth_dict = {} depth_list = [] depth_keys = [] # get_residue residue_list = Selection.unfold_entities(model, 'R') # make surface from PDB file using MSMS surface = get_surface(model) # calculate rdepth for each residue for residue in residue_list: if not is_aa(residue): continue rd = residue_depth(residue, surface) ca_rd = ca_depth(residue, surface) # Get the key res_id = residue.get_id() chain_id = residue.get_parent().get_id() depth_dict[(chain_id, res_id)] = (rd, ca_rd) depth_list.append((residue, (rd, ca_rd))) depth_keys.append((chain_id, res_id)) # Update xtra information residue.xtra['EXP_RD'] = rd residue.xtra['EXP_RD_CA'] = ca_rd AbstractPropertyMap.__init__(self, depth_dict, depth_keys, depth_list)
def get_pdb_sequence(input_pdb_file, chain_id, mapping_output=False, with_gaps=False): """Gets the PDB sequence in a dictionary""" mapping = {} pdb_parser = PDBParser(PERMISSIVE=True, QUIET=True) structure = pdb_parser.get_structure(input_pdb_file, input_pdb_file) model = structure[0] chain = model[chain_id] residues = list(chain) for res in residues: # Remove alternative location residues if "CA" in res.child_dict and is_aa(res) and res.id[2] == ' ': try: mapping[res.id[1]] = three_to_one(res.get_resname()) except KeyError: # Ignore non standard residues such as HIC, MSE, etc. pass if with_gaps: # Add missing gap residues by their residue number res_numbers = sorted(mapping.keys()) start, end = res_numbers[0], res_numbers[-1] missing = sorted(set(range(start, end + 1)).difference(res_numbers)) for m in missing: mapping[m] = '-' if mapping_output: return mapping else: return ''.join([mapping[k] for k in sorted(mapping.keys())])
def CalculateHydrogenBonds(ProteinModel, Filename, EnergyCutoff, PathToDSSP): # Run DSSP algorithm DSSPOutput = DSSP(ProteinModel, Filename, dssp = PathToDSSP) # Assign structure HydrogenBonds = {} TotalNumberOfHydrogenBonds = 0 for Chain in ProteinModel: ChainID = Chain.get_id() for Residue in Chain: ResidueID = Residue.get_id() if is_aa(Residue.get_resname(), standard = True): HydrogenBonds[Chain, Residue] = 0 try: DSSPEntry = DSSPOutput[(ChainID, ResidueID)] if float(DSSPEntry[7]) < EnergyCutoff: HydrogenBonds[Chain, Residue] += 1 TotalNumberOfHydrogenBonds += 1 if float(DSSPEntry[11]) < EnergyCutoff: HydrogenBonds[Chain, Residue] += 1 TotalNumberOfHydrogenBonds += 1 except: sys.stderr.write("No DSSP entry generated for amino acid residue " + str(ResidueID[1]) + ". Ignoring the residue. \n") sys.stdout.write(str(TotalNumberOfHydrogenBonds) + " backbone N-O hydrogen bonds.\n") return HydrogenBonds
def __init__(self, model, msms_exec=None): """Initialize the class.""" if msms_exec is None: msms_exec = "msms" depth_dict = {} depth_list = [] depth_keys = [] # get_residue residue_list = Selection.unfold_entities(model, "R") # make surface from PDB file using MSMS surface = get_surface(model, MSMS=msms_exec) # calculate rdepth for each residue for residue in residue_list: if not is_aa(residue): continue rd = residue_depth(residue, surface) ca_rd = ca_depth(residue, surface) # Get the key res_id = residue.get_id() chain_id = residue.get_parent().get_id() depth_dict[(chain_id, res_id)] = (rd, ca_rd) depth_list.append((residue, (rd, ca_rd))) depth_keys.append((chain_id, res_id)) # Update xtra information residue.xtra["EXP_RD"] = rd residue.xtra["EXP_RD_CA"] = ca_rd AbstractPropertyMap.__init__(self, depth_dict, depth_keys, depth_list)
def create_sequence(self, pdb_code, pdb_path): pdb = PDB.objects.get(code=pdb_code) struct = PDBParser(PERMISSIVE=1, QUIET=1).get_structure(pdb_code, pdb_path) for chain in struct[0].get_chains(): residues = [] for residue in chain.get_residues(): if is_aa(residue, standard=True): # alts = [a.get_altloc() for a in residue.get_atoms() if a.get_altloc()] # if len(alts) > 1 : # print(pdb_code) # disordered_select # print("alternative residue %s from %s was removed from sequence" % ( # str(residue.id), pdb_code # )) # else: residues.append(residue) if residues: seq = "".join([seq1(x.resname) for x in residues]) start = str(residues[0].id[1]) end = str(residues[-1].id[1]) seqid = "_".join([pdb_code, chain.id, start, end]) if not Bioentry.objects.filter(biodatabase=self.biodb, identifier=seqid).exists(): be = Bioentry(biodatabase=self.biodb, accession=seqid, identifier=seqid, name=pdb.code) be.save() Biosequence(bioentry=be, seq=seq, length=len(seq)).save()
def Extract_coordinates_from_PDB(self, PDB_file, type): ''' Returns both the alpha carbon coordinates contained in the PDB file and the residues coordinates for the desired chains''' from Bio.PDB.PDBParser import PDBParser from Bio.PDB import MMCIFParser Name = ntpath.basename(PDB_file).split('.')[0] try: parser = PDB.PDBParser() structure = parser.get_structure('%s' % (Name), PDB_file) except: parser = MMCIFParser() structure = parser.get_structure('%s' % (Name), PDB_file) ############## Iterating over residues to extract all of them even if there is more than 1 chain if type == 'models': CoordinatesPerModel = [] for model in structure: model_coord = [] for chain in model: for residue in chain: if is_aa(residue.get_resname(), standard=True): model_coord.append(residue['CA'].get_coord()) CoordinatesPerModel.append(model_coord) return CoordinatesPerModel elif type == 'chains': CoordinatesPerChain = [] for model in structure: for chain in model: chain_coord = [] for residue in chain: if is_aa(residue.get_resname(), standard=True): chain_coord.append(residue['CA'].get_coord()) CoordinatesPerChain.append(chain_coord) return CoordinatesPerChain elif type == 'all': alpha_carbon_coordinates = [] for chain in structure.get_chains(): for residue in chain: if is_aa(residue.get_resname(), standard=True): # try: alpha_carbon_coordinates.append( residue['CA'].get_coord()) # except: # pass return alpha_carbon_coordinates
def get_sequence(self, chain_id): # extract every residue name (three letters) from a given chain in a PDB structure # return sequence as one-letter-code chain = self.structure[0][chain_id] return ''.join([ three_to_one(residue.get_resname()) for residue in chain if is_aa(residue) ])
def get_pdb_sequence(structure): """ Retrieves the AA sequence from a PDB structure. """ _aainfo = lambda r: (r.id[1], aa3to1.get(r.resname, 'X')) seq = [_aainfo(r) for r in structure.get_residues() if is_aa(r)] return seq
def getSequenceStructure(s): seq = "" for r in s.get_residues(): if is_aa(r.get_resname(), standard=True): seq += three_to_one(r.get_resname()) else: seq += "G" return seq
def get_pdb_sequence_with_chains(structure): """ Retrieves the AA sequence from a PDB structure. It's a list that looks like [(5, 'R', 'A'), (6, 'E', 'A'), (7, 'H', 'A'), (8, 'W', 'A'),...] """ _aainfo = lambda r: (r.id[1], aa3to1.get(r.resname, 'X'),r.get_parent().get_id(),r.id[0],r.id[2]) seq = [_aainfo(r) for r in structure.get_residues() if (is_aa(r) and r.has_id('CA'))] return seq
def getSequenceFromChain(self, modelID, chainID): self.checkRead() seq = list() for model in self.structure: if model.id == modelID: for chain in model: if str(chain.id) == chainID: if len(chain.get_unpacked_list()[0].resname) == 1: print("Your sequence is a nucleotide sequence (" \ "RNA)\n") # alphabet = IUPAC.IUPACAmbiguousRNA._upper() for residue in chain: ## Check if the residue belongs to the ## standard RNA and add those residues to the ## seq if residue.get_resname() in ['A', 'C', 'G', 'U']: seq.append(residue.get_resname()) else: seq.append("X") elif len(chain.get_unpacked_list()[0].resname) == 2: print("Your sequence is a nucleotide sequence (" \ "DNA)\n") # alphabet = IUPAC.ExtendedIUPACDNA._upper() for residue in chain: ## Check if the residue belongs to the ## standard DNA and add those residues to the ## seq if residue.get_resname()[1] in ['A', 'C', 'G', 'T']: seq.append(residue.get_resname()[1]) else: seq.append("X") elif len(chain.get_unpacked_list()[0].resname) == 3: counter = 0 for residue in chain: if is_aa(residue.get_resname(), standard=True): # alphabet = IUPAC.ExtendedIUPACProtein._upper() ## The test checks if the amino acid ## is one of the 20 standard amino acids ## Some proteins have "UNK" or "XXX", or other symbols ## for missing or unknown residues seq.append(three_to_one(residue.get_resname())) counter += 1 else: seq.append("X") if counter != 0: # aminoacids print("Your sequence is an aminoacid sequence") else: # HETAM print("Your sequence is a HETAM sequence") for residue in chain: seq.append(residue.get_resname()) while seq[-1] == "X": del seq[-1] while seq[0] == "X": del seq[0] # return Seq(str(''.join(seq)), alphabet=alphabet) return Seq(str(''.join(seq)))
def ExtractPDBSeq(residues): residueList = [ r for r in residues if is_aa(r, standard=True) and ( r.get_resname().upper() in ValidAA3Letters) ] #print residueList pdbseq = ''.join([three_to_one(r.get_resname()) for r in residueList]) return pdbseq, residueList
def retrieveAtomicStructure(pdb_sequence): """Retrieves the atomic structure for a single chain of a PDB file, based on the measured structure""" pdb_structure = retrieveStructureFromPDB(pdb_sequence['pdb_id']) return { residue.get_id()[1]: residue.get_resname() for residue in pdb_structure[0][ pdb_sequence['chain_id']].get_residues() if is_aa(residue) }
def parse_structure(path): """ Parses a PDB formatter structure using Biopython's PDB Parser Verifies the integrity of the structure (gaps) and its suitability for the calculation (is it a complex?). """ print('[+] Reading structure file: {0}'.format(path)) fname = os.path.basename(path) sname = '.'.join(fname.split('.')[:-1]) try: s = P.get_structure(sname, path) except Exception as e: print('[!] Structure \'{0}\' could not be parsed'.format(sname), file=sys.stderr) raise Exception(e) # Double occupancy check for atom in list(s.get_atoms()): if atom.is_disordered(): residue = atom.parent sel_at = atom.selected_child sel_at.altloc = ' ' sel_at.disordered_flag = 0 residue.detach_child(atom.id) residue.add(sel_at) # Remove HETATMs and solvent res_list = list(s.get_residues()) n_res = len(res_list) _ignore = lambda r: r.id[0][0] == 'W' or r.id[0][0] == 'H' for res in res_list: if _ignore(res): chain = res.parent chain.detach_child(res.id) elif not is_aa(res, standard=True): raise ValueError( 'Unsupported non-standard amino acid found: {0}'.format( res.resname)) # Detect gaps and compare with no. of chains pep_builder = PPBuilder() peptides = pep_builder.build_peptides(s) n_peptides = len(peptides) n_chains = len(set([c.id for c in s.get_chains()])) if n_peptides != n_chains: print('[!] Structure contains gaps:', file=sys.stderr) for i_pp, pp in enumerate(peptides): print( '\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > {2.parent.id} {2.resname}{2.id[1]}' .format(i_pp, pp[0], pp[-1]), file=sys.stderr) #raise Exception('Calculation cannot proceed') return (s, n_chains, n_res)
def _extract_residue(self, line): resseq = int(line[27:32].split()[0]) resname = line[20:24].split()[0] if is_aa(resname): hetero_flag = " " elif resname == "HOH" or resname == "WAT": hetero_flag = "W" else: hetero_flag = "H" return resseq, resname, hetero_flag
def annotate(m, ss_seq): """Apply seconardary structure information to residues in model.""" c = m.get_list()[0] all = c.get_list() residues = [] # Now remove HOH etc. for res in all: if is_aa(res): residues.append(res) L = len(residues) if not L == len(ss_seq): raise ValueError("Length mismatch %i %i" % (L, len(ss_seq))) for i in range(0, L): residues[i].xtra["SS_PSEA"] = ss_seq[i]
def getResidueStrings(structure): seqs = [] for model in structure: for ch in model.get_chains(): seq = '' for residue in model.get_residues(): resname = residue.get_resname() if is_aa(resname, standard=True): seq += three_to_one(resname) elif resname in {'HIE', 'HID'}: seq += 'H' elif resname in {'CYX', 'CYM'}: seq += 'C' else: seq += 'X' seqs.append(seq) return seqs
def __init__(self, model, pdb_file): depth_dict = {} depth_list = [] depth_keys = [] # get_residue residue_list = Selection.unfold_entities(model, "R") # make surface from PDB file surface = get_surface(pdb_file) # calculate rdepth for each residue for residue in residue_list: if not is_aa(residue): continue rd = residue_depth(residue, surface) ca_rd = ca_depth(residue, surface) # Get the key res_id = residue.get_id() chain_id = residue.get_parent().get_id() depth_dict[(chain_id, res_id)] = (rd, ca_rd) depth_list.append((residue, (rd, ca_rd))) depth_keys.append((chain_id, res_id)) # Update xtra information residue.xtra["EXP_RD"] = rd residue.xtra["EXP_RD_CA"] = ca_rd AbstractPropertyMap.__init__(self, depth_dict, depth_keys, depth_list)
def get_residue_depth(pdb_fh,msms_fh): """ Extracts Residue depth from PDB structure :param pdb_fh: path to PDB structure file :param msms_fh: path to MSMS libraries :returns data_depth: pandas table with residue depth per residue """ from Bio.PDB import Selection,PDBParser from Bio.PDB.Polypeptide import is_aa from Bio.PDB.ResidueDepth import get_surface,_read_vertex_array,residue_depth,ca_depth,min_dist surface_fh="%s/%s.msms.vert" % (dirname(msms_fh),basename(pdb_fh)) if not exists(surface_fh): pdb_to_xyzr_fh="%s/pdb_to_xyzr" % dirname(msms_fh) xyzr_fh="%s/%s.xyzr" % (dirname(msms_fh),basename(pdb_fh)) pdb_to_xyzr_com="%s %s > %s" % (pdb_to_xyzr_fh,pdb_fh,xyzr_fh) msms_com="%s -probe_radius 1.5 -if %s -of %s > %s.log" % (msms_fh,xyzr_fh,splitext(surface_fh)[0],splitext(surface_fh)[0]) log_fh="%s.log" % msms_fh log_f = open(log_fh,'a') log_f.write("%s;\n%s\n" % (pdb_to_xyzr_com,msms_com)) subprocess.call("%s;%s" % (pdb_to_xyzr_com,msms_com) , shell=True,stdout=log_f, stderr=subprocess.STDOUT) log_f.close() surface =_read_vertex_array(surface_fh) pdb_parser=PDBParser() pdb_data=pdb_parser.get_structure("pdb_name",pdb_fh) model = pdb_data[0] residue_list = Selection.unfold_entities(model, 'R') depth_dict = {} depth_list = [] depth_keys = [] for residue in residue_list: if not is_aa(residue): continue rd = residue_depth(residue, surface) ca_rd = ca_depth(residue, surface) # Get the key res_id = residue.get_id() chain_id = residue.get_parent().get_id() if chain_id=="A": depth_dict[(chain_id, res_id)] = (rd, ca_rd) depth_list.append((residue, (rd, ca_rd))) depth_keys.append((chain_id, res_id)) # Update xtra information residue.xtra['EXP_RD'] = rd residue.xtra['EXP_RD_CA'] = ca_rd else: break depth_df=pd.DataFrame(depth_dict).T.reset_index() depth_df=depth_df.drop("level_0",axis=1) aasi_prev=0 for i in range(len(depth_df)): if depth_df.loc[i,"level_1"][1]!=aasi_prev: depth_df.loc[i,"aasi"]=depth_df.loc[i,"level_1"][1] aasi_prev=depth_df.loc[i,"level_1"][1] depth_df=depth_df.drop("level_1",axis=1) depth_df=depth_df.loc[~pd.isnull(depth_df.loc[:,"aasi"]),:] depth_df=depth_df.set_index("aasi",drop=True) depth_df.columns=["Residue depth","Residue (C-alpha) depth"] return depth_df
def parse_structure(path): """ Parses a structure using Biopython's PDB/mmCIF Parser Verifies the integrity of the structure (gaps) and its suitability for the calculation (is it a complex?). """ print('[+] Reading structure file: {0}'.format(path)) fname = os.path.basename(path) sname = '.'.join(fname.split('.')[:-1]) s_ext = fname.split('.')[-1] _ext = set(('pdb', 'ent', 'cif')) if s_ext not in _ext: raise IOError('[!] Structure format \'{0}\' is not supported. Use \'.pdb\' or \'.cif\'.'.format(s_ext)) if s_ext in set(('pdb', 'ent')): sparser = PDBParser(QUIET=1) elif s_ext == 'cif': sparser = MMCIFParser() try: s = sparser.get_structure(sname, path) except Exception as e: print('[!] Structure \'{0}\' could not be parsed'.format(sname), file=sys.stderr) raise Exception(e) # Keep first model only if len(s) > 1: print('[!] Structure contains more than one model. Only the first one will be kept') model_one = s[0].id for m in s.child_list[:]: if m.id != model_one: s.detach_child(m.id) # Double occupancy check for atom in list(s.get_atoms()): if atom.is_disordered(): residue = atom.parent sel_at = atom.selected_child sel_at.altloc = ' ' sel_at.disordered_flag = 0 residue.detach_child(atom.id) residue.add(sel_at) # Remove HETATMs and solvent res_list = list(s.get_residues()) def _is_het(residue): return residue.id[0][0] == 'W' or residue.id[0][0] == 'H' for res in res_list: if _is_het(res): chain = res.parent chain.detach_child(res.id) elif not is_aa(res, standard=True): raise ValueError('Unsupported non-standard amino acid found: {0}'.format(res.resname)) n_res = len(list(s.get_residues())) # Remove Hydrogens atom_list = list(s.get_atoms()) def _is_hydrogen(atom): return atom.element == 'H' for atom in atom_list: if _is_hydrogen(atom): residue = atom.parent residue.detach_child(atom.name) # Detect gaps and compare with no. of chains pep_builder = PPBuilder() peptides = pep_builder.build_peptides(s) n_peptides = len(peptides) n_chains = len(set([c.id for c in s.get_chains()])) if n_peptides != n_chains: print('[!] Structure contains gaps:', file=sys.stderr) for i_pp, pp in enumerate(peptides): print('\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > {2.parent.id} {2.resname}{2.id[1]}'.format(i_pp, pp[0], pp[-1]), file=sys.stderr) #raise Exception('Calculation cannot proceed') return (s, n_chains, n_res)
def validate_structure(s, selection=None, clean=True): # setup logging logger = logging.getLogger('Prodigy') # Keep first model only if len(s) > 1: logger.warning('[!] Structure contains more than one model. Only the first one will be kept') model_one = s[0].id for m in s.child_list[:]: if m.id != model_one: s.detach_child(m.id) # process selected chains chains = list(s.get_chains()) chain_ids = set([c.id for c in chains]) if selection: sel_chains = [] # Match selected chain with structure for sel in selection: for c in sel.split(','): sel_chains.append(c) if c not in chain_ids: raise ValueError('Selected chain not present in provided structure: {0}'.format(c)) # Remove unselected chains _ignore = lambda x: x.id not in sel_chains for c in chains: if _ignore(c): c.parent.detach_child(c.id) # Double occupancy check for atom in list(s.get_atoms()): if atom.is_disordered(): residue = atom.parent sel_at = atom.selected_child sel_at.altloc = ' ' sel_at.disordered_flag = 0 residue.detach_child(atom.id) residue.add(sel_at) if clean: # Remove HETATMs and solvent res_list = list(s.get_residues()) _ignore = lambda r: r.id[0][0] == 'W' or r.id[0][0] == 'H' for res in res_list: if _ignore(res): chain = res.parent chain.detach_child(res.id) elif not is_aa(res, standard=True): raise ValueError('Unsupported non-standard amino acid found: {0}'.format(res.resname)) # Remove Hydrogens atom_list = list(s.get_atoms()) _ignore = lambda x: x.element == 'H' for atom in atom_list: if _ignore(atom): residue = atom.parent residue.detach_child(atom.name) # Detect gaps and compare with no. of chains pep_builder = PPBuilder() peptides = pep_builder.build_peptides(s) n_peptides = len(peptides) if n_peptides != len(chain_ids): message = '[!] Structure contains gaps:\n' for i_pp, pp in enumerate(peptides): message += '\t{1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > ' \ '{2.parent.id} {2.resname}{2.id[1]}\n'.format(i_pp, pp[0], pp[-1]) logger.warning(message) # raise Exception(message) return s
def __init__(self, model, radius, offset=0, hse_up_key='HSE_U', hse_down_key='HSE_D', angle_key=None, check_chain_breaks=False, check_knots=False, receptor=None, signprot=None): """ @param model: model @type model: L{Model} @param radius: HSE radius @type radius: float @param offset: number of flanking residues that are ignored in the calculation of the number of neighbors @type offset: int @param hse_up_key: key used to store HSEup in the entity.xtra attribute @type hse_up_key: string @param hse_down_key: key used to store HSEdown in the entity.xtra attribute @type hse_down_key: string @param angle_key: key used to store the angle between CA-CB and CA-pCB in the entity.xtra attribute @type angle_key: string """ assert(offset>=0) # For PyMOL visualization self.ca_cb_list=[] ppb=CaPPBuilder() ppl=ppb.build_peptides(model) hse_map={} hse_list=[] hse_keys=[] ### GP if model.get_id()!=0: model = model[0] residues_in_pdb,residues_with_proper_CA=[],[] if check_chain_breaks==True: # for m in model: for chain in model: for res in chain: # try: if is_aa(res): residues_in_pdb.append(res.get_id()[1]) # except: # if is_aa(chain): # residues_in_pdb.append(chain.get_id()[1]) # print('chain', chain, res) # break self.clash_pairs = [] self.chain_breaks = [] if check_knots: possible_knots = PossibleKnots(receptor, signprot) knot_resis = possible_knots.get_resnums() self.remodel_resis = {} for pp1 in ppl: for i in range(0, len(pp1)): residues_with_proper_CA.append(pp1[i].get_id()[1]) if i==0: r1=None else: r1=pp1[i-1] r2=pp1[i] if i==len(pp1)-1: r3=None else: r3=pp1[i+1] # This method is provided by the subclasses to calculate HSE result=self._get_cb(r1, r2, r3) if result is None: # Missing atoms, or i==0, or i==len(pp1)-1 continue pcb, angle=result hse_u=0 hse_d=0 ca2=r2['CA'].get_vector() residue_up=[] ### GP residue_down=[] ### GP for pp2 in ppl: for j in range(0, len(pp2)): try: if r2.get_id()[1]-1!=r1.get_id()[1] or r2.get_id()[1]+1!=r3.get_id()[1]: pass else: raise Exception except: if pp1 is pp2 and abs(i-j)<=offset: # neighboring residues in the chain are ignored continue ro=pp2[j] if not is_aa(ro) or not ro.has_id('CA'): continue cao=ro['CA'].get_vector() d=(cao-ca2) if d.norm()<radius: if d.angle(pcb)<(math.pi/2): hse_u+=1 ### GP # Puts residues' names in a list that were found in the upper half sphere residue_up.append(ro) ### end of GP code else: hse_d+=1 ### GP # Puts residues' names in a list that were found in the lower half sphere residue_down.append(ro) ### end of GP code res_id=r2.get_id() chain_id=r2.get_parent().get_id() # Fill the 3 data structures hse_map[(chain_id, res_id)]=(hse_u, hse_d, angle) hse_list.append((r2, (residue_up, residue_down, hse_u, hse_d, angle))) ### GP residue_up and residue_down added to hse_list hse_keys.append((chain_id, res_id)) # Add to xtra r2.xtra[hse_up_key]=hse_u r2.xtra[hse_down_key]=hse_d if angle_key: r2.xtra[angle_key]=angle ### GP checking for knots if check_knots: for knot in knot_resis: if knot[0][1]==pp1[i].get_id()[1] and knot[0][0]==pp1[i].get_parent().get_id(): print(pp1[i].get_parent().get_id(),pp1[i]) for r in residue_up: if r.get_parent().get_id()==knot[1][0] and r.get_id()[1] in knot[1][1]: print('close: ', r.get_parent().get_id(),r) resi_range = [knot[1][1][0], knot[1][1][-1]] if knot[1][0] not in self.remodel_resis: self.remodel_resis[knot[1][0]] = [resi_range] else: if resi_range not in self.remodel_resis[knot[1][0]]: self.remodel_resis[knot[1][0]].append(resi_range) ### GP checking for atom clashes include_prev, include_next = False, False try: if pp1[i].get_id()[1]-1!=pp1[i-1].get_id()[1]: include_prev = True except: include_prev = False try: if pp1[i].get_id()[1]+1!=pp1[i+1].get_id()[1]: include_next = True except: include_next = False for atom in pp1[i]: ref_vector = atom.get_vector() for other_res in residue_up: try: if other_res==pp1[i-1] and include_prev==False: continue elif len(pp1)>=i+1 and other_res==pp1[i+1] and include_next==False: continue else: raise Exception except: for other_atom in other_res: other_vector = other_atom.get_vector() d = other_vector-ref_vector if d.norm()<2: if len(str(pp1[i]['CA'].get_bfactor()).split('.')[1])==1: clash_res1 = float(str(pp1[i]['CA'].get_bfactor())+'0') else: clash_res1 = pp1[i]['CA'].get_bfactor() if len(str(other_res['CA'].get_bfactor()).split('.')[1])==1: clash_res2 = float(str(other_res['CA'].get_bfactor())+'0') else: clash_res2 = other_res['CA'].get_bfactor() self.clash_pairs.append([(clash_res1, pp1[i].get_id()[1]), (clash_res2, other_res.get_id()[1])]) if check_chain_breaks==True: for r in residues_in_pdb: if r not in residues_with_proper_CA: self.chain_breaks.append(r)
def __init__(self, model, radius, offset, hse_up_key, hse_down_key, angle_key=None): """ @param model: model @type model: L{Model} @param radius: HSE radius @type radius: float @param offset: number of flanking residues that are ignored in the calculation of the number of neighbors @type offset: int @param hse_up_key: key used to store HSEup in the entity.xtra attribute @type hse_up_key: string @param hse_down_key: key used to store HSEdown in the entity.xtra attribute @type hse_down_key: string @param angle_key: key used to store the angle between CA-CB and CA-pCB in the entity.xtra attribute @type angle_key: string """ assert(offset>=0) # For PyMOL visualization self.ca_cb_list=[] ppb=CaPPBuilder() ppl=ppb.build_peptides(model) hse_map={} hse_list=[] hse_keys=[] for pp1 in ppl: for i in range(0, len(pp1)): if i==0: r1=None else: r1=pp1[i-1] r2=pp1[i] if i==len(pp1)-1: r3=None else: r3=pp1[i+1] # This method is provided by the subclasses to calculate HSE result=self._get_cb(r1, r2, r3) if result is None: # Missing atoms, or i==0, or i==len(pp1)-1 continue pcb, angle=result hse_u=0 hse_d=0 ca2=r2['CA'].get_vector() for pp2 in ppl: for j in range(0, len(pp2)): if pp1 is pp2 and abs(i-j)<=offset: # neighboring residues in the chain are ignored continue ro=pp2[j] if not is_aa(ro) or not ro.has_id('CA'): continue cao=ro['CA'].get_vector() d=(cao-ca2) if d.norm()<radius: if d.angle(pcb)<(pi/2): hse_u+=1 else: hse_d+=1 res_id=r2.get_id() chain_id=r2.get_parent().get_id() # Fill the 3 data structures hse_map[(chain_id, res_id)]=(hse_u, hse_d, angle) hse_list.append((r2, (hse_u, hse_d, angle))) hse_keys.append((chain_id, res_id)) # Add to xtra r2.xtra[hse_up_key]=hse_u r2.xtra[hse_down_key]=hse_d if angle_key: r2.xtra[angle_key]=angle AbstractPropertyMap.__init__(self, hse_map, hse_keys, hse_list)