def calc_sasa(self, parsed_pdb): """Calculates the per atom solvent accessible surface area of the iFG and the sasa of the residue containing the iFG. Needs FreeSASA module to be imported. Takes as argument an instance of ParsedPDB class, which contains the iFG. Right now this function isn't optimized, in the sense that the iFG atoms must be in the same residue. Need better general way to select iFG atoms... parsed_pdb: an instance of class ParsedPDB having attributes .contacts, .fs_struct, .fs_result, .dssp, .prody_pdb """ assert isinstance(parsed_pdb.fs_struct, freesasa.Structure), 'parsed_pdb object must have attribute freesasa structure obj' assert isinstance(parsed_pdb.fs_result, freesasa.Result), 'parsed_pdb object must have attribute freesasa result obj' if len(self.resnum) == 1: selections = freesasa.selectArea(('ifg_atoms, chain ' + self.chid + ' and resi ' + str(self.resnum[0]) + ' and name ' + '+'.join(self.atom_names[self.resname[0]]), 'ifg_residue, chain ' + self.chid + ' and resi ' + str(self.resnum[0])), parsed_pdb.fs_struct, parsed_pdb.fs_result) else: selections = freesasa.selectArea(('ifg_atoms, chain ' + self.chid + ' and ((resi ' + str(self.resnum[0]) + ' and name ' + '+'.join(self.atom_names[self.resname[0]]) + ') or (resi ' + str(self.resnum[1]) + ' and name ' + '+'.join(self.atom_names[self.resname[1]]) + '))', 'ifg_residue, chain ' + self.chid + ' and resi ' + '+'.join(str(rn) for rn in self.resnum)), parsed_pdb.fs_struct, parsed_pdb.fs_result) self.sasa = '{0:.2f}'.format(selections['ifg_atoms']) self.residue_sasa = '{0:.2f}'.format(selections['ifg_residue']) self.sasa_3A_probe = self.calc_large_probe_sasa(parsed_pdb, parsed_pdb.fs_result_cb_3A) self.sasa_4A_probe = self.calc_large_probe_sasa(parsed_pdb, parsed_pdb.fs_result_cb_4A) self.sasa_5A_probe = self.calc_large_probe_sasa(parsed_pdb, parsed_pdb.fs_result_cb_5A)
def get_contact_residue_sasa(self, cutoff=8.5): """Compute the feature value.""" self.bsa_data = {} self.bsa_data_xyz = {} res = self.sql.get_contact_residues(cutoff=cutoff) keys = list(res.keys()) res = res[keys[0]] + res[keys[1]] for r in res: # define the selection string and the bsa for the complex select_str = ('res, (resi %d) and (chain %s)' % (r[1], r[0]), ) asa_complex = freesasa.selectArea(select_str, self.complex, self.result_complex)['res'] # define the selection string and the bsa for the isolated select_str = ('res, resi %d' % r[1], ) asa_unbound = freesasa.selectArea(select_str, self.chains[r[0]], self.result_chains[r[0]])['res'] # define the bsa bsa = asa_unbound - asa_complex # define the xyz key : (chain,x,y,z) chain = {'A': 0, 'B': 1}[r[0]] xyz = np.mean(self.sql.get('x,y,z', resSeq=r[1], chainID=r[0]), 0) xyzkey = tuple([chain] + xyz.tolist()) # put the data in dict self.bsa_data[r] = [bsa]
def get_contact_residue_sasa(self, cutoff=5.5): """Compute the feature of BSA. It generates following feature: bsa Raises: ValueError: No interface residues found. """ self.bsa_data = {} self.bsa_data_xyz = {} ctc_res = self.sql.get_contact_residues(cutoff=cutoff, chain1=self.chain1, chain2=self.chain2) ctc_res = ctc_res[self.chain1] + ctc_res[self.chain2] # handle with small interface or no interface total_res = len(ctc_res) if total_res == 0: raise ValueError( f"No interface residue found with the cutoff {cutoff}Å." f" Failed to calculate the feature BSA") elif total_res < 5: # this is an empirical value warnings.warn( f"Only {total_res} interface residues found with cutoff" f" {cutoff}Å. Be careful with using the feature BSA") for res in ctc_res: # define the selection string and the bsa for the complex select_str = ('res, (resi %d) and (chain %s)' % (res[1], res[0]), ) asa_complex = freesasa.selectArea(select_str, self.complex, self.result_complex)['res'] # define the selection string and the bsa for the isolated select_str = ('res, resi %d' % res[1], ) asa_unbound = freesasa.selectArea( select_str, self.chains[res[0]], self.result_chains[res[0]])['res'] # define the bsa bsa = asa_unbound - asa_complex # define the xyz key: (chain,x,y,z) chain = {self.chain1: 0, self.chain2: 1}[res[0]] # get the center _, xyz = self.get_residue_center(self.sql, res=res) xyzkey = tuple([chain] + xyz[0]) # put the data in dict self.bsa_data[res] = [bsa] self.bsa_data_xyz[xyzkey] = [bsa] # pyt the data in dict self.feature_data['bsa'] = self.bsa_data self.feature_data_xyz['bsa'] = self.bsa_data_xyz
def calc_sasa(self, parsed_pdb): """Calculates the per atom solvent accessible surface area of the iFG and the sasa of the residue containing the iFG. Needs FreeSASA module to be imported. Takes as argument an instance of ParsedPDB class, which contains the iFG. Right now this function isn't optimized, in the sense that the iFG atoms must be in the same residue. Need better general way to select iFG atoms... parsed_pdb: an instance of class ParsedPDB having attributes .contacts, .fs_struct, .fs_result, .dssp, .prody_pdb """ # assert isinstance(self.pdb_name, str), 'pdb name of iFG is not defined' # assert self.pdb_name + '.pdb' in os.listdir(path_to_pdbs), 'pdb file is not in directory' assert isinstance( parsed_pdb.fs_struct, freesasa.Structure ), 'parsed_pdb object must have attribute freesasa structure obj' assert isinstance( parsed_pdb.fs_result, freesasa.Result ), 'parsed_pdb object must have attribute freesasa result obj' selections = freesasa.selectArea( ('vdm_residue, chain ' + self.chid + ' and resi ' + str(self.resnum), ), parsed_pdb.fs_struct, parsed_pdb.fs_result) self.residue_sasa = '{0:.2f}'.format(selections['vdm_residue']) self.sasa_3A_probe = self.calc_large_probe_sasa( parsed_pdb, parsed_pdb.fs_result_cb_3A) self.sasa_4A_probe = self.calc_large_probe_sasa( parsed_pdb, parsed_pdb.fs_result_cb_4A) self.sasa_5A_probe = self.calc_large_probe_sasa( parsed_pdb, parsed_pdb.fs_result_cb_5A)
def get_accessible_surface_area(self, atom): try: sasa, sasa_struct = self._get_sasa() selection = "sele, chain {} and resi {} and name {}".format(self.chain, atom.get_parent().get_id()[1], atom.get_id()[0]) with silence_stdout(), silence_stderr(): selections = freesasa.selectArea([selection], sasa_struct, sasa) atom_area = selections["sele"] except (KeyError, AssertionError, AttributeError, TypeError): atom_area = np.NaN dssp = self._get_dssp() try: residue_area = dssp[atom.get_full_id()[2:4]][3] except (KeyError, AssertionError, AttributeError, TypeError): try: #Remove HETATMs residue_area = dssp[(atom.get_full_id()[2], (' ', atom.get_full_id()[3][1], ' '))][3] if residue_area == "NA": residue_area = np.NaN except (KeyError, AssertionError, AttributeError, TypeError): residue_area = np.NaN asa = np.zeros(4) asa[0] = atom_area asa[1] = residue_area asa[2] = float(residue_area < 0.2) asa[3] = float(residue_area >= 0.2) return asa
def cal_sasa(prot, resilist): structure = freesasa.Structure(prot) result = freesasa.calc(structure) for i in range(len(resilist)): resi_ind = resilist[i]['resi_seq'] chain = resilist[i]['chain'] sasa_value = freesasa.selectArea( ('alanine, resn ala', 'we, resi ' + str(resi_ind) + ' and chain ' + chain), structure, result) resilist[i]['SASA'] = sasa_value['we'] return resilist
def calcSASA(Latm, selection): """Calcule la surface accessible au solvent (SAS) des acides aminés de la selecion Retourne la SAS pour une sélection donnée """ freesasa.setVerbosity(1) structure = freesasa.Structure() for a in Latm: structure.addAtom(a.ty, a.resname, a.resN, a.chain, a.traj[0], a.traj[1], a.traj[2]) result = freesasa.calc(structure) selections = freesasa.selectArea((selection, 'all, resn ala'), structure, result) return selections[selection.split()[0][:-1]]
def surface_list(file1): maximum_area = { 'ALA': 120.56, 'CYS': 143.79, 'ASP': 157.04, 'GLU': 188.42, 'PHE': 227.46, 'GLY': 89.41, 'HIS': 200.14, 'ILE': 96.42, 'LYS': 213.74, 'LEU': 206.32, 'MET': 216.63, 'ASN': 149.85, 'PRO': 155.07, 'GLN': 186.83, 'ARG': 229.51, 'SER': 128.27, 'THR': 138.58, 'VAL': 169.82, 'TRP': 269.35, 'TYR': 241.54 } global chain_A global chain_B surface_list_a1 = [] surface_list_b1 = [] structure = freesasa.Structure(file1) result = freesasa.calc(structure) for residue1 in chain_A.get_residues(): try: res_id = residue1["CA"].get_full_id()[3][1] select_word = str(res_id) + ", " + "chain H and resi " + str( res_id) selections = freesasa.selectArea((select_word, ), structure, result) for key in selections: if float('%.3f' % (selections[key] / maximum_area[chain_A[ residue1.get_full_id()[3][1]].get_resname()])) > 0.05: surface_list_a1.append(res_id) except Exception: pass continue for residue2 in chain_B.get_residues(): try: res_id = residue2["CA"].get_full_id()[3][1] select_word = str(res_id) + ", " + "chain L and resi " + str( res_id) selections = freesasa.selectArea((select_word, ), structure, result) for key in selections: if float('%.3f' % (selections[key] / maximum_area[chain_B[ residue2.get_full_id()[3][1]].get_resname()])) > 0.05: surface_list_b1.append(res_id) except Exception: pass continue return surface_list_a1, surface_list_b1
def get_accessible_surface_area(self, atom_or_residue): """Returns the ASA value from freesasa (if inout is Atom) and the DSSP value (if input is Atom or Residue) Returns ------- If input is residue a 3-vector is returned, otherwise a 4-vector is returned """ if isinstance(atom_or_residue, PDB.Atom.Atom): is_atom = True atom = atom_or_residue residue = atom_or_residue.get_parent() elif isinstance(atom_or_residue, PDB.Residue.Residue): is_atom = False residue = atom_or_residue else: raise RuntimeErorr("Input must be Atom or Residue") if not hasattr(self, "_sasa"): self._sasa = run_freesasa_biopython(self.pdb_path) sasa, sasa_struct = self._sasa if is_atom: total_area = surface_areas.get(atom.element.title(), 1.0) try: selection = "sele, chain {} and resi {} and name {}".format(self.chain, atom.get_parent().get_id()[1], atom.get_id()[0]) with silence_stdout(), silence_stderr(): selections = freesasa.selectArea([selection], sasa_struct, sasa) atom_area = selections["sele"] fraction = atom_area/total_area except (KeyError, AssertionError, AttributeError, TypeError): raise atom_area = np.NaN fraction = np.NaN if not hasattr(self, "_dssp"): self._dssp = run_dssp(self.structure, self.pdb_path, work_dir=self.work_dir, job=self.job) try: residue_area = self._dssp[residue.get_full_id()[2:]][3] except (KeyError, AssertionError, AttributeError, TypeError): try: #Remove HETATMs residue_area = self._dssp[(residue.get_full_id()[2], (' ', residue.get_full_id()[3][1], ' '))][3] if residue_area == "NA": residue_area = np.NaN except (KeyError, AssertionError, AttributeError, TypeError): residue_area = np.NaN if is_atom: asa = np.zeros(6) asa[0] = atom_area asa[1] = float(fraction <= 0.2) #buried asa[2] = float(fraction > 0.2) #exposed asa[3] = residue_area asa[4] = float(residue_area < 0.2) asa[5] = float(residue_area >= 0.2) else: asa = np.zeros(3) asa[0] = residue_area asa[1] = float(residue_area < 0.2) asa[2] = float(residue_area >= 0.2) return asa
def openfile(): global prob, probab, te global my_seq global anti global structure, structure_id, filename global antigenicity, hydro, flex, sec global m, a, c, b, length, j, k global hydroph, flexi, access anti = [] sec = [] probab = [] from tkinter import filedialog root = Tk() root.filename = filedialog.askopenfilename( initialdir="/", title="Select file", filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb"))) filename = root.filename print(filename) structure_id = "1e6j" structure = PDBParser().get_structure(structure_id, root.filename) ppb = PPBuilder() for pp in ppb.build_peptides(structure): my_seq = pp.get_sequence() # type: Seq print(my_seq) for model in structure: for chain in model: print(chain) sequence = list(my_seq) m = ''.join(sequence) print(m) length = len(m) # type: int print("Sequence consist of", length, "Amino Acids") from Bio.SeqUtils.ProtParam import ProteinAnalysis analysed_seq = ProteinAnalysis(m) print("Molecular weight = ", analysed_seq.molecular_weight()) print("Amino Acid Count = ", analysed_seq.count_amino_acids()) print("Secondary structure fraction =", analysed_seq.secondary_structure_fraction()) kd = { 'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2 } c = list(analysed_seq.flexibility()) b = list(analysed_seq.protein_scale(kd, 10, 1.0)) hydro = list(analysed_seq.protein_scale(kd, 10, 1.0)) flex = list(analysed_seq.flexibility()) hydroph = list(analysed_seq.protein_scale(kd, 10, 1.0)) flexi = list(analysed_seq.flexibility()) i = 1 j = -1 # type: int k = 9 while i <= (length - 10): print("Sequence is = ", m[j + 1:k + 1]) print("Flexibility value = ", c[j + 1]) print("Hydrophilicity value = ", b[j + 1]) ana_seq = ''.join(m[j + 1:k + 1]) analyze_seq = ProteinAnalysis(ana_seq) # For Secondary structure Analysis print("Secondary structure fraction =", analyze_seq.secondary_structure_fraction()) a = list(analyze_seq.secondary_structure_fraction()) a = a[0] sec.append(a) i += 1 j += 1 k += 1 f = length r = 1 y = 10 global acc, logacc acc = [] for i in range(0, f): str1 = "accessibility, resi " str2 = str(r) + "-" + str(y) saving = str1 + str2 print(saving) r = r + 1 y = y + 1 structure = freesasa.Structure("1e6j.pdb") resulta = freesasa.calc(structure) area_classes = freesasa.classifyResults(resulta, structure) print("Total : %.2f A2" % resulta.totalArea()) for key in area_classes: print(key, ": %.2f A2" % area_classes[key]) resulta = freesasa.calc( structure, freesasa.Parameters({ 'algorithm': freesasa.LeeRichards, 'n-slices': 10 })) selections = freesasa.selectArea(('alanine, resn ala', saving), structure, resulta) for key in selections: print(key, ": %.2f A2" % selections[key]) a = selections[key] acc.append(a) l = acc[0::2] access = l print(acc) print(l) logacc = [math.log(y, 10) for y in l] print(logacc)
def parse_pdb_coordinates(pdb_path: str, start_position: int, end_position: int, position_correction: int, chain: str, sasa: bool = False) -> DataFrame: """ Parse coordinate of CA atoms. Will also return the bfactor and SASA using freesasa. If PDB is missing atoms, it can handle it. """ # Get structure from PDB structure = PDBParser().get_structure('pdb', pdb_path) coordinates = [] commands = [] bfactors = [] positions_worked = [] # positions present in pdb # Iterate over each CA atom and geet coordinates for i in np.arange(start_position + position_correction, end_position + position_correction): # first check if atom exists try: structure[0][chain][int(i)].has_id("CA") # Get atom from pdb and geet coordinates atom = list(structure[0][chain][int(i)]["CA"].get_vector()) + [i] coordinates.append(atom) # Get SASA command for each residue and bfactor residue = "s{}, chain {} and resi {}".format(str(i), chain, str(i)) commands.append(residue) bfactor = (structure[0][chain][int(i)]["CA"].get_bfactor()) bfactors.append(np.log10(bfactor)) positions_worked.append(i) except: print("residue {} not found".format(str(i))) coordinates.append([np.nan, np.nan, np.nan, i]) # Convert to df df_coordinates = DataFrame(columns=['x', 'y', 'z', 'Position'], data=coordinates) # Center data x, y, z = centroid(df_coordinates) df_coordinates['x_cent'] = (df_coordinates['x'] - x).abs()**2 df_coordinates['y_cent'] = (df_coordinates['y'] - y).abs()**2 df_coordinates['z_cent'] = (df_coordinates['z'] - z).abs()**2 df_coordinates['Distance'] = df_coordinates['x_cent'] + df_coordinates[ 'y_cent'] + df_coordinates['z_cent'] # Add sasa values if sasa: # Get structure for SASA structure_sasa = freesasa.Structure(pdb_path) result = freesasa.calc(structure_sasa) # Calculate sasa sasa_area = freesasa.selectArea(commands, structure_sasa, result) df_sasa: DataFrame = DataFrame(columns=['SASA'], data=sasa_area.values()) df_sasa['log B-factor'] = bfactors df_sasa['Position'] = positions_worked # Merge df_coordinates = df_coordinates.merge(df_sasa, how='outer', on='Position') return df_coordinates
def get_DNA_H_SASA(pdb_file,csvfileout,chain=None,resids=[],seq=None,probe_radius=1.4,slicen=100,vdw_set=None,Hcontrib=[1.0]*7,n_threads=1,verbose=False): """ Function is a warapper to the FREESASA library to calculate the Surface Accessible Surface Area out atoms in pdb_file, then expreacts the SASA deoxiribose hydrogen atoms and sums it up for every nucleotide with coefficients Hcontrib. chain - name of the DNA chain of interest in pdb_file, if chain has no name leave blank ('') resids - a list of resids to calculate H-SASA values. seq - seqeunce of the DNA strand, string or biopython Seq object. Hcontrib - coefficients for individual SASA of deoxyribose hydrogens for summing them up into H-SASA profile, order [H1' H2' H2'' H3' H4' H5' H5''] Note: chain, resids, seq, Hcontrib - can be also a list of two or more instances, to make calculation for several chains, spans of resids or combinations of Hcontrib at once. In this case number of elements in chain, resids, Hcontrib should be the same, and the algorithm will iterate through all list simultaneously (i.e. no combination will be tried). Chains should be of the same length. probe_radius - size of probe to roll. slicen - number of slices per atom, controls precision of the calculation. vdw_set - seleting the set of VdW radii: None - default for FREESASA used charmm36-rmin - rmin from charmm36 forcefield abmer10-rmin - rmin from AMBER10 forcefield Return -------- CSV file csvfileout with columns of H-SASA profiles along the sequence. """ chain=[chain] if isinstance(chain,basestring) else list(chain) if len(chain)>1: assert len(chain)==len(resids) assert len(chain)==len(seq) assert len(chain)==len(Hcontrib) else: resids=[resids] seq=[seq] Hcontrib=[Hcontrib] if not verbose: freesasa.setVerbosity(freesasa.nowarnings) hatoms=['H1\'','H2\'','H2\'','H3\'','H4\'','H5\'','H5\'\''] if vdw_set=='charmm36-rmin': #Open config from package in a tricky way, independent of package installation mode temp2 = tempfile.NamedTemporaryFile(delete=False) conffile = pkgutil.get_data('hydroid', 'pkgdata/charmm36_rmin.config') temp2.write(conffile) temp2.seek(0) temp2.close() classifier = freesasa.Classifier(temp2.name) os.remove(temp2.name) #### structure = freesasa.Structure(pdb_file,classifier, options={'hydrogen' : True,'hetatm' : True}) elif vdw_set=='amber10-rmin': #Open config from package in a tricky way, independent of package installation mode temp2 = tempfile.NamedTemporaryFile(delete=False) conffile = pkgutil.get_data('hydroid', 'pkgdata/amber10_rmin.config') temp2.write(conffile) temp2.seek(0) temp2.close() classifier = freesasa.Classifier(temp2.name) os.remove(temp2.name) #### structure = freesasa.Structure(pdb_file,classifier, options={'hydrogen' : True,'hetatm' : True}) else: structure = freesasa.Structure(pdb_file, options={'hydrogen' : True,'hetatm' : True}) print "Launching FreeSASA calculation..." result = freesasa.calc(structure,freesasa.Parameters({'algorithm' : freesasa.LeeRichards,'n-slices' : slicen,'probe-radius':probe_radius,'n-threads':n_threads})) # result = freesasa.calc(structure,freesasa.Parameters({'algorithm' : freesasa.ShrakeRupley,'n-slices' : slicen,'n-threads':n_threads})) print "Calculation done" print "Extracting SASA values ..." res=dict() for ch,rids,Hcont,i in zip(chain,resids,Hcontrib,range(len(chain))): res[i]=pd.Series() if (np.array(Hcont)==1.0).all(): #simplified procedure, we can do it faster: we need to calculate all H-SASA at once sels=[] for resid in rids: if len(ch)>0: sels.append('%d,(chain %s) and (resi %s%d) and (name %s)'%(resid, ch,'\\' if resid<0 else '', resid, '+'.join(hatoms))) else: sels.append('%d,(resi %s%d) and (name %s)'%(resid,'\\' if resid<0 else '', resid, '+'.join(hatoms))) selections = freesasa.selectArea(sels,structure, result) res[i]=res[i].add(pd.Series(selections)*1.0,fill_value=0) else: #regular procedure for hat,hcont in zip(hatoms,Hcont): sels=[] if hcont!=0: for resid in rids: if len(ch)>0: sels.append('%d,(chain %s) and (resi %s%d) and (name %s)'%(resid, ch,'\\' if resid<0 else '', resid, hat)) else: sels.append('%d,(resi %s%d) and (name %s)'%(resid,'\\' if resid<0 else '', resid, hat)) selections = freesasa.selectArea(sels,structure, result) res[i]=res[i].add(pd.Series(selections)*float(hcont),fill_value=0) for i in range(len(chain)): res[i].index=res[i].index.map(int) res[i]=res[i].sort_index() if len(chain)==1: df=pd.DataFrame({'resid':res[0].index,'Site':['%d%s'%(n,l) for n,l in zip(range(1,1+len(seq[0])),seq[0])],'H-SASA':res[0].values}) else: df=pd.DataFrame() for ch,i in zip(chain,range(len(chain))): # print res[i] # print seq[i] ndf=pd.DataFrame({'resid_%d'%i:res[i].index,'Site_%d'%i:['%d%s'%(n,l) for n,l in zip(range(1,1+len(seq[i])),seq[i])],'H-SASA_%d'%i:res[i].values}) df=pd.concat([df,ndf],axis=1) print "Outputting H-SASA profile to %s"%csvfileout df.to_csv(csvfileout)