def get_structure(self): """Get the pdb structure of the molecule.""" # we can have a str or a list of bytes as input if isinstance(self.pdb_data, str): self.complex = freesasa.Structure(self.pdb_data) else: self.complex = freesasa.Structure() atomdata = self.sql.get('name,resName,resSeq,chainID,x,y,z') for atomName, residueName, residueNumber, chainLabel, x, y, z in atomdata: atomName = '{:>2}'.format(atomName[0]) self.complex.addAtom(atomName, residueName, residueNumber, chainLabel, x, y, z) self.result_complex = freesasa.calc(self.complex) self.chains = {} self.result_chains = {} for label in self.chains_label: self.chains[label] = freesasa.Structure() atomdata = self.sql.get('name,resName,resSeq,chainID,x,y,z', chainID=label) for atomName, residueName, residueNumber, chainLabel, x, y, z in atomdata: atomName = '{:>2}'.format(atomName[0]) self.chains[label].addAtom(atomName, residueName, residueNumber, chainLabel, x, y, z) self.result_chains[label] = freesasa.calc(self.chains[label])
def getFreeSASAStructureFromModel(structure, classifier=None): outFile = "gsfm.temp.pdb" structure.save(outFile) if(classifier is not None): freesasa_structure = freesasa.Structure(outFile, classifier=classifier) else: freesasa_structure = freesasa.Structure(outFile) if(os.access(outFile, os.R_OK)): os.remove(outFile) return freesasa_structure
def cb_sasas(design_pdb, fg_vdm_txt): sasa_dict = {} # key is resnum, value is list [aa, cbsasa] # get the vdm AAs and resnums from txtfile vdms_aa = [] vdms_resnum = [] with open(fg_vdm_txt) as inF: for line in inF: line = line.strip() line = line.split(' ') vdms_resnum.append(line[0]) vdms_aa.append(line[1]) # parse design and do freesasa calc prody_parsed = pr.parsePDB(design_pdb, altloc='A', model=1) fs_struct = freesasa.Structure(design_pdb) # more atoms fs_result = freesasa_cb(prody_parsed, probe_radius=3) # less atoms bc this is Cb cutoff # get sasa for resnum, aa in zip(vdms_resnum, vdms_aa): # shouldn't have to worry about #neg resnums in designed proteins # get Cb atoms prody_pdb_bb_cb_atom_ind = prody_parsed.select( 'protein and (backbone or name CB) and \ not element H D').getIndices() # get Cb atoms for resnum sele = prody_parsed.select('protein and (backbone or name CB) and resnum ' + str(resnum) \ + ' and not element H D') bb_cb_atom_ind = sele.getIndices() sasa_3A_probe = '{0:.2f}'.format(sum(fs_result.atomArea(i) for i in \ np.where(np.in1d(prody_pdb_bb_cb_atom_ind,bb_cb_atom_ind))[0])) sasa_dict[int(resnum)] = [aa, float(sasa_3A_probe)] return sasa_dict
def get_area(this_run,basename): path_dictionary=setup_paths() outpath = path_dictionary["pdb_path"] + basename + '.pdb' print('getting area') # convert to pdb obConversion = openbabel.OBConversion() obConversion.SetInFormat("xyz") obConversion.SetOutFormat("pdb") OBMol = openbabel.OBMol() obConversion.ReadFile(OBMol, this_run.init_geopath) obConversion.WriteFile(OBMol, outpath) # measure free SA dc = DerivedClassifierT() myopt = {'halt-at-unknown': False, 'hetatm': True, 'hydrogen': True, 'join-models': False, 'skip-unknown': False} structure = freesasa.Structure(outpath,classifier = dc, options = myopt) structure.setRadiiWithClassifier(dc) result = freesasa.calc(structure).totalArea() this_run.area = result
def featurize(structure: Structure) -> list[Any]: """ Calculates 3D ML features from the `structure`. """ structure1 = freesasa.Structure(pdbpath) result = freesasa.calc(structure1) area_classes = freesasa.classifyResults(result, structure1) Total_area = [] Total_area.append(result.totalArea()) Polar_Apolar = [] for key in area_classes: # print( key, ": %.2f A2" % area_classes[key]) Polar_Apolar.append(area_classes[key]) # get all the residues residues = [res for res in structure.get_residues()] seq_length = [] seq_length.append(len(residues)) # calculate some random 3D features (you should be smarter here!) protein_length = residues[1]["CA"] - residues[-2]["CA"] angle = calc_dihedral( residues[1]["CA"].get_vector(), residues[2]["CA"].get_vector(), residues[-3]["CA"].get_vector(), residues[-2]["CA"].get_vector(), ) # create the feature vector features = [Total_area, Polar_Apolar, protein_length, seq_length, angle] return features
def sa_calc(polymer_pdb, radius): # pdb files are needed for calculation surface area mol_file = Chem.MolFromMolFile(polymer_pdb) # hydrogens are removed in the mol file pdb_file = Chem.AddHs(mol_file, addCoords = True) # convert mol file to pdb file in rdkit Chem.MolToPDBFile(pdb_file, out_dir+NAME+'_new.pdb') # hydrogens are removed in the default option option_with_Hs = { 'hetatm' : True, 'hydrogen' : True, 'join-models' : False, 'skip-unknown' : False, 'halt-at-unknown' : False } # calculate solvent accessible surface area(probe radius = 1.4 Å or 3.6 Å) para = freesasa.Parameters() freesasa.Parameters.setProbeRadius(para, radius) # calculate sa for different type of polymers free_struct = freesasa.Structure(out_dir+NAME+'_new.pdb', options = option_with_Hs) free_calc = freesasa.calc(free_struct, para) total = free_calc.totalArea() # round to 4 decimals decimal = round(total, 4) print (f'Total SASA is {decimal} Å^2 when probe radius is {radius} Å.') atom_number = mol_file.GetNumAtoms() normalized_sa = round(decimal / atom_number, 4) # save data to a txt file with open (out_dir + 'Average surface area.txt', 'a+') as Asa: Asa.write(f'The normalized surface area of {NAME} is ' + str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + 'Å.\n' ) print ('Nomalized solvent accessible surface area is '+ str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + 'Å.\n')
def get_area_classes(file): struct = freesasa.Structure(file) result = freesasa.calc(struct) area_classes = freesasa.classifyResults(result, struct) list_areas = [(list(area_classes.values())[0]), (list(area_classes.values())[1]), result.totalArea()] return list_areas
def calculate_SAS(temp_dict, pdb_path, seq_len): struct = freesasa.Structure(str(pdb_path)) result = freesasa.calc(struct) area_classes = freesasa.classifyResults(result, struct) polar = area_classes['Polar'] apolar = area_classes['Apolar'] sasa_fraction = (polar + apolar) / seq_len temp_dict.update({ "Polar": polar, "Apolar": apolar, "SASA Fraction": sasa_fraction })
def calcSASA(Latm, selection): """Calcule la surface accessible au solvent (SAS) des acides aminés de la selecion Retourne la SAS pour une sélection donnée """ freesasa.setVerbosity(1) structure = freesasa.Structure() for a in Latm: structure.addAtom(a.ty, a.resname, a.resN, a.chain, a.traj[0], a.traj[1], a.traj[2]) result = freesasa.calc(structure) selections = freesasa.selectArea((selection, 'all, resn ala'), structure, result) return selections[selection.split()[0][:-1]]
def cal_sasa(prot, resilist): structure = freesasa.Structure(prot) result = freesasa.calc(structure) for i in range(len(resilist)): resi_ind = resilist[i]['resi_seq'] chain = resilist[i]['chain'] sasa_value = freesasa.selectArea( ('alanine, resn ala', 'we, resi ' + str(resi_ind) + ' and chain ' + chain), structure, result) resilist[i]['SASA'] = sasa_value['we'] return resilist
def run_freesasa_biopython(pdb_path): global freesasa if freesasa is None: try: import freesasa except ImportError: raise RuntimeError("Cannot use this method. Please save the pdb file and rerun with docker") with silence_stdout(), silence_stderr(): #Automatically removes hydrogens sasa_struct = freesasa.Structure(pdb_path) sasa = freesasa.calc(sasa_struct) return sasa, sasa_struct
def _compute_asa(df): """Compute solvent-accessible surface area for provided strucutre.""" bp = dt.df_to_bp(df) structure = freesasa.Structure( classifier=freesasa.Classifier.getStandardClassifier('naccess'), options={ 'hydrogen': True, 'skip-unknown': True }) for i, atom in df.iterrows(): if atom['resname'] != 'UNK' and atom['element'] != 'H': structure.addAtom(atom['name'], atom['resname'], atom['residue'], atom['chain'], atom['x'], atom['y'], atom['z']) result = freesasa.calc(structure) return result.totalArea()
def get_attributes(self): # read pdb file with open(self.file_path, "r") as f: self.data = f.readlines() # calculate solvent access data try: self.solvent_access = fs.calc(fs.Structure(self.file_path)) except Exception: raise self._clean_data() try: self._ca_attributes() except AssertionError: raise self._distance_to_others() self._find_in_range()
def _get_sasa(self): if freesasa is None: print "SASA not installed! SASA will be 0" return None, None if self.sasa is None: pdbfd, tmp_pdb_path = tempfile.mkstemp() with os.fdopen(pdbfd, 'w') as tmp: writePDBStream(tmp, self.structure) with silence_stdout(), silence_stderr(): self.sasa_struct = freesasa.Structure(tmp_pdb_path) self.sasa = freesasa.calc(self.sasa_struct) os.remove(tmp_pdb_path) return self.sasa, self.sasa_struct
def sasa_from_file(file: Union[str, pathlib.Path]) -> Sasa: """Get the freesasa.Result.residueAreas() dictionary obtained after parsing a PDB file to a freesasa.Structure and calling fresasa.calc() on it. """ if isinstance(file, str): file = pathlib.Path(file) elif isinstance(file, pathlib.Path): pass else: raise TypeError( "Invalid argument type. File should be 'str' or pathlib.Path") if not file.exists(): raise FileNotFoundError( f"File {file.absolute().as_posix()} does not exist.") _struct = freesasa.Structure(file.absolute().as_posix()) _sasa = freesasa.calc(_struct) return ObjDict(_sasa.residueAreas())
def _get_scores(self, df, pdb_id, pdb_chain): sifts = get_sifts_alignment_for_chain(pdb_id, pdb_chain, self.sifts_directory, self.download_sifts) if sifts is None: scores = None else: df = pd.merge(df, sifts, left_on='residue', right_on='uniprot position', how='left') pdb_file_path = os.path.join(self.pdb_directory, pdb_id + '.pdb') if not os.path.isfile(pdb_file_path): # PDB file not already downloaded. if self.download_pdb_file: download_pdb_file(pdb_id, self.pdb_directory) else: raise LookupError( "PDB file {} is not in the pdb_directory {}".format( pdb_id, self.pdb_directory)) structure = freesasa.Structure(pdb_file_path) result = freesasa.calc(structure, self.freesasa_parameters) chain_results = result.residueAreas()[pdb_chain] scores = np.full(len(df), np.nan) for i, residue in enumerate(df['pdb position']): if not np.isnan(residue): try: scores[i] = getattr(chain_results[str(int(residue))], self.metric) except KeyError as e: pass return scores
def cb_sasas(design_pdb): sasa_dict = {} # key is resnum, value is list [aa, cbsasa] # parse design and do freesasa calc prody_parsed = pr.parsePDB(design_pdb, altloc='A', model=1) fs_struct = freesasa.Structure(design_pdb) # more atoms fs_result = freesasa_cb(prody_parsed, probe_radius=3) # less atoms bc this is Cb cutoff # get sasa for each resnum chain_start = min(prody_parsed.getResnums()) chain_end = max(prody_parsed.getResnums()) for resnum in range(chain_start,chain_end+1): # get Cb atoms prody_pdb_bb_cb_atom_ind = prody_parsed.select('protein and (backbone or name CB) and \ not element H D').getIndices() # get Cb atoms for resnum sele = prody_parsed.select('protein and (backbone or name CB) and resnum ' + str(resnum) \ + ' and not element H D') resname = sele.getResnames()[0] bb_cb_atom_ind = sele.getIndices() sasa_3A_probe = '{0:.2f}'.format(sum(fs_result.atomArea(i) for i in \ np.where(np.in1d(prody_pdb_bb_cb_atom_ind,bb_cb_atom_ind))[0])) sasa_dict[int(resnum)] = [resname, float(sasa_3A_probe)] return sasa_dict
def surface_list(file1): maximum_area = { 'ALA': 120.56, 'CYS': 143.79, 'ASP': 157.04, 'GLU': 188.42, 'PHE': 227.46, 'GLY': 89.41, 'HIS': 200.14, 'ILE': 96.42, 'LYS': 213.74, 'LEU': 206.32, 'MET': 216.63, 'ASN': 149.85, 'PRO': 155.07, 'GLN': 186.83, 'ARG': 229.51, 'SER': 128.27, 'THR': 138.58, 'VAL': 169.82, 'TRP': 269.35, 'TYR': 241.54 } global chain_A global chain_B surface_list_a1 = [] surface_list_b1 = [] structure = freesasa.Structure(file1) result = freesasa.calc(structure) for residue1 in chain_A.get_residues(): try: res_id = residue1["CA"].get_full_id()[3][1] select_word = str(res_id) + ", " + "chain H and resi " + str( res_id) selections = freesasa.selectArea((select_word, ), structure, result) for key in selections: if float('%.3f' % (selections[key] / maximum_area[chain_A[ residue1.get_full_id()[3][1]].get_resname()])) > 0.05: surface_list_a1.append(res_id) except Exception: pass continue for residue2 in chain_B.get_residues(): try: res_id = residue2["CA"].get_full_id()[3][1] select_word = str(res_id) + ", " + "chain L and resi " + str( res_id) selections = freesasa.selectArea((select_word, ), structure, result) for key in selections: if float('%.3f' % (selections[key] / maximum_area[chain_B[ residue2.get_full_id()[3][1]].get_resname()])) > 0.05: surface_list_b1.append(res_id) except Exception: pass continue return surface_list_a1, surface_list_b1
def sa_conformers(file_1, func_1, file_2, func_2, units, radius): # turn off cache stk.OPTIONS['cache'] = False # number of conformers N = 10 """ functional groups: ['diol'] and ['dibromine']/['difluorene'] or ['bromine'] and ['bromine']/['iodine'] """ name_1 = file_1.replace('.mol', '') unit_1 = stk.StructUnit2(file_1, func_1) name_2 = file_2.replace('.mol', '') unit_2 = stk.StructUnit2(file_2, func_2) # make polymer NAME = name_1+'_'+name_2+'_AB_poly' print(f'Creating polymer: {NAME}') polymer = stk.Polymer([unit_1, unit_2], stk.Linear('AB', [0, 0], n=units, ends='h')) # write unoptimized structure polymer.write(NAME+'.mol') mol_polymer = rdkit.MolFromMolFile(NAME + '.mol') #print(f'{NAME} has {polymer.mol.get_no_atoms()} atoms!') print(f'Optimizing polymer {NAME} and saving {N} conformers') # clean molecule with ETKDG embedder = stk.UFF(use_cache=False) embedder.optimize(polymer, conformer=-1) # write optimized polymer to json polymer.dump(NAME+'_opt.json') polymer.write(NAME+'_opt.mol') # make N conformers of the polymer molecule etkdg = rdkit.ETKDGv2() etkdg.randomSeed = 1000 etkdg.verbose = True etkdg.maxIterations = 200000 cids = rdkit.EmbedMultipleConfs( mol=polymer.mol, numConfs=N, params=etkdg ) print(f'Made {len(cids)} conformers...') print(f'Warning! I have not implemented an optimization of the ETKDG cleaned polymers!') # iterate over conformers and save structure file_dir = '/home/fanyuzhao/Monomers/OH+F/dimer/conformers/' new_dir = file_dir+NAME+'_'+str(units)+'_'+str(radius)+'/' for cid in cids: # build directories if not os.path.exists(new_dir): os.makedirs(new_dir) # write optimized polymer to mol polymer.write(new_dir+NAME+'_'+str(cid)+'_opt.mol', conformer=cid) # write optimized polymer to pdb polymer.write(new_dir+NAME+'_'+str(cid)+'_opt.pdb', conformer=cid) print(f'Done! {N} ETKDG conformers of polymer written to {NAME}_{N}_opt.mol/pdb') # pdb file from stk can not be read in freesasa # save the new pdb file in rdkit from mol files for item in os.listdir(new_dir): if item.endswith('.mol'): file_pdb = item.replace('.mol', '') a = rdkit.MolFromMolFile(os.path.join(new_dir, item)) # hydrogens are removed when converting the file in rdkit b = rdkit.AddHs(a, addCoords = True) rdkit.MolToPDBFile(b, new_dir + file_pdb + '_new.pdb') # calculate solvent accessible surface area(probe radius = 1.4Å and 3.6Å) # hydrogens are removed in the default option # hetatm are ignored in the default option options_with_Hs = { 'hetatm' : True, 'hydrogen' : True, 'join-models' : False, 'skip-unknown' : False, 'halt-at-unknown' : False } sa_list = [] pdb_list = [] # loop all new pdb files for pdb in os.listdir(new_dir): if pdb.endswith("_new.pdb"): # use freesasa to calculate SASA para = freesasa.Parameters() freesasa.Parameters.setProbeRadius(para, radius) free_struct = freesasa.Structure(os.path.join(new_dir, pdb), options = options_with_Hs) free_calc = freesasa.calc(free_struct, para) total = free_calc.totalArea() # keep 3 decimals decimal = round(total, 4) sa_list.append(decimal) name_pdb = pdb.replace('.pdb', '') pdb_list.append(name_pdb) # calculate average SASA(probe radius = 1.4Å) sa_average = round(sum(sa_list) / len(sa_list), 4) atom_number = mol_polymer.GetNumAtoms() normalized_sa = round(sa_average / atom_number, 4) with open (file_dir + 'Average surface area of conformers.txt', 'a+') as Asa: Asa.write(f'The normalized surface area of {NAME}_{units} is ' + str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + f'Å and chain length of {units}.\n') print ('The avarage surface area of the conformers is ' + str(sa_average) + ' Å^2 with the probe size of ' + str(radius) + 'Å.') # save data to a csv table # save pdb file and surface area to a directory dic = {p: s for p, s in zip(pdb_list, sa_list)} download_dict = new_dir + 'Solvent accessible surface area of ' + NAME +'.csv' csv = open(download_dict, 'w') columnTitleRow = "Polymer_name, SASA\n" csv.write(columnTitleRow) for key in dic.keys(): Polymer_name = key SASA = dic[key] row = Polymer_name + "," + str(SASA) + "\n" csv.write(row) print ('Nomalized solvent accessible surface area is '+ str(normalized_sa) + ' Å^2 with the probe size of ' + str(radius) + 'Å.')
def __init__(self, comb, pdb_acc_code, chain, **kwargs): """ :comb: arg: instance of cls Comb with attributes pdbchain_dict, ifg_selection_info :pdb_acc_code: type: str: 4 character pdb accession code :param kwargs: path_to_pdb path_to_dssp """ #search for acc code in input_dir_pdb from comb object. assert isinstance(pdb_acc_code, str), 'PDB accession code needs to be a string' pdb_file = [ file.name for file in os.scandir(comb.input_dir_pdb) if pdb_acc_code in file.name ] try: if pdb_file: pdb_file = pdb_file[0] self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + pdb_file, altloc='A', model=1) elif 'path_to_pdb' in kwargs: self.prody_pdb = pr.parsePDB(kwargs.get('path_to_pdb'), altloc='A', model=1) else: # NEED TO UPDATE: note if going to fetch pdb, it should be sent through Reduce first... try: os.mkdir(comb.input_dir_pdb + 'raw') os.mkdir(comb.input_dir_pdb + 'reduce') except: pass pr.fetchPDB(pdb_acc_code, compressed=False, folder=comb.input_dir_pdb + 'raw') os.system(comb.path_to_reduce + comb.reduce + ' -FLIP -Quiet -DB ' + comb.path_to_reduce + 'reduce_wwPDB_het_dict.txt ' + comb.input_dir_pdb + 'raw/' + pdb_acc_code.lower() + '.pdb > ' + comb.input_dir_pdb + 'reduce/' + pdb_acc_code.lower() + 'H.pdb') self.prody_pdb = pr.parsePDB(comb.input_dir_pdb + 'reduce/' + pdb_acc_code.lower() + 'H.pdb', altloc='A', model=1) except NameError: raise NameError( 'ParsePDB instance needs a pdb file path or a valid pdb accession code.' ) self.pdb_acc_code = pdb_acc_code.lower() self.pdb_chain = chain if len(self.prody_pdb) == len(self.prody_pdb.select('icode _')) \ and self.prody_pdb.select('protein and chain ' + self.pdb_chain) is not None: self.contacts = pr.Contacts(self.prody_pdb) self.set_bonds() if pdb_file: self.fs_struct = freesasa.Structure(comb.input_dir_pdb + pdb_file) elif 'path_to_pdb' in kwargs: self.fs_struct = freesasa.Structure(kwargs.get('path_to_pdb')) else: path = comb.input_dir_pdb + 'reduce/' self.fs_struct = freesasa.Structure(path + next( file.name for file in os.scandir(path) if self.pdb_acc_code in file.name)) self.fs_result = freesasa.calc(self.fs_struct) self.fs_result_cb_3A = self.freesasa_cb(probe_radius=3) self.fs_result_cb_4A = self.freesasa_cb(probe_radius=4) self.fs_result_cb_5A = self.freesasa_cb(probe_radius=5) self.prody_pdb_bb_cb_atom_ind = self.prody_pdb.select( 'protein and (backbone or name CB) ' 'and not element H D').getIndices() dssp_file = [ file.name for file in os.scandir(comb.input_dir_dssp) if pdb_acc_code in file.name ] if dssp_file: dssp_file = dssp_file[0] self.dssp = pr.parseDSSP(comb.input_dir_dssp + dssp_file, self.prody_pdb) elif 'path_to_dssp' in kwargs: self.dssp = pr.parseDSSP(kwargs.get('path_to_dssp'), self.prody_pdb) else: if pdb_file: pr.execDSSP(comb.input_dir_pdb + pdb_file, outputdir=comb.input_dir_dssp) elif 'path_to_pdb' in kwargs: pr.execDSSP(kwargs.get('path_to_pdb'), outputdir=comb.input_dir_dssp) else: path = comb.input_dir_pdb + 'reduce/' + next( file.name for file in os.scandir(comb.input_dir_pdb + 'reduce') if pdb_acc_code in file.name) pr.execDSSP(path, outputdir=comb.input_dir_dssp) self.dssp = pr.parseDSSP( comb.input_dir_dssp + next(file.name for file in os.scandir(comb.input_dir_dssp) if pdb_acc_code in file.name), self.prody_pdb) self.possible_ifgs = self.find_possible_ifgs(comb) else: self.possible_ifgs = None # valence and hydrogen bond data for vandermers and iFGs of ParsedPDB protein instance # iFG specific: self._ifg_pdb_info = [] self._ifg_atom_density = [] self._ifg_contact_water = [] self._ifg_contact_ligand = [] self._ifg_contact_metal = [] # vdM specific: self._vdm_pdb_info = [] self._vdm_sasa_info = [] self._ifg_contact_vdm = [] self._ifg_hbond_vdm = [] self._ifg_hbond_water = [] self._ifg_hbond_ligand = [] self._ifg_ca_hbond_vdm = []
def openfile(): global prob, probab, te global my_seq global anti global structure, structure_id, filename global antigenicity, hydro, flex, sec global m, a, c, b, length, j, k global hydroph, flexi, access anti = [] sec = [] probab = [] from tkinter import filedialog root = Tk() root.filename = filedialog.askopenfilename( initialdir="/", title="Select file", filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb"))) filename = root.filename print(filename) structure_id = "1e6j" structure = PDBParser().get_structure(structure_id, root.filename) ppb = PPBuilder() for pp in ppb.build_peptides(structure): my_seq = pp.get_sequence() # type: Seq print(my_seq) for model in structure: for chain in model: print(chain) sequence = list(my_seq) m = ''.join(sequence) print(m) length = len(m) # type: int print("Sequence consist of", length, "Amino Acids") from Bio.SeqUtils.ProtParam import ProteinAnalysis analysed_seq = ProteinAnalysis(m) print("Molecular weight = ", analysed_seq.molecular_weight()) print("Amino Acid Count = ", analysed_seq.count_amino_acids()) print("Secondary structure fraction =", analysed_seq.secondary_structure_fraction()) kd = { 'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2 } c = list(analysed_seq.flexibility()) b = list(analysed_seq.protein_scale(kd, 10, 1.0)) hydro = list(analysed_seq.protein_scale(kd, 10, 1.0)) flex = list(analysed_seq.flexibility()) hydroph = list(analysed_seq.protein_scale(kd, 10, 1.0)) flexi = list(analysed_seq.flexibility()) i = 1 j = -1 # type: int k = 9 while i <= (length - 10): print("Sequence is = ", m[j + 1:k + 1]) print("Flexibility value = ", c[j + 1]) print("Hydrophilicity value = ", b[j + 1]) ana_seq = ''.join(m[j + 1:k + 1]) analyze_seq = ProteinAnalysis(ana_seq) # For Secondary structure Analysis print("Secondary structure fraction =", analyze_seq.secondary_structure_fraction()) a = list(analyze_seq.secondary_structure_fraction()) a = a[0] sec.append(a) i += 1 j += 1 k += 1 f = length r = 1 y = 10 global acc, logacc acc = [] for i in range(0, f): str1 = "accessibility, resi " str2 = str(r) + "-" + str(y) saving = str1 + str2 print(saving) r = r + 1 y = y + 1 structure = freesasa.Structure("1e6j.pdb") resulta = freesasa.calc(structure) area_classes = freesasa.classifyResults(resulta, structure) print("Total : %.2f A2" % resulta.totalArea()) for key in area_classes: print(key, ": %.2f A2" % area_classes[key]) resulta = freesasa.calc( structure, freesasa.Parameters({ 'algorithm': freesasa.LeeRichards, 'n-slices': 10 })) selections = freesasa.selectArea(('alanine, resn ala', saving), structure, resulta) for key in selections: print(key, ": %.2f A2" % selections[key]) a = selections[key] acc.append(a) l = acc[0::2] access = l print(acc) print(l) logacc = [math.log(y, 10) for y in l] print(logacc)
def handle(self, *args, **options): failed = [] # get preferred chain for PDB-code references = Structure.objects.all().prefetch_related( 'pdb_code', 'pdb_data') for reference in references: preferred_chain = reference.preferred_chain.split(',')[0] pdb_code = reference.pdb_code.index try: print(pdb_code) if "refined" in pdb_code: failed.append(pdb_code) continue #structure = self.load_pdb(pdb_code) structure = self.load_pdb_var(pdb_code, reference.pdb_data.pdb) # grab residues with the generic numbering for this structure db_reslist = list( Residue.objects.exclude( generic_number__isnull=True).filter( protein_conformation__protein=reference. protein_conformation.protein).prefetch_related( 'generic_number')) ####################################################################### ############################# filter pdb ############################# db_tmlist = [[] for i in range(TMNUM)] db_set = set() for r in db_reslist: if r.generic_number.label[:2] in [ "1x", "2x", "3x", "4x", "5x", "6x", "7x" ]: # and r.generic_number in pchain db_tmlist[int(r.generic_number.label[0]) - 1].append( r.sequence_number) db_set.add((' ', r.sequence_number, ' ')) def recurse(entity, slist): for subenty in entity.get_list(): if not subenty.id in slist[0]: entity.detach_child(subenty.id) elif slist[1:]: recurse(subenty, slist[1:]) recurse(structure, [[0], preferred_chain]) hse_struct = deepcopy(structure) recurse(structure, [[0], preferred_chain, db_set]) pchain = structure[0][preferred_chain] ####################################################################### ############### Calculate the axes through the helices ################ ####################################################################### N = 3 hres_list = [ np.asarray([pchain[r]["CA"].get_coord() for r in sl], dtype=float) for sl in db_tmlist ] h_cb_list = [ np.asarray([ pchain[r]["CB"].get_coord() if "CB" in pchain[r] else np.array([None, None, None]) for r in sl ], dtype=float) for sl in db_tmlist ] # fast and fancy way to take the average of N consecutive elements hres_three = np.asarray([ sum([h[i:-(len(h) % N) or None:N] for i in range(N)]) / N for h in hres_list ]) helices_mn = np.asarray( [np.mean(h, axis=0) for h in hres_three]) self.save_pseudo(hres_three, pdb_code + "helper") ####################################################################### ################################# PCA ################################# ####################################################################### def pca_line(pca, h, r=0): if ((not r) if pca.fit_transform(h)[0][0] < 0 else r): return pca.inverse_transform( np.asarray([[-20, 0, 0], [20, 0, 0]])) else: return pca.inverse_transform( np.asarray([[20, 0, 0], [-20, 0, 0]])) helix_pcas = [PCA() for i in range(7)] pos_list = np.asarray([ pca_line(helix_pcas[i], h, i % 2) for i, h in enumerate(hres_three) ]) #self.write_cgo_arrow_pml(pdb_code, "pca",pos_list) pos_list = np.mean(pos_list, axis=0) #self.write_cgo_arrow_pml(pdb_code, "pca_mean",[pos_list]) pca = PCA() pos_list = pca_line(pca, np.vstack(hres_three)) #self.write_cgo_arrow_pml(pdb_code, "pca_all",[pos_list]) pos_list = np.asarray([ pca_line(PCA(), h[:len(h) // 2:(-(i % 2) or 1)]) for i, h in enumerate(hres_three) ]) pos_list = pos_list - (np.mean(pos_list, axis=1) - helices_mn).reshape(-1, 1, 3) #self.write_cgo_arrow_pml(pdb_code, "pca_extra",pos_list) #self.write_cgo_arrow_pml(pdb_code, "pca_extra_mean",[np.mean(pos_list,axis=0)]) pca_extra = PCA() pos_list = pca_line(pca_extra, np.vstack(pos_list)) #self.write_cgo_arrow_pml(pdb_code, "pca_extra_pca",[pos_list]) ####################################################################### ################################ Angles ############################### ####################################################################### def calc_angle(b, c): ba = -b bc = c + ba ba[:, 0] = 0 return np.degrees( np.arccos( inner1d(ba, bc) / (np.linalg.norm(ba, axis=1) * np.linalg.norm(bc, axis=1)))) def ca_cb_calc(i, pca): fin = np.isfinite(h_cb_list[i][:, 0]) return calc_angle(pca.transform(hres_list[i][fin]), pca.transform(h_cb_list[i][fin])) def axes_calc(i, pca_list, pca): p = pca_list[i] h = hres_list[i] a = (np.roll(np.vstack((h, h[0])), 1, axis=0)[:-1] + h + np.roll(np.vstack((h, h[-1])), -1, axis=0)[:-1]) / 3 b = p.transform(h) b[:, 1:] = p.transform(a)[:, 1:] b = p.inverse_transform(b) return calc_angle(pca.transform(b), pca.transform(h)) def set_bfactor(structure, angles): for r, an in zip(structure[0][preferred_chain].get_list(), angles): for a in r: a.set_bfactor(an) centerpca = pca ########################### Axis to CA to CB ########################## tv = np.isfinite(np.concatenate(h_cb_list)[:, 0]) angle = np.full_like(tv, -1, dtype=float) angle[tv] = np.concatenate( [ca_cb_calc(i, centerpca) for i in range(TMNUM)]) set_bfactor(structure, angle) self.save_pdb(structure, pdb_code + 'angle_colored_ca_cb.pdb') ######################### Axis to Axis to CA ########################## angle2 = np.concatenate([ axes_calc(i, helix_pcas, centerpca) for i in range(TMNUM) ]) set_bfactor(structure, angle2) self.save_pdb(structure, pdb_code + 'angle_colored_axes.pdb') ### ASA pdbstruct = freesasa.Structure("pymol_output/" + pdb_code + 'angle_colored_axes.pdb') res = freesasa.calc(pdbstruct) asa_list = [] oldnum = -1 for i in range(res.nAtoms()): resnum = pdbstruct.residueNumber(i) if resnum == oldnum: asa_list[-1] += res.atomArea(i) else: asa_list.append(res.atomArea(i)) oldnum = resnum set_bfactor(structure, asa_list) self.save_pdb(structure, pdb_code + 'asa_colored.pdb') reslist = [r.id[1] for r in pchain.get_list()] ### HSE model = hse_struct[0] exp_ca = pdb.HSExposure.HSExposureCA(model) [[a.set_bfactor(x[1][1]) for a in x[0]] for x in exp_ca] recurse(hse_struct, [[0], preferred_chain, db_set]) r = [x[0] for x in exp_ca] #x = model["A"].get_list() x = pchain.get_list() for r in (set(x) - set(r)): for a in r: a.set_bfactor(-1) exp_ca = [ a["CA"].get_bfactor() for a in hse_struct[0][preferred_chain].get_list() ] self.save_pdb(hse_struct, pdb_code + 'hsea_colored.pdb') with open('pymol_output/' + pdb_code + '_measures.pickle', 'wb') as handle: pickle.dump((np.array(reslist), np.array(asa_list), np.array(exp_ca), angle, angle2), handle) except Exception as e: print("ERROR!!", pdb_code, e) failed.append(pdb_code) continue print(len(failed), "of", len(references), "failed:", failed)
def parse_pdb_coordinates(pdb_path: str, start_position: int, end_position: int, position_correction: int, chain: str, sasa: bool = False) -> DataFrame: """ Parse coordinate of CA atoms. Will also return the bfactor and SASA using freesasa. If PDB is missing atoms, it can handle it. """ # Get structure from PDB structure = PDBParser().get_structure('pdb', pdb_path) coordinates = [] commands = [] bfactors = [] positions_worked = [] # positions present in pdb # Iterate over each CA atom and geet coordinates for i in np.arange(start_position + position_correction, end_position + position_correction): # first check if atom exists try: structure[0][chain][int(i)].has_id("CA") # Get atom from pdb and geet coordinates atom = list(structure[0][chain][int(i)]["CA"].get_vector()) + [i] coordinates.append(atom) # Get SASA command for each residue and bfactor residue = "s{}, chain {} and resi {}".format(str(i), chain, str(i)) commands.append(residue) bfactor = (structure[0][chain][int(i)]["CA"].get_bfactor()) bfactors.append(np.log10(bfactor)) positions_worked.append(i) except: print("residue {} not found".format(str(i))) coordinates.append([np.nan, np.nan, np.nan, i]) # Convert to df df_coordinates = DataFrame(columns=['x', 'y', 'z', 'Position'], data=coordinates) # Center data x, y, z = centroid(df_coordinates) df_coordinates['x_cent'] = (df_coordinates['x'] - x).abs()**2 df_coordinates['y_cent'] = (df_coordinates['y'] - y).abs()**2 df_coordinates['z_cent'] = (df_coordinates['z'] - z).abs()**2 df_coordinates['Distance'] = df_coordinates['x_cent'] + df_coordinates[ 'y_cent'] + df_coordinates['z_cent'] # Add sasa values if sasa: # Get structure for SASA structure_sasa = freesasa.Structure(pdb_path) result = freesasa.calc(structure_sasa) # Calculate sasa sasa_area = freesasa.selectArea(commands, structure_sasa, result) df_sasa: DataFrame = DataFrame(columns=['SASA'], data=sasa_area.values()) df_sasa['log B-factor'] = bfactors df_sasa['Position'] = positions_worked # Merge df_coordinates = df_coordinates.merge(df_sasa, how='outer', on='Position') return df_coordinates
def get_DNA_H_SASA(pdb_file,csvfileout,chain=None,resids=[],seq=None,probe_radius=1.4,slicen=100,vdw_set=None,Hcontrib=[1.0]*7,n_threads=1,verbose=False): """ Function is a warapper to the FREESASA library to calculate the Surface Accessible Surface Area out atoms in pdb_file, then expreacts the SASA deoxiribose hydrogen atoms and sums it up for every nucleotide with coefficients Hcontrib. chain - name of the DNA chain of interest in pdb_file, if chain has no name leave blank ('') resids - a list of resids to calculate H-SASA values. seq - seqeunce of the DNA strand, string or biopython Seq object. Hcontrib - coefficients for individual SASA of deoxyribose hydrogens for summing them up into H-SASA profile, order [H1' H2' H2'' H3' H4' H5' H5''] Note: chain, resids, seq, Hcontrib - can be also a list of two or more instances, to make calculation for several chains, spans of resids or combinations of Hcontrib at once. In this case number of elements in chain, resids, Hcontrib should be the same, and the algorithm will iterate through all list simultaneously (i.e. no combination will be tried). Chains should be of the same length. probe_radius - size of probe to roll. slicen - number of slices per atom, controls precision of the calculation. vdw_set - seleting the set of VdW radii: None - default for FREESASA used charmm36-rmin - rmin from charmm36 forcefield abmer10-rmin - rmin from AMBER10 forcefield Return -------- CSV file csvfileout with columns of H-SASA profiles along the sequence. """ chain=[chain] if isinstance(chain,basestring) else list(chain) if len(chain)>1: assert len(chain)==len(resids) assert len(chain)==len(seq) assert len(chain)==len(Hcontrib) else: resids=[resids] seq=[seq] Hcontrib=[Hcontrib] if not verbose: freesasa.setVerbosity(freesasa.nowarnings) hatoms=['H1\'','H2\'','H2\'','H3\'','H4\'','H5\'','H5\'\''] if vdw_set=='charmm36-rmin': #Open config from package in a tricky way, independent of package installation mode temp2 = tempfile.NamedTemporaryFile(delete=False) conffile = pkgutil.get_data('hydroid', 'pkgdata/charmm36_rmin.config') temp2.write(conffile) temp2.seek(0) temp2.close() classifier = freesasa.Classifier(temp2.name) os.remove(temp2.name) #### structure = freesasa.Structure(pdb_file,classifier, options={'hydrogen' : True,'hetatm' : True}) elif vdw_set=='amber10-rmin': #Open config from package in a tricky way, independent of package installation mode temp2 = tempfile.NamedTemporaryFile(delete=False) conffile = pkgutil.get_data('hydroid', 'pkgdata/amber10_rmin.config') temp2.write(conffile) temp2.seek(0) temp2.close() classifier = freesasa.Classifier(temp2.name) os.remove(temp2.name) #### structure = freesasa.Structure(pdb_file,classifier, options={'hydrogen' : True,'hetatm' : True}) else: structure = freesasa.Structure(pdb_file, options={'hydrogen' : True,'hetatm' : True}) print "Launching FreeSASA calculation..." result = freesasa.calc(structure,freesasa.Parameters({'algorithm' : freesasa.LeeRichards,'n-slices' : slicen,'probe-radius':probe_radius,'n-threads':n_threads})) # result = freesasa.calc(structure,freesasa.Parameters({'algorithm' : freesasa.ShrakeRupley,'n-slices' : slicen,'n-threads':n_threads})) print "Calculation done" print "Extracting SASA values ..." res=dict() for ch,rids,Hcont,i in zip(chain,resids,Hcontrib,range(len(chain))): res[i]=pd.Series() if (np.array(Hcont)==1.0).all(): #simplified procedure, we can do it faster: we need to calculate all H-SASA at once sels=[] for resid in rids: if len(ch)>0: sels.append('%d,(chain %s) and (resi %s%d) and (name %s)'%(resid, ch,'\\' if resid<0 else '', resid, '+'.join(hatoms))) else: sels.append('%d,(resi %s%d) and (name %s)'%(resid,'\\' if resid<0 else '', resid, '+'.join(hatoms))) selections = freesasa.selectArea(sels,structure, result) res[i]=res[i].add(pd.Series(selections)*1.0,fill_value=0) else: #regular procedure for hat,hcont in zip(hatoms,Hcont): sels=[] if hcont!=0: for resid in rids: if len(ch)>0: sels.append('%d,(chain %s) and (resi %s%d) and (name %s)'%(resid, ch,'\\' if resid<0 else '', resid, hat)) else: sels.append('%d,(resi %s%d) and (name %s)'%(resid,'\\' if resid<0 else '', resid, hat)) selections = freesasa.selectArea(sels,structure, result) res[i]=res[i].add(pd.Series(selections)*float(hcont),fill_value=0) for i in range(len(chain)): res[i].index=res[i].index.map(int) res[i]=res[i].sort_index() if len(chain)==1: df=pd.DataFrame({'resid':res[0].index,'Site':['%d%s'%(n,l) for n,l in zip(range(1,1+len(seq[0])),seq[0])],'H-SASA':res[0].values}) else: df=pd.DataFrame() for ch,i in zip(chain,range(len(chain))): # print res[i] # print seq[i] ndf=pd.DataFrame({'resid_%d'%i:res[i].index,'Site_%d'%i:['%d%s'%(n,l) for n,l in zip(range(1,1+len(seq[i])),seq[i])],'H-SASA_%d'%i:res[i].values}) df=pd.concat([df,ndf],axis=1) print "Outputting H-SASA profile to %s"%csvfileout df.to_csv(csvfileout)
def calculate_surface_points(receptor, ligand, num_points, rec_translation, surface_density, seed=STARTING_POINTS_SEED, has_membrane=False, num_sphere_points=100): """Calculates the position of num_points on the surface of the given protein""" if num_points < 0: raise SetupError( "Invalid number of points to generate over the surface") receptor_atom_coordinates = receptor.representative(has_membrane) distances_matrix_rec = distance.pdist(receptor_atom_coordinates) receptor_max_diameter = np.max(distances_matrix_rec) distances_matrix_lig = distance.pdist(ligand.representative()) ligand_max_diameter = np.max(distances_matrix_lig) surface_distance = ligand_max_diameter / 4.0 # Surface pdb_file_name = Path( receptor.structure_file_names[receptor.representative_id]) molecule = parsePDB(pdb_file_name).select('protein or nucleic') if has_membrane: pdb_no_membrane = str( pdb_file_name.absolute().parent / f"{pdb_file_name.stem}_no_membrane{pdb_file_name.suffix}") writePDB(pdb_no_membrane, molecule) surface = molecule.select('protein and surface or nucleic and name P') coords = surface.getCoords() # SASA if num_points == 0: if has_membrane: structure = freesasa.Structure(pdb_no_membrane) else: structure = freesasa.Structure(str(pdb_file_name)) result = freesasa.calc(structure) total_sasa = result.totalArea() num_points = ceil(total_sasa / surface_density) # Surface clusters if len(coords) > num_points: # Extremely important to set seed in order to get reproducible results np.random.seed(seed) surface_clusters = kmeans2(data=coords, k=num_points, minit='points', iter=100) surface_centroids = surface_clusters[0] else: surface_centroids = coords # Create points over the surface of each surface cluster sampling = [] for sc in surface_centroids: sphere_points = np.array(points_on_sphere(num_sphere_points)) surface_points = sphere_points * surface_distance + sc sampling.append(surface_points) # Filter out not compatible points centroids_kd_tree = KDTree(surface_centroids) for i_centroid in range(len(sampling)): # print('.', end="", flush=True) centroid = surface_centroids[i_centroid] # Search for this centroid neighbors centroid_neighbors = centroids_kd_tree.query_ball_point(centroid, r=20.) # For each neighbor, remove points too close for n in centroid_neighbors: points_to_remove = [] if n != i_centroid: for i_p, p in enumerate(sampling[i_centroid]): if np.linalg.norm( p - surface_centroids[n]) <= surface_distance: points_to_remove.append(i_p) points_to_remove = list(set(points_to_remove)) sampling[i_centroid] = [sampling[i_centroid][i_p] \ for i_p in range(len(sampling[i_centroid])) if i_p not in points_to_remove] s = [] for points in sampling: s.extend(points) # Final cluster of points if len(s) > num_points: # Extremely important to set seed in order to get reproducible results np.random.seed(seed) s_clusters = kmeans2(data=s, k=num_points, minit='points', iter=100) s = s_clusters[0] for p in s: p += rec_translation return s, receptor_max_diameter, ligand_max_diameter
def calculate_sasa(pdbfile, chain, multichain=True, relative_type='sidechain'): """ :param pdbfile: String of PDB file name. :param chain: String or List of chain identifiers. :param multichain: Boolean. True to separate chains. This allows SASA calculation for a single unattached monomer. False if you want to calculate SASA for the structure 'as-is'. :return: Pandas Dataframe of residue number, types, and sasa values as columns. """ import freesasa as fs dict_max_acc = { # Miller max acc: Miller et al. 1987 https://doi.org/10.1016/0022-2836(87)90038-6 # Wilke: Tien et al. 2013 https://doi.org/10.1371/journal.pone.0080635 # Sander: Sander & Rost 1994 https://doi.org/10.1002/prot.340200303 "Miller": { "ALA": 113.0, "ARG": 241.0, "ASN": 158.0, "ASP": 151.0, "CYS": 140.0, "GLN": 189.0, "GLU": 183.0, "GLY": 85.0, "HIS": 194.0, "ILE": 182.0, "LEU": 180.0, "LYS": 211.0, "MET": 204.0, "PHE": 218.0, "PRO": 143.0, "SER": 122.0, "THR": 146.0, "TRP": 259.0, "TYR": 229.0, "VAL": 160.0, }, "Wilke": { "ALA": 129.0, "ARG": 274.0, "ASN": 195.0, "ASP": 193.0, "CYS": 167.0, "GLN": 225.0, "GLU": 223.0, "GLY": 104.0, "HIS": 224.0, "ILE": 197.0, "LEU": 201.0, "LYS": 236.0, "MET": 224.0, "PHE": 240.0, "PRO": 159.0, "SER": 155.0, "THR": 172.0, "TRP": 285.0, "TYR": 263.0, "VAL": 174.0, "MSE": 224.0, "SEC": 167.0, }, "Sander": { "ALA": 106.0, "ARG": 248.0, "ASN": 157.0, "ASP": 163.0, "CYS": 135.0, "GLN": 198.0, "GLU": 194.0, "GLY": 84.0, "HIS": 184.0, "ILE": 169.0, "LEU": 164.0, "LYS": 205.0, "MET": 188.0, "PHE": 197.0, "PRO": 136.0, "SER": 130.0, "THR": 142.0, "TRP": 227.0, "TYR": 222.0, "VAL": 142.0, }, } theoreticalMaxASA = dict_max_acc["Wilke"] # Calculates SASA for unseparated chains. if not multichain: structure = fs.Structure(pdbfile) else: # Separate chains if multichain structure. This allows SASA calculation for a single unattached monomer. structures = fs.structureArray(pdbfile, options={"separate-chains": True}) chains = [] for c in range(len(structures)): chains.append(structures[c].chainLabel(1)) structure = structures[chains.index(chain)] print("using {} separating chains {}".format(chains.index(chain), chains)) print("Number of atoms of {}: {}".format(pdbfile, structure.nAtoms())) result = fs.calc(structure, fs.Parameters({'algorithm': fs.ShrakeRupley, 'n-points': 10000})) res = result.residueAreas() residue = [] resnum = [] total = [] apolar = [] mainchain = [] sidechain = [] ratio = [] for idx, v in res[chain].items(): residue.append(v.residueType) resnum.append(v.residueNumber) total.append(v.total) apolar.append(v.apolar) mainchain.append(v.mainChain) sidechain.append(v.sideChain) if v.residueType == 'GLY': ratio.append(100 * v.mainChain / theoreticalMaxASA[v.residueType]) elif v.residueType not in theoreticalMaxASA.keys(): possibleSASA = [] for i, maxSASA in enumerate(theoreticalMaxASA.values()): # If the residue is unknown but has a SASA, # calculate the rSASA dividing by theoretical maxSASA and then use the average of that value possibleSASA.append(100 * v.sideChain / maxSASA) ratio.append(np.average(possibleSASA)) else: if relative_type == 'sidechain': ratio.append(100 * v.sideChain / theoreticalMaxASA[v.residueType]) else: ratio.append(100 * v.total / theoreticalMaxASA[v.residueType]) # if v.hasRelativeAreas: # ratio.append(v.relativeSideChain) # else: # ratio.append(np.nan) df_sasa = pd.DataFrame({'Residue': residue, 'Residue_num': resnum, 'Chain': chain, 'Total': total, 'Apolar': apolar, 'Backbone': mainchain, 'Sidechain': sidechain, 'Ratio': ratio}) area_class = fs.classifyResults(result, structure) print("Total : %.2f A2" % result.totalArea()) for key in area_class: print(key, ": %.2f A2" % area_class[key]) return df_sasa
import freesasa savedData = open('SASA.txt', 'w+') structure = freesasa.Structure("3lau.pdb") result = freesasa.calc( structure, freesasa.Parameters({ 'algorithm': freesasa.LeeRichards, 'n-slices': 100 })) print(result.nAtoms()) for i in range(1, result.nAtoms() + 1): details = '(' + structure.atomName(i) + ',' + str( result.atomArea(i)) + ' )' print(details) savedData.writelines(details + '\n') area_classes = freesasa.classifyResults(result, structure) print(area_classes) print("Total : %.2f A2" % result.totalArea()) for key in area_classes: print(key, ": %.2f A2" % area_classes[key])
char_at_neutral = [] char_at_base = [] parser = argparse.ArgumentParser() parser.add_argument("--infile", type=str, default="data/test.zip") parser.add_argument("--model", type=str, default="model.pkl") args = parser.parse_args() #protein_parser = PDBParser() with temppathlib.TemporaryDirectory() as tmpdir: # unzip the file with all the test PDBs with zipfile.ZipFile(args.infile, "r") as zip_: zip_.extractall(tmpdir.path) for test_pdb in tmpdir.path.glob("*.pdb"): struct = freesasa.Structure(str(test_pdb)) result = freesasa.calc(struct) areas_classes = freesasa.classifyResults(result, struct) list_areas = [(list(areas_classes.values())[0]), (list(areas_classes.values())[1]), result.totalArea()] polar_area.append(list_areas[0]) apolar_area.append(list_areas[1]) total_area.append(list_areas[2]) print('done') with temppathlib.TemporaryDirectory() as tmpdir: # unzip the file with all the test PDBs with zipfile.ZipFile(args.infile, "r") as zip_: zip_.extractall(tmpdir.path)
def handle(self, *args, **options): def recurse(entity, slist): """ filter a pdb structure in a recursive way entity: the pdb entity, a structure should be given on the top level slist: the list of filter criterias, for each level. """ for subenty in entity.get_list(): if not subenty.id in slist[0]: entity.detach_child(subenty.id) elif slist[1:]: recurse(subenty, slist[1:]) def cal_pseudo_CB(r): """ Calculate pseudo CB for Glycin from Bio pdb faq """ a = r['CA'].get_vector() n = r['N'].get_vector() - a c = r['C'].get_vector() - a rot = pdb.rotaxis(-np.pi * 120.0 / 180.0, c) b = n.left_multiply(rot) + a return b.get_array() def pca_line(pca, h, r=0): """ Calculate the pca for h and return the first pc transformed back to the original coordinate system """ if ((not r) if pca.fit_transform(h)[0][0] < 0 else r): return pca.inverse_transform( np.asarray([[-20, 0, 0], [20, 0, 0]])) else: return pca.inverse_transform( np.asarray([[20, 0, 0], [-20, 0, 0]])) def calc_angle(b, c): """ Calculate the angle between c, b and the orthogonal projection of b to the x axis. """ ba = -b bc = c + ba ba[:, 0] = 0 return np.degrees( np.arccos( inner1d(ba, bc) / (np.linalg.norm(ba, axis=1) * np.linalg.norm(bc, axis=1)))) def ca_cb_calc(ca, cb, pca): """ Calcuate the angles between ca, cb and center axis """ return calc_angle(pca.transform(ca), pca.transform(cb)) def axes_calc(h, p, pca): """ Calculate the orthogonal projection of the CA to the helix axis which is moved to the mean of three consecutive amino acids """ a = (np.roll(np.vstack((h, h[0])), 1, axis=0)[:-1] + h + np.roll(np.vstack((h, h[-1])), -1, axis=0)[:-1]) / 3 b = p.transform(h) b[:, 1:] = p.transform(a)[:, 1:] b = p.inverse_transform(b) return calc_angle(pca.transform(b), pca.transform(h)) def set_bfactor(chain, angles): """ simple helper to set the bfactor of all residues by some value of a list """ for r, an in zip(chain.get_list(), angles): for a in r: a.set_bfactor(an) def qgen(x): """ Helper function to slice a list of all residues of a protein of the list of the residues of all proteins """ start = False for i in range(len(qset) - 1, 0, -1): if not start and qset[i].protein_conformation.protein == x: start = i if start and qset[i].protein_conformation.protein != x: if start != len(qset) - 1: del qset[start + 1:] return qset[i + 1:] return qset[i + 1:] del qset[start + 1:] return qset failed = [] # get preferred chain for PDB-code references = Structure.objects.filter( protein_conformation__protein__family__slug__startswith="001" ).exclude(refined=True).prefetch_related( 'pdb_code', 'pdb_data', 'protein_conformation').order_by('protein_conformation__protein') references = list(references) pids = [ref.protein_conformation.protein.id for ref in references] qset = Residue.objects.filter( protein_conformation__protein__id__in=pids) qset = qset.filter( generic_number__label__regex=r'^[1-7]x[0-9]+').order_by( '-protein_conformation__protein', '-generic_number__label') qset = list( qset.prefetch_related('generic_number', 'protein_conformation')) res_dict = { ref.pdb_code.index: qgen(ref.protein_conformation.protein) for ref in references } ####################################################################### ######################### Start of main loop ########################## ####################################################################### for reference in references: preferred_chain = reference.preferred_chain.split(',')[0] pdb_code = reference.pdb_code.index state_id = reference.protein_conformation.state.id try: print(pdb_code) structure = self.load_pdb_var(pdb_code, reference.pdb_data.pdb) pchain = structure[0][preferred_chain] ####################################################################### ###################### prepare and evaluate query ##################### db_reslist = res_dict[pdb_code] ####################################################################### ######################### filter data from db ######################### def reslist_gen(x): try: while db_reslist[-1].generic_number.label[0] == x: yield db_reslist.pop() except IndexError: pass # when gdict is not needed the helper can be removed #db_tmlist = [[(' ',r.sequence_number,' ') for r in reslist_gen(x) if r.sequence_number in pchain and r.sequence_number < 1000] for x in ["1","2","3","4","5","6","7"]] db_helper = [[ (r.generic_number.label, r.sequence_number) for r in reslist_gen(x) if r.sequence_number in pchain and r.sequence_number < 1000 ] for x in ["1", "2", "3", "4", "5", "6", "7"]] gdict = {r[1]: r[0] for hlist in db_helper for r in hlist} db_tmlist = [[(' ', r[1], ' ') for r in sl] for sl in db_helper] db_set = set(db_tmlist[0] + db_tmlist[1] + db_tmlist[2] + db_tmlist[3] + db_tmlist[4] + db_tmlist[5] + db_tmlist[6]) ####################################################################### ############################# filter pdb ############################# recurse(structure, [[0], preferred_chain, db_set]) ####################################################################### ############### Calculate the axes through the helices ################ ####################################################################### N = 3 hres_list = [ np.asarray([pchain[r]["CA"].get_coord() for r in sl], dtype=float) for sl in db_tmlist ] h_cb_list = [ np.asarray([ pchain[r]["CB"].get_coord() if "CB" in pchain[r] else cal_pseudo_CB(pchain[r]) for r in sl ], dtype=float) for sl in db_tmlist ] # fast and fancy way to take the average of N consecutive elements hres_three = np.asarray([ sum([h[i:-(len(h) % N) or None:N] for i in range(N)]) / N for h in hres_list ]) ####################################################################### ################################# PCA ################################# ####################################################################### helix_pcas = [PCA() for i in range(7)] [ pca_line(helix_pcas[i], h, i % 2) for i, h in enumerate(hres_three) ] # extracellular part if extra_pca: helices_mn = np.asarray( [np.mean(h, axis=0) for h in hres_three]) pos_list = np.asarray([ pca_line(PCA(), h[:len(h) // 2:(-(i % 2) or 1)]) for i, h in enumerate(hres_three) ]) pos_list = pos_list - (np.mean(pos_list, axis=1) - helices_mn).reshape(-1, 1, 3) pca = PCA() pca_line(pca, np.vstack(pos_list)) else: pca = PCA() pca_line(pca, np.vstack(hres_three)) ####################################################################### ################################ Angles ############################### ####################################################################### ########################### Axis to CA to CB ########################## angle = np.concatenate([ ca_cb_calc(ca, cb, pca) for ca, cb in zip(hres_list, h_cb_list) ]) set_bfactor(pchain, angle) if print_pdb: self.save_pdb(structure, pdb_code + 'angle_colored_ca_cb.pdb') ######################### Axis to Axis to CA ########################## angle2 = np.concatenate([ axes_calc(h, p, pca) for h, p in zip(hres_list, helix_pcas) ]) set_bfactor(pchain, angle2) if print_pdb: self.save_pdb(structure, pdb_code + 'angle_colored_axes.pdb') ################################ SASA ################################# if SASA: pdbstruct = freesasa.Structure("pymol_output/" + pdb_code + 'angle_colored_axes.pdb') res = freesasa.calc(pdbstruct) asa_list = [] oldnum = -1 for i in range(res.nAtoms()): resnum = pdbstruct.residueNumber(i) if resnum == oldnum: asa_list[-1] += res.atomArea(i) else: asa_list.append(res.atomArea(i)) oldnum = resnum set_bfactor(pchain, asa_list) if print_pdb: self.save_pdb(structure, pdb_code + 'asa_colored.pdb') ################################# HSE ################################# if HSE: hse = pdb.HSExposure.HSExposureCB(structure[0]) [[a.set_bfactor(x[1][1]) for a in x[0]] for x in hse] if print_pdb: self.save_pdb(structure, pdb_code + 'hsea_colored.pdb') ############################### pickle ################################ if HSE and SASA: reslist = [] grslist = [] hse = [] for r in pchain: reslist.append(r.id[1]) grslist.append(gdict[r.id[1]]) hse.append(r["CA"].get_bfactor()) with open('pymol_output/' + pdb_code + '_measures.pickle', 'wb') as handle: pickle.dump( (np.array(reslist), grslist, np.array(asa_list), np.array(hse), angle, angle2, state_id), handle) #Angle.objects.bulk_create([Angle(residue=gdict[res.id[1]], angle=res["CA"].get_bfactor(), structure=reference) for res in pchain]) except Exception as e: print("ERROR!!", pdb_code, e) failed.append(pdb_code) continue print(len(failed), "of", len(references), "failed:", failed)
def _get_item_src(self, decoy): """ decoy: str, path to the decoy """ atom_to_num = { "C": 1, "N": 2, "O": 3, "S": 4 } residues = [] atom_positions = self.create_atom_positions() residue = self.build_residue() structure = fs.Structure(decoy) solvent_access = fs.calc(structure) with open(decoy, "r") as f: line = f.readline().rstrip() while not line.startswith("ATOM"): line = f.readline().rstrip() cur_resi = int(line[22:26]) # PDB file stardard format # COLUMNS DATA TYPE FIELD # ------------------------------------------- # 1 - 6 Record name "ATOM " # 7 - 11 Integer Atom serial # # 13 - 16 Atom Atom name # 17 Character Alternate location # 18 - 20 Residue name resName # 22 Character chainID # 23 - 26 Integer resSeq # 27 AChar Code for insertion of residues # 31 - 38 Real(8.3) x # 39 - 46 Real(8.3) y # 47 - 54 Real(8.3) z # 55 - 60 Real(6.2) occupancy # 61 - 66 Real(6.2) tempFactor # 77 - 78 LString(2) element # 79 - 80 LString(2) Charge on the atom while line: if line.startswith("TER"): break if not line.startswith("ATOM"): line = f.readline().rstrip() continue # ignore hydrogens atom_type = line[-1] if atom_type == "H": line = f.readline().rstrip() continue resi_num = int(line[22:26]) if resi_num > cur_resi: residues.append(residue) if len(residues) == 400: break residue = self.build_residue() cur_resi = resi_num residue = self._put_atom_src( line.rstrip(), residue, solvent_access, atom_positions, atom_to_num) line = f.readline().rstrip() # normalize residues pc = np.ones((self.npoints, self.num_channel())) * float("-inf") residues = np.array(residues) logging.debug("decoy shape: {}".format(residues.shape)) x_mean = np.mean(residues[:, 1]) y_mean = np.mean(residues[:, 2]) z_mean = np.mean(residues[:, 3]) for i in range(self.num_channel() // self.ATTRIBUTES_EACH_ATOM): residues[:, self.ATTRIBUTES_EACH_ATOM*i+1] -= x_mean residues[:, self.ATTRIBUTES_EACH_ATOM*i+2] -= y_mean residues[:, self.ATTRIBUTES_EACH_ATOM*i+3] -= z_mean pc[0:residues.shape[0], :] = residues target_path = os.path.dirname(decoy) gdt_ts = 0.0 with open(os.path.join(target_path, "list.dat"), "r") as lst: info = lst.readline() while info: if info.startswith(os.path.basename(decoy)): gdt_ts = float( info.split()[CASPDataset.list_dat["gdt_ts"]]) break info = lst.readline() return pc, gdt_ts