def featurize(structure: Structure) -> list[Any]: """ Calculates 3D ML features from the `structure`. """ structure1 = freesasa.Structure(pdbpath) result = freesasa.calc(structure1) area_classes = freesasa.classifyResults(result, structure1) Total_area = [] Total_area.append(result.totalArea()) Polar_Apolar = [] for key in area_classes: # print( key, ": %.2f A2" % area_classes[key]) Polar_Apolar.append(area_classes[key]) # get all the residues residues = [res for res in structure.get_residues()] seq_length = [] seq_length.append(len(residues)) # calculate some random 3D features (you should be smarter here!) protein_length = residues[1]["CA"] - residues[-2]["CA"] angle = calc_dihedral( residues[1]["CA"].get_vector(), residues[2]["CA"].get_vector(), residues[-3]["CA"].get_vector(), residues[-2]["CA"].get_vector(), ) # create the feature vector features = [Total_area, Polar_Apolar, protein_length, seq_length, angle] return features
def get_area_classes(file): struct = freesasa.Structure(file) result = freesasa.calc(struct) area_classes = freesasa.classifyResults(result, struct) list_areas = [(list(area_classes.values())[0]), (list(area_classes.values())[1]), result.totalArea()] return list_areas
def calculate_SAS(temp_dict, pdb_path, seq_len): struct = freesasa.Structure(str(pdb_path)) result = freesasa.calc(struct) area_classes = freesasa.classifyResults(result, struct) polar = area_classes['Polar'] apolar = area_classes['Apolar'] sasa_fraction = (polar + apolar) / seq_len temp_dict.update({ "Polar": polar, "Apolar": apolar, "SASA Fraction": sasa_fraction })
parser = argparse.ArgumentParser() parser.add_argument("--infile", type=str, default="data/test.zip") parser.add_argument("--model", type=str, default="model.pkl") args = parser.parse_args() #protein_parser = PDBParser() with temppathlib.TemporaryDirectory() as tmpdir: # unzip the file with all the test PDBs with zipfile.ZipFile(args.infile, "r") as zip_: zip_.extractall(tmpdir.path) for test_pdb in tmpdir.path.glob("*.pdb"): struct = freesasa.Structure(str(test_pdb)) result = freesasa.calc(struct) areas_classes = freesasa.classifyResults(result, struct) list_areas = [(list(areas_classes.values())[0]), (list(areas_classes.values())[1]), result.totalArea()] polar_area.append(list_areas[0]) apolar_area.append(list_areas[1]) total_area.append(list_areas[2]) print('done') with temppathlib.TemporaryDirectory() as tmpdir: # unzip the file with all the test PDBs with zipfile.ZipFile(args.infile, "r") as zip_: zip_.extractall(tmpdir.path) for test_pdb in tmpdir.path.glob("*.pdb"):
def openfile(): global prob, probab, te global my_seq global anti global structure, structure_id, filename global antigenicity, hydro, flex, sec global m, a, c, b, length, j, k global hydroph, flexi, access anti = [] sec = [] probab = [] from tkinter import filedialog root = Tk() root.filename = filedialog.askopenfilename( initialdir="/", title="Select file", filetypes=(("pdb files", "*.pdb"), ("pdb files", "*.pdb"))) filename = root.filename print(filename) structure_id = "1e6j" structure = PDBParser().get_structure(structure_id, root.filename) ppb = PPBuilder() for pp in ppb.build_peptides(structure): my_seq = pp.get_sequence() # type: Seq print(my_seq) for model in structure: for chain in model: print(chain) sequence = list(my_seq) m = ''.join(sequence) print(m) length = len(m) # type: int print("Sequence consist of", length, "Amino Acids") from Bio.SeqUtils.ProtParam import ProteinAnalysis analysed_seq = ProteinAnalysis(m) print("Molecular weight = ", analysed_seq.molecular_weight()) print("Amino Acid Count = ", analysed_seq.count_amino_acids()) print("Secondary structure fraction =", analysed_seq.secondary_structure_fraction()) kd = { 'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2 } c = list(analysed_seq.flexibility()) b = list(analysed_seq.protein_scale(kd, 10, 1.0)) hydro = list(analysed_seq.protein_scale(kd, 10, 1.0)) flex = list(analysed_seq.flexibility()) hydroph = list(analysed_seq.protein_scale(kd, 10, 1.0)) flexi = list(analysed_seq.flexibility()) i = 1 j = -1 # type: int k = 9 while i <= (length - 10): print("Sequence is = ", m[j + 1:k + 1]) print("Flexibility value = ", c[j + 1]) print("Hydrophilicity value = ", b[j + 1]) ana_seq = ''.join(m[j + 1:k + 1]) analyze_seq = ProteinAnalysis(ana_seq) # For Secondary structure Analysis print("Secondary structure fraction =", analyze_seq.secondary_structure_fraction()) a = list(analyze_seq.secondary_structure_fraction()) a = a[0] sec.append(a) i += 1 j += 1 k += 1 f = length r = 1 y = 10 global acc, logacc acc = [] for i in range(0, f): str1 = "accessibility, resi " str2 = str(r) + "-" + str(y) saving = str1 + str2 print(saving) r = r + 1 y = y + 1 structure = freesasa.Structure("1e6j.pdb") resulta = freesasa.calc(structure) area_classes = freesasa.classifyResults(resulta, structure) print("Total : %.2f A2" % resulta.totalArea()) for key in area_classes: print(key, ": %.2f A2" % area_classes[key]) resulta = freesasa.calc( structure, freesasa.Parameters({ 'algorithm': freesasa.LeeRichards, 'n-slices': 10 })) selections = freesasa.selectArea(('alanine, resn ala', saving), structure, resulta) for key in selections: print(key, ": %.2f A2" % selections[key]) a = selections[key] acc.append(a) l = acc[0::2] access = l print(acc) print(l) logacc = [math.log(y, 10) for y in l] print(logacc)
def ClassifySASA(self, SASA, SASAStruct): area_classes = freesasa.classifyResults(SASA, SASAStruct) return area_classes
def calculate_sasa(pdbfile, chain, multichain=True, relative_type='sidechain'): """ :param pdbfile: String of PDB file name. :param chain: String or List of chain identifiers. :param multichain: Boolean. True to separate chains. This allows SASA calculation for a single unattached monomer. False if you want to calculate SASA for the structure 'as-is'. :return: Pandas Dataframe of residue number, types, and sasa values as columns. """ import freesasa as fs dict_max_acc = { # Miller max acc: Miller et al. 1987 https://doi.org/10.1016/0022-2836(87)90038-6 # Wilke: Tien et al. 2013 https://doi.org/10.1371/journal.pone.0080635 # Sander: Sander & Rost 1994 https://doi.org/10.1002/prot.340200303 "Miller": { "ALA": 113.0, "ARG": 241.0, "ASN": 158.0, "ASP": 151.0, "CYS": 140.0, "GLN": 189.0, "GLU": 183.0, "GLY": 85.0, "HIS": 194.0, "ILE": 182.0, "LEU": 180.0, "LYS": 211.0, "MET": 204.0, "PHE": 218.0, "PRO": 143.0, "SER": 122.0, "THR": 146.0, "TRP": 259.0, "TYR": 229.0, "VAL": 160.0, }, "Wilke": { "ALA": 129.0, "ARG": 274.0, "ASN": 195.0, "ASP": 193.0, "CYS": 167.0, "GLN": 225.0, "GLU": 223.0, "GLY": 104.0, "HIS": 224.0, "ILE": 197.0, "LEU": 201.0, "LYS": 236.0, "MET": 224.0, "PHE": 240.0, "PRO": 159.0, "SER": 155.0, "THR": 172.0, "TRP": 285.0, "TYR": 263.0, "VAL": 174.0, "MSE": 224.0, "SEC": 167.0, }, "Sander": { "ALA": 106.0, "ARG": 248.0, "ASN": 157.0, "ASP": 163.0, "CYS": 135.0, "GLN": 198.0, "GLU": 194.0, "GLY": 84.0, "HIS": 184.0, "ILE": 169.0, "LEU": 164.0, "LYS": 205.0, "MET": 188.0, "PHE": 197.0, "PRO": 136.0, "SER": 130.0, "THR": 142.0, "TRP": 227.0, "TYR": 222.0, "VAL": 142.0, }, } theoreticalMaxASA = dict_max_acc["Wilke"] # Calculates SASA for unseparated chains. if not multichain: structure = fs.Structure(pdbfile) else: # Separate chains if multichain structure. This allows SASA calculation for a single unattached monomer. structures = fs.structureArray(pdbfile, options={"separate-chains": True}) chains = [] for c in range(len(structures)): chains.append(structures[c].chainLabel(1)) structure = structures[chains.index(chain)] print("using {} separating chains {}".format(chains.index(chain), chains)) print("Number of atoms of {}: {}".format(pdbfile, structure.nAtoms())) result = fs.calc(structure, fs.Parameters({'algorithm': fs.ShrakeRupley, 'n-points': 10000})) res = result.residueAreas() residue = [] resnum = [] total = [] apolar = [] mainchain = [] sidechain = [] ratio = [] for idx, v in res[chain].items(): residue.append(v.residueType) resnum.append(v.residueNumber) total.append(v.total) apolar.append(v.apolar) mainchain.append(v.mainChain) sidechain.append(v.sideChain) if v.residueType == 'GLY': ratio.append(100 * v.mainChain / theoreticalMaxASA[v.residueType]) elif v.residueType not in theoreticalMaxASA.keys(): possibleSASA = [] for i, maxSASA in enumerate(theoreticalMaxASA.values()): # If the residue is unknown but has a SASA, # calculate the rSASA dividing by theoretical maxSASA and then use the average of that value possibleSASA.append(100 * v.sideChain / maxSASA) ratio.append(np.average(possibleSASA)) else: if relative_type == 'sidechain': ratio.append(100 * v.sideChain / theoreticalMaxASA[v.residueType]) else: ratio.append(100 * v.total / theoreticalMaxASA[v.residueType]) # if v.hasRelativeAreas: # ratio.append(v.relativeSideChain) # else: # ratio.append(np.nan) df_sasa = pd.DataFrame({'Residue': residue, 'Residue_num': resnum, 'Chain': chain, 'Total': total, 'Apolar': apolar, 'Backbone': mainchain, 'Sidechain': sidechain, 'Ratio': ratio}) area_class = fs.classifyResults(result, structure) print("Total : %.2f A2" % result.totalArea()) for key in area_class: print(key, ": %.2f A2" % area_class[key]) return df_sasa
def main(args): try: os.makedirs(args.output_dir) except: pass with open(f'../data/filtered_pdb_ID/filtered_{args.halide}.txt', 'r') as f: text=f.readlines() base_list=[] base_list+=text base_list=[line.rstrip() for line in base_list] base_list=[i for i in base_list if i !=''] print (base_list) for i in range(2,len(base_list),3): if args.input_type == 'pdb_id': struct = PandasPdb().fetch_pdb(f'{base_list[i]}') model_name = args.input structure= freesasa.Structure(f'{base_list[i]}') elif args.input_type == 'structure': struct = PandasPdb() struct = struct.read_pdb(f'{args.input}/pdb{base_list[i].lower()}.ent') print(f'{args.input}/pdb{base_list[i].lower()}.ent') model_name = re.search('[\d\w]+$', struct.header).group() structure= freesasa.Structure(f'{args.input}/pdb{base_list[i].lower()}.ent') try: resolution = float(re.search("REMARK\s+2\s+RESOLUTION\.\s+([\d\(.)]+)\s\w+", struct.pdb_text).group(1)) except: resolution = 100 print (resolution) result= freesasa.calc(structure) classArea =freesasa.classifyResults(result,structure) print(result.totalArea()) halide_atoms = struct.df['HETATM'][struct.df['HETATM']['atom_name'] == args.halide] modern_df=struct.df['ATOM'] # make the subset dict_of_subsets = {} for i in halide_atoms.values: global Coordinate Coordinate=[] # list of coordinates М[0]= halide coordinates М[1] the nearest atom's coordinates dist = struct.distance(xyz=tuple(i[11:14]), records=('ATOM')) modern_df['dist']=dist # add distanse to subset if args.C == 1: modern_subset =modern_df[modern_df.dist < args.angstrem_radius] # halide neighbors elif args.C == 2: modern_df1=modern_df[modern_df.dist < args.angstrem_radius] modern_subset=modern_df1.loc[~modern_df1['element_symbol'].isin(['C','H'])] elif args.C == 3: modern_df1 =modern_df[modern_df.dist < args.angstrem_radius] modern_subset=modern_df1.loc[~modern_df1['element_symbol'].isin(['C'])] elif args.C == 4: modern_df1 =modern_df[modern_df.dist < args.angstrem_radius] atoms=list(map("".join,itertools.permutations('BDEGHZ',1))) atoms1=list(map("".join,itertools.permutations('BDEGHZ123',2))) atoms2=atoms+atoms1 atom2=['C'+ atom for atom in atoms2] +['O','C'] modern_subset=modern_df1.loc[~modern_df1['atom_name'].isin(atom2)] xyz=i[11:14] # halide coordinate Coordinate+=[xyz] # add coordinates try: nearest=modern_subset.loc[modern_subset['dist']==min(modern_subset['dist'])].values[0][[3,5,11,12,13,20,21]] #define the nearest atom Coordinate+=[nearest[2:5]] # add the nearest atom's coordinates except: continue for n in modern_subset.values: xyz2= n[11:14] Coordinate+=[xyz2] # add coordinates modern_subset1=modern_subset.copy(deep=True) modern_subset1['angles']=ang(Coordinate) # add angles to subset #modern_subset1=modern_subset1.loc[modern_subset1['angles'] != 0] # delete rows with angles=0 #{nearest[0]}:{nearest[1]}:{"%.3f"% nearest[6]}:{np.nan} dict_of_subsets[f'{model_name}:{"%.3f"% result.totalArea()}:{resolution}:{i[3]}:{nearest[5]}'] =\ [(f'{j[3]}:{j[5]}:{"%.3f"% j[21]}:{"%.3f"% j[22]}') for j in modern_subset1.values] def write_output(sfx): with open(f'{args.output_dir}/{args.output_file_name}_{sfx}.tsv', 'a') as w: for k,v in dict_of_subsets.items(): w.write(f'{k}\t') for i in range(len(v)): if i == len(v)-1: w.write(f'{v[i]}') else: w.write(f'{v[i]},') w.write('\n') if resolution <= 1.5: write_output('HIGH') elif resolution > 1.5 and resolution < 2.5: write_output('MODERATE') else: write_output('LOW') path=os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pdb_one_halide.txt') os.remove(path)