def get_surface_residues(filename, my_acc_array, my_threshold): """ Given a pdb file, finds the residues exposed to the solvent (not buried) according to the ASA (accessible surface area) value given by DSSP module. The user can select a threshold of ASA. Default is 0.2. """ p = PDBParser(PERMISSIVE=1) s = p.get_structure("code.pdb", filename) model = s[0] d = DSSP(model, filename, dssp='mkdssp', acc_array=my_acc_array) sys.stderr.write("\nHandled %i residues\n" % len(d)) residue_number = set() for element in sorted(d): if type(element[3]) is not str: #Sometimes the element[3] is NA if element[3] >= my_threshold: # foreach aa in the surface (according to threshold) store residue_number try: residue_number.add( str(list(d.keys())[element[0] - 1][1][1]) + list(d.keys())[element[0] - 1][0]) except IndexError: sys.stderr.write("Element " + str(d.keys()[0]) + " index out of range\n") return residue_number
def _run(self, pdbcode, chains): pdb_file = os.path.join(PDB_PATH, 'pdb' + pdbcode + '.ent') p = PDBParser() structure = p.get_structure(pdbcode, pdb_file) model = structure[0] dssp = DSSP(model, pdb_file, dssp=dssp_route) valid_keys = [key for key in dssp.keys() if key[0] in chains] chain = dssp.keys()[0][0] return [dssp[key] for key in valid_keys], chain
def add_dssp(pdb_file): 'This is a small function that makes a dictionary with secondary structure information' from Bio.Seq import Seq from Bio import SeqIO from Bio.Alphabet import IUPAC io = PDBIO() import pandas as pd from Bio.PDB.DSSP import DSSP pdbl = PDBList() parser = PDBParser() ppb = PPBuilder() 'If the structure hasnt been downloaed then it will - else parse it' structure = parser.get_structure(pdb_file,pdb_file) model = structure[0] chain = model['A'] 'path to the dssp excecutable:' dssp_exc = '/Users/thorn/dssp-2.2.1/mkdssp' dssp = DSSP(model, pdb_file, dssp_exc) sec_dict = {} for i in range(len(dssp)): a_key = dssp.keys()[i] index = a_key[1][1] sec_structure = dssp[a_key][1] sec_dict[index] = sec_structure return sec_dict
def calc_dssp(model, chain_sites: dict, pdb_name: str) -> None: # DSSP # ============ === # Tuple Index Value # ============ === # 0 DSSP index # 1 Amino acid # 2 Secondary structure # 3 Relative ASA # 4 Phi # 5 Psi # 6 NH-->O_1_relidx # 7 NH-->O_1_energy # 8 O-->NH_1_relidx # 9 O-->NH_1_energy # 10 NH-->O_2_relidx # 11 NH-->O_2_energy # 12 O-->NH_2_relidx # 13 O-->NH_2_energy # ============ === try: dssp = DSSP(model, pdb_name, dssp="mkdssp") except: dssp = {} print("dssp failed!") for residue in dssp.keys(): ( dssp_i, aa, sec_struct, sasa_r, phi, psi, nh_o1_relidx, nh_o1_e, o_nh1_relidx, o_nh1_e, nh_o2_relidx, nh_o2_e, o_nh2_relidx, o_nh2_e, ) = dssp[residue] chain_id, res_id = residue _, resnumb, _ = res_id if chain_id in chain_sites and resnumb in chain_sites[chain_id]: resid = chain_sites[chain_id][resnumb][1] new_res = chain_sites[chain_id][resnumb][2] new_res.sec_struct = sec_struct new_res.sasa_r = sasa_r new_res.phi = phi new_res.psi = psi session.commit()
def calc_features(PATH, PDB_id, OUTPATH): #Loading the files parser = PDBParser(PERMISSIVE=1) filename = os.path.join(PATH, PDB_id + ".pdb") structure = parser.get_structure(PDB_id, filename) model = structure[0] #DSSP Analysis for SS, PHI, PSI dssp = DSSP(model, filename) #NACCESS Analysis for SASA rsa, asa = run_naccess(model, filename) rsa = process_rsa_data(rsa) #Feature mapping to each atomic coordinate dssp_present, dssp_not_present = 0, 0 feature = dict() #The feature dictionary for model in structure: for chain in model: for residue in chain: for atom in residue: print(atom.get_full_id()) ID = (atom.get_full_id()[2], atom.get_full_id()[3]) if (ID in list(dssp.keys())): if (rsa[ID]["all_atoms_abs"] > Threshold): rsa_label = 1 else: rsa_label = 0 feat = (SS_Labels[dssp[ID][2]], dssp[ID][4] / 360, dssp[ID][5] / 360, rsa_label) feature[tuple(atom.get_coord())] = feat print(ID, atom.get_coord(), feat) dssp_present += 1 else: print("==> ID not present : ", atom.get_full_id()) dssp_not_present += 1 #Printing the Stats print( "==> STATS : PDBID : %s , DSSP PRESENT : %s , DSSP NOT PRESENT : %s" % (PDB_id, dssp_present, dssp_not_present)) #Saving the feature to each PDB file with open(os.path.join(OUTPATH, PDB_id + ".dat"), "wb+") as f: pickle.dump(feature, f) print("==> Dump completed")
def getSecondaryStructure(pdbID): pdb.fetchInPDBFormat(pdbID) p = PDBParser() structure = p.get_structure(pdbID, pdb.getTemporaryPDBPath()) model = structure[0] dssp = DSSP(model, pdb.getTemporaryPDBPath()) seq = "" secStructure = "" for key in dssp.keys(): seq = seq + dssp[key][1] secStructure = secStructure + dssp[key][2] # print("Secuencia: ", seq + " de longitud " + str(len(seq))) # print("Estructura secundaria: ", secStructure + " de longitud " + str(len(secStructure))) return secStructure
def accessible_surface_area(PDB_file): # Calcul de la surface accessible au solvant pour chaque residus ASA_dict = {} parser = PDBParser() structure_id = PDB_file.split(".")[0] structure = parser.get_structure(structure_id, PDB_file) model = structure[0] dssp = DSSP(model, PDB_file, dssp='mkdssp') id_CA = 0 for CA in list(dssp.keys()): if dssp[CA][1] != 'X': ASA_dict[id_CA] = dssp[CA][3] id_CA += 1 return ASA_dict
def calculate_asa(model, filename, AROM_LIST, chain_list): """Returns a list of surface exposed residues as determined by relative solvent accessibility. Only standard protein residues are currently supported. Non-protein and user specified custom residues cannot be classified as surface exposed using this criteria. Parameters --------- model: :class:`Bio.PDB.Model.Model` Model which contains chains and residues of protein strucutre filename: str Name of pdb file to be analyzed AROM_LIST : list of str List containing which standard residues are included in analysis chain_list: list of str Chains are included in analysis Notes ----- The relative accessible surface area (RSA) of each residue is calculated using the Bio.PDB.DSSP module. A residue with an RSA value of 0.05 or higher is classified as surface exposed. References --------- Tien, M. Z.; Meyer, A. G.; Sydykova, D. K.; Spielman, S. J.; Wilke, C. O. PLoS ONE 2013, 8 (11). Reference for relative solvent accessibility cutoff of 0.05, and for MaxASA values """ cutoff = .05 surface_exposed_res = [] letter_list = [] for res_name in AROM_LIST: if res_name_to_char.get(res_name): letter_list.append(res_name_to_char.get(res_name)) try: dssp = DSSP(model, filename, acc_array="Wilke") keys = list(dssp.keys()) for key in keys: if key[0] in chain_list and dssp[key][3] >= cutoff and dssp[key][ 1] in letter_list: goal_str = dssp[key][1] + \ str(key[1][1]) + "(" + str(key[0]) + ")" surface_exposed_res.append(goal_str) except Exception as e: warnings.warn( "Unable to calculate solvent accessibility. Check that DSSP is installed.", RuntimeWarning, stacklevel=2) return surface_exposed_res
def __init__(self, pdbfile, fastafile): names = {'HIS':'H','ASP':'D','ARG':'R','PHE':'F','ALA':'A','CYS':'C','GLY':'G',\ 'GLN':'Q','GLU':'E','LYS':'K','LEU':'L','MET':'M','ASN':'N','SER':'S',\ 'TYR':'Y','THR':'T','ILE':'I','TRP':'W','PRO':'P','VAL':'V','SER':'S'} # Load fasta residue sequence f = open(fastafile) ff = [line.rstrip("\n") for line in f] f.close() p_id = ff[0][1:] self.seq = ff[1] # Load pdb information p = PDBParser(PERMISSIVE=1) st = p.get_structure(p_id, pdbfile) model = st[0] tag = p_id[-1] chain = model[tag] residues = chain.get_residues() self.residues = [res for res in residues] ## sequence info from pdb self.pdbseq = "".join([names[res.get_resname()] for res in self.residues if \ names.has_key(res.get_resname())]) ## 3-state sse info from pdb dssp = DSSP(model, pdbfile) to3_dict = {'-':'C', 'G':'H', 'H':'H', 'I':'H', 'E':'E', 'B':'E', 'T':'C', \ 'S':'C', 'L':'C'} keys = list(dssp.keys()) self.pdbss3seq = "".join([to3_dict[dssp[k][2]]for k in keys]) # Align the pdb sequence(always missing some residues) to fasta sequence alignment = AlignNW(self.seq, self.pdbseq) self.re_index = alignment['j'] self.re_index = [i-1 for i in alignment['j']] # minus 1 for indexing # generate sequence alignment between pdb sequence and fasta sequence self.alignment = "".join([self.pdbseq[i] if i > -1 else "-" for i in self.re_index]) self.alignment = "\n".join([self.seq, self.alignment]) # generate full lenght of 3-state SSE sequence according to re-index self.ss3seq = "".join([self.pdbss3seq[i] if i > -1 else "C" for i in self.re_index]) # generate full lenght of distance matrix(distance=-1 when disappear in pdbseq) self.dist_matrix = self.generate_dist_matrix() #np.savetxt("test.txt", self.dist_matrix) # generate full lenght of angle matrix(distance=None when disappear in pdbseq) self.angle_matrix = self.generate_angle_matrix()
def getDSSP( struct, fname, dsspPath=os.path.expanduser( "~/Tesis/rriPredMethod/dependencies/bioinformaticTools/dssp/mkdssp")): dssp = DSSP(struct[0], fname, dssp=dsspPath) chains = struct[0].child_list dsspDict = { chain.get_id(): {symbol: [] for symbol in DSSP_SYMBOLS} for chain in chains } for chainId, resId in dssp.keys(): secStruct = dssp[(chainId, resId)][2] dsspDict[chainId][secStruct].append(resId) return dsspDict
def __applyDssp(self): import Bio.PDB as bio print('PSU: applying dssp') from Bio.PDB.DSSP import DSSP p = bio.PDBParser() pdbFile = self.pdbDataPath + 'pdb' + self.pdbCode + '.ent' structure = p.get_structure(self.pdbCode, pdbFile) model = structure[0] dssp = DSSP(model, pdbFile) for akey in list(dssp.keys()): chain = akey[0] res_no = akey[1][1] row = dssp[akey] ss = row[2] for atom in self.atoms: if atom.values['rid'] == res_no and atom.values[ 'chain'] == chain: atom.setDsspInfo(ss) print('PSU: applied dssp successfully')
def Make_dssp(): ref = { 'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'D': 'ASP', 'B': 'ASX', 'C': 'CYS', 'E': 'GLU', 'Q': 'GLN', 'Z': 'GLX', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE', 'L': 'LEU', 'K': 'LYS', 'M': 'MET', 'F': 'PHE', 'P': 'PRO', 'S': 'SER', 'T': 'THR', 'W': 'TRP', 'Y': 'TYR', 'V': 'VAL', 'X': '---' } dssp_dict = {} dssp_dict['X'] = np.NaN p = PDBParser() structure = p.get_structure('model', './current_pdb.txt') mod = structure[0] dssp = DSSP(mod, './current_pdb.txt') for i in range(len(dssp)): a_key = list(dssp.keys())[i] dssp_dict[str(a_key[0]) + str(a_key[1][1]) + str(ref[dssp[a_key][1]])] = dssp[a_key][2] return (dssp_dict)
print() os.remove(filename) continue valid_aa = [ 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' ] current_chain = "" chain = "" phi_psis = [] dihedrals = [] chains = [] chain_names = [] # we assume that the chains and residues are in order, i.e. A1,A2,A3,...,B1,B2,B3,... for key in dssp.keys(): chain_id = key[0] if not current_chain or chain_id != current_chain: current_chain = chain_id chain_names.append(chain_id) if chain: assert len(chain) == len(phi_psis), \ "the length of chain '%s' does not equal the number of dihedrals: %s" % (chain, len(phi_psis)) chains.append(chain) dihedrals.append(phi_psis) chain = "" phi_psis = [] residue = dssp[key]
from Bio.PDB import PDBParser from Bio.PDB.DSSP import DSSP #Read in and Parse PDB file to obtain DSSP --> secondary structure determination #follows the basic outline on biopython.org -- tutorial parse = PDBParser() struc = parse.get_structure('6hrc', "6hrc.pdb") model = struc[0] dssp = DSSP(model, '6hrc.pdb') sec_struc = '' a_helix = 0 b_sheet = 0 other = 0 none = 0 key = list(dssp.keys())[2] dssp[key] for c in range(len(dssp)): key = list(dssp.keys())[c] sec_struc += dssp[key][2] if dssp[key][2] == "H" or dssp[key][2] == "G" or dssp[key][2] == "I": a_helix += 1 if dssp[key][2] == "E" or dssp[key][2] == "B": b_sheet += 1 if dssp[key][2] == "-": none += 1 else: other += 1
def calc_features(PATH, pdb_ligand_ID, OUTPATH): #Loading the files parser = PDBParser(PERMISSIVE=1) PDB_id = pdb_ligand_ID[:4].lower() #+ '_pocket' filename = os.path.join(PATH, PDB_id + ".pdb") structure = parser.get_structure(PDB_id, filename) model = structure[0] #DSSP Analysis for SS, PHI, PSI dssp = DSSP(model, filename) #NACCESS Analysis for SASA rsa, asa = run_naccess(model, filename) rsa = process_rsa_data(rsa) # print(rsa) #Feature mapping to each atomic coordinate dssp_present, dssp_not_present = 0, 0 feature = dict() #The feature dictionary for model in structure: for chain in model: if (chain.get_full_id()[2] == pdb_ligand_ID.split('_')[2]): pssm_ID = chain.get_full_id()[0][:4].upper( ) + '_' + chain.get_full_id()[2] pssm = parse_PSSM(pssm_ID) start = True gap = 0 idx_prev = 0 for residue in chain: # if(start): # start_idx =residue.get_full_id()[3][1] # idx_prev = 0 idx = residue.get_full_id()[3][1] if (idx < 1): print(idx) a = 0 pass elif (idx - idx_prev >= 1): print(idx) a = 1 gap += idx - idx_prev - 1 # elif(start): # gap += -1 # start = False for atom in residue: # print(atom.get_full_id()) ID = (atom.get_full_id()[2], atom.get_full_id()[3]) if (ID in list(dssp.keys())): if (rsa[ID]["all_atoms_abs"] > RSA_Threshold): rsa_label = 1 else: rsa_label = 0 print(gap, atom.get_full_id()[3][1], a) feat = (SS_Labels[dssp[ID][2]], dssp[ID][4] / 360, dssp[ID][5] / 360, rsa_label) + tuple( pssm[str(atom.get_full_id()[3][1] - gap)]) feature[tuple(atom.get_coord())] = feat print(pdb_ligand_ID[:4], ID, atom.get_coord(), feat) dssp_present += 1 else: print(">>> ID not present : ", atom.get_full_id()) dssp_not_present += 1 idx_prev = idx #Printing the Stats print( "===> STATS : PDBID : %s , DSSP PRESENT : %s , DSSP NOT PRESENT : %s" % (PDB_id, dssp_present, dssp_not_present)) #Saving the feature to each PDB file with open(os.path.join(OUTPATH, pdb_ligand_ID + ".dat"), "wb+") as f: pickle.dump(feature, f) print("====> Dump completed")
args = parser.parse_args() # args = parser.parse_args(['-p', '-c', 'A', '--pymol', 'ctc445/CTC-445.pdb']) # args = parser.parse_args(['-p', '-c', 'A', '--pymol', 'ctc445/CTC-640.pdb']) for pdb in args.pdbs: init_time = time() input_basename, _ = path.splitext(path.basename(pdb)) # parse input pdb and prepare for prediction input_structure = PDB.PDBParser().get_structure('input_structure', pdb) try: dssp = DSSP(input_structure[0], pdb) except Exception as e: raise RuntimeError('DSSP failed', e) ss = [ dssp[k][2] for k in filter( lambda k: k[0] == args.chain if args.chain else True, dssp.keys()) ] if args.chain: input_structure = next( filter(lambda c: c.id == args.chain, input_structure.get_chains())) # ca_atoms = list(filter(lambda a: a.name == 'CA', input_structure.get_atoms())) n_atoms = list(filter(lambda a: a.name == 'N', input_structure.get_atoms())) c_atoms = list(filter(lambda a: a.name == 'C', input_structure.get_atoms())) ca_atoms = list( filter(lambda a: a.name == 'CA', input_structure.get_atoms())) cb_xyz = []
from Bio.PDB import * from Bio.PDB.DSSP import DSSP print("START") p = PDBParser() structure = p.get_structure("X", "3g8n.pdb") model = structure[0] #chain = model['A'] #print("RESIDES #:", len(list(chain))) dssp = DSSP(model, "3g8n.pdb", dssp='mkdssp') print(dssp.keys()[2][1][1]) #RESIDUE NUMBER result = "" for i in range(len(list(dssp.keys()))): if dssp.keys()[i][0] != 'A': continue a_key = list(dssp.keys())[i] result += dssp[a_key][2] print("STRING", result) print(len(result)) #a_key = list(dssp.keys())[4] #print("RESULT", dssp[a_key]) #print("LEN", len(list(dssp.keys())))
def computeOneFile(self, pdbFname, struct): ''' Computes DSSP for a given pdb file :param pdbFname: str. fname to pdb file :param struct: Bio.PDB.Structure ''' allResidues = set([]) for chain in struct[0]: residues = chain.get_residues() allResidues = allResidues.union(set(residues)) prefixExtended = self.getExtendedPrefix(pdbFname) prefix, chainType = self.splitExtendedPrefix(prefixExtended)[:2] if self.checkAlreayComputed(prefixExtended): print("Dssp already computed for %s" % prefixExtended) return 0 print("launching Dssp over %s" % prefixExtended) try: featuresDict = {} try: dssp_out = DSSP(struct[0], pdbFname, dssp=self.dsspBinPath) except Exception as e: if "DSSP failed to produce an output" in e.message: dssp_out = {} else: print(e) raise e for chainId, resId in dssp_out.keys(): secStruct = dssp_out[(chainId, resId)][2] if secStruct == "-": secStruct = "Z" try: featuresDict[struct[0][chainId][resId]] = secStruct except KeyError: continue dataDict = {} for aa in allResidues: chainId_resIdStr_resName = self.fromRes2ChainResIdAndName(aa) chainId, resIdStr, resName = chainId_resIdStr_resName if resName is None: continue if aa in featuresDict: values = [featuresDict[aa]] else: values = ["Z"] record = [chainId, resIdStr, resName] + values record = " ".join(record) try: dataDict[chainId].append(record) except KeyError: dataDict[chainId] = [record] categoricalLevels = { ("H", "B", "E", "G", "I", "T", "S", "Z"): ("2ndStruct", ) } self.writeResultsFromDataDictSingleChain( dataDict, outName=self.getFNames(prefixExtended)[0], categoricalLevels=categoricalLevels) except (Exception, KeyboardInterrupt): self.tryToRemoveAllFnames(prefixExtended) raise
def get_struc_feat(seq_len, model_file): ''' Agrs: seq_len (int): sequence length. model_file (string): the path of model file. ''' feature = {} structure = pdb_parser.get_structure("tmp_stru", model_file) model = structure.get_list()[0] residues = model.get_list()[0].get_list() # SS and RSA dssp = DSSP(model, model_file, dssp='dssp') SS3s, RSAs = np.zeros((seq_len, 3)), np.zeros((seq_len, 1)) for _key in dssp.keys(): res_index = _key[1][1] if res_index >= 1 and res_index <= seq_len: SS3s[res_index - 1, SS3_TYPES[dssp[_key][2]]] = 1 RSAs[res_index - 1] = [dssp[_key][3]] feature['SS3'] = SS3s feature['RSA'] = RSAs atom_types = ['CA', 'CB', 'N', 'O'] # generate empty coordinates coordinates = [] for _ in range(seq_len): _dict = {} for atom_type in atom_types: _dict[atom_type] = None coordinates.append(_dict) # extract coordinates from pdb for res in residues: for atom in res: if atom.name in atom_types: coordinates[res.id[1] - 1][atom.name] = atom.coord # copy CA coordinate to CB if CB is None (GLY) if 'CB' in atom_types and coordinates[res.id[1] - 1]['CB'] is None: coordinates[res.id[1] - 1]['CB'] = coordinates[res.id[1] - 1]['CA'] # distance map atom_pairs = ['CaCa', 'CbCb', 'NO'] for atom_pair in atom_pairs: atom1, atom2 = atom_pair[:int(len(atom_pair) / 2)].upper( ), atom_pair[int(len(atom_pair) / 2):].upper() X = [ list(c[atom1]) if (c is not None and c[atom1] is not None) else [0, 0, 0] for c in coordinates ] X_valid = [ 0 if (c is None or c[atom1] is None) else 1 for c in coordinates ] Y = [ list(c[atom2]) if (c is not None and c[atom2] is not None) else [0, 0, 0] for c in coordinates ] Y_valid = [ 0 if (c is None or c[atom2] is None) else 1 for c in coordinates ] dist = scipy.spatial.distance_matrix(X, Y).astype(np.float16) XY_valid = np.outer(X_valid, Y_valid) np.putmask(dist, XY_valid == 0, -1) if atom1 == atom2: np.fill_diagonal(dist, 0) # set the self distance to 0 feature[atom_pair] = dist.reshape( (dist.shape[0], dist.shape[1], -1)) * 0.1 return feature
vectors = [] phi = [] psi = [] exp_phi = np.array([60, -80]) exp_psi = np.array([-120, 0]) b_turns = [] for atom in structure[0]['A'].get_atoms(): vectors.append(atom.get_vector()) for i in range(len(vectors) - 2): if i % 2 == 0: phi.append(calc_angle(vectors[i], vectors[i + 1], vectors[i + 2])) else: psi.append(calc_angle(vectors[i], vectors[i + 1], vectors[i + 2])) df = pd.DataFrame(list(zip(phi, psi)), columns=['phi', 'psi']) for df_slice in df.rolling(window=2): if np.allclose(df_slice['phi'].values, exp_phi, atol=error) and np.allclose( df_slice['psi'].values, exp_psi, atol=error): b_turns.append(df_slice) print('hello') dssp = DSSP(structure[0], '1g60.pdb', dssp='mkdssp') phi = list(dssp.keys())[4] print(dssp[phi])
#Printing atomic types ######################################### f_write.write("\n[") #print("[") for i in range(len(all_atom_types)): #print("[") f_write.write("[") for j in range(len(all_atom_types[i])): f_write.write(list_atoms_types[all_atom_types[i][j]] + " ") #print(list_atoms_types[all_atom_types[i][j]]) #print("]") f_write.write("]") f_write.write("]") #print("]") ######################################### #Printing secondary structure ######################################### model = structure[0] dssp = DSSP(model, file_data_name, dssp='mkdssp') secondary_struct = "" for i in range(len(list(dssp.keys()))): if dssp.keys()[i][0] != num_chain: continue a_key = list(dssp.keys())[i] secondary_struct += dssp[a_key][2] f_write.write("\n" + secondary_struct) num_of_structures += 1 f_write.write("\n\n") os.remove(new_name)
f_write.write("]") f_write.write("]") print("]") f_write.write("\n[") print("[") for i in range(len(all_atom_types)): print("[") f_write.write("[") for j in range(len(all_atom_types[i])): f_write.write(list_atoms_types[all_atom_types[i][j]] + " ") print(list_atoms_types[all_atom_types[i][j]]) print("]") f_write.write("]") f_write.write("]") print("]\n") #break # dssp: model = structure[0] dssp = DSSP(model, file_data_name, dssp='mkdssp') num_res_in_chain = 5 # TODO: change! secondary_struct = "" for i in range(num_res_in_chain): a_key = list(dssp.keys())[i] result = dssp[a_key][2] # returns DSSP secondary structure secondary_struct += result f_write.write("secondary_struct" + secondary_struct) print("secondary_struct" + secondary_struct) # delete current file os.remove(file_name) num_of_structures += 1 break
def main(pdb_file, dssp_exe, out_file): # Read PDB structrue p = PDBParser() structure = p.get_structure('id', pdb_file) model = structure[0] # Run DSSP dssp = DSSP(model, pdb_file, dssp=dssp_exe) # keys: # (dssp index, amino acid, secondary structure, relative ASA, phi, psi, # NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, # NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) sec_struc = [] rel_asa = [] for a_key in list(dssp.keys()): aux = dssp[a_key] sec_struc.append(aux[2]) rel_asa.append(0 if aux[3] == 'NA' else aux[3]) rel_asa = np.expand_dims(rel_asa, axis=1) # Get coordinates for N, CA and C atoms coor_N = [] coor_CA = [] coor_C = [] for chain in model.get_list(): total_len = len(chain.get_list()) for residue in chain.get_list(): coor_N.append(residue['N'].get_coord()) coor_CA.append(residue['CA'].get_coord()) coor_C.append(residue['C'].get_coord()) assert len(coor_N) == len(coor_CA) == len(coor_C) == total_len # Get dihedral angles and inter-residue angles angles = [] for j in range(total_len): # Initialize angles if not found phi_angle, psi_angle, omega_angle = (2 * np.pi, 2 * np.pi, 2 * np.pi) theta_angle, tau_angle = (2 * np.pi, 2 * np.pi) vec_N2, vec_CA2, vec_C2 = (coor_N[j], coor_CA[j], coor_C[j]) # Phi if j != 0: vec_C1 = coor_C[j - 1] phi_angle = calculate_dihedral(vec_C1, vec_N2, vec_CA2, vec_C2) # Psi and Omega if j != total_len - 1: vec_N3 = coor_N[j + 1] psi_angle = calculate_dihedral(vec_N2, vec_CA2, vec_C2, vec_N3) vec_CA3 = coor_CA[j + 1] omega_angle = calculate_dihedral(vec_CA2, vec_C2, vec_N3, vec_CA3) # Theta if np.logical_and(j != 0, j != total_len - 1): vec_CA1 = coor_CA[j - 1] vec_CA3 = coor_CA[j + 1] theta_angle = calculate_angle(vec_CA1, vec_CA2, vec_CA3) # Tau if np.logical_and(j > 1, j != total_len - 1): vec_CA0, vec_CA1, vec_CA3 = (coor_CA[j - 2], coor_CA[j - 1], coor_CA[j + 1]) tau_angle = calculate_dihedral(vec_CA0, vec_CA1, vec_CA2, vec_CA3) # Concatenate angles angles.append([phi_angle, psi_angle, theta_angle, tau_angle]) # Calculate sine and cosine of each angle angles = np.array(angles) angles_sin = np.sin(angles) angles_cos = np.cos(angles) angles_sin_cos = np.stack([ angles_sin[:, 0], angles_cos[:, 0], angles_sin[:, 1], angles_cos[:, 1], angles_sin[:, 2], angles_cos[:, 2], angles_sin[:, 3], angles_cos[:, 3] ]).T angles_sin_cos[np.where(np.abs(angles_sin_cos) < 1e-10)] = 0 # One-hot encoding of secondary structure (8-state) alphabet = 'HBEGITS-' # H Alpha helix (4-12) # B Isolated beta-bridge residue # E Strand # G 3-10 helix # I Pi helix # T Turn # S Bend # - None ohdict = dict((c, i) for i, c in enumerate(alphabet)) ss_onehot = np.zeros((total_len, len(ohdict))) for i in range(total_len): ss_onehot[i, ohdict[sec_struc[i]]] = 1 # Create feature matrix for the protein features = np.hstack([ss_onehot, rel_asa, angles_sin_cos]) # Save dictionary of features with open(out_file, 'wb') as f: pickle.dump(features.astype('float32'), f, protocol=2)