def get_dssp_info(PDB_file, model, dir): """Runs DSSP on protein input""" #TODO : you can run DSSP through biopython. The output contains a lot of useful information. #Tip : make sure your secondary structure indexing matches the sequence order in the PDB file! return PDB.DSSP(model, dir + '/' + PDB_file, dssp='mkdssp')
def get_sasa(self, dssp_loc="/usr/bin/mkdssp", skip=None): self.dssp_loc = dssp_loc SASA = {} protein = self.universe.selectAtoms("protein") for ts in self.universe.trajectory: if skip: self.universe.trajectory.skip = skip sys.stdout.flush() sys.stdout.write('\rsasa [step {0}] '.format( self.universe.trajectory.frame)) writer = MDAnalysis.Writer("tmp.pdb") writer.write(protein) writer.close() parser = bp.PDBParser() structure = parser.get_structure('tmp', 'tmp.pdb') dssp = bp.DSSP(structure[0], 'tmp.pdb', self.dssp_loc) for key in dssp.keys(): resnum = dssp[key][0] sasa = dssp[key][2] if resnum.id[1] in SASA: SASA[resnum.id[1]].append(sasa) else: SASA[resnum.id[1]] = [sasa] count = 0 fp = open(self.out_path + self.out_file + "_sasa.dat", 'w') for key in SASA: fp.write("{0}\t{1}\t{2}\t{3}\n".format(protein.resnames()[count], key, np.mean(SASA[key]), np.std(SASA[key], ddof=1))) count += 1 fp.close() sys.stdout.write('\rSASA table created ') print
def run_dssp(self): pdb = PDB.PDBList() pdb.retrieve_pdb_file(self.pdb_code, pdir='./', file_format="pdb") p = PDB.PDBParser() f = 'pdb{}.ent'.format(self.pdb_code.lower()) wt_residues = [ i for i in Residue.objects.filter( protein_conformation__protein=self.protein).exclude( protein_segment__slug__in=['N-term', 'C-term']) ] gn_residues = [ i.sequence_number for i in wt_residues if i.generic_number and i.protein_segment.slug not in ['ECL1', 'ECL2', 'ICL3', 'ECL3'] ] structure = p.get_structure(self.pdb_code, f) for chain in structure[0]: ch = chain.get_id() self.chains.append(ch) self.dssp_dict[ch] = OrderedDict() self.dssp_info[ch] = OrderedDict([('H', 0), ('B', 0), ('E', 0), ('G', 0), ('I', 0), ('T', 0), ('S', 0), ('-', 0)]) if len(self.dssp_dict) > 1: dssp = PDB.DSSP(structure[0], f, dssp='/env/bin/dssp') for key in dssp.keys(): if int(key[1][1]) in gn_residues: self.dssp_dict[key[0]][key[1][1]] = dssp[key] self.dssp_info[key[0]][dssp[key][2]] = self.dssp_info[ key[0]][dssp[key][2]] + 1 os.remove(f)
def runDssp(self): """Run DSSP executable for this model""" if (self.dssp is None): dssp = PDB.DSSP(self.bioModel, self.pcssRunner.pdh.getFullModelFile(self), self.pcssRunner.internalConfig["dssp_executable"]) #Hard to get exact reason why DSSP didn't work since it's BioPython, but will set #as feature exception rather than global exception if (dssp is None or len(dssp.keys()) < 1): raise pcssErrors.DsspException("Did not load DSSP for model %s. This likely indicates a problem with the " "Biopython DSSP module;\ntry running DSSP from the command line to isolate " "the issue" % self.getId()) self.dssp = dssp
def secondary_structure(pdb_file, pdb_code): parser = biop.PDBParser() structure = parser.get_structure(pdb_code, pdb_file) model = structure[0] dssp = biop.DSSP(model, pdb_file) return { 'helix': [np.array(np.where(np.array(dssp)[:, 2] == 'H')) + 1][0][0], 'strand': [np.array(np.where(np.array(dssp)[:, 2] == 'E')) + 1][0][0], 'pi_helix': [np.array(np.where(np.array(dssp)[:, 2] == 'I')) + 1][0][0], 'turn': [np.array(np.where(np.array(dssp)[:, 2] == 'T')) + 1][0][0], 'bend': [np.array(np.where(np.array(dssp)[:, 2] == 'S')) + 1][0][0], }
def get_first_residue_id_dssp(pdbname, pdbpath, pdb_id): ''' get id of first residue in pdb file (pdb numbering) ''' ''' use dssp data where pdb ids are keys of dssp data dictionary ''' ''' input: pdbname, path to pdb file, pdb id with chain, example input: 'pdb1ztm', '/home/pdb/1ztm.pdb', '1ztm_A' ''' ''' output: integer id of first residue in pdb, example output: 44 ''' p = PDB.PDBParser() structure = p.get_structure(pdbname, pdbpath) model = structure[0] dssp = PDB.DSSP(model, pdbpath) chain_id = pdb_id.split('_')[-1] # fetch chain id from pdb_id chain_data = [] for res in list(dssp.keys()): # consider only residues of desired chain if res[0] == chain_id: chain_data.append(res) return chain_data[0][1][1]
def assign_secondary_structure(pdbfile, modelno=0): """ Uses DSSP via Biopython to assign secondary structure. Requires DSSP to be installed. --PARAMETERS-- pdbfile: the path to the structure (in PDB format) for which you wish to get secondary structure assignments. modelno: specify which model in the PDB file should be analysed. Default is the first. --RETURNS-- A dictionary containing secondary structure assignments, using DSSP codes (i.e. E = beta strand, H = alpha helix, etc.). Dictionary keys are the protein chain IDs, and values are a list of tuples giving residue ss assignments: (resid, ss_assignment) """ # Load structure using Biopython and select specified model structure = PDB.PDBParser(QUIET=True).get_structure("structure", pdbfile) model = structure[modelno] # Run DSSP dssp_result = PDB.DSSP(model, pdbfile) # Extract SS assignments # List of tuples format maintains the correct residue order result = {} for chain, res in dssp_result.keys(): if chain not in result: result[chain] = [] resid = biopython_resid_to_str(res) k = (chain, res) result[chain].append((resid, dssp_result[k][1], dssp_result[k][2])) return result
def rawChainParser(filepath, chainID, pssm): parser = PDB.PDBParser() structure = parser.get_structure(chainID, filepath) model = structure[0] d = PDB.DSSP(model, filepath) rd = ResidueDepth(model) pharmDic = getPharmacophoreDict() hs = PDB.HSExposure.HSExposureCA(model) df = pd.DataFrame() for residue in model[chainID]: seqID = getSeqIndex(residue) row = {} x, y, z = getResCoords(residue) resName = residue.get_resname() row["AA"] = PDB.Polypeptide.three_to_one(resName) row["x"] = x[0] row["y"] = y[0] row["z"] = z[0] tupKey = (chainID, (' ', seqID, ' ')) row["res_depth"] = rd[tupKey][0] row["ca_depth"] = rd[tupKey][1] dssp = d[(chainID, seqID)] row["ss"] = dssp[2] row["asa"] = dssp[3] row["phi"] = dssp[4] / 360.0 row["psi"] = dssp[5] / 360.0 if tupKey in hs: row["hseu"] = hs[tupKey][0] row["hsed"] = hs[tupKey][1] else: #NO HSE CALCULATED row["hseu"] = 0 row["hsed"] = 0 row["seqId"] = seqID - 1 #row["bFactor"] = centralAtom.get_bfactor() #must be done using zhang lab tool resQ instead #get pssm row pssmRow = pssm[seqID - 1] row["aligns"] = sum(pssmRow.values()) # total alignments if (row["aligns"] != 0): for key in pssmRow.keys(): if key not in list('ABCDEFGHIKLMNPQRSTUVWYZ'): print(key) row["pssm_" + key] = pssmRow[key] / row["aligns"] if (residue.is_disordered()): print(f"disorded atom in res {getSeqIndex()}") row.update(pharmDic[row["AA"]]) df = df.append(row, ignore_index=True) #check for missed columns in pssm for a in 'ABCDEFGHIKLMNPQRSTUVWYZ': # check for missing aas if ("pssm_" + a) not in df: df["pssm_" + a] = 0.0 aaEncoder = AAonehot() aaTransformed = aaEncoder.transform(df["AA"]) aaCols = ["AA_" + x for x in aaEncoder.classes_] aaDf = pd.DataFrame(aaTransformed, columns=aaCols) ssEncoder = ssOneHot() ssTransformed = ssEncoder.transform(df["ss"]) ssCols = ["SS_" + x for x in ssEncoder.classes_] ssDf = pd.DataFrame(ssTransformed, columns=ssCols) #center coordinates max = df.max() min = df.min() df["x"] = df["x"] - (max["x"] + min["x"]) / 2 df["y"] = df["y"] - (max["y"] + min["y"]) / 2 df["z"] = df["z"] - (max["z"] + min["z"]) / 2 df = pd.concat([df, aaDf, ssDf], axis=1).drop(["AA", "ss"], axis=1) df = df.fillna(0.0) if (df.shape[1] != 72): print(df) assert df.shape[ 1] == 72, f"Incorrect pssmdf shape = {df.shape[1]} for file: {filepath}" #error check #pd.set_option('display.max_columns', 500) #print(df.describe()) return df
def add_dssp(self,selected_chains=list()): ''' Adds DSSP features. DSSP ignores hetatoms but treats oligomers as single units This means that interface residues will have lower than expected SASA Therefore, each chain is split into individual chains and DSSP is calculated over each chain separately and over the whole oligomer DSSP takes files directly so need to create a temporary PDB file for each chain ''' if self.debug: print self.debug_head+"Adding DSSP" print self.debug_head+"Current header: {}".format(self.header) if self.id == "Holder": c = HOLDERS.keys() h = ['?','?','?']+[HOLDERS[x] for x in c] c = self.res_header+c self.dssp = pd.DataFrame([h],columns=c)[self.res_header+HEADERS['dssp']] if self.debug: print self.debug_head+"added holder: {}".format(h) else: genempty = False dssp = list() try: #Calculate DSSP for Oligomer #DSSP is pain in that it only takes files. Therefore, if this was created # using a model or url, create a temporary file if self.debug: print self.debug_head+"Oligomer DSSP" if self.filename == "url" or self.filename == "model": ofile = "_".join([uuid.uuid4().hex,self.id]) io = PDB.PDBIO() io.set_structure(self.struct) io.save(ofile) if self.debug: print self.debug_head+"wrote file for dssp" try: oligomer = dict(PDB.DSSP(self.struct[0],ofile,self.dssp_path)) except Exception as e: # DSSP failures generate unnamed exceptions raise DescriptorException("dssp calculation",e) os.remove(ofile) else: oligomer = dict(PDB.DSSP(self.struct[0],self.filename,self.dssp_path)) if self.debug: print self.debug_head+"finished oligomer DSSP" class chain_select(PDB.Select): #Needed for extracting each chain def accept_chain(self,c): if c.get_id() == chainid: return True else: return False #Calculate DSSP for isolated chains isolated = dict() badchains = list() if len(self.struct[0].get_list()) == 1: if self.debug: print self.debug_head+"Single chain, no need to run DSSP on isolated chains" isolated[self.struct[0].get_list()[0].get_id()] = oligomer else: if self.debug: print self.debug_head+"Running DSSP on isolated chains" io = PDB.PDBIO() io.set_structure(self.struct) for chain in self.struct[0]: # Make sure there are residues in this chain otherwise unnamed Exception if len([x for x in chain.get_residues() if x.get_id()[0]==' ' and len(x)>2])==0: badchains.append(chain.get_id()) if self.debug: print self.debug_head+"skipping no-res chain {}".format(chain.get_id()) continue # Generate random filename chainid = chain.get_id() if chainid not in selected_chains: if self.debug: print self.debug_head+"skipping unwanted chain {}".format(chainid) continue cfile = "_".join([uuid.uuid4().hex,self.id,chainid]) # Write isolated chain to random filename # Then calculate DSSP from it io.save(cfile, chain_select()) tmp = self.parser.get_structure(chainid,cfile) if self.debug: print self.debug_head+"write file for chain {}, calculating DSSP".format(chainid) try: isolated[chainid] = dict(PDB.DSSP(tmp[0],cfile,self.dssp_path)) except Exception: # DSSP failures generate unnamed exceptions print "Warning, DSSP failed for {} isolated chain {}, setting equal to oligomer".format(self.id,chainid) isolated[chainid] = oligomer os.remove(cfile) #Generate DSSP DataFrame if self.debug: print self.debug_head+"generating DSSP dataframe" if len(badchains)>0: print self.debug_head+"skipping badchains {}".format(badchains) for res in oligomer: if res[1][0]!=" ": continue c = res[0] if c not in selected_chains or c in badchains: continue # try: # r = int(res[1][1]) # except ValueError: # r = -999 r = res[1][1] i = res[1][2] ss = oligomer[res][2] try: sasa = round(float(oligomer[res][3]),3) isosasa = round(float(isolated[c][res][3]),3) except ValueError: #TODO: Does this every happen? sasa = oligomer[res] isosasa = isolated[c][res][3] dssp.append([c,r,i,ss,sasa,isosasa]) except (OSError,DescriptorException,PDB.PDBExceptions.PDBException) as e: # Generate a holder set on the fly print "Warning, DSSP calculation failed for {}: {}".format(self.id,e) genempty = True # raise ParseWarning("DSSP Calculation","Failed DSSP for {}({})".format(self.id,e)) # except ParseWarning as e: if self.debug: print self.debug_head+"DSSP failed with exception {}".format(e) if len(dssp)==0: # Generate a holder set on the fly if it's still empty genempty = True if self.debug: print self.debug_head+"DSSP set is empty, generating holder set" if genempty: dssp = list() for c in self.struct[0]: for r in c: res = r.get_id() if res[0]!=' ': continue dssp.append([c.get_id(),res[1],res[2],'?',-999,-999]) self.dssp = pd.DataFrame(dssp,columns=self.res_header+HEADERS['dssp']) if self.debug: print "Finished dssp datatable has {} rows".format(len(self.dssp.index)) if self.descriptors is None: self.descriptors = self.dssp else: self.descriptors = self.descriptors.merge(self.dssp, on=self.res_header, how='outer') self.header += HEADERS['dssp'] if self.debug: print self.debug_head+"Finished DSSP, current header: {}".format(self.header)
HSEA_dict = HSEA.property_dict HSEA_keys = HSEA.property_keys HSEA_list = HSEA.property_list HSEB = PDB.HSExposureCB(s) HSEB_dict = HSEB.property_dict HSEB_keys = HSEB.property_keys HSEB_list = HSEB.property_list depth = PDB.ResidueDepth(s) dep_dict = depth.property_dict dep_keys = depth.property_keys dep_list = depth.property_list dssp = PDB.DSSP(s, "3skpFH.pdb") dssp_dict = dssp.property_dict nb_dict = {} nb = PDB.NeighborSearch(ca_list) for a in ca_list: t = nb.search(a.get_coord(), 8) aa = a.get_parent() aa_id = (aa.get_parent().get_id(), aa.get_id()) nb_dict[aa_id] = t dic = {} dic["res_id"] = [] for a in aa_list: dic["res_id"].append(a.get_id())
def get_dssp_df(model, pdb_file, outfile=None, outdir=None, outext='_dssp.df', force_rerun=False): """ Args: model: pdb_file: outfile: outdir: outext: force_rerun: Returns: """ # Create the output file name outfile = ssbio.utils.outfile_maker(inname=pdb_file, outname=outfile, outdir=outdir, outext=outext) if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile): try: # TODO: errors with non-standard residues, ie. MSE in 4Q6U or 1nfr # TODO: write command line executor for DSSP and parser for raw DSSP results dssp = PDB.DSSP(model, pdb_file) except KeyError: return pd.DataFrame() if len(dssp.property_list) == 0: return pd.DataFrame() # Reorganize the results into a csv file appender = [] for k in dssp.property_keys: to_append = [] x = dssp.property_dict[k] chain = k[0] residue = k[1] het = residue[0] resnum = residue[1] icode = residue[2] to_append.extend([chain, resnum, icode]) to_append.extend(x) appender.append(to_append) cols = ['chain', 'resnum', 'icode', 'dssp_index', 'aa', 'ss', 'exposure_rsa', 'phi', 'psi', 'NH_O_1_relidx', 'NH_O_1_energy', 'O_NH_1_relidx', 'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy', 'O_NH_2_relidx', 'O_NH_2_energy'] df = pd.DataFrame.from_records(appender, columns=cols) # Adding additional columns df = df[df['aa'].isin(list(aa1))] df['aa_three'] = df['aa'].apply(one_to_three) df['max_acc'] = df['aa_three'].map(residue_max_acc['Sander'].get) df[['exposure_rsa', 'max_acc']] = df[['exposure_rsa', 'max_acc']].astype(float) df['exposure_asa'] = df['exposure_rsa'] * df['max_acc'] df.to_csv(outfile) else: log.debug('{}: already ran DSSP and force_rerun={}, loading results'.format(outfile, force_rerun)) df = pd.read_csv(outfile, index_col=0) return df
def get_sasa(topology, trajectory, dssp_loc=master_dssp_location, skip=None): """ This function currently only works with one or two chains, because I am lazy. """ dssp_loc = dssp_loc DSSP = {'A': {}} universe = MDAnalysis.Universe(topology, trajectory) #set the chain name here. this will only work for MDAnalysis 0.16 chain_name = universe.add_Segment(segid='A') universe.residues[...].segments = chain_name protein = universe.select_atoms("protein") diff_res = [] #this attempt to identify chain breaks will only work if the resids #... in the chains are not numbered consecutively for i in range(len(protein.resnums)): if protein.resnums[i] - protein.resnums[i - 1] < 0 and i != 0: diff_res.append(i) if len(diff_res) >= 1: chain_sep = diff_res.pop(0) chain_end = len(protein.resnums) bchain = protein[chain_sep:chain_end] bchain.set_segids('B') DSSP['B'] = {} for ts in universe.trajectory: if skip: universe.trajectory.skip = skip sys.stdout.flush() sys.stdout.write('\rsasa [step {0}] '.format( universe.trajectory.frame)) writer = MDAnalysis.Writer("tmp.pdb") writer.write(protein) writer.close() parser = bp.PDBParser() structure = parser.get_structure('tmp', 'tmp.pdb') dssp = bp.DSSP(structure[0], 'tmp.pdb', dssp_loc) for key in dssp.keys(): if 0: resobj = dssp[key][0] resname = dssp[key][0].resname residx = resobj.id[1] chain = key[0] secondary_structure = resobj.xtra['SS_DSSP'] rel_sasa = resobj.xtra['EXP_DSSP_RASA'] abs_sasa = resobj.xtra['EXP_DSSP_ASA'] phi = resobj.xtra['PHI_DSSP'] psi = resobj.xtra['PSI_DSSP'] resobj = dssp[key] resname = residue_codes_reverse[resobj[1]] residx = key[1][1] chain = key[0] secondary_structure = resobj[2] rel_sasa = resobj[3] abs_sasa = resobj[3] * dssp.residue_max_acc[resname] phi = resobj[4] psi = resobj[5] if residx in DSSP[chain] and DSSP[chain][residx][ 'resname'] == resname: DSSP[chain][residx]['dssp'].append(secondary_structure) DSSP[chain][residx]['rel_sasa'].append(rel_sasa) DSSP[chain][residx]['abs_sasa'].append(abs_sasa) DSSP[chain][residx]['phi'].append(phi) DSSP[chain][residx]['psi'].append(psi) DSSP[chain][residx]['time'].append(ts.time) else: DSSP[chain][residx] = { 'dssp': [secondary_structure], 'phi': [phi], 'time': [ts.time], 'psi': [psi], 'rel_sasa': [rel_sasa], 'chain': chain, 'abs_sasa': [abs_sasa], 'resname': resname } return DSSP
pdb_file = "{}/{}.pdb".format(PDB_HOME, pdb_id.lower()) pdbIO = PDB.PDBIO() pdbIO.set_structure(structure[0]) pdbIO.save(pdb_file) #------------------------------------------------------------------------------ # Get Surface residue # https://biopython.org/wiki/The_Biopython_Structural_Bioinformatics_FAQ # https://biopython.org/DIST/docs/api/Bio.PDB.DSSP%27-module.html # Download and Install DSSP is required # ftp://ftp.cmbi.ru.nl//pub/molbio/software/dssp-2/ #------------------------------------------------------------------------------ # Read PDB #pdb_file = '/Users/jjeong/local/project_dev/ppi/codes/utils/1A2Y.pdb' # load structure pdbParser = PDB.PDBParser() structure = pdbParser.get_structure(pdb_id, pdb_file) model = structure[0] #dssp = PDB.DSSP(model=model, in_file="/Users/jjeong/local/project_dev/ppi/outputs/pdb/6dm0.dssp", file_type='DSSP') dssp = PDB.DSSP(model=model, in_file=pdb_file, dssp='mkdssp', acc_array='Sander', file_type='PDB') #-- To see Max ACC maxAcc = dssp.residue_max_acc print(hsspProfile)
import glob from Bio import PDB pdb_files = glob.iglob('all_pdbs/*') file = open('casp11.sec', 'w') c = 0 for pdb in pdb_files: c += 1 print(c) p = PDB.PDBParser() structure = p.get_structure(pdb[:-4], pdb) model = structure[0] dssp = PDB.DSSP(model, pdb) seq = '' ss = '' for key in list(dssp.keys()): ss += dssp[key][2] seq += dssp[key][1] file.write('>{}\n'.format(pdb)) file.write('{}\n'.format(seq)) file.write('>{}\n'.format(pdb)) file.write('{}\n'.format(ss)) file.close()