def _get_protein_features(self, pdb_code, file_path, chain_selection): """ :param file_path: (str) file path to PDB file :param pdb_code: (str) String containing four letter PDB accession :return df (pd.DataFrame): Dataframe containing output of DSSP (Solvent accessibility, secondary structure for each residue) """ # Run DSSP on relevant PDB file if pdb_code: d = dssp_dict_from_pdb_file(self.pdb_dir + pdb_code + '.pdb') if file_path: d = dssp_dict_from_pdb_file(file_path) # Parse DSSP output to DataFrame appender = [] for k in d[1]: to_append = [] y = d[0][k] chain = k[0] residue = k[1] het = residue[0] resnum = residue[1] icode = residue[2] to_append.extend([chain, resnum, icode]) to_append.extend(y) appender.append(to_append) cols = [ 'chain', 'resnum', 'icode', 'aa', 'ss', 'exposure_rsa', 'phi', 'psi', 'dssp_index', 'NH_O_1_relidx', 'NH_O_1_energy', 'O_NH_1_relidx', 'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy', 'O_NH_2_relidx', 'O_NH_2_energy' ] df = pd.DataFrame.from_records(appender, columns=cols) # Subset dataframe to those in chain_selection if chain_selection != 'all': df = df.loc[df['chain'].isin(chain_selection)] # Rename cysteines to 'C' df['aa'] = df['aa'].str.replace('[a-z]', 'C') df = df[df['aa'].isin(list(aa1))] # Drop alt_loc residues df = df.loc[df['icode'] == ' '] # Add additional Columns df['aa_three'] = df['aa'].apply(one_to_three) df['max_acc'] = df['aa_three'].map(residue_max_acc['Sander'].get) df[['exposure_rsa', 'max_acc']] = df[['exposure_rsa', 'max_acc']].astype(float) df['exposure_asa'] = df['exposure_rsa'] * df['max_acc'] df['index'] = df['chain'] + ':' + df['aa_three'] + ':' + df[ 'resnum'].apply(str) return df
def calc_ss(pdbfile) -> [str]: ''' Calculate the secondary structure of the protein. Code Structure H Alpha helix (4-12) B Isolated beta-bridge residue E Strand G 3-10 helix I Pi helix T Turn S Bend - None ''' ''' dssp_dict_from_pdb_file simply Popen 'mkdssp' and then deals with its output, src: http://biopython.org/DIST/docs/api/Bio.PDB.DSSP%27-pysrc.html#dssp_dict_from_pdb_file ''' # keys :: [(chainid, res_id)], eg [('A', (' ', 12, ' ')), ...] ss_dict, keys = dssp_dict_from_pdb_file(pdbfile) # this supports .cif also # make the resides' order consistent as it is in C-alpha file (i.e 'modes_CA.pdb') # to plot fluctuations with 2nd structure correctly parser = set_parser(pdbfile) protein = parser.get_structure(pdbfile[:4], pdbfile) ss_list = [] for a in protein.get_atoms(): if is_ca(a): full_id = a.get_full_id() new_key = (full_id[2], full_id[3]) if new_key in ss_dict: ss_list.append(ss_dict[new_key][1]) return ss_list
def generate_2structures(pdbs_to_process,output_path,pdb_id,logger): print(pdbs_to_process) pdb_files = [] for pdbs_id in pdbs_to_process: url = "https://files.rcsb.org/download/"+ pdbs_id +".pdb" try: urllib.request.urlretrieve(url, output_path +"/" + pdbs_id + ".pdb") pdb_files.append(output_path +"/" + pdbs_id + ".pdb") except Exception as e: print(str(e)) all_seq_fasta = output_path + "/"+pdb_id+".fasta" for pdb_file in pdb_files: dssp_tuple = dssp_dict_from_pdb_file(pdb_file) dssp_dict = dssp_tuple[0] #EL PRIMER VALOR DE LA TUPLA ES UN DICCIONARIO (TUPLA KEY, DATA DE LA ESTRUCTURA) #EL SEGUNDO VALOR ES LA LISTA DE KEYS QUE SON DEL FORMATO ("CADENA",('',NRO DE RESIDUO,'')) #LAS CADENAS ESTAN SEPARADAS , POR EJ LA CADENA A SON MUCHAS KEYS TODAS EMPEZANDO CON A PERO CON DISTINTO NUMERO DE RESIDUO chain_map = {} for key in dssp_tuple[1]: if(key[0] in chain_map.keys()): chain_map[key[0]].append(key) else: chain_map[key[0]] = [key] for chain,keys in chain_map.items(): seq = "" for chainPart in keys: seq += dssp_dict[chainPart][1] pdb_name = pdb_file.split('/')[2].split('.')[0] + "_" + chain secondary_map[pdb_name] = seq return generate_secondary_fasta(get_primary_map(pdb_id,output_path),output_path)
def get_dssp_df(pdb_file, pdb_name, dir, dssp_exec='dssp'): d = dssp_dict_from_pdb_file(pdb_file) appender = [] for k in d[1]: to_append = [] y = d[0][k] chain = k[0] residue = k[1] het = residue[0] resnum = residue[1] icode = residue[2] to_append.extend([chain, resnum, icode]) to_append.extend(y) appender.append(to_append) cols = ['chain', 'resnum', 'icode', 'dssp_index', 'aa', 'ss', 'exposure_rsa', 'phi', 'psi', 'NH_O_1_relidx', 'NH_O_1_energy', 'O_NH_1_relidx', 'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy', 'O_NH_2_relidx', 'O_NH_2_energy'] df = pd.DataFrame.from_records(appender, columns=cols) # Adding additional columns df = df[df['aa'].isin(list(aa1))] df['aa_three'] = df['aa'].apply(one_to_three) df['max_acc'] = df['aa_three'].map(residue_max_acc['Sander'].get) df[['exposure_rsa', 'max_acc']] = df[['exposure_rsa', 'max_acc']].astype(float) df['exposure_asa'] = df['exposure_rsa'] * df['max_acc'] df.to_csv(dir + pdb_name + '_sasa.csv') return df
def pdb2cd(name): f = name + ".pdb" dssp_tuple = dssp_dict_from_pdb_file(f) dssp_dict = dssp_tuple[0] p = PDBParser(QUIET=True).get_structure("file", f) # Initiates and fills array ("cc") with chains. cc = [chain.get_id() for model in p for chain in model] # Determines length of sequence, initiates an array ("ss") of same length. howLong = ss_out = 0 for c in cc: howLong += len([_ for _ in p[0][c].get_residues() if PDB.is_aa(_)]) if not howLong == len(dssp_tuple[1]): howLong = len(dssp_tuple[1]) ss = np.arange(1, howLong + 1) # Fills the array ("ss") with secondary structures. for i in ss: ss_lib = dssp_dict[dssp_tuple[1][ i - 3]] # ss_lib = dssp_dict[(dssp_tuple[1][0][0], (' ', i-1, ' '))] dict_ss = ss_lib[1] if dict_ss == 'H': ss_out = 0 if dict_ss == 'E': ss_out = 1 if dict_ss == '-': # else:# dict_ss == '-': ss_out = 2 ss[i - 1] = ss_out # Returns the fractional composition of alpha helix, beta sheet or random coil. alpha = (ss == 0).sum() / ss.__len__() beta = (ss == 1).sum() / ss.__len__() coil = (ss == 2).sum() / ss.__len__() abc = [alpha, beta, coil] return abc
def get_nf1(pdb, res, chain, nf1_window): PROJECT_PATH = os.path.dirname(__file__) + "/" filename_pdb = PROJECT_PATH + '/PDB_Data/' + pdb + '.pdb' dssp = dssp_dict_from_pdb_file(filename_pdb) dssp = dssp[0] nf1 = [] start = res - nf1_window end = res + nf1_window structure = '' for k, v in dssp: chain = k break for j in range(start - 1, end): try: structure = dssp[chain, (' ', j, ' ')][1] if structure == 'H' or structure == 'G' or structure == 'I': nf1.append(1) elif structure == 'T' or structure == 'S': nf1.append(2) elif structure == 'B': nf1.append(3) elif structure == 'E': nf1.append(4) else: nf1.append(5) except: nf1.append(6) print("NF1_" + str(nf1_window) + ": " + str(nf1)) return nf1
def get_dssp_dict_for_pdb_file(pdb_filename): """Run DSSP to calculate secondary structure features for a given PDB file.""" dssp_dict = {} try: dssp_tuple = dssp_dict_from_pdb_file(pdb_filename) dssp_dict = dssp_tuple[0] except Exception: logging.info("No DSSP features found for {:}".format(pdb_filename)) return dssp_dict
def compute_dssp(fname): ''' computes dssp from fname source: https://biopython.org/docs/1.75/api/Bio.PDB.DSSP.html ''' assert os.path.isfile(fname), 'no such file' dssp_tuple = dssp_dict_from_pdb_file(fname) sec_struc = [] for k, v in dssp_tuple[0].items(): sec_struc.append(v[1]) return sec_struc
def create_dssp_csv(pdb_chain_file, dssp_csv_file): """create a dssp csv file Parameters ---------- pdb_chain_file : str The file location of the pdb chain dssp_csv_file : str The file location of the output dssp_csv """ values, keys = dssp_dict_from_pdb_file(pdb_chain_file) data = [x + y for x, y in zip(keys, values.values())] pd.DataFrame(data).to_csv(dssp_csv_file, index=False)
def get_secondary_structure_residues(chain, pdb_code='1KX5'): p = PDBList() fn = p.retrieve_pdb_file(pdb_code=pdb_code, file_format='pdb', overwrite=False) dssp_dict = dssp_dict_from_pdb_file(fn)[0] residues = [] for k in dssp_dict.keys(): cName = k[0] rId = k[1][1] DSSP = dssp_dict[k][1] if not (DSSP in 'TS-'): if cName == chain: residues.append(rId) return residues
def add_dssp_df(G: nx.Graph, dssp_config: Optional[DSSPConfig]) -> nx.Graph: """ Construct DSSP dataframe and add as graph level variable to protein graph :param G: Input protein graph :param G: nx.Graph :param dssp_config: DSSPConfig object. Specifies which executable to run. Located in graphein.protein.config :type dssp_config: DSSPConfig, optional :return: Protein graph with DSSP dataframe added :rtype: nx.Graph """ config = G.graph["config"] pdb_id = G.graph["pdb_id"] # TODO - Check for DSSP installation # Check for existence of pdb file. If not, download it. if not os.path.isfile(config.pdb_dir / pdb_id): pdb_file = download_pdb(config, pdb_id) else: pdb_file = config.pdb_dir + pdb_id + ".pdb" # Extract DSSP executable executable = dssp_config.executable if config.verbose: print(f"Using DSSP executable '{executable}'") # Run DSSP dssp_dict = dssp_dict_from_pdb_file(pdb_file, DSSP=executable) dssp_dict = parse_dssp_df(dssp_dict) dssp_dict = process_dssp_df(dssp_dict) if config.verbose: print(dssp_dict) # Assign DSSP Dict G.graph["dssp_df"] = dssp_dict return G
def __init__(self, pdb_file, AA_kind='common'): if AA_kind == 'common': self.AA_dict = { 'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V', 'UNK': 'X' } else: self.AA_dict = { 'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V', 'UNK': 'X', 'SEC': 'B' } self.ss_dict_8_3 = { 'H': 'H', 'G': 'H', 'I': 'H', 'E': 'E', 'B': 'E', 'S': 'C', 'T': 'C', '-': 'C', 'C': 'C', 'X': 'X', 'x': 'x', 'M': 'M' } # 8-classes to 3-classes and 3 to 3 self.pdb_file = pdb_file self.protein_dict = read_pdb(pdb_file) try: dssp_dict = dssp_dict_from_pdb_file(pdb_file)[0] self.dssp_read = True except: self.dssp_read = False self.pdb_read = (type(self.protein_dict) == dict) if self.pdb_read and self.dssp_read: self.Seq_dict = {} self.SS_dict_8 = {} self.SS_dict_3 = {} for chain in self.protein_dict.keys(): Complete_Seq = '' Complete_SS_8 = '' Complete_SS_3 = '' index_info = sorted( [index_split(i) for i in self.protein_dict[chain].keys()], key=lambda x: x[0]) indv_pre = index_info[0][0] - 1 resi_dict_problem = False for index_value, index_foot, index in index_info: Complete_Seq += 'x' * (index_value - indv_pre - 1) Complete_SS_8 += 'x' * (index_value - indv_pre - 1) Complete_SS_3 += 'x' * (index_value - indv_pre - 1) dssp_key = (chain, (' ', index_value, index_foot)) if self.protein_dict[chain][index][ 'resi'] in self.AA_dict.keys(): resi_abbre = self.AA_dict[self.protein_dict[chain] [index]['resi']] else: resi_dict_problem = True break if dssp_key in dssp_dict.keys(): resi_ss = dssp_dict[dssp_key][1] self.protein_dict[chain][index]['SeconStru'] = resi_ss if resi_abbre != dssp_dict[dssp_key][0]: print('Residue Error! %s and %s do not match!' % (self.protein_dict[chain][index]['resi'], dssp_dict[dssp_key][0])) else: self.protein_dict[chain][index][ 'AminoAci'] = dssp_dict[dssp_key][0] else: resi_ss = 'M' Complete_Seq += resi_abbre Complete_SS_8 += resi_ss Complete_SS_3 += self.ss_dict_8_3[resi_ss] indv_pre = index_value if resi_dict_problem: self.Seq_dict[chain] = None self.SS_dict_8[chain] = None self.SS_dict_3[chain] = None else: self.Seq_dict[chain] = Complete_Seq self.SS_dict_8[chain] = Complete_SS_8 self.SS_dict_3[chain] = Complete_SS_3
res = ds.iloc[:, 2] chain = ds.iloc[:, 3] # Structures # H,G,I: 1 # T: 2 (T, S) # S: 3 # B: 4 # E: 5 # - 6 # Exception: 7 ssf_list = [] p = PDBParser() last_file = '../../../../pdb/' + str(pdb[0]) + '.pdb' last_dssp = dssp_dict_from_pdb_file(last_file) for i in range(len(pdb)): try: pdb_id = str(pdb[i]) print(pdb_id) try: file = '../../../../pdb/' + pdb_id.lower() + '.pdb' if file == last_file: dssp = last_dssp else: last_file = file dssp = dssp_dict_from_pdb_file(file) last_dssp = dssp except: file = '../../../../pdb/' + pdb_id.upper() + '.pdb' if file == last_file:
from Bio.PDB import PDBParser from Bio.PDB.DSSP import DSSP from Bio.PDB.DSSP import dssp_dict_from_pdb_file import json import sys # p = PDBParser() # structure = p.get_structure("3S7I", "./3s7i.pdb") # model = structure[0] # dssp = DSSP(model, "./3s7i.pdb", acc_array="Miller") # print(dssp['A', (' ', 173, ' ')]) pdbFile = sys.argv[1] dssp_tup = dssp_dict_from_pdb_file(pdbFile, DSSP="./mkdssp") dssp = dssp_tup[0] # (dssp index, amino acid, secondary structure, relative ASA, phi, psi, # NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, # NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) # ^ if using DSSP object, different to dssp_dict_from_pdb_file # construct sequence from dssp output sequence = "" dssp_array = [] # accessible surface area asa = [] # secondary structure ss = [] # residue ids res_id = []
def dssp_sse_extract_from_pdb(filename): """ Construct a list of SSEs from an un-annotated PDB file. @author: Travis Peters # Test... ################################## #aa_type = dssp[0][key][0] #sse_code = dssp[0][key][1] #x = float(dssp[0][key][2]) #y = float(dssp[0][key][3]) #z = float(dssp[0][key][4]) #print(str(key) + ":" + str(dssp[0][key])) #print " AA Type = " + str(aa_type), #print "\n SSE Code = " + str(sse_code), #print "\n Location = " + str((x,y,z)) ############################################ """ # Assumes dssp executable is located in root of project directory if sys.platform[0:2] == "win": DSSP_EXEC = "dssp" else: DSSP_EXEC = "./dssp" # NOTE: The DSSP codes for secondary structure used here are: # - H Alpha helix (4-12) # - G 3-10 helix # - I pi helix # - B Isolated beta-bridge residue # - E Strand # - T Turn # - S Bend # - - None HELIX = ['H', 'G', 'I'] SHEET = ['B', 'E'] # DSSP call returns a dictionary that maps (chainid, resid) to # (amino acid type, secondary structure code, and accessibility). dssp = dssp_dict_from_pdb_file(filename, DSSP_EXEC) sses = [] sse_start = None sse_type = None res_count = 0 for key in dssp[1]: # Extract the residue number resnum = key[1][1] # Extract SSE code for a residue sse_code = dssp[0][key][1] # Record SSEs by examining sse_codes of consecutive residues if sse_code in HELIX: if sse_type == 'HELIX': res_count += 1 else: # Did we just detect an SSE if res_count >= REQUIRED_SSE_RES_NUM: sses.append( SSE(sse_type, sse_start, resnum-1) ) # Start recording a new SSE res_count = 0 sse_start = resnum sse_type = 'HELIX' elif sse_code in SHEET: if sse_type == 'SHEET': res_count += 1 else: # Did we just detect an SSE if res_count >= REQUIRED_SSE_RES_NUM: sses.append( SSE(sse_type, sse_start, resnum-1) ) # Start recording a new SSE res_count = 0 sse_start = resnum sse_type = 'SHEET' else: if not (sse_type == None): # Did we just detect an SSE if res_count >= REQUIRED_SSE_RES_NUM: sses.append( SSE(sse_type, sse_start, resnum-1) ) # sse_code suggests we are not detecting an SSE res_count = 0 sse_start = resnum sse_type = None return sses
def get_dssp_df_on_file(pdb_file, outfile=None, outdir=None, outext='_dssp.df', force_rerun=False): """Run DSSP directly on a structure file with the Biopython method Bio.PDB.DSSP.dssp_dict_from_pdb_file Avoids errors like: PDBException: Structure/DSSP mismatch at <Residue MSE het= resseq=19 icode= > by not matching information to the structure file (DSSP fills in the ID "X" for unknown residues) Args: pdb_file: Path to PDB file outfile: Name of output file outdir: Path to output directory outext: Extension of output file force_rerun: If DSSP should be rerun if the outfile exists Returns: Pandas DataFrame: DSSP results, summarized """ # TODO: function unfinished # Create the output file name outfile = ssbio.utils.outfile_maker(inname=pdb_file, outname=outfile, outdir=outdir, outext=outext) if ssbio.utils.force_rerun(flag=force_rerun, outfile=outfile): try: d = dssp_dict_from_pdb_file(pdb_file) except Exception('DSSP failed to produce an output'): log.error('{}: unable to run DSSP'.format(pdb_file)) return pd.DataFrame() appender = [] # TODO: WARNING: d is slightly different than when using function get_dssp_df for k in d[1]: to_append = [] y = d[0][k] chain = k[0] residue = k[1] het = residue[0] resnum = residue[1] icode = residue[2] to_append.extend([chain, resnum, icode]) to_append.extend(y) appender.append(to_append) cols = ['chain', 'resnum', 'icode', 'dssp_index', 'aa', 'ss', 'exposure_rsa', 'phi', 'psi', 'NH_O_1_relidx', 'NH_O_1_energy', 'O_NH_1_relidx', 'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy', 'O_NH_2_relidx', 'O_NH_2_energy'] df = pd.DataFrame.from_records(appender, columns=cols) # Adding additional columns df = df[df['aa'].isin(list(aa1))] df['aa_three'] = df['aa'].apply(one_to_three) df['max_acc'] = df['aa_three'].map(residue_max_acc['Sander'].get) df[['exposure_rsa', 'max_acc']] = df[['exposure_rsa', 'max_acc']].astype(float) df['exposure_asa'] = df['exposure_rsa'] * df['max_acc'] df.to_csv(outfile) else: log.debug('{}: already ran DSSP and force_rerun={}, loading results'.format(outfile, force_rerun)) df = pd.read_csv(outfile, index_col=0) return df
def get_dssp_dict(self): dssp_dict = dssp_dict_from_pdb_file(self.pdb_fname)[0] return dssp_dict