def summarize_residues(mutations, pdb_info, radius, rASA, dssp, tmp_dir, quiet=True): # iterate over each structure logger.info('Running of PDB structures . . .') output = [[ 'structure', 'tumor type', '# buried residues', '# protein interface residues', '# nucleic acid interface residues', 'total residues', '# buried mutations', '# protien interface mutations', '# nucleic acid interface mutations', 'total # mutations', 'burial p-value', 'protein interface p-value', 'nucleic acid interface p-value' ]] for structure_id in pdb_info: #if structure_id.startswith('ENSP') or structure_id.startswith('NP_'): #continue #print structure_id # get pdb info struct_info = pdb_info[structure_id] pdb_path = struct_info.pop('path') # read in structure structure = utils.read_structure(pdb_path, structure_id, quiet=quiet) if structure is None: continue # make a list of all chain letters in structure struct_chains = [] for k in struct_info.keys(): struct_chains.extend(struct_info[k]) structure_mutations = mutations.get(structure_id, []) # skip structure if no mutations if not structure_mutations: continue # separate out mutation info ttypes, mres, mcount, mchains = zip( *structure_mutations) # if model_mutations else ([], [], []) # stratify mutations by their tumor type # ttype_ixs is a dictionary that contains # ttype as the keys and a list of relevant # indices as the values unique_ttypes = set(ttypes) ttype_ixs = { t: [i for i in range(len(mcount)) if ttypes[i] == t] for t in unique_ttypes } #ttype_ixs['PANCAN'] = range(len(mcount)) # add PANCAN as a "tumour type" unique_ttypes = list(unique_ttypes) #unique_ttypes.append('PANCAN') # obtain relevant info from structure tmp_info = pstruct.get_structure_info(structure, mchains, mres, mcount, struct_chains, ttype_ixs) (mut_res_centers_of_geometry, mut_res_mutation_counts, all_res_centers_of_geometry, models) = tmp_info annotated_chains = { chain for description in struct_info for chain in struct_info[description] } # find buried residues buried_res = pstruct.get_buried_residues(structure, rASA, tmp_dir, dssp) tmp_buried = [ res_id for res_id in buried_res if res_id[2] in annotated_chains ] total_res = len(tmp_buried) buried_res_info = {(info[1], info[2], info[3]) for info in tmp_buried if info[-1] == 1} num_buried_res = len(buried_res_info) # find interface residues for proteins and nucleic acids interface_res = pstruct.get_interface_residues(structure, radius) interface_prot_info = {(res_id[1], res_id[2], res_id[3][1]) for res_id in interface_res if (res_id[2] in annotated_chains) and interface_res[res_id][0] == 1} interface_na_info = {(res_id[1], res_id[2], res_id[3][1]) for res_id in interface_res if (res_id[2] in annotated_chains) and sum(interface_res[res_id][1:]) >= 1} num_interface_prot_res = len(interface_prot_info) num_interface_na_res = len(interface_na_info) # iterate through each tumour type pan_counts = [] pan_buried_counts = [] pan_interface_prot_counts, pan_interface_na_counts = [], [] tmp_output = [] for tumour in unique_ttypes: # skip tumor types if not one specified #if (not opts['tumor_type'] == tumour and not opts['tumor_type'] == 'EVERY'): #continue # draw information for the specific tumour type t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour] t_mut_res_mutation_counts = mut_res_mutation_counts[tumour] # count total mutations in structure while # avoiding double counting due to same id and chain # being on multiple models obs_models = [] obs_chains = [] total_mutations = 0 total_buried_muts = 0 total_interface_prot_muts, total_interface_na_muts = 0, 0 banned_chains = set() #if not tumour == 'PANCAN': if True: for k in t_mut_res_mutation_counts: mutations_to_add = t_mut_res_mutation_counts[k] # prevent double counting cur_model = k[1] cur_chain = k[2] cur_pos = k[3][1] for i in range(len(obs_models)): if not cur_model == obs_models[ i] and cur_chain == obs_chains[i]: mutations_to_add = 0 break if (cur_chain, cur_pos) in banned_chains: mutations_to_add = 0 # add all equivalent chains to banned list equiv_chains = pstruct.find_eq_letters( struct_info, cur_chain) if equiv_chains is not None: equiv_pos = set([(e, cur_pos) for e in equiv_chains]) banned_chains |= equiv_pos - set([(cur_chain, cur_pos) ]) # add to total mutation count total_mutations += mutations_to_add # current residue of interest curr_res = (cur_model, cur_chain, cur_pos) # add buried residue mutation counts is_buried = [(m, c[0], c[1]) in buried_res_info for c in equiv_pos for m in range(4)] #if (curr_res in buried_res_info): if any(is_buried): total_buried_muts += mutations_to_add pan_buried_counts.append(mutations_to_add) # add interface residue mutation counts is_interface_prot = [(m, c[0], c[1]) in interface_prot_info for c in equiv_pos for m in range(4)] is_interface_na = [(m, c[0], c[1]) in interface_na_info for c in equiv_pos for m in range(4)] #if (curr_res in interface_info): if any(is_interface_prot): total_interface_prot_muts += mutations_to_add pan_interface_prot_counts.append(mutations_to_add) if any(is_interface_na): total_interface_na_muts += mutations_to_add pan_interface_na_counts.append(mutations_to_add) # mark chains/models obs_models.append(k[1]) obs_chains.append(k[2]) pan_counts.append(total_mutations) else: total_mutations = sum(pan_counts) total_buried_muts = sum(pan_buried_counts) total_interface_prot_muts = sum(pan_interface_prot_counts) total_interface_na_muts = sum(pan_interface_na_counts) tmp_output.append([ structure_id, tumour, num_buried_res, num_interface_prot_res, num_interface_na_res, total_res, total_buried_muts, total_interface_prot_muts, total_interface_na_muts, total_mutations, ]) output.extend(tmp_output) return output
def main(opts): # read in the PDB info file pdb_info = utils.read_pdb_info(opts['pdb_info']) # read in multiple testing file mtc = read_delim(opts['multiple_testing']) header = mtc.pop(0) ttype_ix = header.index('Tumor Type') qval_ix = header.index('q-value') gene_ix = header.index('HUGO Symbol') tx_ix = header.index('Sequence Ontology Transcript') res_ix = header.index('CRAVAT Res') #mtc.sort(key=lambda x: x[0]) # iterate through each tumor type output = [] gene2graph_all = {} # graphs for combined tumor types uniq_ttypes = set(m[ttype_ix] for m in mtc) for ttype in uniq_ttypes: logger.info('Working on {0} . . .'.format(ttype)) # initialize the graph to empty gene2graph = {} # graph for an individual tumor type # get the significant residues for the tumor type mtc_ttype = [m for m in mtc if (m[ttype_ix] == ttype) and (float(m[qval_ix])<=opts['q_value'])] significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix])) for m in mtc_ttype]) # read annotation file annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype) annotation, col_pos = read_mupit_file(annotation_file, significant_res) pdb_ix = col_pos['pdb'] anot_gene_ix = col_pos['gene'] anot_tx_ix = col_pos['tx'] anot_res_ix = col_pos['res'] # sort by structure annotation.sort(key=lambda x: x[pdb_ix]) for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]): struct_info = pdb_info[pdb_id].copy() pdb_path = struct_info.pop('path') struct_chains = [] for d in struct_info: struct_chains.extend(struct_info[d]) #pdb_path = pdb2path[pdb_id] struct = utils.read_structure(pdb_path, pdb_id) if struct is None: continue # skip if pdb file not found # calculate the centers of geometry cog = pstruct.calc_center_of_geometry(struct, struct_chains) # contains relevant mupit annotations for this pdb tmp = list(grp) # get significant residues signif_struct_info = {} for s in tmp: try: tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']])) except: print 'int error' continue signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix], s[anot_res_ix]) # update the graph to reflect info from the current structure gene2graph = update_graph(gene2graph, cog, signif_struct_info, struct, opts['radius']) # update graph for the combined cross-tumor type regions banned_ttypes = ['COAD', 'READ', 'PANCAN12', 'CHOL', 'SARC', 'TGCT', 'THYM', 'UVM'] if ttype not in banned_ttypes: gene2graph_all = update_graph(gene2graph_all, cog, signif_struct_info, struct, opts['radius']) # format the results into the output list tmp_out = retrieve_components(gene2graph, ttype) output += tmp_out logger.info('Finished {0}'.format(ttype)) # update output to contain cross-tumor type reference regions tmp_out = retrieve_components(gene2graph_all, 'REF') output += tmp_out # write output with open(opts['output'], 'wb') as handle: for line in output: handle.write('\t'.join(line)+'\n') logger.info('Finished Successfully!!!')
def main(opts): # read in the PDB info file pdb_info = utils.read_pdb_info(opts['pdb_info']) # read in multiple testing file mtc = read_delim(opts['multiple_testing']) header = mtc.pop(0) ttype_ix = header.index('Tumor Type') qval_ix = header.index('q-value') gene_ix = header.index('HUGO Symbol') tx_ix = header.index('Sequence Ontology Transcript') res_ix = header.index('CRAVAT Res') #mtc.sort(key=lambda x: x[0]) # iterate through each tumor type output = [] gene2graph_all = {} # graphs for combined tumor types uniq_ttypes = set(m[ttype_ix] for m in mtc) for ttype in uniq_ttypes: logger.info('Working on {0} . . .'.format(ttype)) # initialize the graph to empty gene2graph = {} # graph for an individual tumor type # get the significant residues for the tumor type mtc_ttype = [ m for m in mtc if (m[ttype_ix] == ttype) and ( float(m[qval_ix]) <= opts['q_value']) ] significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix])) for m in mtc_ttype]) # read annotation file annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype) annotation, col_pos = read_mupit_file(annotation_file, significant_res) pdb_ix = col_pos['pdb'] anot_gene_ix = col_pos['gene'] anot_tx_ix = col_pos['tx'] anot_res_ix = col_pos['res'] # sort by structure annotation.sort(key=lambda x: x[pdb_ix]) for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]): # fringe case if pdb_id not in pdb_info: print('skipping ' + pdb_id) continue # get path info struct_info = pdb_info[pdb_id].copy() pdb_path = struct_info.pop('path') struct_chains = [] for d in struct_info: struct_chains.extend(struct_info[d]) #pdb_path = pdb2path[pdb_id] struct = utils.read_structure(pdb_path, pdb_id) if struct is None: continue # skip if pdb file not found # calculate the centers of geometry cog = pstruct.calc_center_of_geometry(struct, struct_chains) # contains relevant mupit annotations for this pdb tmp = list(grp) # get significant residues signif_struct_info = {} for s in tmp: try: tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']])) except: print 'int error' continue signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix], s[anot_res_ix]) # update the graph to reflect info from the current structure gene2graph = update_graph(gene2graph, cog, signif_struct_info, struct, opts['radius']) # update graph for the combined cross-tumor type regions gene2graph_all = update_graph(gene2graph_all, cog, signif_struct_info, struct, opts['radius']) # format the results into the output list tmp_out = retrieve_components(gene2graph, ttype) output += tmp_out logger.info('Finished {0}'.format(ttype)) # update output to contain cross-tumor type reference regions tmp_out = retrieve_components(gene2graph_all, 'REF') output += tmp_out # write output with open(opts['output'], 'wb') as handle: for line in output: handle.write('\t'.join(line) + '\n') logger.info('Finished Successfully!!!')
def main(opts): # read in the PDB info file pdb_info = utils.read_pdb_info(opts['pdb_info']) # use external module to separate out the residues in the hotspot.py output # onto separate lines mtc = read_residue_info(opts['input']) pval_thresholds = read_thresholds(opts['significance']) # read in multiple testing file #mtc = read_delim(opts['multiple_testing']) header = mtc.pop(0) ttype_ix = header.index('Tumor Type') struct_ix = header.index('Structure') model_ix = header.index('Model') chain_ix = header.index('Chain') res_ix = header.index('Mutation Residues') pval_ix = header.index('Hotspot P-value') # iterate through each tumor type output = [] uniq_ttypes = set(m[ttype_ix] for m in mtc) for ttype in uniq_ttypes: logger.info('Working on {0} . . .'.format(ttype)) # if there is no pval threshold, nothing is significant if not ttype in pval_thresholds: continue # get the significant residues for the tumor type mtc_ttype = [m for m in mtc if (m[ttype_ix] == ttype) and (float(m[pval_ix])<=pval_thresholds[ttype])] # ANY EQUIVALENT COPY THING FOR STRUCTURES? # significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix])) # for m in mtc_ttype]) #significant_res = list(mtc_ttype) significant_res = [(m[struct_ix], m[chain_ix], int(m[res_ix])) for m in mtc_ttype] # read annotation file annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype) all_annotation, col_pos = read_mupit_file(annotation_file, significant_res) pdb_ix = col_pos['pdb'] anot_gene_ix = col_pos['gene'] anot_tx_ix = col_pos['tx'] anot_res_ix = col_pos['res'] # sort by structure all_annotation.sort(key=lambda x: x[pdb_ix]) for pdb_id, grp in it.groupby(all_annotation, lambda x: x[pdb_ix]): # initialize the graph to empty struct2graph = {} struct_info = pdb_info[pdb_id].copy() pdb_path = struct_info.pop('path') struct_chains = [] for d in struct_info: struct_chains.extend(struct_info[d]) #pdb_path = pdb2path[pdb_id] struct = utils.read_structure(pdb_path, pdb_id) if struct is None: continue # skip if pdb file not found # calculate the centers of geometry all_cogs = pstruct.calc_center_of_geometry(struct, struct_chains) # contains relevant mupit annotations for this pdb tmp = list(grp) # get significant residues signif_struct_info = {} non_signif_struct_info = {} for s in tmp: try: tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']])) except: continue if (s[col_pos['pdb']], s[col_pos['chain']], int(s[col_pos['pdb_res']])) in significant_res: signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix])) else: non_signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix])) #print "Pushing update", pdb_id # update the graph to reflect info from the current structure struct2graph, signif_res_neighbours = update_graph(struct2graph, all_cogs, signif_struct_info, non_signif_struct_info, struct, opts['radius']) # format the results into the output list tmp_out = retrieve_components(struct2graph, ttype, all_cogs, opts['radius'], signif_res_neighbours) output += tmp_out # format the results into the output list # tmp_out = retrieve_components(struct2graph, ttype) # output += tmp_out logger.info('Finished {0}'.format(ttype)) # write output with open(opts['output'], 'wb') as handle: for line in output: handle.write('\t'.join(line)+'\n') logger.info('Finished Successfully!!!')
def main(opts): """Currently, performs analysis for the given genes. It attempts to use any available PDB sturctures. It then loops through each protein chain and tumor type. """ # read in data logger.info('Reading in annotations . . .') pdb_info = utils.read_pdb_info(opts['annotation']) logger.info('Finished reading in annotations.') logger.info('Reading in mutations . . .') mutations = utils.read_mutations(opts['mutations']) logger.info('Finished reading in mutations.') # iterate over each structure logger.info('Running of PDB structures . . .') output = [] num_pdbs = 0 num_missing_pdbs = 0 missing_pdb_list = [] error_pdb_structs = [] quiet = True if opts[ 'log_level'] != "DEBUG" else False # flag indicating pdb warnings pdb_parser = PDBParser(QUIET=quiet) # parser for pdb files for structure_id in pdb_info: print(structure_id) # get pdb info struct_info = pdb_info[structure_id] pdb_path = struct_info.pop('path') # read in structure structure = utils.read_structure(pdb_path, structure_id, quiet=quiet) if structure is None: continue # make a list of all chain letters in structure struct_chains = [] for k in struct_info.keys(): struct_chains.extend(struct_info[k]) # get mutation info structure_mutations = mutations.get(structure_id, []) # skip structure if no mutations if not structure_mutations: continue # separate out mutation info ttypes, mres, mcount, mchains = zip( *structure_mutations) # if model_mutations else ([], [], []) # stratify mutations by their tumor type # ttype_ixs is a dictionary that contains # ttype as the keys and a list of relevant # indices as the values unique_ttypes = set(ttypes) ttype_ixs = { t: [i for i in range(len(mcount)) if ttypes[i] == t] for t in unique_ttypes } unique_ttypes = list(unique_ttypes) # obtain relevant info from structure tmp_info = get_structure_info(structure, mchains, mres, mcount, struct_chains, ttype_ixs) (mut_res_centers_of_geometry, mut_res_mutation_counts, all_res_centers_of_geometry, models) = tmp_info if not all_res_centers_of_geometry: logger.error('No available center of geometries for {0}'.format( structure_id)) continue # get neigbours for all residues neighbors = find_neighbors(all_res_centers_of_geometry, opts['radius']) # iterate through each tumour type for tumour in unique_ttypes: # skip tumor types if not one specified if (not opts['tumor_type'] == tumour and not opts['tumor_type'] == 'EVERY'): continue # draw information for the specific tumour type t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour] t_mut_res_mutation_counts = mut_res_mutation_counts[tumour] mut_density = src.mutations.mutation_density( t_mut_res_mutation_counts, neighbors) mut_vals = mut_density.values() if mut_vals: max_obs_dens = max(mut_density.values()) else: max_obs_dens = 0 # generate null distribution # count total mutations in structure while # avoiding double counting due to same id and chain # being on multiple models obs_models = [] obs_chains = [] total_mutations = 0 for k in t_mut_res_mutation_counts: mutations_to_add = t_mut_res_mutation_counts[k] for i in range(len(obs_models)): if not k[1] == obs_models[i] and k[2] == obs_chains[i]: mutations_to_add = 0 break total_mutations += mutations_to_add obs_models.append(k[1]) obs_chains.append(k[2]) # generate empirical null distribution sim_null_dist = sim.generate_null_dist( structure_id, models, struct_info, all_res_centers_of_geometry, total_mutations, opts['num_simulations'], opts['seed'], neighbors, opts['stop_criterion'], max_obs_dens) # get a list of lists format for compute p values function mut_list = [[res_id, mut_density[res_id]] for res_id in mut_density] if not t_mut_res_mutation_counts: print("here") # aditional information about p-values # for specific residues in a structure # compute p-values for observed obs_pvals, sim_cdf = sim.compute_pvals(mut_list, sim_null_dist) output.append([ structure_id, tumour, ','.join([str(o[0][1]) for o in mut_list]), ','.join([str(o[0][2]) for o in mut_list]), ','.join([str(o[0][3][1]) for o in mut_list]), ','.join( [str(t_mut_res_mutation_counts[o[0]]) for o in mut_list]), ','.join([str(o[1]) for o in mut_list]), ','.join(map(str, obs_pvals)), ]) # write output to file output = [[ 'Structure', 'Tumor Type', 'Model', 'Chain', 'Mutation Residues', 'Residue Mutation Count', 'Mutation Density', 'Hotspot P-value', ]] + output with open(opts['output'], 'w') as handle: csv.writer(handle, delimiter='\t', lineterminator='\n').writerows(output) # if user specified to log failed reading of pdbs if opts['error_pdb'] and error_pdb_structs: with open(opts['error_pdb'], 'w') as handle: for bad_pdb in error_pdb_structs: handle.write(bad_pdb + '\n') print("NUM_MODEL_DIFF: " + str(sim.NUM_MODEL_DIFF)) print("NUM_CHAIN_DIFF: " + str(sim.NUM_CHAIN_DIFF)) print("STRUCT_MODEL_DIFF: " + str(sim.STRUCT_MODEL_DIFF)) print("STRUCT_CHAIN_DIFF: " + str(sim.STRUCT_CHAIN_DIFF)) logger.info('Finished successfully!')
def main(opts): """Currently, performs analysis for the given genes. It attempts to use any available PDB sturctures. It then loops through each protein chain and tumor type. """ # read in data logger.info('Reading in annotations . . .') pdb_info = utils.read_pdb_info(opts['annotation']) logger.info('Finished reading in annotations.') logger.info('Reading in mutations . . .') mutations = utils.read_mutations(opts['mutations']) logger.info('Finished reading in mutations.') # iterate over each structure logger.info('Running of PDB structures . . .') output = [] num_pdbs = 0 num_missing_pdbs = 0 missing_pdb_list = [] error_pdb_structs = [] quiet = True if opts['log_level'] != "DEBUG" else False # flag indicating pdb warnings pdb_parser = PDBParser(QUIET=quiet) # parser for pdb files for structure_id in pdb_info: print (structure_id) # get pdb info struct_info = pdb_info[structure_id] pdb_path = struct_info.pop('path') # read in structure structure = utils.read_structure(pdb_path, structure_id, quiet=quiet) if structure is None: continue # make a list of all chain letters in structure struct_chains = [] for k in struct_info.keys(): struct_chains.extend(struct_info[k]) # get mutation info structure_mutations = mutations.get(structure_id, []) # skip structure if no mutations if not structure_mutations: continue # separate out mutation info ttypes, mres, mcount, mchains = zip(*structure_mutations) # if model_mutations else ([], [], []) # stratify mutations by their tumor type # ttype_ixs is a dictionary that contains # ttype as the keys and a list of relevant # indices as the values unique_ttypes = set(ttypes) ttype_ixs = {t: [i for i in range(len(mcount)) if ttypes[i]==t] for t in unique_ttypes} unique_ttypes = list(unique_ttypes) # obtain relevant info from structure tmp_info = get_structure_info(structure, mchains, mres, mcount, struct_chains, ttype_ixs) (mut_res_centers_of_geometry, mut_res_mutation_counts, all_res_centers_of_geometry, models) = tmp_info if not all_res_centers_of_geometry: logger.error('No available center of geometries for {0}'.format(structure_id)) continue # get neigbours for all residues neighbors = find_neighbors(all_res_centers_of_geometry, opts['radius']) # iterate through each tumour type for tumour in unique_ttypes: # skip tumor types if not one specified if (not opts['tumor_type'] == tumour and not opts['tumor_type'] == 'EVERY'): continue # draw information for the specific tumour type t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour] t_mut_res_mutation_counts = mut_res_mutation_counts[tumour] mut_density = src.mutations.mutation_density(t_mut_res_mutation_counts, neighbors) mut_vals = mut_density.values() if mut_vals: max_obs_dens = max(mut_density.values()) else: max_obs_dens =0 # generate null distribution # count total mutations in structure while # avoiding double counting due to same id and chain # being on multiple models obs_models = [] obs_chains = [] total_mutations = 0 for k in t_mut_res_mutation_counts: mutations_to_add = t_mut_res_mutation_counts[k] for i in range(len(obs_models)): if not k[1] == obs_models[i] and k[2] == obs_chains[i]: mutations_to_add = 0 break total_mutations += mutations_to_add obs_models.append(k[1]) obs_chains.append(k[2]) # generate empirical null distribution sim_null_dist = sim.generate_null_dist(structure_id, models, struct_info, all_res_centers_of_geometry, total_mutations, opts['num_simulations'], opts['seed'], neighbors, opts['stop_criterion'], max_obs_dens) # get a list of lists format for compute p values function mut_list = [[res_id, mut_density[res_id]] for res_id in mut_density] if not t_mut_res_mutation_counts: print("here") # aditional information about p-values # for specific residues in a structure # compute p-values for observed obs_pvals, sim_cdf = sim.compute_pvals(mut_list, sim_null_dist) output.append([structure_id, tumour, ','.join([str(o[0][1]) for o in mut_list]), ','.join([str(o[0][2]) for o in mut_list]), ','.join([str(o[0][3][1]) for o in mut_list]), ','.join([str(t_mut_res_mutation_counts[o[0]]) for o in mut_list]), ','.join([str(o[1]) for o in mut_list]), ','.join(map(str, obs_pvals)),]) # write output to file output = [['Structure', 'Tumor Type', 'Model', 'Chain', 'Mutation Residues', 'Residue Mutation Count', 'Mutation Density', 'Hotspot P-value', ]] + output with open(opts['output'], 'w') as handle: csv.writer(handle, delimiter='\t', lineterminator='\n').writerows(output) # if user specified to log failed reading of pdbs if opts['error_pdb'] and error_pdb_structs: with open(opts['error_pdb'], 'w') as handle: for bad_pdb in error_pdb_structs: handle.write(bad_pdb+'\n') print("NUM_MODEL_DIFF: " + str(sim.NUM_MODEL_DIFF)) print("NUM_CHAIN_DIFF: " + str(sim.NUM_CHAIN_DIFF)) print("STRUCT_MODEL_DIFF: " + str(sim.STRUCT_MODEL_DIFF)) print("STRUCT_CHAIN_DIFF: " + str(sim.STRUCT_CHAIN_DIFF)) logger.info('Finished successfully!')
def get_buried_residues(structure, cutoff, tmp_dir, dssp_path): """Finds buried residues by using relative solvent accessible surface area. """ # get structure id structure_id = structure.id all_letters = set(string.ascii_uppercase) | set(string.ascii_lowercase) # flatten models into a single model due to limitations of DSSP id_map = {} for k, model in enumerate(structure): if k == 0: #used_letters = set(model.child_dict.keys()) used_letters = set() for chain in model: if chain.get_id() == ' ': chain.id = 'A' #for l in used_letters: id_map[(model.id, chain.id)] = (model.id, chain.id) used_letters.add(chain.id) new_model = model.id else: for chain in model: left_over = all_letters - used_letters if not left_over: # if run out of chain letters just return nothing return [] new_letter = left_over.pop() used_letters.add(new_letter) old_letter = chain.id chain.id = new_letter id_map[(new_model, new_letter)] = (model.id, old_letter) # add numbers if there is not more letters left if not (all_letters - used_letters): all_letters.update( set(string.digits) | set(string.punctuation)) model.id = new_model # save new structure to tmp dir io = Bio.PDB.PDBIO() io.set_structure(structure) tmp_path = os.path.join(tmp_dir, structure_id + '.pdb') io.save(tmp_path) # read in tmp structure tmp_structure = utils.read_structure(tmp_path, structure_id, quiet=True) # find the solvent accessibility for residues dssp_results = Bio.PDB.DSSP(tmp_structure[0], tmp_path, dssp=dssp_path) # get bfactors for each amino acid residue bfacs_missing = [ r for r in tmp_structure.get_residues() if Bio.PDB.is_aa(r) and 'CA' not in r.child_dict ] bfacs = [ r['CA'].get_bfactor() for r in tmp_structure.get_residues() if Bio.PDB.is_aa(r) and 'CA' in r.child_dict ] mean_bfac = np.mean(bfacs) std_bfac = np.std(bfacs) # format output output = [] for result in dssp_results: # skip if not an amino acid if not Bio.PDB.is_aa(result[0]): continue # format the ID full_id = result[0].get_full_id() #if full_id[2] == ' ': #full_id[2] = 'A' try: orig_model_chain = list(id_map[full_id[1:3]]) except: print full_id, id_map raise # fix missing letter for homology models if orig_model_chain[1] == ' ': orig_model_chain[1] = 'A' # record whether it was buried if 'CA' in result[0].child_dict: norm_bfactor = (result[0]['CA'].get_bfactor() - mean_bfac) / std_bfac else: norm_bfactor = None line = [structure_id] + orig_model_chain + [ result[0].id[1], result[3], norm_bfactor ] if result[3] <= cutoff: line.append(1) else: line.append(0) output.append(line) # delete tmp file if os.path.exists(tmp_path): os.remove(tmp_path) return output