def main(ccds_path: Path): gene_set = set() interval_separator = re.compile(r'\s*,\s*') print('Reading CCDS data from', ccds_path) with open(ccds_path) as f: # Maps gene IDs to sets of intervals in string format, e.g. '129415220-129415360'. # Used to consolidate isoforms that are each listed separately. intervals_by_gene = defaultdict(set) r = csv.DictReader(f, delimiter='\t') # First line starts with "#", so the field name for chromosome is actually "#chromosome" for line in r: chrom_name = 'chr{}'.format(line['#chromosome']) intervals = interval_separator.split( line['cds_locations'].strip('[]')) gene_id = line['gene_id'] gene_set.add(gene_id) for interval in intervals: intervals_by_gene[gene_id].add((chrom_name, interval)) trees = defaultdict(IntervalTree) gene_length = pd.Series(0.0, index=sorted(intervals_by_gene)) print('Read data for', len(intervals_by_gene), 'genes') for gene, interval_data in intervals_by_gene.items(): chrom_dict = {} for chrom, interval_string in interval_data: # Skip invalid interval lists if interval_string == '-': continue if chrom not in chrom_dict: chrom_dict[chrom] = [float('inf'), 0] interval_pieces = interval_string.split('-') start = int(interval_pieces[0]) end = int(interval_pieces[1]) chrom_dict[chrom][0] = min(chrom_dict[chrom][0], start) chrom_dict[chrom][1] = max(chrom_dict[chrom][1], end) for chrom in chrom_dict: trees[chrom][chrom_dict[chrom][0]:chrom_dict[chrom][1]] = gene # Kilobases, so divide by 1000 gene_length.loc[gene] = (chrom_dict[chrom][1] - chrom_dict[chrom][0]) / 1000 data_path = create_data_path('build_tree') pickle_file = data_path / 'trees.pickle' print('Saving interval trees to', pickle_file) data_to_save = { 'trees': trees, 'gene_length': gene_length, 'intervals_by_gene': intervals_by_gene, } with open(pickle_file, 'wb') as f: pickle.dump(data_to_save, f, protocol=pickle.HIGHEST_PROTOCOL)
def queue_jobs(srr_list_file: Path, pool: str, subprocesses: int): data_path = create_data_path(SCRIPT_LABEL) slurm_path = create_slurm_path(SCRIPT_LABEL) with open(srr_list_file) as f: srr_ids = [line.strip() for line in f] srr_sublist_count = ceil(len(srr_ids) / SLURM_ARRAY_MAX) srr_filename_digits = digits(srr_sublist_count) for i, srr_sublist_raw in enumerate(grouper(srr_ids, SLURM_ARRAY_MAX)): srr_sublist_path = data_path / SRR_LIST_FILENAME_TEMPLATE.format( i, srr_filename_digits) print( f'{i:0{srr_filename_digits}} Saving SRR sublist to {srr_sublist_path}' ) srr_sublist = list(filter(None, srr_sublist_raw)) with open(srr_sublist_path, 'w') as f: for srr_id in srr_sublist: print(srr_id, file=f) array_index_spec = f'0-{len(srr_sublist) - 1}' script_file = slurm_path / SCRIPT_FILENAME_TEMPLATE.format( i, srr_filename_digits) print(f'{i:0{srr_filename_digits}} Saving script to {script_file}') with open(script_file, 'w') as f: script_content = script_template.format( srr_list_file=srr_sublist_path.absolute(), pool=pool, subprocesses=subprocesses, stdout_path=script_file.with_suffix('.out')) print(script_content, file=f) slurm_command = [ piece.format( array_index_spec=array_index_spec, script_filename=script_file, ) for piece in SBATCH_COMMAND_TEMPLATE ] print(f'{i:0{srr_filename_digits}} Running', ' '.join(slurm_command)) check_call(slurm_command)
#!/usr/bin/env python3 import json from pathlib import Path import pickle import attr from data_path_utils import create_data_path import pandas as pd from parse_tcga_clinical_xml import Patient, find_clinical_xml_files, parse_clinical_xml data_path = create_data_path('tcga_xml_to_json') input_path = Path('~/data/tcga-clinical-all').expanduser() print(f'Parsing XML files in {input_path}') patients = [parse_clinical_xml(path) for path in find_clinical_xml_files(input_path)] print(f'Read information for {len(patients)} patients') converted_data = [ attr.asdict( patient, filter=attr.filters.exclude(attr.fields(Patient).path), ) for patient in patients ] pickle_file = data_path / 'tcga_clinical_data.pickle' print('Saving pickled data to', pickle_file) with open(pickle_file, 'wb') as f: pickle.dump(patients, f) output_file = data_path / 'tcga_clinical_data.json'
for n1, n2, weight in input_network.edges.data('weight'): new_n1 = edge_name_func([n1]) new_n2 = edge_name_func([n2]) new_node = edge_name_func(sorted([n1, n2])) if weight is None: new_weight = 1 else: new_weight = sqrt(weight) network.add_edge(new_n1, new_node, weight=new_weight) network.add_edge(new_node, new_n2, weight=new_weight) return network if __name__ == '__main__': p = ArgumentParser() p.add_argument('hippie_input_path', type=Path) args = p.parse_args() data_path = create_data_path('build_hippie_network') print('Reading HIPPIE network from', args.hippie_input_path) network = build_hippie_network(args.hippie_input_path) print('Network contains {} nodes, {} edges'.format(len(network.nodes()), len(network.edges()))) network_output_path = data_path / 'network.pickle' print('Saving network to', network_output_path) with network_output_path.open('wb') as f: pickle.dump(network, f, protocol=pickle.HIGHEST_PROTOCOL)
from data_path_utils import ( create_data_path, create_output_path, find_newest_data_path, ) import pandas as pd from gene_mappings import read_ensembl_entrez_mapping from utils import sorted_union p = ArgumentParser() p.add_argument('gdc_manifest', type=Path) args = p.parse_args() data_path = create_data_path('consolidate_mrna_expression') input_path = find_newest_data_path('query_cases_by_file') / 'raw_responses' gdc_manifest = pd.read_table(args.gdc_manifest) files_in_manifest = set(gdc_manifest.id) def get_submitter_ids(data: dict): for key, value in data.items(): if key == 'submitter_id': yield value if isinstance(value, dict): yield from get_submitter_ids(value) if isinstance(value, list): for sub_data in value:
element_wise_min, weighted_correlation, ) selected_cancer = 'brca' p = ArgumentParser() p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) if __name__ == '__main__': args = p.parse_args() else: args = p.parse_args([]) label = f'treatment_features_alpha_{args.alpha:.2f}' data_path = create_data_path(label) output_path = create_output_path(label) network_path = find_newest_data_path('build_hippie_network') with (network_path / 'network.pickle').open('rb') as f: network = pickle.load(f) nodes = sorted(network.nodes()) node_set = set(nodes) w_prime = normalize(network) def get_prop_vec(name, genes): s = pd.Series(0.0, index=nodes) gene_set = set(genes) genes_in_network = gene_set & node_set
from gene_mappings import read_hugo_entrez_mapping from utils import DEFAULT_ALPHA, sorted_intersection from propagation import propagate, normalize DEFAULT_SUBPROCESSES = 2 p = ArgumentParser() p.add_argument('-s', '--subprocesses', type=int, default=DEFAULT_SUBPROCESSES) p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) if __name__ == '__main__': args = p.parse_args() else: args = p.parse_args([]) data_path = create_data_path(f'propagate_mutations_alpha_{args.alpha:.2f}') with (find_newest_data_path('build_hippie_network') / 'network.pickle').open('rb') as f: network = pickle.load(f) print('Loaded network') w_prime = normalize(network) node_set = set(network.nodes()) nodes = sorted(node_set) node_count = len(nodes) with pd.HDFStore(find_newest_data_path('parse_tcga_mutations') / 'mutations.hdf5', 'r') as store: mutations = store['muts'] print('Read mutations') expr = pd.read_pickle(find_newest_data_path('parse_cosmic_diffexpr') / 'brca_expr.pickle')
get_patient_barcode) selected_diffexpr = cosmic.loc[ cosmic.adj_sample_name.apply(is_selected), :] expr_samples = sorted(set(selected_diffexpr.adj_sample_name)) expr_genes = sorted(set(selected_diffexpr.GENE_NAME)) sample_mapping = {sample: i for (i, sample) in enumerate(expr_samples)} gene_mapping = {gene: i for (i, gene) in enumerate(expr_genes)} sample_vec = [ sample_mapping[sample] for sample in selected_diffexpr.adj_sample_name ] gene_vec = [gene_mapping[gene] for gene in selected_diffexpr.GENE_NAME] mat = scipy.sparse.coo_matrix( (selected_diffexpr.Z_SCORE, (sample_vec, gene_vec))) return pd.DataFrame(mat.todense(), index=expr_samples, columns=expr_genes) if __name__ == '__main__': p = ArgumentParser() p.add_argument('cosmic_filename', type=Path) p.add_argument('patient_id_file', type=Path) args = p.parse_args() diffexpr = parse_cosmic_diffexpr(args.cosmic_filename, args.patient_id_file) data_path = create_data_path('parse_cosmic_diffexpr') diffexpr.to_pickle(data_path / 'diffexpr.pickle')
header_pieces = next(f).strip().split('\t') sample_id = get_patient_barcode(header_pieces[1]) # Next line is "Composite Element REF...", ignore that too next(f) for line in f: if line: gene, expr_str = line.strip().split('\t') try: expr_value = float(expr_str) except ValueError: expr_value = nan expr_values.append(expr_value) pairs.append((sample_id, gene)) return pairs_and_values_to_dataframe(pairs, expr_values) if __name__ == '__main__': p = ArgumentParser() p.add_argument('tcga_mut_path', type=Path, nargs='+') args = p.parse_args() data_path = create_data_path('parse_tcga_mutations') print('Reading TCGA mutation data from', args.tcga_mut_path) muts = mafs_to_matrix(args.tcga_mut_path, get_patient_barcode) print('Mutation data shape:', muts.shape) mut_output_path = data_path / 'mutations.hdf5' print('Saving mutations to', mut_output_path) with pd.HDFStore(mut_output_path) as store: store['muts'] = muts
def main(): script_label = 'prop_edge_lbs_overlap' data_path = create_data_path(script_label) output_path = create_output_path(script_label) hem = read_hugo_entrez_mapping() lbs_mut_path = find_newest_data_path('intersect_muts_lbs') lbs_muts = pd.read_csv(lbs_mut_path / 'brca_lbs_muts.csv') prop_edge_path = find_newest_data_path( f'propagate_mutations_edges_alpha_{args.alpha:.2f}') with pd.HDFStore(prop_edge_path / 'data_propagated.hdf5') as store: mut_edge_prop = store['mutations'] patients_with_lbs_muts = set(lbs_muts.patient) print('Patients with LBS mutations:', len(patients_with_lbs_muts)) lbs_muts_by_patient = defaultdict(set) for i, row in lbs_muts.iterrows(): if row.gene not in hem: print('Skipping gene', row.gene) continue lbs_muts_by_patient[row.patient].add(hem[row.gene]) all_edge_set = {i for i in mut_edge_prop.columns if '_' in i} all_edges = sorted(all_edge_set) edge_prop = mut_edge_prop.loc[:, all_edges] shuffle_count = 100 sorted_patients = sorted(patients_with_lbs_muts) patient_count = len(sorted_patients) ndcg = pd.Series(0.0, index=sorted_patients) shuffled_ndcg = pd.DataFrame(0.0, index=sorted_patients, columns=range(shuffle_count)) lbs_edges_by_patient = pd.Series(0, index=sorted_patients) print('Loading shuffled data') prop_lbs_shuffle_path = find_newest_data_path('prop_edge_lbs_shuffle') with open(prop_lbs_shuffle_path / 'shuffled_muts_edges_by_patient.pickle', 'rb') as f: d = pickle.load(f) shuffled_by_patient = d['shuffled_by_patient'] selected_edges_by_patient = d['selected_edges_by_patient'] shuffled_edges_by_patient = d['shuffled_edges_by_patient'] ## NDCG analysis # For each patient, rank edges by propagated mutation scores, assign label of 1 if # either node connected to that edge has a LBS mutation for i, patient in enumerate(patients_with_lbs_muts, 1): print(f'Computing NDCG for patient {i}/{patient_count}') edge_scores = mut_edge_prop.loc[patient, all_edges].copy().sort_values( ascending=False) selected_edges = selected_edges_by_patient[patient] shuffled_edge_list = shuffled_edges_by_patient[patient] relevance = np.array([e in selected_edges for e in edge_scores.index]).astype(float) ndcg.loc[patient] = normalized_discounted_cumulative_gain( relevance)[-1] for j, shuffled_edges in enumerate(shuffled_edge_list): shuffled_relevance = np.array( [e in shuffled_edges for e in edge_scores.index]).astype(float) shuffled_ndcg.loc[patient, j] = normalized_discounted_cumulative_gain( shuffled_relevance)[-1] with pd.HDFStore(data_path / 'ndcg_data.hdf5') as store: store['ndcg'] = ndcg store['shuffled_ndcg'] = shuffled_ndcg store['lbs_edges_by_patient'] = lbs_edges_by_patient shuffled_ndcg_flat = shuffled_ndcg.unstack() #shuffled_ndcg_median = shuffled_ndcg.median(axis=1) with new_plot(): ndcg.plot.hist(bins=hist_bin_count) plt.title('NDCG histogram') plt.xlabel( 'Patient NDCG score: selection of LBS edges by propagated edge score' ) figure_path = output_path / 'ndcg_hist.pdf' print('Saving NDCG histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') with new_plot(): shuffled_ndcg_flat.plot.hist(bins=hist_bin_count) plt.title('NDCG histogram') plt.xlabel( 'Patient NDCG score: selection of shuffled LBS edges by propagated edge score' ) figure_path = output_path / 'shuffled_ndcg_hist.pdf' print('Saving NDCG histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') ndcg_ks = scipy.stats.ks_2samp(ndcg, shuffled_ndcg_flat) ndcg_ks_pvalue_str = to_matplotlib_sci_notation(ndcg_ks[1]) with new_plot(): ndcg.plot.hist( bins=hist_bin_count, alpha=0.8, label='Real NDCG', density=True, ) shuffled_ndcg_flat.plot.hist( bins=hist_bin_count, alpha=0.8, label='Shuffled NDCG, across 100 permutations', density=True, ) plt.xlabel( 'Patient NDCG score: selection of LBS edges by propagated edge score' ) plt.legend() plt.figtext( 0.89, 0.7, f'Kolmogorov-Smirnov $P = {ndcg_ks_pvalue_str}$', horizontalalignment='right', ) figure_path = output_path / 'ndcg_both_hist.pdf' print('Saving NDCG histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') ## /NDCG analysis ## PR and ROC AUC analysis roc_auc = pd.Series(0.0, index=sorted_patients) average_pr_scores = pd.Series(0.0, index=sorted_patients) shuffled_roc_auc = pd.DataFrame(0.0, index=sorted_patients, columns=range(shuffle_count)) shuffled_average_pr_scores = pd.DataFrame(0.0, index=sorted_patients, columns=range(shuffle_count)) # Maps patient IDs to performance objects roc_data_objects = {} pr_data_objects = {} for i, patient in enumerate(patients_with_lbs_muts, 1): print( f'Computing classifier performance for patient {i}/{patient_count}' ) selected_edges: Set[str] = selected_edges_by_patient[patient] edge_scores = mut_edge_prop.loc[patient, all_edges].copy() labels = np.array([e in selected_edges for e in edge_scores.index]).astype(float) rd = RocData.calculate(labels, edge_scores) roc_data_objects[patient] = rd roc_auc.loc[patient] = rd.auc pr = PrData.calculate(labels, edge_scores) pr_data_objects[patient] = pr average_pr_scores.loc[patient] = average_precision_score( labels, edge_scores) shuffled_edge_list: List[Set[str]] = shuffled_edges_by_patient[patient] for j, shuffled_edges in enumerate(shuffled_edge_list): shuffled_labels = np.array( [e in shuffled_edges for e in edge_scores.index]).astype(float) shuffled_rd = RocData.calculate(shuffled_labels, edge_scores) shuffled_roc_auc.loc[patient, j] = shuffled_rd.auc shuffled_average_pr_scores.loc[patient, j] = average_precision_score( shuffled_labels, edge_scores, ) with pd.HDFStore(data_path / 'classifier_data.hdf5') as store: store['roc_auc'] = roc_auc store['average_pr'] = average_pr_scores store['shuffled_roc_auc'] = shuffled_roc_auc store['shuffled_average_pr'] = shuffled_average_pr_scores with new_plot(): roc_auc.plot.hist(bins=hist_bin_count) plt.title('ROC AUC histogram') plt.xlabel( 'Patient ROC AUC: selection of LBS edges by propagated edge score') figure_path = output_path / 'roc_auc_hist.pdf' print('Saving ROC AUC histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') #shuffled_roc_auc_median = shuffled_roc_auc.median(axis=1) shuffled_roc_auc_flat = shuffled_roc_auc.unstack() with new_plot(): shuffled_roc_auc_flat.plot.hist(bins=hist_bin_count) plt.title('ROC AUC histogram') plt.xlabel( 'Patient ROC AUC: selection of shuffled LBS edges by propagated edge score' ) figure_path = output_path / 'shuffled_roc_auc_hist.pdf' print('Saving ROC AUC histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') roc_auc_ks = scipy.stats.ks_2samp(roc_auc, shuffled_roc_auc_flat) roc_auc_ks_pvalue_str = to_matplotlib_sci_notation(roc_auc_ks[1]) with new_plot(): roc_auc.plot.hist( bins=hist_bin_count, alpha=0.8, label='Real ROC AUC', density=True, ) shuffled_roc_auc_flat.plot.hist( bins=50, alpha=0.8, label='Shuffled ROC AUC, across 100 permutations', density=True, ) plt.xlabel( 'Patient ROC AUC: selection of LBS edges by propagated edge score') plt.legend() plt.figtext( 0.14, 0.7, f'Kolmogorov-Smirnov $P = {roc_auc_ks_pvalue_str}$', horizontalalignment='left', ) figure_path = output_path / 'roc_auc_both_hist.pdf' print('Saving ROC AUC histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') with new_plot(): average_pr_scores.plot.hist(bins=hist_bin_count) plt.title('Average precision histogram') plt.xlabel( 'Average precision: selection of LBS edges by propagated edge score' ) figure_path = output_path / 'avg_prec_hist.pdf' print('Saving AP histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') shuffled_average_pr_median = shuffled_average_pr_scores.median(axis=1) with new_plot(): shuffled_average_pr_median.plot.hist(bins=hist_bin_count) plt.title('Average precision histogram') plt.xlabel( 'Average precision: selection of shuffled LBS edges by propagated edge score' ) figure_path = output_path / 'shuffled_avg_prec_hist.pdf' print('Saving AP histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') top_n = 4 rest_uniform = 6 sorted_pr_scores = average_pr_scores.dropna().sort_values() usable_patient_count = sorted_pr_scores.shape[0] # Top 5, and 5 uniformly distributed from the rest patient_indexes = list( np.linspace( 0, usable_patient_count - 1 - top_n, num=rest_uniform, ).astype(int)) patient_indexes.extend( range(usable_patient_count - top_n, usable_patient_count)) selected_patients = sorted_pr_scores.index[list(reversed(patient_indexes))] with new_plot(): plt.figure(figsize=(10, 10)) for patient in selected_patients: prd = pr_data_objects[patient] plt.plot(prd.rec, prd.prec, label=patient) plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.xlabel('Recall') plt.ylabel('Precision') plt.axes().set_aspect('equal', 'datalim') plt.legend() plt.title( f'Precision-recall: top {top_n} patients, uniform spacing of bottom {rest_uniform}' ) figure_path = output_path / 'pr_selected.pdf' print('Saving selected PR curves to', figure_path) plt.savefig(figure_path, bbox_inches='tight') ## /PR and ROC AUC analysis ## Spearman correlation P-value analysis spearman_pvalues = pd.Series(0.0, index=sorted_patients) shuffled_spearman_pvalues = pd.DataFrame(0.0, index=sorted_patients, columns=range(shuffle_count)) for i, patient in enumerate(patients_with_lbs_muts, 1): print( f'Computing Spearman correlation P-value for patient {i}/{patient_count}' ) selected_edges: Set[str] = selected_edges_by_patient[patient] edge_scores = mut_edge_prop.loc[patient, all_edges].copy() labels = np.array([e in selected_edges for e in edge_scores.index]).astype(float) spearman_result = scipy.stats.spearmanr(edge_scores, labels) spearman_pvalue = spearman_result[1] spearman_pvalues.loc[patient] = spearman_pvalue shuffled_edge_list: List[Set[str]] = shuffled_edges_by_patient[patient] for j, shuffled_edges in enumerate(shuffled_edge_list): shuffled_labels = np.array( [e in shuffled_edges for e in edge_scores.index]).astype(float) shuffled_spearman_result = scipy.stats.spearmanr( edge_scores, shuffled_labels) shuffled_spearman_pvalue = shuffled_spearman_result[1] shuffled_spearman_pvalues.loc[patient, j] = shuffled_spearman_pvalue sp_dir = Path('data/prop_edge_lbs_overlap_20180606-105746') with pd.HDFStore(sp_dir / 'spearman_pvalues.hdf5') as store: spearman_pvalues = store['spearman_pvalues'] shuffled_spearman_pvalues = store['shuffled_spearman_pvalues'] with pd.HDFStore(data_path / 'spearman_pvalues.hdf5') as store: store['spearman_pvalues'] = spearman_pvalues store['shuffled_spearman_pvalues'] = shuffled_spearman_pvalues nl10_spearman_pvalues_all = -np.log10(spearman_pvalues) nl10_spearman_pvalues = nl10_spearman_pvalues_all.loc[ ~(nl10_spearman_pvalues_all.isnull()) & ~(np.isinf(nl10_spearman_pvalues_all))] with new_plot(): nl10_spearman_pvalues.plot.hist(bins=50) plt.title('Spearman $P$-value histogram') plt.xlabel( 'Spearman $P$-values ($-\\log_{10}$): LBS edges vs. prop. edge score' ) figure_path = output_path / 'spearman_pvalue_hist.pdf' print('Saving Spearman P-value histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') shuffled_spearman_pvalues_flat = shuffled_spearman_pvalues.unstack() nl10_shuffled_spearman_pvalues_flat_all = -np.log10( shuffled_spearman_pvalues_flat) nl10_shuffled_spearman_pvalues_flat = nl10_shuffled_spearman_pvalues_flat_all.loc[ ~(nl10_shuffled_spearman_pvalues_flat_all.isnull()) & ~(np.isinf(nl10_shuffled_spearman_pvalues_flat_all))] with new_plot(): nl10_shuffled_spearman_pvalues_flat.plot.hist(bins=50) plt.title('Spearman $P$-value histogram') plt.xlabel( 'Spearman $P$-values ($-\\log_{10}$): shuffled LBS edges vs. prop. edge score' ) figure_path = output_path / 'shuffled_spearman_pvalue_hist.pdf' print('Saving Spearman P-value histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') spearman_ks = scipy.stats.ks_2samp(spearman_pvalues, shuffled_spearman_pvalues_flat) spearman_ks_pvalue_str = to_matplotlib_sci_notation(spearman_ks[1]) with new_plot(): nl10_spearman_pvalues.plot.hist( bins=hist_bin_count, alpha=0.8, label='Real Spearman $P$-values', density=True, ) nl10_shuffled_spearman_pvalues_flat.plot.hist( bins=hist_bin_count, alpha=0.8, label='Shuffled Spearman $P$-values, across 100 permutations', density=True, ) plt.xlabel( 'Spearman $P$-values ($-\\log_{10}$): LBS edges vs. prop. edge score' ) plt.legend() plt.figtext( 0.89, 0.7, f'Kolmogorov-Smirnov $P = {spearman_ks_pvalue_str}$', horizontalalignment='right', ) figure_path = output_path / 'spearman_pvalues_both_hist.pdf' print('Saving Spearman P-value histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') ## /Spearman correlation P-value analysis ## Overall ROC AUC print('Creating binary LBS edge matrix') lbs_edge_matrix = pd.DataFrame(0, index=edge_prop.index, columns=edge_prop.columns) for patient, edges in selected_edges_by_patient.items(): lbs_edge_matrix.loc[patient, list(edges)] = 1 lbs_matrix_path = data_path / 'lbs_edge_matrix.hdf5' print('Saving LBS edge matrix to', lbs_matrix_path) with pd.HDFStore(lbs_matrix_path) as store: store['lbs_edge_matrix'] = lbs_edge_matrix sorted_flattened_edge_scores = edge_prop.unstack().sort_values( ascending=False) flattened_lbs_edges = lbs_edge_matrix.unstack() ordered_flattened_lbs_edges = flattened_lbs_edges.loc[ sorted_flattened_edge_scores.index] flattened_rd = RocData.calculate(ordered_flattened_lbs_edges, sorted_flattened_edge_scores) flattened_rd_path = data_path / 'flattened_rd.pickle' print('Saving flattened vector RocData to', flattened_rd_path) with open(flattened_rd_path, 'wb') as f: pickle.dump(flattened_rd, f) ## /Overall ROC AUC ## Survival analysis edge_prop_survival_dir = find_newest_data_path('edge_prop_survival') survival_data = pd.read_csv(edge_prop_survival_dir / 'univariate_surv_results.csv', index_col=0) # Indexed by gene/edge, across all patients surv_edge_sel = [('_' in i) for i in survival_data.index] edge_survival_data = survival_data.loc[surv_edge_sel, :] lbs_mut_edge_matrix = pd.DataFrame( 0.0, index=sorted(selected_edges_by_patient), columns=all_edges, ) for patient, edges in selected_edges_by_patient.items(): lbs_mut_edge_matrix.loc[patient, list(edges)] = 1 # Binary vector: is this edge incident on a LBS mut in at least one patient? edges_with_lbs_muts = lbs_mut_edge_matrix.sum(axis=0).astype(bool) surv_pvalues_with_lbs = edge_survival_data.loc[edges_with_lbs_muts, 'pvalue'] surv_pvalues_with_lbs.name = 'With LBS' surv_pvalues_without_lbs = edge_survival_data.loc[~edges_with_lbs_muts, 'pvalue'] surv_pvalues_without_lbs.name = 'Without LBS' ks_res = scipy.stats.ks_2samp(surv_pvalues_with_lbs, surv_pvalues_without_lbs) with new_plot(): plot_cdf(surv_pvalues_with_lbs) plot_cdf(surv_pvalues_without_lbs) plt.legend() plt.ylabel('CDF') plt.xlabel('Univariate Cox Regression $P$-value') figure_path = output_path / 'surv_pvalue_cdfs.pdf' plt.savefig(figure_path, bbox_inches='tight') with new_plot(): fig = plt.figure() surv_pvalues_with_lbs.plot.hist(bins=50, ax=plt.gca(), alpha=0.5) surv_pvalues_without_lbs.plot.hist(bins=50, ax=plt.gca(), alpha=0.5) plt.legend('topleft') plt.xlabel('Univariate Cox Regression $P$-value') figure_path = output_path / 'surv_pvalue_hist.pdf' plt.savefig(figure_path, bbox_inches='tight') ## /Survival analysis ## Permuted survival analysis pvalues = edge_survival_data.loc[:, 'r_square'] ks_manual = (np.array([0.1, 0.2, 0.25, 0.3]) * edge_prop.shape[0]).astype(int) ks_auto = np.logspace(1, 3, num=15).astype(int) ks = sorted(chain(ks_manual, ks_auto)) edge_count = 1000 template = dedent(''' \\begin{{frame}}[plain] \\begin{{center}} \\includegraphics[width=0.7\\textwidth]{{survival_rsquare_hist_k_{k}}} \\end{{center}} \\end{{frame}} ''') with open(data_path / 'figure_include.tex', 'w') as f: for k in ks: print(template.format(k=k), file=f) for k in ks: print('Computing edge ranking results for k =', k) edge_ranking = get_rank_k_edge_values(edge_prop, k) sorted_edge_scores = edge_ranking.sort_values(ascending=False) top_edges = sorted_edge_scores.iloc[:edge_count] top_edge_pvalues = pvalues.loc[top_edges.index] bottom_edges = sorted_edge_scores.iloc[edge_count:] permutation_count = 1000 permutation_pvalues = pd.Series(0.0, index=range(permutation_count)) for i in range(permutation_count): edge_selection = np.random.choice(bottom_edges.index, size=100) selected_pvalues = pvalues.loc[edge_selection] comparison_result = scipy.stats.mannwhitneyu( top_edge_pvalues, selected_pvalues, alternative='greater', ) permutation_pvalues.iloc[i] = comparison_result.pvalue nl10_permutation_pvalues = -np.log10(permutation_pvalues) with new_plot(): plt.figure(figsize=(5, 5)) nl10_permutation_pvalues.plot.hist(bins=50) title = (f'Survival $R^2$: top {edge_count} edges ($k = {k}$) vs. ' f'{permutation_count} random selections') plt.title(title) plt.xlabel('$- \\log_{10}$($P$-value) from Mann-Whitney $U$ test') nl10_0_05 = -np.log10(0.05) plt.axvline(x=nl10_0_05, color='#FF0000FF') nl10_0_001 = -np.log10(0.001) plt.axvline(x=nl10_0_001, color='#000000FF') figure_path = output_path / f'survival_rsquare_hist_k_{k}.pdf' print('Saving survival R^2 histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight')
#!/usr/bin/env python3 import csv from pathlib import Path from data_path_utils import create_data_path data_path = create_data_path('parse_er_targets') trrust_path = Path('~/data/trrust_rawdata.txt').expanduser() er_names = {'ESR1', 'ESR2'} er_targets = set() with trrust_path.open() as f: r = csv.reader(f, delimiter='\t') for line in r: tf_name = line[0] target_name = line[1] if tf_name in er_names: er_targets.add(target_name) target_path = data_path / 'er_targets.txt' print('Saving {} ER targets to {}'.format(len(er_targets), target_path)) with target_path.open('w') as f: for target in sorted(er_targets): print(target, file=f)
import json from pathlib import Path import pickle from data_path_utils import ( DATA_PATH, create_data_path, find_newest_data_path, ) import numpy as np import pandas as pd from propagation import normalize, propagate from utils import weighted_correlation data_path = create_data_path('drug_targets') with Path('~/data/drugs_targets.json').expanduser().open() as f: raw_data = json.load(f) drug_target_data_all = pd.DataFrame(raw_data).T # Select those with protein targets drug_target_data = drug_target_data_all.loc[drug_target_data_all.gene_symbols.notnull(), :] dtd_path = data_path / 'drug_targets.pickle' print('Saving drug target data matrix to', dtd_path) drug_target_data.to_pickle(dtd_path) synonyms = defaultdict(list) for row_name, synonym_csv in drug_target_data.synonyms.iteritems(): for synonym in synonym_csv.split(','):
gene_metadata = pd.read_table(lbs_dir / 'mutLBSgene_basic.txt') lbs_muts = pd.read_table(lbs_dir / 'mutLBSgene_tcga_cosmic_overlapped_mutations.txt') lbs_genes = sorted(set(lbs_muts.gene)) missing_from_tcga = set(lbs_genes) - set(genes) print('Genes in LBS data but not TCGA mutation data:', len(missing_from_tcga)) lbs_mut_set = set(zip(lbs_muts.gene, lbs_muts.nsSNV)) # set of tuples of (patient, gene, AA sub) brca_muts_in_lbs = set() for i, row in muts.iterrows(): if isinstance(row.HGVSp_Short, float): # null continue aa_sub = strip_prefix(row.HGVSp_Short, 'p.') patient = get_patient_barcode(row.Tumor_Sample_Barcode) gene = row.Hugo_Symbol if (gene, aa_sub) in lbs_mut_set: item = (patient, gene, aa_sub) brca_muts_in_lbs.add(item) print('Found LBS mut:', item) data_path = create_data_path('intersect_muts_lbs') lbs_mut_df = pd.DataFrame(list(brca_muts_in_lbs)) lbs_mut_df.columns = ['patient', 'gene', 'aa_sub'] tcga_mut_path = data_path / 'brca_lbs_muts.csv' print('Saving BRCA mutations in LBS DB to', tcga_mut_path) lbs_mut_df.to_csv(tcga_mut_path, index=None)
#!/usr/bin/env python3 from data_path_utils import ( DATA_PATH, create_data_path, find_newest_data_path, ) import pandas as pd from scipy.stats import pearsonr from utils import consolidate_data_frames, sorted_intersection data_path = create_data_path('tcga_lincs_expr_features') drugs = [ 'arimidex', 'taxol', ] tcga_expr_path = find_newest_data_path( 'parse_cosmic_diffexpr') / 'brca_expr.pickle' print('Reading expression data from', tcga_expr_path) tcga_expr = pd.read_pickle(tcga_expr_path) lincs_expr = pd.read_csv( find_newest_data_path('gct_drug_subset') / 'subset.csv', header=None, index_col=0, ) lincs_expr.columns = drugs lincs_genes = set(lincs_expr.index)
scale_continuous_df_cols, sorted_intersection, ) p = ArgumentParser() p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) p.add_argument('--plot-pca-components', action='store_true') if '__file__' in globals(): args = p.parse_args() else: args = p.parse_args([]) entrez_hugo_mapping = read_entrez_hugo_mapping() output_label = f'compute_drug_features_labels_alpha_{args.alpha:.2f}' data_path = create_data_path(output_label) drug_response_dir = find_newest_data_path('drug_response_labels') tx_info_raw = pd.read_pickle(drug_response_dir / 'tx_info.pickle') network_path = find_newest_data_path('build_hippie_network') / 'network.pickle' print('Loading network from', network_path) with network_path.open('rb') as f: network = pickle.load(f) self_edge_count = 0 # HACK: remove self edges for node in network.nodes: if network.has_edge(node, node): network.remove_edge(node, node) self_edge_count += 1
#!/usr/bin/env python3 from data_path_utils import ( create_data_path, find_newest_data_path, ) import pandas as pd from gene_mappings import read_entrez_hugo_mapping, read_hugo_entrez_mapping data_path = create_data_path('dump_muts_for_nbs') hugo_entrez_mapping = read_hugo_entrez_mapping() entrez_hugo_mapping = read_entrez_hugo_mapping() mut_path = find_newest_data_path('parse_tcga_mutations') muts_all = pd.read_pickle(mut_path / 'mutations.pickle') gene_sel = pd.Series( [ (gene in hugo_entrez_mapping and hugo_entrez_mapping[gene]) for gene in muts_all.columns ], index=muts_all.columns, ).astype(bool) muts = muts_all.loc[:, gene_sel] muts.columns = [hugo_entrez_mapping[gene] for gene in muts.columns] muts = muts.groupby(axis=1, level=-1).any().astype(int) gene_symbols = [entrez_hugo_mapping[gene] for gene in muts.columns] with open(data_path / 'gene_symbols.txt', 'w') as f: print('Gene', file=f)
def main(): script_label = 'prop_edge_lbs_shuffle' data_path = create_data_path(script_label) output_path = create_output_path(script_label) hem = read_hugo_entrez_mapping() lbs_mut_path = find_newest_data_path('intersect_muts_lbs') lbs_muts = pd.read_csv(lbs_mut_path / 'brca_lbs_muts.csv') prop_edge_path = find_newest_data_path(f'propagate_mutations_edges_alpha_{args.alpha:.2f}') with pd.HDFStore(prop_edge_path / 'data_propagated.hdf5') as store: mut_edge_prop = store['mutations'] patients_with_lbs_muts = set(lbs_muts.patient) print('Patients with LBS mutations:', len(patients_with_lbs_muts)) lbs_muts_by_patient = defaultdict(set) for i, row in lbs_muts.iterrows(): if row.gene not in hem: print('Skipping gene', row.gene) continue lbs_muts_by_patient[row.patient].add(hem[row.gene]) all_edge_set = {i for i in mut_edge_prop.columns if '_' in i} all_edges = sorted(all_edge_set) all_gene_set = set(mut_edge_prop.columns) - all_edge_set shuffle_count = 100 sorted_patients = sorted(patients_with_lbs_muts) patient_count = len(sorted_patients) lbs_edges_by_patient = pd.Series(0, index=sorted_patients) # Assign label of 1 for an edge if either node has a LBS mutation selected_edges_by_patient: Dict[str, Set[str]] = {} shuffled_edges_by_patient: Dict[str, List[Set[str]]] = {} shuffled_by_patient = {} for i, patient in enumerate(patients_with_lbs_muts, 1): print(f'Shuffling LBS mutations for patient {patient} ({i}/{patient_count})') muts = lbs_muts_by_patient[patient] mut_count = len(muts) l = [] for j in range(shuffle_count): other_genes = all_gene_set - muts new_muts = sample(other_genes, mut_count) l.append(new_muts) shuffled_by_patient[patient] = l # TODO: parallelize this; it's too slow for i, patient in enumerate(patients_with_lbs_muts, 1): print(f'Computing selected/shuffled edges for patient {i}/{patient_count}') lbs_genes = lbs_muts_by_patient[patient] selected_edges: Set[str] = set() shuffled_edges: List[Set[str]] = [set() for _ in range(shuffle_count)] edge_scores = mut_edge_prop.loc[patient, all_edges].copy().sort_values(ascending=False) for g1_g2 in edge_scores.index: g1, g2 = g1_g2.split('_') if g1 in lbs_genes or g2 in lbs_genes: selected_edges.add(g1_g2) # TODO: clean up iteration for j, shuffled_genes in enumerate(shuffled_by_patient[patient]): if g1 in shuffled_genes or g2 in shuffled_genes: shuffled_edges[j].add(g1_g2) lbs_edges_by_patient.loc[patient] = len(selected_edges) selected_edges_by_patient[patient] = selected_edges shuffled_edges_by_patient[patient] = shuffled_edges selected_edge_count = pd.Series( {patient: len(edges) for patient, edges in selected_edges_by_patient.items()} ).sort_index() with new_plot(): selected_edge_count.plot.hist(bins=25) plt.xlabel('Number of LBS-incident edges') plt.ylabel('Patients') figure_path = output_path / 'lbs_edge_count.pdf' print('Saving LBS edge count histogram to', figure_path) plt.savefig(figure_path, bbox_inches='tight') shuffled_data_path = data_path / 'shuffled_muts_edges_by_patient.pickle' print('Saving shuffled muts by patient to', shuffled_data_path) with open(shuffled_data_path, 'wb') as f: pickle.dump( { 'shuffled_by_patient': shuffled_by_patient, 'selected_edges_by_patient': selected_edges_by_patient, 'shuffled_edges_by_patient': shuffled_edges_by_patient, }, f, )
tfs = set(tfn.split('::')[0] for tfn in hits.tf_name) return tfs genes = get_genes() mg = mygene.MyGeneInfo() q = mg.querymany( genes, species='human', scopes=['ensemblgene', 'entrezgene', 'symbol'], fields=['ensembl.gene', 'entrezgene', 'symbol'], ) data_path = create_data_path('query_mygene') raw_results_path = data_path / 'raw_results.json' print('Saving raw query results to', raw_results_path) with open(raw_results_path, 'w') as f: json.dump(q, f) mapping = {} for result in q: if 'entrezgene' in result: mapping[result['query']] = str(result['entrezgene']) mapping_path = data_path / 'mapping.json' print('Saving mapping to', mapping_path) with open(mapping_path, 'w') as f: json.dump(mapping, f)
PrData, RocData, plot_pr, plot_roc, sorted_intersection, ) p = ArgumentParser() p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA) if '__file__' in globals(): args = p.parse_args() else: args = p.parse_args([]) script_label = 'ki67_analysis' data_path = create_data_path(script_label) output_path = create_output_path(script_label) expr_path = find_newest_data_path('parse_cosmic_diffexpr') expr = pd.read_pickle(expr_path / 'brca_expr.pickle') gene = 'MKI67' def ki67_analysis(drug: str): feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_expr = expr.loc[selected_samples, gene]
#!/usr/bin/env python3 from pathlib import Path from data_path_utils import create_data_path import pandas as pd cell_line_expr_path = Path('~/data/brca-cell-lines/pmid26771497/breast_rnaseq_qn.txt').expanduser() expr_data_raw = pd.read_table(cell_line_expr_path, index_col='gene_id') # Index is Entrez ID, first column is HUGO symbol, second is ensembl ID expr_data = expr_data_raw.iloc[:, 2:] expr_data.columns = [col.upper() for col in expr_data.columns] expr_data = expr_data.T data_path = create_data_path('parse_pmid26771497_expr') expr_path = data_path / 'expr.pickle' print('Saving expression data to', expr_path) expr_data.to_pickle(expr_path)