def pmid_26033813_analysis(drug: str): tree = build_tree() feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_labels = labels_all.loc[selected_samples] selected_expr = expr.loc[selected_samples, :] fit_tree(selected_expr, selected_labels, tree) predictions = pd.Series( [ predict_sample(sample_name, selected_expr, tree) for sample_name in selected_samples ], index=selected_samples, ) rd = RocData.calculate(selected_labels, predictions) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'PMID26033813 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, predictions) plot_pr(pr, f'PMID26033813 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
def ki67_analysis(drug: str): feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_expr = expr.loc[selected_samples, gene] selected_labels = labels_all.loc[selected_samples] rd = RocData.calculate(selected_labels, selected_expr) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'Ki67 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, selected_expr) plot_pr(pr, f'Ki67 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
def propagate_data(data: pd.DataFrame, label: str): """ :param data: Matrix of (samples, genes); network propagation will be run on each row :param label: mutations or diffexpr or something :return: """ sample_count = len(data.index) data_gene_set = set(data.columns) common_genes = sorted_intersection(data.columns, node_set) common_genes_path = data_path / f'{label}_common_genes.txt' print(f'{label}: saving {len(common_genes)} common genes to {common_genes_path}') with common_genes_path.open('w') as f: for gene in common_genes: print(gene, file=f) only_mut_genes = sorted(data_gene_set - node_set) only_mut_genes_path = data_path / f'{label}_only_mut_genes.txt' print(f'{label}: saving {len(only_mut_genes)} data-only genes to {only_mut_genes_path}') with only_mut_genes_path.open('w') as f: for gene in only_mut_genes: print(gene, file=f) only_network_genes = sorted(node_set - data_gene_set) only_network_genes_path = data_path / '{}_only_network_genes.txt'.format(label) print(f'{label}: saving {len(only_network_genes)} network-only genes to {only_network_genes_path}') with only_network_genes_path.open('w') as f: for gene in only_network_genes: print(gene, file=f) data_network = pd.DataFrame(0.0, columns=nodes, index=data.index) data_propagated = pd.DataFrame(0.0, columns=nodes, index=data.index) data_network.loc[:, common_genes] = data.loc[:, common_genes] param_generator = ( (i, sample, label, sample_count, data_network.loc[sample, :]) for i, sample in enumerate(data_network.index) ) with Pool(args.subprocesses) as pool: for sample, propagated in pool.imap_unordered( propagate_mutations, param_generator, ): data_propagated.loc[sample, :] = np.array(propagated).reshape((node_count,)) return data_propagated
def pmid_26892682_analysis(drug: str): feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_expr = expr.loc[selected_samples, selected_genes] selected_labels = labels_all.loc[selected_samples] ln_p_over_1_minus_p = selected_expr.as_matrix() @ coefs.as_matrix() probs = expit(ln_p_over_1_minus_p) rd = RocData.calculate(selected_labels, probs) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'PMID26892682 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, probs) plot_pr(pr, f'PMID26892682 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
output_path = create_output_path(script_label) genes = [ 'TWIST1', 'KRT81', 'PTRF', 'EEF1A2', 'PTPRK', 'EGFR', 'CXCL14', 'ERBB3' ] t_value_strs = [ '-2.879', '-2.453', '-2.024', '-1.895', '-1.793', '-1.701', '2.229', '2.26' ] t_values_inverted = np.array([float(v) for v in t_value_strs]) t_values = -t_values_inverted coefs_all = pd.Series(t_values, index=genes) expr_path = find_newest_data_path('parse_cosmic_diffexpr') expr = pd.read_pickle(expr_path / 'brca_expr.pickle') selected_genes = sorted_intersection(coefs_all.index, expr.columns) coefs = coefs_all.loc[selected_genes] def pmid_26892682_analysis(drug: str): feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_expr = expr.loc[selected_samples, selected_genes] selected_labels = labels_all.loc[selected_samples] ln_p_over_1_minus_p = selected_expr.as_matrix() @ coefs.as_matrix() probs = expit(ln_p_over_1_minus_p)
drugs = ['ai_all', 'arimidex'] feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') aucs = pd.Series(0, index=range(len(reordered_labels))) for drug in drugs: roc_data = [] for i, (order, clusters) in enumerate(reordered_labels): labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, clusters.index) selected_labels = labels_all.loc[selected_samples] selected_clusters = clusters.loc[selected_samples] rd = RocData.calculate(selected_labels, selected_clusters) rd.save(data_path / f'roc_data_{drug}_permutation_{i}.pickle') roc_data.append(rd) aucs.loc[i] = rd.auc with new_plot(): plt.figure(figsize=CROSSVAL_FIGSIZE) for i, rd in enumerate(roc_data): plt.plot( rd.fpr, rd.tpr,
dfs_to_consolidate.append((pca_feature_matrix, 'drug_mut_full')) for drug in sorted(expr_full_pca): pca_feature_matrix = expr_full_pca[drug] dfs_to_consolidate.append((pca_feature_matrix, 'drug_expr_full')) full_matrix_unscaled = consolidate_data_frames(dfs_to_consolidate).fillna(0) full_matrix_unscaled_path = data_path / 'feature_matrix_unscaled.pickle' print('Saving full matrix (unscaled) to', full_matrix_unscaled_path) full_matrix_unscaled.to_pickle(full_matrix_unscaled_path) data_desc_filepath = data_path / 'data_desc_unscaled.csv' print('Saving unscaled data description to', data_desc_filepath) full_matrix_unscaled.describe().T.to_csv(data_desc_filepath) common_samples = sorted_intersection(full_matrix_unscaled.index, tx_info_raw.index) scaler, full_matrix = scale_continuous_df_cols(full_matrix_unscaled) full_matrix_csv_path = data_path / 'feature_matrix.csv' print('Saving feature matrix to', full_matrix_csv_path) full_matrix.to_csv(full_matrix_csv_path) full_matrix_pickle_path = replace_extension(full_matrix_csv_path, 'pickle') print('Saving feature matrix to', full_matrix_pickle_path) full_matrix.to_csv(full_matrix_pickle_path) data_desc_filepath = data_path / 'data_desc_normalized.csv' print('Saving normalized data description to', data_desc_filepath) full_matrix.describe().T.to_csv(data_desc_filepath)
nodes = sorted(node_set) node_count = len(nodes) with pd.HDFStore(find_newest_data_path('parse_tcga_mutations') / 'mutations.hdf5', 'r') as store: mutations = store['muts'] print('Read mutations') expr = pd.read_pickle(find_newest_data_path('parse_cosmic_diffexpr') / 'brca_expr.pickle') print('Read log-fold expression with Hugo symbols') cutoff = 2 print('Binarizing log-fold expression with cutoff {}'.format(cutoff)) diffexpr_hugo = (expr.abs() > cutoff).astype(float) hugo_entrez_mapping = read_hugo_entrez_mapping() diffexpr_hugo_in_mapping = sorted_intersection(diffexpr_hugo.columns, hugo_entrez_mapping) print(f'{len(diffexpr_hugo_in_mapping)} of {diffexpr_hugo.shape[1]} gene IDs in expr data are in mapping') diffexpr_overlap = diffexpr_hugo.loc[:, diffexpr_hugo_in_mapping] new_diffexpr_cols = [hugo_entrez_mapping[col] for col in diffexpr_overlap.columns] duplicate_col_count = len(new_diffexpr_cols) - len(set(new_diffexpr_cols)) print('Duplicate columns:', duplicate_col_count) used_entrez_ids = set() non_dup_col_indices = [] for i, entrez_id in enumerate(new_diffexpr_cols): if entrez_id not in used_entrez_ids: non_dup_col_indices.append(i) used_entrez_ids.add(entrez_id) diffexpr_overlap.columns = new_diffexpr_cols diffexpr_overlap_non_dups = diffexpr_overlap.iloc[:, non_dup_col_indices] diffexpr = diffexpr_overlap_non_dups.loc[:, sorted(diffexpr_overlap_non_dups.columns)]
def propagate_mutations(param_tuple): i, sample, label, sample_count, vec = param_tuple if not i % 100: print('{}: done with {} samples ({:.2f}%)'.format(label, i, (i * 100) / sample_count)) vector = np.matrix(vec).reshape((node_count, 1)) propagated = propagate(w_prime, vector, alpha=args.alpha, verbose=False) return sample, propagated data = mutations label = 'mutations' sample_count = len(data.index) data_gene_set = set(data.columns) common_genes = sorted_intersection(data.columns, node_set) common_genes_path = data_path / '{}_common_genes.txt'.format(label) print('{}: saving {} common genes to {}'.format(label, len(common_genes), common_genes_path)) with common_genes_path.open('w') as f: for gene in common_genes: print(gene, file=f) only_mut_genes = sorted(data_gene_set - node_set) only_mut_genes_path = data_path / '{}_only_mut_genes.txt'.format(label) print('{}: saving {} data-only genes to {}'.format(label, len(only_mut_genes), only_mut_genes_path)) with only_mut_genes_path.open('w') as f: for gene in only_mut_genes: print(gene, file=f) only_network_genes = sorted(node_set - data_gene_set) only_network_genes_path = data_path / '{}_only_network_genes.txt'.format(label)
patient_gene_set_muts = pd.DataFrame(0, index=muts.index, columns=range(len(entrez_gene_sets))) for i, gene_set in enumerate(entrez_gene_sets): patient_gene_set_muts.loc[:, i] = muts.loc[:, gene_set].any(axis=1).astype(int) pathway_mut_counts = patient_gene_set_muts.sum(axis=1) gene_set_mut_matrix_path = data_path / 'gene_set_mut_matrix.pickle' print('Saving gene set mutation matrix to', gene_set_mut_matrix_path) patient_gene_set_muts.to_pickle(gene_set_mut_matrix_path) pathway_mut_count_path = data_path / 'pathway_mut_counts.pickle' print('Saving pathway mutation counts to', pathway_mut_count_path) pathway_mut_counts.to_pickle(pathway_mut_count_path) drugs = ['ai_all', 'arimidex'] feature_label_path = find_newest_data_path(f'compute_drug_features_labels_alpha_{args.alpha:.2f}') for drug in drugs: labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, pathway_mut_counts.index) selected_labels = labels_all.loc[selected_samples] selected_counts = pathway_mut_counts.loc[selected_samples] rd = RocData.calculate(selected_labels, selected_counts) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'WExT Pathway Mutation Count ROC: {drug.title()}', output_path / f'{drug}_roc.pdf')
lincs_expr = pd.read_csv( find_newest_data_path('gct_drug_subset') / 'subset.csv', header=None, index_col=0, ) lincs_expr.columns = drugs lincs_genes = set(lincs_expr.index) tcga_genes = set(tcga_expr.columns) lincs_benchmark_gene_data = pd.read_excel(DATA_PATH / 'Landmark_Genes_n978.xlsx') lincs_benchmark_genes = set(lincs_benchmark_gene_data.loc[:, 'Gene Symbol']) common_genes = sorted_intersection(lincs_genes, tcga_genes, lincs_benchmark_genes) tcga_only_genes = tcga_genes - lincs_genes lincs_only_genes = lincs_benchmark_genes - tcga_genes print('Intersection of TCGA and LINCS gene symbols: {} genes'.format( len(common_genes))) print('Gene symbols only in TCGA expression data: {}'.format( len(tcga_only_genes))) print('Gene symbols only in LINCS expression data: {}'.format( len(lincs_only_genes))) lincs_expr_common_with_dups = lincs_expr.loc[common_genes, :].fillna(0) lincs_expr_common = pd.DataFrame(0.0, index=common_genes, columns=drugs) for i, rows in lincs_expr_common_with_dups.groupby( lincs_expr_common_with_dups.index): lincs_expr_common.loc[i, :] = rows.mean()