Exemple #1
0
def pmid_26033813_analysis(drug: str):
    tree = build_tree()

    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_labels = labels_all.loc[selected_samples]
    selected_expr = expr.loc[selected_samples, :]

    fit_tree(selected_expr, selected_labels, tree)

    predictions = pd.Series(
        [
            predict_sample(sample_name, selected_expr, tree)
            for sample_name in selected_samples
        ],
        index=selected_samples,
    )

    rd = RocData.calculate(selected_labels, predictions)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'PMID26033813 ROC: {drug.title()}',
             output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, predictions)
    plot_pr(pr, f'PMID26033813 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
Exemple #2
0
def ki67_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, gene]
    selected_labels = labels_all.loc[selected_samples]

    rd = RocData.calculate(selected_labels, selected_expr)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'Ki67 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, selected_expr)
    plot_pr(pr, f'Ki67 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
Exemple #3
0
def propagate_data(data: pd.DataFrame, label: str):
    """
    :param data: Matrix of (samples, genes); network propagation will be run on each row
    :param label: mutations or diffexpr or something
    :return:
    """
    sample_count = len(data.index)
    data_gene_set = set(data.columns)

    common_genes = sorted_intersection(data.columns, node_set)
    common_genes_path = data_path / f'{label}_common_genes.txt'
    print(f'{label}: saving {len(common_genes)} common genes to {common_genes_path}')
    with common_genes_path.open('w') as f:
        for gene in common_genes:
            print(gene, file=f)

    only_mut_genes = sorted(data_gene_set - node_set)
    only_mut_genes_path = data_path / f'{label}_only_mut_genes.txt'
    print(f'{label}: saving {len(only_mut_genes)} data-only genes to {only_mut_genes_path}')
    with only_mut_genes_path.open('w') as f:
        for gene in only_mut_genes:
            print(gene, file=f)

    only_network_genes = sorted(node_set - data_gene_set)
    only_network_genes_path = data_path / '{}_only_network_genes.txt'.format(label)
    print(f'{label}: saving {len(only_network_genes)} network-only genes to {only_network_genes_path}')
    with only_network_genes_path.open('w') as f:
        for gene in only_network_genes:
            print(gene, file=f)

    data_network = pd.DataFrame(0.0, columns=nodes, index=data.index)
    data_propagated = pd.DataFrame(0.0, columns=nodes, index=data.index)
    data_network.loc[:, common_genes] = data.loc[:, common_genes]

    param_generator = (
        (i, sample, label, sample_count, data_network.loc[sample, :])
        for i, sample in enumerate(data_network.index)
    )

    with Pool(args.subprocesses) as pool:
        for sample, propagated in pool.imap_unordered(
                propagate_mutations,
                param_generator,
        ):
            data_propagated.loc[sample, :] = np.array(propagated).reshape((node_count,))

    return data_propagated
Exemple #4
0
def pmid_26892682_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, selected_genes]
    selected_labels = labels_all.loc[selected_samples]

    ln_p_over_1_minus_p = selected_expr.as_matrix() @ coefs.as_matrix()
    probs = expit(ln_p_over_1_minus_p)

    rd = RocData.calculate(selected_labels, probs)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'PMID26892682 ROC: {drug.title()}',
             output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, probs)
    plot_pr(pr, f'PMID26892682 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
Exemple #5
0
output_path = create_output_path(script_label)

genes = [
    'TWIST1', 'KRT81', 'PTRF', 'EEF1A2', 'PTPRK', 'EGFR', 'CXCL14', 'ERBB3'
]
t_value_strs = [
    '-2.879', '-2.453', '-2.024', '-1.895', '-1.793', '-1.701', '2.229', '2.26'
]
t_values_inverted = np.array([float(v) for v in t_value_strs])
t_values = -t_values_inverted
coefs_all = pd.Series(t_values, index=genes)

expr_path = find_newest_data_path('parse_cosmic_diffexpr')
expr = pd.read_pickle(expr_path / 'brca_expr.pickle')

selected_genes = sorted_intersection(coefs_all.index, expr.columns)
coefs = coefs_all.loc[selected_genes]


def pmid_26892682_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, selected_genes]
    selected_labels = labels_all.loc[selected_samples]

    ln_p_over_1_minus_p = selected_expr.as_matrix() @ coefs.as_matrix()
    probs = expit(ln_p_over_1_minus_p)
Exemple #6
0
drugs = ['ai_all', 'arimidex']

feature_label_path = find_newest_data_path(
    f'compute_drug_features_labels_alpha_{args.alpha:.2f}')

aucs = pd.Series(0, index=range(len(reordered_labels)))

for drug in drugs:
    roc_data = []

    for i, (order, clusters) in enumerate(reordered_labels):
        labels_all = pd.read_pickle(feature_label_path /
                                    f'labels_{drug}.pickle')

        selected_samples = sorted_intersection(labels_all.index,
                                               clusters.index)
        selected_labels = labels_all.loc[selected_samples]
        selected_clusters = clusters.loc[selected_samples]

        rd = RocData.calculate(selected_labels, selected_clusters)
        rd.save(data_path / f'roc_data_{drug}_permutation_{i}.pickle')
        roc_data.append(rd)

        aucs.loc[i] = rd.auc

    with new_plot():
        plt.figure(figsize=CROSSVAL_FIGSIZE)
        for i, rd in enumerate(roc_data):
            plt.plot(
                rd.fpr,
                rd.tpr,
Exemple #7
0
    dfs_to_consolidate.append((pca_feature_matrix, 'drug_mut_full'))
for drug in sorted(expr_full_pca):
    pca_feature_matrix = expr_full_pca[drug]
    dfs_to_consolidate.append((pca_feature_matrix, 'drug_expr_full'))

full_matrix_unscaled = consolidate_data_frames(dfs_to_consolidate).fillna(0)

full_matrix_unscaled_path = data_path / 'feature_matrix_unscaled.pickle'
print('Saving full matrix (unscaled) to', full_matrix_unscaled_path)
full_matrix_unscaled.to_pickle(full_matrix_unscaled_path)

data_desc_filepath = data_path / 'data_desc_unscaled.csv'
print('Saving unscaled data description to', data_desc_filepath)
full_matrix_unscaled.describe().T.to_csv(data_desc_filepath)

common_samples = sorted_intersection(full_matrix_unscaled.index,
                                     tx_info_raw.index)

scaler, full_matrix = scale_continuous_df_cols(full_matrix_unscaled)

full_matrix_csv_path = data_path / 'feature_matrix.csv'
print('Saving feature matrix to', full_matrix_csv_path)
full_matrix.to_csv(full_matrix_csv_path)

full_matrix_pickle_path = replace_extension(full_matrix_csv_path, 'pickle')
print('Saving feature matrix to', full_matrix_pickle_path)
full_matrix.to_csv(full_matrix_pickle_path)

data_desc_filepath = data_path / 'data_desc_normalized.csv'
print('Saving normalized data description to', data_desc_filepath)
full_matrix.describe().T.to_csv(data_desc_filepath)
Exemple #8
0
nodes = sorted(node_set)
node_count = len(nodes)

with pd.HDFStore(find_newest_data_path('parse_tcga_mutations') / 'mutations.hdf5', 'r') as store:
    mutations = store['muts']
print('Read mutations')

expr = pd.read_pickle(find_newest_data_path('parse_cosmic_diffexpr') / 'brca_expr.pickle')
print('Read log-fold expression with Hugo symbols')
cutoff = 2
print('Binarizing log-fold expression with cutoff {}'.format(cutoff))
diffexpr_hugo = (expr.abs() > cutoff).astype(float)

hugo_entrez_mapping = read_hugo_entrez_mapping()

diffexpr_hugo_in_mapping = sorted_intersection(diffexpr_hugo.columns, hugo_entrez_mapping)
print(f'{len(diffexpr_hugo_in_mapping)} of {diffexpr_hugo.shape[1]} gene IDs in expr data are in mapping')
diffexpr_overlap = diffexpr_hugo.loc[:, diffexpr_hugo_in_mapping]
new_diffexpr_cols = [hugo_entrez_mapping[col] for col in diffexpr_overlap.columns]
duplicate_col_count = len(new_diffexpr_cols) - len(set(new_diffexpr_cols))
print('Duplicate columns:', duplicate_col_count)
used_entrez_ids = set()
non_dup_col_indices = []
for i, entrez_id in enumerate(new_diffexpr_cols):
    if entrez_id not in used_entrez_ids:
        non_dup_col_indices.append(i)
    used_entrez_ids.add(entrez_id)

diffexpr_overlap.columns = new_diffexpr_cols
diffexpr_overlap_non_dups = diffexpr_overlap.iloc[:, non_dup_col_indices]
diffexpr = diffexpr_overlap_non_dups.loc[:, sorted(diffexpr_overlap_non_dups.columns)]
Exemple #9
0
def propagate_mutations(param_tuple):
    i, sample, label, sample_count, vec = param_tuple
    if not i % 100:
        print('{}: done with {} samples ({:.2f}%)'.format(label, i, (i * 100) / sample_count))
    vector = np.matrix(vec).reshape((node_count, 1))
    propagated = propagate(w_prime, vector, alpha=args.alpha, verbose=False)
    return sample, propagated

data = mutations
label = 'mutations'

sample_count = len(data.index)
data_gene_set = set(data.columns)

common_genes = sorted_intersection(data.columns, node_set)
common_genes_path = data_path / '{}_common_genes.txt'.format(label)
print('{}: saving {} common genes to {}'.format(label, len(common_genes), common_genes_path))
with common_genes_path.open('w') as f:
    for gene in common_genes:
        print(gene, file=f)

only_mut_genes = sorted(data_gene_set - node_set)
only_mut_genes_path = data_path / '{}_only_mut_genes.txt'.format(label)
print('{}: saving {} data-only genes to {}'.format(label, len(only_mut_genes), only_mut_genes_path))
with only_mut_genes_path.open('w') as f:
    for gene in only_mut_genes:
        print(gene, file=f)

only_network_genes = sorted(node_set - data_gene_set)
only_network_genes_path = data_path / '{}_only_network_genes.txt'.format(label)
Exemple #10
0
patient_gene_set_muts = pd.DataFrame(0, index=muts.index, columns=range(len(entrez_gene_sets)))

for i, gene_set in enumerate(entrez_gene_sets):
    patient_gene_set_muts.loc[:, i] = muts.loc[:, gene_set].any(axis=1).astype(int)

pathway_mut_counts = patient_gene_set_muts.sum(axis=1)

gene_set_mut_matrix_path = data_path / 'gene_set_mut_matrix.pickle'
print('Saving gene set mutation matrix to', gene_set_mut_matrix_path)
patient_gene_set_muts.to_pickle(gene_set_mut_matrix_path)

pathway_mut_count_path = data_path / 'pathway_mut_counts.pickle'
print('Saving pathway mutation counts to', pathway_mut_count_path)
pathway_mut_counts.to_pickle(pathway_mut_count_path)

drugs = ['ai_all', 'arimidex']

feature_label_path = find_newest_data_path(f'compute_drug_features_labels_alpha_{args.alpha:.2f}')

for drug in drugs:
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, pathway_mut_counts.index)
    selected_labels = labels_all.loc[selected_samples]
    selected_counts = pathway_mut_counts.loc[selected_samples]

    rd = RocData.calculate(selected_labels, selected_counts)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'WExT Pathway Mutation Count ROC: {drug.title()}', output_path / f'{drug}_roc.pdf')
Exemple #11
0
lincs_expr = pd.read_csv(
    find_newest_data_path('gct_drug_subset') / 'subset.csv',
    header=None,
    index_col=0,
)
lincs_expr.columns = drugs

lincs_genes = set(lincs_expr.index)
tcga_genes = set(tcga_expr.columns)

lincs_benchmark_gene_data = pd.read_excel(DATA_PATH /
                                          'Landmark_Genes_n978.xlsx')
lincs_benchmark_genes = set(lincs_benchmark_gene_data.loc[:, 'Gene Symbol'])

common_genes = sorted_intersection(lincs_genes, tcga_genes,
                                   lincs_benchmark_genes)
tcga_only_genes = tcga_genes - lincs_genes
lincs_only_genes = lincs_benchmark_genes - tcga_genes

print('Intersection of TCGA and LINCS gene symbols: {} genes'.format(
    len(common_genes)))
print('Gene symbols only in TCGA expression data: {}'.format(
    len(tcga_only_genes)))
print('Gene symbols only in LINCS expression data: {}'.format(
    len(lincs_only_genes)))

lincs_expr_common_with_dups = lincs_expr.loc[common_genes, :].fillna(0)
lincs_expr_common = pd.DataFrame(0.0, index=common_genes, columns=drugs)
for i, rows in lincs_expr_common_with_dups.groupby(
        lincs_expr_common_with_dups.index):
    lincs_expr_common.loc[i, :] = rows.mean()