Python RocData Examples

Programming Language: Python

Namespace/Package Name: utils

Class/Type: RocData

Examples at hotexamples.com: 8

Python RocData - 8 examples found. These are the top rated real world Python examples of utils.RocData extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

calculate(8)

Frequently Used Methods

calculate (8)

Example #1

Show file

def pmid_26033813_analysis(drug: str):
    tree = build_tree()

    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_labels = labels_all.loc[selected_samples]
    selected_expr = expr.loc[selected_samples, :]

    fit_tree(selected_expr, selected_labels, tree)

    predictions = pd.Series(
        [
            predict_sample(sample_name, selected_expr, tree)
            for sample_name in selected_samples
        ],
        index=selected_samples,
    )

    rd = RocData.calculate(selected_labels, predictions)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'PMID26033813 ROC: {drug.title()}',
             output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, predictions)
    plot_pr(pr, f'PMID26033813 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')

Example #2

Show file

def load_l1o_data_compute_roc(path: Path, drug: str) -> Dict[str, RocData]:
    with pd.HDFStore(path / f'l1o_preds_{drug}.hdf5', 'r') as store:
        labels = store['labels']
        preds = store['l1o_preds']

    return {
        clf: RocData.calculate(labels, preds.loc[:, clf])
        for clf in preds.columns
    }

Example #3

Show file

def ki67_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, gene]
    selected_labels = labels_all.loc[selected_samples]

    rd = RocData.calculate(selected_labels, selected_expr)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'Ki67 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, selected_expr)
    plot_pr(pr, f'Ki67 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')

Example #4

Show file

def pmid_26892682_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, selected_genes]
    selected_labels = labels_all.loc[selected_samples]

    ln_p_over_1_minus_p = selected_expr.as_matrix() @ coefs.as_matrix()
    probs = expit(ln_p_over_1_minus_p)

    rd = RocData.calculate(selected_labels, probs)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'PMID26892682 ROC: {drug.title()}',
             output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, probs)
    plot_pr(pr, f'PMID26892682 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')

Example #5

Show file

aucs = pd.Series(0, index=range(len(reordered_labels)))

for drug in drugs:
    roc_data = []

    for i, (order, clusters) in enumerate(reordered_labels):
        labels_all = pd.read_pickle(feature_label_path /
                                    f'labels_{drug}.pickle')

        selected_samples = sorted_intersection(labels_all.index,
                                               clusters.index)
        selected_labels = labels_all.loc[selected_samples]
        selected_clusters = clusters.loc[selected_samples]

        rd = RocData.calculate(selected_labels, selected_clusters)
        rd.save(data_path / f'roc_data_{drug}_permutation_{i}.pickle')
        roc_data.append(rd)

        aucs.loc[i] = rd.auc

    with new_plot():
        plt.figure(figsize=CROSSVAL_FIGSIZE)
        for i, rd in enumerate(roc_data):
            plt.plot(
                rd.fpr,
                rd.tpr,
                lw=1,
                label=
                f'Permutation {i} (area = {rd.auc:.{SIGNIFICANT_DIGITS}f})')

Example #6

Show file

def main():
    script_label = 'prop_edge_lbs_overlap'
    data_path = create_data_path(script_label)
    output_path = create_output_path(script_label)

    hem = read_hugo_entrez_mapping()

    lbs_mut_path = find_newest_data_path('intersect_muts_lbs')
    lbs_muts = pd.read_csv(lbs_mut_path / 'brca_lbs_muts.csv')

    prop_edge_path = find_newest_data_path(
        f'propagate_mutations_edges_alpha_{args.alpha:.2f}')
    with pd.HDFStore(prop_edge_path / 'data_propagated.hdf5') as store:
        mut_edge_prop = store['mutations']

    patients_with_lbs_muts = set(lbs_muts.patient)
    print('Patients with LBS mutations:', len(patients_with_lbs_muts))

    lbs_muts_by_patient = defaultdict(set)
    for i, row in lbs_muts.iterrows():
        if row.gene not in hem:
            print('Skipping gene', row.gene)
            continue
        lbs_muts_by_patient[row.patient].add(hem[row.gene])

    all_edge_set = {i for i in mut_edge_prop.columns if '_' in i}
    all_edges = sorted(all_edge_set)

    edge_prop = mut_edge_prop.loc[:, all_edges]

    shuffle_count = 100
    sorted_patients = sorted(patients_with_lbs_muts)
    patient_count = len(sorted_patients)
    ndcg = pd.Series(0.0, index=sorted_patients)
    shuffled_ndcg = pd.DataFrame(0.0,
                                 index=sorted_patients,
                                 columns=range(shuffle_count))
    lbs_edges_by_patient = pd.Series(0, index=sorted_patients)

    print('Loading shuffled data')
    prop_lbs_shuffle_path = find_newest_data_path('prop_edge_lbs_shuffle')

    with open(prop_lbs_shuffle_path / 'shuffled_muts_edges_by_patient.pickle',
              'rb') as f:
        d = pickle.load(f)
        shuffled_by_patient = d['shuffled_by_patient']
        selected_edges_by_patient = d['selected_edges_by_patient']
        shuffled_edges_by_patient = d['shuffled_edges_by_patient']

    ## NDCG analysis

    # For each patient, rank edges by propagated mutation scores, assign label of 1 if
    # either node connected to that edge has a LBS mutation

    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(f'Computing NDCG for patient {i}/{patient_count}')

        edge_scores = mut_edge_prop.loc[patient, all_edges].copy().sort_values(
            ascending=False)
        selected_edges = selected_edges_by_patient[patient]
        shuffled_edge_list = shuffled_edges_by_patient[patient]

        relevance = np.array([e in selected_edges
                              for e in edge_scores.index]).astype(float)
        ndcg.loc[patient] = normalized_discounted_cumulative_gain(
            relevance)[-1]

        for j, shuffled_edges in enumerate(shuffled_edge_list):
            shuffled_relevance = np.array(
                [e in shuffled_edges for e in edge_scores.index]).astype(float)
            shuffled_ndcg.loc[patient,
                              j] = normalized_discounted_cumulative_gain(
                                  shuffled_relevance)[-1]

    with pd.HDFStore(data_path / 'ndcg_data.hdf5') as store:
        store['ndcg'] = ndcg
        store['shuffled_ndcg'] = shuffled_ndcg
        store['lbs_edges_by_patient'] = lbs_edges_by_patient

    shuffled_ndcg_flat = shuffled_ndcg.unstack()
    #shuffled_ndcg_median = shuffled_ndcg.median(axis=1)

    with new_plot():
        ndcg.plot.hist(bins=hist_bin_count)
        plt.title('NDCG histogram')
        plt.xlabel(
            'Patient NDCG score: selection of LBS edges by propagated edge score'
        )

        figure_path = output_path / 'ndcg_hist.pdf'
        print('Saving NDCG histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    with new_plot():
        shuffled_ndcg_flat.plot.hist(bins=hist_bin_count)
        plt.title('NDCG histogram')
        plt.xlabel(
            'Patient NDCG score: selection of shuffled LBS edges by propagated edge score'
        )

        figure_path = output_path / 'shuffled_ndcg_hist.pdf'
        print('Saving NDCG histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ndcg_ks = scipy.stats.ks_2samp(ndcg, shuffled_ndcg_flat)
    ndcg_ks_pvalue_str = to_matplotlib_sci_notation(ndcg_ks[1])

    with new_plot():
        ndcg.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Real NDCG',
            density=True,
        )
        shuffled_ndcg_flat.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Shuffled NDCG, across 100 permutations',
            density=True,
        )
        plt.xlabel(
            'Patient NDCG score: selection of LBS edges by propagated edge score'
        )
        plt.legend()
        plt.figtext(
            0.89,
            0.7,
            f'Kolmogorov-Smirnov $P = {ndcg_ks_pvalue_str}$',
            horizontalalignment='right',
        )

        figure_path = output_path / 'ndcg_both_hist.pdf'
        print('Saving NDCG histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ## /NDCG analysis

    ## PR and ROC AUC analysis

    roc_auc = pd.Series(0.0, index=sorted_patients)
    average_pr_scores = pd.Series(0.0, index=sorted_patients)
    shuffled_roc_auc = pd.DataFrame(0.0,
                                    index=sorted_patients,
                                    columns=range(shuffle_count))
    shuffled_average_pr_scores = pd.DataFrame(0.0,
                                              index=sorted_patients,
                                              columns=range(shuffle_count))
    # Maps patient IDs to performance objects
    roc_data_objects = {}
    pr_data_objects = {}

    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(
            f'Computing classifier performance for patient {i}/{patient_count}'
        )
        selected_edges: Set[str] = selected_edges_by_patient[patient]
        edge_scores = mut_edge_prop.loc[patient, all_edges].copy()
        labels = np.array([e in selected_edges
                           for e in edge_scores.index]).astype(float)

        rd = RocData.calculate(labels, edge_scores)
        roc_data_objects[patient] = rd
        roc_auc.loc[patient] = rd.auc

        pr = PrData.calculate(labels, edge_scores)
        pr_data_objects[patient] = pr
        average_pr_scores.loc[patient] = average_precision_score(
            labels, edge_scores)

        shuffled_edge_list: List[Set[str]] = shuffled_edges_by_patient[patient]

        for j, shuffled_edges in enumerate(shuffled_edge_list):
            shuffled_labels = np.array(
                [e in shuffled_edges for e in edge_scores.index]).astype(float)

            shuffled_rd = RocData.calculate(shuffled_labels, edge_scores)
            shuffled_roc_auc.loc[patient, j] = shuffled_rd.auc

            shuffled_average_pr_scores.loc[patient,
                                           j] = average_precision_score(
                                               shuffled_labels,
                                               edge_scores,
                                           )

    with pd.HDFStore(data_path / 'classifier_data.hdf5') as store:
        store['roc_auc'] = roc_auc
        store['average_pr'] = average_pr_scores
        store['shuffled_roc_auc'] = shuffled_roc_auc
        store['shuffled_average_pr'] = shuffled_average_pr_scores

    with new_plot():
        roc_auc.plot.hist(bins=hist_bin_count)
        plt.title('ROC AUC histogram')
        plt.xlabel(
            'Patient ROC AUC: selection of LBS edges by propagated edge score')

        figure_path = output_path / 'roc_auc_hist.pdf'
        print('Saving ROC AUC histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    #shuffled_roc_auc_median = shuffled_roc_auc.median(axis=1)
    shuffled_roc_auc_flat = shuffled_roc_auc.unstack()

    with new_plot():
        shuffled_roc_auc_flat.plot.hist(bins=hist_bin_count)
        plt.title('ROC AUC histogram')
        plt.xlabel(
            'Patient ROC AUC: selection of shuffled LBS edges by propagated edge score'
        )

        figure_path = output_path / 'shuffled_roc_auc_hist.pdf'
        print('Saving ROC AUC histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    roc_auc_ks = scipy.stats.ks_2samp(roc_auc, shuffled_roc_auc_flat)
    roc_auc_ks_pvalue_str = to_matplotlib_sci_notation(roc_auc_ks[1])

    with new_plot():
        roc_auc.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Real ROC AUC',
            density=True,
        )
        shuffled_roc_auc_flat.plot.hist(
            bins=50,
            alpha=0.8,
            label='Shuffled ROC AUC, across 100 permutations',
            density=True,
        )
        plt.xlabel(
            'Patient ROC AUC: selection of LBS edges by propagated edge score')
        plt.legend()
        plt.figtext(
            0.14,
            0.7,
            f'Kolmogorov-Smirnov $P = {roc_auc_ks_pvalue_str}$',
            horizontalalignment='left',
        )

        figure_path = output_path / 'roc_auc_both_hist.pdf'
        print('Saving ROC AUC histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    with new_plot():
        average_pr_scores.plot.hist(bins=hist_bin_count)
        plt.title('Average precision histogram')
        plt.xlabel(
            'Average precision: selection of LBS edges by propagated edge score'
        )

        figure_path = output_path / 'avg_prec_hist.pdf'
        print('Saving AP histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    shuffled_average_pr_median = shuffled_average_pr_scores.median(axis=1)
    with new_plot():
        shuffled_average_pr_median.plot.hist(bins=hist_bin_count)
        plt.title('Average precision histogram')
        plt.xlabel(
            'Average precision: selection of shuffled LBS edges by propagated edge score'
        )

        figure_path = output_path / 'shuffled_avg_prec_hist.pdf'
        print('Saving AP histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    top_n = 4
    rest_uniform = 6
    sorted_pr_scores = average_pr_scores.dropna().sort_values()
    usable_patient_count = sorted_pr_scores.shape[0]
    # Top 5, and 5 uniformly distributed from the rest
    patient_indexes = list(
        np.linspace(
            0,
            usable_patient_count - 1 - top_n,
            num=rest_uniform,
        ).astype(int))
    patient_indexes.extend(
        range(usable_patient_count - top_n, usable_patient_count))
    selected_patients = sorted_pr_scores.index[list(reversed(patient_indexes))]

    with new_plot():
        plt.figure(figsize=(10, 10))
        for patient in selected_patients:
            prd = pr_data_objects[patient]
            plt.plot(prd.rec, prd.prec, label=patient)

        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.axes().set_aspect('equal', 'datalim')
        plt.legend()

        plt.title(
            f'Precision-recall: top {top_n} patients, uniform spacing of bottom {rest_uniform}'
        )

        figure_path = output_path / 'pr_selected.pdf'
        print('Saving selected PR curves to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ## /PR and ROC AUC analysis

    ## Spearman correlation P-value analysis
    spearman_pvalues = pd.Series(0.0, index=sorted_patients)
    shuffled_spearman_pvalues = pd.DataFrame(0.0,
                                             index=sorted_patients,
                                             columns=range(shuffle_count))

    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(
            f'Computing Spearman correlation P-value for patient {i}/{patient_count}'
        )
        selected_edges: Set[str] = selected_edges_by_patient[patient]
        edge_scores = mut_edge_prop.loc[patient, all_edges].copy()
        labels = np.array([e in selected_edges
                           for e in edge_scores.index]).astype(float)

        spearman_result = scipy.stats.spearmanr(edge_scores, labels)
        spearman_pvalue = spearman_result[1]

        spearman_pvalues.loc[patient] = spearman_pvalue

        shuffled_edge_list: List[Set[str]] = shuffled_edges_by_patient[patient]

        for j, shuffled_edges in enumerate(shuffled_edge_list):
            shuffled_labels = np.array(
                [e in shuffled_edges for e in edge_scores.index]).astype(float)

            shuffled_spearman_result = scipy.stats.spearmanr(
                edge_scores, shuffled_labels)
            shuffled_spearman_pvalue = shuffled_spearman_result[1]

            shuffled_spearman_pvalues.loc[patient,
                                          j] = shuffled_spearman_pvalue

    sp_dir = Path('data/prop_edge_lbs_overlap_20180606-105746')
    with pd.HDFStore(sp_dir / 'spearman_pvalues.hdf5') as store:
        spearman_pvalues = store['spearman_pvalues']
        shuffled_spearman_pvalues = store['shuffled_spearman_pvalues']

    with pd.HDFStore(data_path / 'spearman_pvalues.hdf5') as store:
        store['spearman_pvalues'] = spearman_pvalues
        store['shuffled_spearman_pvalues'] = shuffled_spearman_pvalues

    nl10_spearman_pvalues_all = -np.log10(spearman_pvalues)
    nl10_spearman_pvalues = nl10_spearman_pvalues_all.loc[
        ~(nl10_spearman_pvalues_all.isnull())
        & ~(np.isinf(nl10_spearman_pvalues_all))]

    with new_plot():
        nl10_spearman_pvalues.plot.hist(bins=50)
        plt.title('Spearman $P$-value histogram')
        plt.xlabel(
            'Spearman $P$-values ($-\\log_{10}$): LBS edges vs. prop. edge score'
        )

        figure_path = output_path / 'spearman_pvalue_hist.pdf'
        print('Saving Spearman P-value histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    shuffled_spearman_pvalues_flat = shuffled_spearman_pvalues.unstack()
    nl10_shuffled_spearman_pvalues_flat_all = -np.log10(
        shuffled_spearman_pvalues_flat)
    nl10_shuffled_spearman_pvalues_flat = nl10_shuffled_spearman_pvalues_flat_all.loc[
        ~(nl10_shuffled_spearman_pvalues_flat_all.isnull())
        & ~(np.isinf(nl10_shuffled_spearman_pvalues_flat_all))]

    with new_plot():
        nl10_shuffled_spearman_pvalues_flat.plot.hist(bins=50)
        plt.title('Spearman $P$-value histogram')
        plt.xlabel(
            'Spearman $P$-values ($-\\log_{10}$): shuffled LBS edges vs. prop. edge score'
        )

        figure_path = output_path / 'shuffled_spearman_pvalue_hist.pdf'
        print('Saving Spearman P-value histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    spearman_ks = scipy.stats.ks_2samp(spearman_pvalues,
                                       shuffled_spearman_pvalues_flat)
    spearman_ks_pvalue_str = to_matplotlib_sci_notation(spearman_ks[1])

    with new_plot():
        nl10_spearman_pvalues.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Real Spearman $P$-values',
            density=True,
        )
        nl10_shuffled_spearman_pvalues_flat.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Shuffled Spearman $P$-values, across 100 permutations',
            density=True,
        )
        plt.xlabel(
            'Spearman $P$-values ($-\\log_{10}$): LBS edges vs. prop. edge score'
        )
        plt.legend()
        plt.figtext(
            0.89,
            0.7,
            f'Kolmogorov-Smirnov $P = {spearman_ks_pvalue_str}$',
            horizontalalignment='right',
        )

        figure_path = output_path / 'spearman_pvalues_both_hist.pdf'
        print('Saving Spearman P-value histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ## /Spearman correlation P-value analysis

    ## Overall ROC AUC
    print('Creating binary LBS edge matrix')
    lbs_edge_matrix = pd.DataFrame(0,
                                   index=edge_prop.index,
                                   columns=edge_prop.columns)
    for patient, edges in selected_edges_by_patient.items():
        lbs_edge_matrix.loc[patient, list(edges)] = 1

    lbs_matrix_path = data_path / 'lbs_edge_matrix.hdf5'
    print('Saving LBS edge matrix to', lbs_matrix_path)
    with pd.HDFStore(lbs_matrix_path) as store:
        store['lbs_edge_matrix'] = lbs_edge_matrix

    sorted_flattened_edge_scores = edge_prop.unstack().sort_values(
        ascending=False)
    flattened_lbs_edges = lbs_edge_matrix.unstack()
    ordered_flattened_lbs_edges = flattened_lbs_edges.loc[
        sorted_flattened_edge_scores.index]

    flattened_rd = RocData.calculate(ordered_flattened_lbs_edges,
                                     sorted_flattened_edge_scores)
    flattened_rd_path = data_path / 'flattened_rd.pickle'
    print('Saving flattened vector RocData to', flattened_rd_path)
    with open(flattened_rd_path, 'wb') as f:
        pickle.dump(flattened_rd, f)
    ## /Overall ROC AUC

    ## Survival analysis

    edge_prop_survival_dir = find_newest_data_path('edge_prop_survival')
    survival_data = pd.read_csv(edge_prop_survival_dir /
                                'univariate_surv_results.csv',
                                index_col=0)
    # Indexed by gene/edge, across all patients
    surv_edge_sel = [('_' in i) for i in survival_data.index]
    edge_survival_data = survival_data.loc[surv_edge_sel, :]

    lbs_mut_edge_matrix = pd.DataFrame(
        0.0,
        index=sorted(selected_edges_by_patient),
        columns=all_edges,
    )
    for patient, edges in selected_edges_by_patient.items():
        lbs_mut_edge_matrix.loc[patient, list(edges)] = 1

    # Binary vector: is this edge incident on a LBS mut in at least one patient?
    edges_with_lbs_muts = lbs_mut_edge_matrix.sum(axis=0).astype(bool)

    surv_pvalues_with_lbs = edge_survival_data.loc[edges_with_lbs_muts,
                                                   'pvalue']
    surv_pvalues_with_lbs.name = 'With LBS'
    surv_pvalues_without_lbs = edge_survival_data.loc[~edges_with_lbs_muts,
                                                      'pvalue']
    surv_pvalues_without_lbs.name = 'Without LBS'

    ks_res = scipy.stats.ks_2samp(surv_pvalues_with_lbs,
                                  surv_pvalues_without_lbs)

    with new_plot():
        plot_cdf(surv_pvalues_with_lbs)
        plot_cdf(surv_pvalues_without_lbs)

        plt.legend()
        plt.ylabel('CDF')
        plt.xlabel('Univariate Cox Regression $P$-value')

        figure_path = output_path / 'surv_pvalue_cdfs.pdf'
        plt.savefig(figure_path, bbox_inches='tight')

    with new_plot():
        fig = plt.figure()

        surv_pvalues_with_lbs.plot.hist(bins=50, ax=plt.gca(), alpha=0.5)
        surv_pvalues_without_lbs.plot.hist(bins=50, ax=plt.gca(), alpha=0.5)

        plt.legend('topleft')
        plt.xlabel('Univariate Cox Regression $P$-value')

        figure_path = output_path / 'surv_pvalue_hist.pdf'
        plt.savefig(figure_path, bbox_inches='tight')

    ## /Survival analysis

    ## Permuted survival analysis

    pvalues = edge_survival_data.loc[:, 'r_square']

    ks_manual = (np.array([0.1, 0.2, 0.25, 0.3]) *
                 edge_prop.shape[0]).astype(int)
    ks_auto = np.logspace(1, 3, num=15).astype(int)
    ks = sorted(chain(ks_manual, ks_auto))

    edge_count = 1000

    template = dedent('''
    \\begin{{frame}}[plain]
     \\begin{{center}}
      \\includegraphics[width=0.7\\textwidth]{{survival_rsquare_hist_k_{k}}}
     \\end{{center}}
    \\end{{frame}}
    ''')

    with open(data_path / 'figure_include.tex', 'w') as f:
        for k in ks:
            print(template.format(k=k), file=f)

    for k in ks:
        print('Computing edge ranking results for k =', k)
        edge_ranking = get_rank_k_edge_values(edge_prop, k)
        sorted_edge_scores = edge_ranking.sort_values(ascending=False)
        top_edges = sorted_edge_scores.iloc[:edge_count]
        top_edge_pvalues = pvalues.loc[top_edges.index]
        bottom_edges = sorted_edge_scores.iloc[edge_count:]
        permutation_count = 1000
        permutation_pvalues = pd.Series(0.0, index=range(permutation_count))
        for i in range(permutation_count):
            edge_selection = np.random.choice(bottom_edges.index, size=100)
            selected_pvalues = pvalues.loc[edge_selection]
            comparison_result = scipy.stats.mannwhitneyu(
                top_edge_pvalues,
                selected_pvalues,
                alternative='greater',
            )
            permutation_pvalues.iloc[i] = comparison_result.pvalue

        nl10_permutation_pvalues = -np.log10(permutation_pvalues)

        with new_plot():
            plt.figure(figsize=(5, 5))
            nl10_permutation_pvalues.plot.hist(bins=50)
            title = (f'Survival $R^2$: top {edge_count} edges ($k = {k}$) vs. '
                     f'{permutation_count} random selections')
            plt.title(title)
            plt.xlabel('$- \\log_{10}$($P$-value) from Mann-Whitney $U$ test')

            nl10_0_05 = -np.log10(0.05)
            plt.axvline(x=nl10_0_05, color='#FF0000FF')

            nl10_0_001 = -np.log10(0.001)
            plt.axvline(x=nl10_0_001, color='#000000FF')

            figure_path = output_path / f'survival_rsquare_hist_k_{k}.pdf'
            print('Saving survival R^2 histogram to', figure_path)
            plt.savefig(figure_path, bbox_inches='tight')

Example #7

Show file

            coef_path = data_path / f'fold_{i}_{drug}_{clf_desc}_coefs.csv'
            save_coefs(clf_obj, coef_path, clf_data.trn_matrix.columns)

            rf_feature_path = output_path / f'{drug}_rf_features_fold_{i}.pdf'
            plot_rf_feature_importance(clf_obj, rf_feature_path,
                                       clf_data.trn_matrix.columns)

            trn_preds = clf_obj.predict_proba(clf_data.trn_matrix)
            trn_preds_series = pd.Series(
                trn_preds[:, 1],
                index=clf_data.trn_matrix.index,
                name=f'switch_{clf_desc}_pred_{drug}',
            )

            trn_rd = RocData.calculate(clf_data.trn_labels, trn_preds_series)
            trn_rd.save(data_path /
                        f'trn_roc_data_{drug}_{clf_desc}_fold_{i}.pickle')
            trn_roc_data[clf_desc].append(trn_rd)
            trn_pr_data[clf_desc].append(
                PrData.calculate(clf_data.trn_labels, trn_preds_series))

            preds = clf_obj.predict_proba(clf_data.val_matrix)
            preds_series = pd.Series(
                preds[:, 1],
                index=clf_data.val_matrix.index,
                name=f'switch_{clf_desc}_pred_{drug}',
            )

            rd = RocData.calculate(clf_data.val_labels, preds_series)
            rd.save(data_path / f'roc_data_{drug}_{clf_desc}_fold_{i}.pickle')

Example #8

Show file

patient_gene_set_muts = pd.DataFrame(0, index=muts.index, columns=range(len(entrez_gene_sets)))

for i, gene_set in enumerate(entrez_gene_sets):
    patient_gene_set_muts.loc[:, i] = muts.loc[:, gene_set].any(axis=1).astype(int)

pathway_mut_counts = patient_gene_set_muts.sum(axis=1)

gene_set_mut_matrix_path = data_path / 'gene_set_mut_matrix.pickle'
print('Saving gene set mutation matrix to', gene_set_mut_matrix_path)
patient_gene_set_muts.to_pickle(gene_set_mut_matrix_path)

pathway_mut_count_path = data_path / 'pathway_mut_counts.pickle'
print('Saving pathway mutation counts to', pathway_mut_count_path)
pathway_mut_counts.to_pickle(pathway_mut_count_path)

drugs = ['ai_all', 'arimidex']

feature_label_path = find_newest_data_path(f'compute_drug_features_labels_alpha_{args.alpha:.2f}')

for drug in drugs:
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, pathway_mut_counts.index)
    selected_labels = labels_all.loc[selected_samples]
    selected_counts = pathway_mut_counts.loc[selected_samples]

    rd = RocData.calculate(selected_labels, selected_counts)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'WExT Pathway Mutation Count ROC: {drug.title()}', output_path / f'{drug}_roc.pdf')