Exemple #1
0
def main(ccds_path: Path):
    gene_set = set()
    interval_separator = re.compile(r'\s*,\s*')

    print('Reading CCDS data from', ccds_path)
    with open(ccds_path) as f:
        # Maps gene IDs to sets of intervals in string format, e.g. '129415220-129415360'.
        # Used to consolidate isoforms that are each listed separately.
        intervals_by_gene = defaultdict(set)

        r = csv.DictReader(f, delimiter='\t')
        # First line starts with "#", so the field name for chromosome is actually "#chromosome"
        for line in r:
            chrom_name = 'chr{}'.format(line['#chromosome'])
            intervals = interval_separator.split(
                line['cds_locations'].strip('[]'))
            gene_id = line['gene_id']
            gene_set.add(gene_id)
            for interval in intervals:
                intervals_by_gene[gene_id].add((chrom_name, interval))

    trees = defaultdict(IntervalTree)
    gene_length = pd.Series(0.0, index=sorted(intervals_by_gene))
    print('Read data for', len(intervals_by_gene), 'genes')

    for gene, interval_data in intervals_by_gene.items():
        chrom_dict = {}
        for chrom, interval_string in interval_data:
            # Skip invalid interval lists
            if interval_string == '-':
                continue
            if chrom not in chrom_dict:
                chrom_dict[chrom] = [float('inf'), 0]

            interval_pieces = interval_string.split('-')
            start = int(interval_pieces[0])
            end = int(interval_pieces[1])

            chrom_dict[chrom][0] = min(chrom_dict[chrom][0], start)
            chrom_dict[chrom][1] = max(chrom_dict[chrom][1], end)

        for chrom in chrom_dict:
            trees[chrom][chrom_dict[chrom][0]:chrom_dict[chrom][1]] = gene
            # Kilobases, so divide by 1000
            gene_length.loc[gene] = (chrom_dict[chrom][1] -
                                     chrom_dict[chrom][0]) / 1000

    data_path = create_data_path('build_tree')
    pickle_file = data_path / 'trees.pickle'
    print('Saving interval trees to', pickle_file)

    data_to_save = {
        'trees': trees,
        'gene_length': gene_length,
        'intervals_by_gene': intervals_by_gene,
    }

    with open(pickle_file, 'wb') as f:
        pickle.dump(data_to_save, f, protocol=pickle.HIGHEST_PROTOCOL)
Exemple #2
0
def queue_jobs(srr_list_file: Path, pool: str, subprocesses: int):
    data_path = create_data_path(SCRIPT_LABEL)
    slurm_path = create_slurm_path(SCRIPT_LABEL)

    with open(srr_list_file) as f:
        srr_ids = [line.strip() for line in f]

    srr_sublist_count = ceil(len(srr_ids) / SLURM_ARRAY_MAX)
    srr_filename_digits = digits(srr_sublist_count)

    for i, srr_sublist_raw in enumerate(grouper(srr_ids, SLURM_ARRAY_MAX)):
        srr_sublist_path = data_path / SRR_LIST_FILENAME_TEMPLATE.format(
            i, srr_filename_digits)
        print(
            f'{i:0{srr_filename_digits}} Saving SRR sublist to {srr_sublist_path}'
        )

        srr_sublist = list(filter(None, srr_sublist_raw))

        with open(srr_sublist_path, 'w') as f:
            for srr_id in srr_sublist:
                print(srr_id, file=f)

        array_index_spec = f'0-{len(srr_sublist) - 1}'

        script_file = slurm_path / SCRIPT_FILENAME_TEMPLATE.format(
            i, srr_filename_digits)
        print(f'{i:0{srr_filename_digits}} Saving script to {script_file}')
        with open(script_file, 'w') as f:
            script_content = script_template.format(
                srr_list_file=srr_sublist_path.absolute(),
                pool=pool,
                subprocesses=subprocesses,
                stdout_path=script_file.with_suffix('.out'))
            print(script_content, file=f)

        slurm_command = [
            piece.format(
                array_index_spec=array_index_spec,
                script_filename=script_file,
            ) for piece in SBATCH_COMMAND_TEMPLATE
        ]
        print(f'{i:0{srr_filename_digits}} Running', ' '.join(slurm_command))
        check_call(slurm_command)
Exemple #3
0
#!/usr/bin/env python3
import json
from pathlib import Path
import pickle

import attr
from data_path_utils import create_data_path
import pandas as pd

from parse_tcga_clinical_xml import Patient, find_clinical_xml_files, parse_clinical_xml

data_path = create_data_path('tcga_xml_to_json')

input_path = Path('~/data/tcga-clinical-all').expanduser()
print(f'Parsing XML files in {input_path}')
patients = [parse_clinical_xml(path) for path in find_clinical_xml_files(input_path)]
print(f'Read information for {len(patients)} patients')
converted_data = [
    attr.asdict(
        patient,
        filter=attr.filters.exclude(attr.fields(Patient).path),
    )
    for patient in patients
]

pickle_file = data_path / 'tcga_clinical_data.pickle'
print('Saving pickled data to', pickle_file)
with open(pickle_file, 'wb') as f:
    pickle.dump(patients, f)

output_file = data_path / 'tcga_clinical_data.json'
Exemple #4
0
    for n1, n2, weight in input_network.edges.data('weight'):
        new_n1 = edge_name_func([n1])
        new_n2 = edge_name_func([n2])
        new_node = edge_name_func(sorted([n1, n2]))
        if weight is None:
            new_weight = 1
        else:
            new_weight = sqrt(weight)

        network.add_edge(new_n1, new_node, weight=new_weight)
        network.add_edge(new_node, new_n2, weight=new_weight)

    return network


if __name__ == '__main__':
    p = ArgumentParser()
    p.add_argument('hippie_input_path', type=Path)
    args = p.parse_args()

    data_path = create_data_path('build_hippie_network')
    print('Reading HIPPIE network from', args.hippie_input_path)
    network = build_hippie_network(args.hippie_input_path)
    print('Network contains {} nodes, {} edges'.format(len(network.nodes()),
                                                       len(network.edges())))
    network_output_path = data_path / 'network.pickle'
    print('Saving network to', network_output_path)
    with network_output_path.open('wb') as f:
        pickle.dump(network, f, protocol=pickle.HIGHEST_PROTOCOL)
Exemple #5
0
from data_path_utils import (
    create_data_path,
    create_output_path,
    find_newest_data_path,
)
import pandas as pd

from gene_mappings import read_ensembl_entrez_mapping
from utils import sorted_union

p = ArgumentParser()
p.add_argument('gdc_manifest', type=Path)
args = p.parse_args()

data_path = create_data_path('consolidate_mrna_expression')

input_path = find_newest_data_path('query_cases_by_file') / 'raw_responses'

gdc_manifest = pd.read_table(args.gdc_manifest)
files_in_manifest = set(gdc_manifest.id)


def get_submitter_ids(data: dict):
    for key, value in data.items():
        if key == 'submitter_id':
            yield value
        if isinstance(value, dict):
            yield from get_submitter_ids(value)
        if isinstance(value, list):
            for sub_data in value:
Exemple #6
0
    element_wise_min,
    weighted_correlation,
)

selected_cancer = 'brca'

p = ArgumentParser()
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)

if __name__ == '__main__':
    args = p.parse_args()
else:
    args = p.parse_args([])

label = f'treatment_features_alpha_{args.alpha:.2f}'
data_path = create_data_path(label)
output_path = create_output_path(label)

network_path = find_newest_data_path('build_hippie_network')
with (network_path / 'network.pickle').open('rb') as f:
    network = pickle.load(f)
nodes = sorted(network.nodes())
node_set = set(nodes)

w_prime = normalize(network)


def get_prop_vec(name, genes):
    s = pd.Series(0.0, index=nodes)
    gene_set = set(genes)
    genes_in_network = gene_set & node_set
Exemple #7
0
from gene_mappings import read_hugo_entrez_mapping
from utils import DEFAULT_ALPHA, sorted_intersection
from propagation import propagate, normalize

DEFAULT_SUBPROCESSES = 2

p = ArgumentParser()
p.add_argument('-s', '--subprocesses', type=int, default=DEFAULT_SUBPROCESSES)
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)

if __name__ == '__main__':
    args = p.parse_args()
else:
    args = p.parse_args([])

data_path = create_data_path(f'propagate_mutations_alpha_{args.alpha:.2f}')

with (find_newest_data_path('build_hippie_network') / 'network.pickle').open('rb') as f:
    network = pickle.load(f)
print('Loaded network')

w_prime = normalize(network)
node_set = set(network.nodes())
nodes = sorted(node_set)
node_count = len(nodes)

with pd.HDFStore(find_newest_data_path('parse_tcga_mutations') / 'mutations.hdf5', 'r') as store:
    mutations = store['muts']
print('Read mutations')

expr = pd.read_pickle(find_newest_data_path('parse_cosmic_diffexpr') / 'brca_expr.pickle')
Exemple #8
0
        get_patient_barcode)
    selected_diffexpr = cosmic.loc[
        cosmic.adj_sample_name.apply(is_selected), :]

    expr_samples = sorted(set(selected_diffexpr.adj_sample_name))
    expr_genes = sorted(set(selected_diffexpr.GENE_NAME))

    sample_mapping = {sample: i for (i, sample) in enumerate(expr_samples)}
    gene_mapping = {gene: i for (i, gene) in enumerate(expr_genes)}

    sample_vec = [
        sample_mapping[sample] for sample in selected_diffexpr.adj_sample_name
    ]
    gene_vec = [gene_mapping[gene] for gene in selected_diffexpr.GENE_NAME]

    mat = scipy.sparse.coo_matrix(
        (selected_diffexpr.Z_SCORE, (sample_vec, gene_vec)))
    return pd.DataFrame(mat.todense(), index=expr_samples, columns=expr_genes)


if __name__ == '__main__':
    p = ArgumentParser()
    p.add_argument('cosmic_filename', type=Path)
    p.add_argument('patient_id_file', type=Path)
    args = p.parse_args()
    diffexpr = parse_cosmic_diffexpr(args.cosmic_filename,
                                     args.patient_id_file)

    data_path = create_data_path('parse_cosmic_diffexpr')
    diffexpr.to_pickle(data_path / 'diffexpr.pickle')
Exemple #9
0
            header_pieces = next(f).strip().split('\t')
            sample_id = get_patient_barcode(header_pieces[1])
            # Next line is "Composite Element REF...", ignore that too
            next(f)
            for line in f:
                if line:
                    gene, expr_str = line.strip().split('\t')
                    try:
                        expr_value = float(expr_str)
                    except ValueError:
                        expr_value = nan
                    expr_values.append(expr_value)
                    pairs.append((sample_id, gene))

    return pairs_and_values_to_dataframe(pairs, expr_values)


if __name__ == '__main__':
    p = ArgumentParser()
    p.add_argument('tcga_mut_path', type=Path, nargs='+')
    args = p.parse_args()

    data_path = create_data_path('parse_tcga_mutations')
    print('Reading TCGA mutation data from', args.tcga_mut_path)
    muts = mafs_to_matrix(args.tcga_mut_path, get_patient_barcode)
    print('Mutation data shape:', muts.shape)
    mut_output_path = data_path / 'mutations.hdf5'
    print('Saving mutations to', mut_output_path)
    with pd.HDFStore(mut_output_path) as store:
        store['muts'] = muts
Exemple #10
0
def main():
    script_label = 'prop_edge_lbs_overlap'
    data_path = create_data_path(script_label)
    output_path = create_output_path(script_label)

    hem = read_hugo_entrez_mapping()

    lbs_mut_path = find_newest_data_path('intersect_muts_lbs')
    lbs_muts = pd.read_csv(lbs_mut_path / 'brca_lbs_muts.csv')

    prop_edge_path = find_newest_data_path(
        f'propagate_mutations_edges_alpha_{args.alpha:.2f}')
    with pd.HDFStore(prop_edge_path / 'data_propagated.hdf5') as store:
        mut_edge_prop = store['mutations']

    patients_with_lbs_muts = set(lbs_muts.patient)
    print('Patients with LBS mutations:', len(patients_with_lbs_muts))

    lbs_muts_by_patient = defaultdict(set)
    for i, row in lbs_muts.iterrows():
        if row.gene not in hem:
            print('Skipping gene', row.gene)
            continue
        lbs_muts_by_patient[row.patient].add(hem[row.gene])

    all_edge_set = {i for i in mut_edge_prop.columns if '_' in i}
    all_edges = sorted(all_edge_set)

    edge_prop = mut_edge_prop.loc[:, all_edges]

    shuffle_count = 100
    sorted_patients = sorted(patients_with_lbs_muts)
    patient_count = len(sorted_patients)
    ndcg = pd.Series(0.0, index=sorted_patients)
    shuffled_ndcg = pd.DataFrame(0.0,
                                 index=sorted_patients,
                                 columns=range(shuffle_count))
    lbs_edges_by_patient = pd.Series(0, index=sorted_patients)

    print('Loading shuffled data')
    prop_lbs_shuffle_path = find_newest_data_path('prop_edge_lbs_shuffle')

    with open(prop_lbs_shuffle_path / 'shuffled_muts_edges_by_patient.pickle',
              'rb') as f:
        d = pickle.load(f)
        shuffled_by_patient = d['shuffled_by_patient']
        selected_edges_by_patient = d['selected_edges_by_patient']
        shuffled_edges_by_patient = d['shuffled_edges_by_patient']

    ## NDCG analysis

    # For each patient, rank edges by propagated mutation scores, assign label of 1 if
    # either node connected to that edge has a LBS mutation

    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(f'Computing NDCG for patient {i}/{patient_count}')

        edge_scores = mut_edge_prop.loc[patient, all_edges].copy().sort_values(
            ascending=False)
        selected_edges = selected_edges_by_patient[patient]
        shuffled_edge_list = shuffled_edges_by_patient[patient]

        relevance = np.array([e in selected_edges
                              for e in edge_scores.index]).astype(float)
        ndcg.loc[patient] = normalized_discounted_cumulative_gain(
            relevance)[-1]

        for j, shuffled_edges in enumerate(shuffled_edge_list):
            shuffled_relevance = np.array(
                [e in shuffled_edges for e in edge_scores.index]).astype(float)
            shuffled_ndcg.loc[patient,
                              j] = normalized_discounted_cumulative_gain(
                                  shuffled_relevance)[-1]

    with pd.HDFStore(data_path / 'ndcg_data.hdf5') as store:
        store['ndcg'] = ndcg
        store['shuffled_ndcg'] = shuffled_ndcg
        store['lbs_edges_by_patient'] = lbs_edges_by_patient

    shuffled_ndcg_flat = shuffled_ndcg.unstack()
    #shuffled_ndcg_median = shuffled_ndcg.median(axis=1)

    with new_plot():
        ndcg.plot.hist(bins=hist_bin_count)
        plt.title('NDCG histogram')
        plt.xlabel(
            'Patient NDCG score: selection of LBS edges by propagated edge score'
        )

        figure_path = output_path / 'ndcg_hist.pdf'
        print('Saving NDCG histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    with new_plot():
        shuffled_ndcg_flat.plot.hist(bins=hist_bin_count)
        plt.title('NDCG histogram')
        plt.xlabel(
            'Patient NDCG score: selection of shuffled LBS edges by propagated edge score'
        )

        figure_path = output_path / 'shuffled_ndcg_hist.pdf'
        print('Saving NDCG histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ndcg_ks = scipy.stats.ks_2samp(ndcg, shuffled_ndcg_flat)
    ndcg_ks_pvalue_str = to_matplotlib_sci_notation(ndcg_ks[1])

    with new_plot():
        ndcg.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Real NDCG',
            density=True,
        )
        shuffled_ndcg_flat.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Shuffled NDCG, across 100 permutations',
            density=True,
        )
        plt.xlabel(
            'Patient NDCG score: selection of LBS edges by propagated edge score'
        )
        plt.legend()
        plt.figtext(
            0.89,
            0.7,
            f'Kolmogorov-Smirnov $P = {ndcg_ks_pvalue_str}$',
            horizontalalignment='right',
        )

        figure_path = output_path / 'ndcg_both_hist.pdf'
        print('Saving NDCG histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ## /NDCG analysis

    ## PR and ROC AUC analysis

    roc_auc = pd.Series(0.0, index=sorted_patients)
    average_pr_scores = pd.Series(0.0, index=sorted_patients)
    shuffled_roc_auc = pd.DataFrame(0.0,
                                    index=sorted_patients,
                                    columns=range(shuffle_count))
    shuffled_average_pr_scores = pd.DataFrame(0.0,
                                              index=sorted_patients,
                                              columns=range(shuffle_count))
    # Maps patient IDs to performance objects
    roc_data_objects = {}
    pr_data_objects = {}

    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(
            f'Computing classifier performance for patient {i}/{patient_count}'
        )
        selected_edges: Set[str] = selected_edges_by_patient[patient]
        edge_scores = mut_edge_prop.loc[patient, all_edges].copy()
        labels = np.array([e in selected_edges
                           for e in edge_scores.index]).astype(float)

        rd = RocData.calculate(labels, edge_scores)
        roc_data_objects[patient] = rd
        roc_auc.loc[patient] = rd.auc

        pr = PrData.calculate(labels, edge_scores)
        pr_data_objects[patient] = pr
        average_pr_scores.loc[patient] = average_precision_score(
            labels, edge_scores)

        shuffled_edge_list: List[Set[str]] = shuffled_edges_by_patient[patient]

        for j, shuffled_edges in enumerate(shuffled_edge_list):
            shuffled_labels = np.array(
                [e in shuffled_edges for e in edge_scores.index]).astype(float)

            shuffled_rd = RocData.calculate(shuffled_labels, edge_scores)
            shuffled_roc_auc.loc[patient, j] = shuffled_rd.auc

            shuffled_average_pr_scores.loc[patient,
                                           j] = average_precision_score(
                                               shuffled_labels,
                                               edge_scores,
                                           )

    with pd.HDFStore(data_path / 'classifier_data.hdf5') as store:
        store['roc_auc'] = roc_auc
        store['average_pr'] = average_pr_scores
        store['shuffled_roc_auc'] = shuffled_roc_auc
        store['shuffled_average_pr'] = shuffled_average_pr_scores

    with new_plot():
        roc_auc.plot.hist(bins=hist_bin_count)
        plt.title('ROC AUC histogram')
        plt.xlabel(
            'Patient ROC AUC: selection of LBS edges by propagated edge score')

        figure_path = output_path / 'roc_auc_hist.pdf'
        print('Saving ROC AUC histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    #shuffled_roc_auc_median = shuffled_roc_auc.median(axis=1)
    shuffled_roc_auc_flat = shuffled_roc_auc.unstack()

    with new_plot():
        shuffled_roc_auc_flat.plot.hist(bins=hist_bin_count)
        plt.title('ROC AUC histogram')
        plt.xlabel(
            'Patient ROC AUC: selection of shuffled LBS edges by propagated edge score'
        )

        figure_path = output_path / 'shuffled_roc_auc_hist.pdf'
        print('Saving ROC AUC histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    roc_auc_ks = scipy.stats.ks_2samp(roc_auc, shuffled_roc_auc_flat)
    roc_auc_ks_pvalue_str = to_matplotlib_sci_notation(roc_auc_ks[1])

    with new_plot():
        roc_auc.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Real ROC AUC',
            density=True,
        )
        shuffled_roc_auc_flat.plot.hist(
            bins=50,
            alpha=0.8,
            label='Shuffled ROC AUC, across 100 permutations',
            density=True,
        )
        plt.xlabel(
            'Patient ROC AUC: selection of LBS edges by propagated edge score')
        plt.legend()
        plt.figtext(
            0.14,
            0.7,
            f'Kolmogorov-Smirnov $P = {roc_auc_ks_pvalue_str}$',
            horizontalalignment='left',
        )

        figure_path = output_path / 'roc_auc_both_hist.pdf'
        print('Saving ROC AUC histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    with new_plot():
        average_pr_scores.plot.hist(bins=hist_bin_count)
        plt.title('Average precision histogram')
        plt.xlabel(
            'Average precision: selection of LBS edges by propagated edge score'
        )

        figure_path = output_path / 'avg_prec_hist.pdf'
        print('Saving AP histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    shuffled_average_pr_median = shuffled_average_pr_scores.median(axis=1)
    with new_plot():
        shuffled_average_pr_median.plot.hist(bins=hist_bin_count)
        plt.title('Average precision histogram')
        plt.xlabel(
            'Average precision: selection of shuffled LBS edges by propagated edge score'
        )

        figure_path = output_path / 'shuffled_avg_prec_hist.pdf'
        print('Saving AP histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    top_n = 4
    rest_uniform = 6
    sorted_pr_scores = average_pr_scores.dropna().sort_values()
    usable_patient_count = sorted_pr_scores.shape[0]
    # Top 5, and 5 uniformly distributed from the rest
    patient_indexes = list(
        np.linspace(
            0,
            usable_patient_count - 1 - top_n,
            num=rest_uniform,
        ).astype(int))
    patient_indexes.extend(
        range(usable_patient_count - top_n, usable_patient_count))
    selected_patients = sorted_pr_scores.index[list(reversed(patient_indexes))]

    with new_plot():
        plt.figure(figsize=(10, 10))
        for patient in selected_patients:
            prd = pr_data_objects[patient]
            plt.plot(prd.rec, prd.prec, label=patient)

        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.axes().set_aspect('equal', 'datalim')
        plt.legend()

        plt.title(
            f'Precision-recall: top {top_n} patients, uniform spacing of bottom {rest_uniform}'
        )

        figure_path = output_path / 'pr_selected.pdf'
        print('Saving selected PR curves to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ## /PR and ROC AUC analysis

    ## Spearman correlation P-value analysis
    spearman_pvalues = pd.Series(0.0, index=sorted_patients)
    shuffled_spearman_pvalues = pd.DataFrame(0.0,
                                             index=sorted_patients,
                                             columns=range(shuffle_count))

    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(
            f'Computing Spearman correlation P-value for patient {i}/{patient_count}'
        )
        selected_edges: Set[str] = selected_edges_by_patient[patient]
        edge_scores = mut_edge_prop.loc[patient, all_edges].copy()
        labels = np.array([e in selected_edges
                           for e in edge_scores.index]).astype(float)

        spearman_result = scipy.stats.spearmanr(edge_scores, labels)
        spearman_pvalue = spearman_result[1]

        spearman_pvalues.loc[patient] = spearman_pvalue

        shuffled_edge_list: List[Set[str]] = shuffled_edges_by_patient[patient]

        for j, shuffled_edges in enumerate(shuffled_edge_list):
            shuffled_labels = np.array(
                [e in shuffled_edges for e in edge_scores.index]).astype(float)

            shuffled_spearman_result = scipy.stats.spearmanr(
                edge_scores, shuffled_labels)
            shuffled_spearman_pvalue = shuffled_spearman_result[1]

            shuffled_spearman_pvalues.loc[patient,
                                          j] = shuffled_spearman_pvalue

    sp_dir = Path('data/prop_edge_lbs_overlap_20180606-105746')
    with pd.HDFStore(sp_dir / 'spearman_pvalues.hdf5') as store:
        spearman_pvalues = store['spearman_pvalues']
        shuffled_spearman_pvalues = store['shuffled_spearman_pvalues']

    with pd.HDFStore(data_path / 'spearman_pvalues.hdf5') as store:
        store['spearman_pvalues'] = spearman_pvalues
        store['shuffled_spearman_pvalues'] = shuffled_spearman_pvalues

    nl10_spearman_pvalues_all = -np.log10(spearman_pvalues)
    nl10_spearman_pvalues = nl10_spearman_pvalues_all.loc[
        ~(nl10_spearman_pvalues_all.isnull())
        & ~(np.isinf(nl10_spearman_pvalues_all))]

    with new_plot():
        nl10_spearman_pvalues.plot.hist(bins=50)
        plt.title('Spearman $P$-value histogram')
        plt.xlabel(
            'Spearman $P$-values ($-\\log_{10}$): LBS edges vs. prop. edge score'
        )

        figure_path = output_path / 'spearman_pvalue_hist.pdf'
        print('Saving Spearman P-value histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    shuffled_spearman_pvalues_flat = shuffled_spearman_pvalues.unstack()
    nl10_shuffled_spearman_pvalues_flat_all = -np.log10(
        shuffled_spearman_pvalues_flat)
    nl10_shuffled_spearman_pvalues_flat = nl10_shuffled_spearman_pvalues_flat_all.loc[
        ~(nl10_shuffled_spearman_pvalues_flat_all.isnull())
        & ~(np.isinf(nl10_shuffled_spearman_pvalues_flat_all))]

    with new_plot():
        nl10_shuffled_spearman_pvalues_flat.plot.hist(bins=50)
        plt.title('Spearman $P$-value histogram')
        plt.xlabel(
            'Spearman $P$-values ($-\\log_{10}$): shuffled LBS edges vs. prop. edge score'
        )

        figure_path = output_path / 'shuffled_spearman_pvalue_hist.pdf'
        print('Saving Spearman P-value histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    spearman_ks = scipy.stats.ks_2samp(spearman_pvalues,
                                       shuffled_spearman_pvalues_flat)
    spearman_ks_pvalue_str = to_matplotlib_sci_notation(spearman_ks[1])

    with new_plot():
        nl10_spearman_pvalues.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Real Spearman $P$-values',
            density=True,
        )
        nl10_shuffled_spearman_pvalues_flat.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Shuffled Spearman $P$-values, across 100 permutations',
            density=True,
        )
        plt.xlabel(
            'Spearman $P$-values ($-\\log_{10}$): LBS edges vs. prop. edge score'
        )
        plt.legend()
        plt.figtext(
            0.89,
            0.7,
            f'Kolmogorov-Smirnov $P = {spearman_ks_pvalue_str}$',
            horizontalalignment='right',
        )

        figure_path = output_path / 'spearman_pvalues_both_hist.pdf'
        print('Saving Spearman P-value histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ## /Spearman correlation P-value analysis

    ## Overall ROC AUC
    print('Creating binary LBS edge matrix')
    lbs_edge_matrix = pd.DataFrame(0,
                                   index=edge_prop.index,
                                   columns=edge_prop.columns)
    for patient, edges in selected_edges_by_patient.items():
        lbs_edge_matrix.loc[patient, list(edges)] = 1

    lbs_matrix_path = data_path / 'lbs_edge_matrix.hdf5'
    print('Saving LBS edge matrix to', lbs_matrix_path)
    with pd.HDFStore(lbs_matrix_path) as store:
        store['lbs_edge_matrix'] = lbs_edge_matrix

    sorted_flattened_edge_scores = edge_prop.unstack().sort_values(
        ascending=False)
    flattened_lbs_edges = lbs_edge_matrix.unstack()
    ordered_flattened_lbs_edges = flattened_lbs_edges.loc[
        sorted_flattened_edge_scores.index]

    flattened_rd = RocData.calculate(ordered_flattened_lbs_edges,
                                     sorted_flattened_edge_scores)
    flattened_rd_path = data_path / 'flattened_rd.pickle'
    print('Saving flattened vector RocData to', flattened_rd_path)
    with open(flattened_rd_path, 'wb') as f:
        pickle.dump(flattened_rd, f)
    ## /Overall ROC AUC

    ## Survival analysis

    edge_prop_survival_dir = find_newest_data_path('edge_prop_survival')
    survival_data = pd.read_csv(edge_prop_survival_dir /
                                'univariate_surv_results.csv',
                                index_col=0)
    # Indexed by gene/edge, across all patients
    surv_edge_sel = [('_' in i) for i in survival_data.index]
    edge_survival_data = survival_data.loc[surv_edge_sel, :]

    lbs_mut_edge_matrix = pd.DataFrame(
        0.0,
        index=sorted(selected_edges_by_patient),
        columns=all_edges,
    )
    for patient, edges in selected_edges_by_patient.items():
        lbs_mut_edge_matrix.loc[patient, list(edges)] = 1

    # Binary vector: is this edge incident on a LBS mut in at least one patient?
    edges_with_lbs_muts = lbs_mut_edge_matrix.sum(axis=0).astype(bool)

    surv_pvalues_with_lbs = edge_survival_data.loc[edges_with_lbs_muts,
                                                   'pvalue']
    surv_pvalues_with_lbs.name = 'With LBS'
    surv_pvalues_without_lbs = edge_survival_data.loc[~edges_with_lbs_muts,
                                                      'pvalue']
    surv_pvalues_without_lbs.name = 'Without LBS'

    ks_res = scipy.stats.ks_2samp(surv_pvalues_with_lbs,
                                  surv_pvalues_without_lbs)

    with new_plot():
        plot_cdf(surv_pvalues_with_lbs)
        plot_cdf(surv_pvalues_without_lbs)

        plt.legend()
        plt.ylabel('CDF')
        plt.xlabel('Univariate Cox Regression $P$-value')

        figure_path = output_path / 'surv_pvalue_cdfs.pdf'
        plt.savefig(figure_path, bbox_inches='tight')

    with new_plot():
        fig = plt.figure()

        surv_pvalues_with_lbs.plot.hist(bins=50, ax=plt.gca(), alpha=0.5)
        surv_pvalues_without_lbs.plot.hist(bins=50, ax=plt.gca(), alpha=0.5)

        plt.legend('topleft')
        plt.xlabel('Univariate Cox Regression $P$-value')

        figure_path = output_path / 'surv_pvalue_hist.pdf'
        plt.savefig(figure_path, bbox_inches='tight')

    ## /Survival analysis

    ## Permuted survival analysis

    pvalues = edge_survival_data.loc[:, 'r_square']

    ks_manual = (np.array([0.1, 0.2, 0.25, 0.3]) *
                 edge_prop.shape[0]).astype(int)
    ks_auto = np.logspace(1, 3, num=15).astype(int)
    ks = sorted(chain(ks_manual, ks_auto))

    edge_count = 1000

    template = dedent('''
    \\begin{{frame}}[plain]
     \\begin{{center}}
      \\includegraphics[width=0.7\\textwidth]{{survival_rsquare_hist_k_{k}}}
     \\end{{center}}
    \\end{{frame}}
    ''')

    with open(data_path / 'figure_include.tex', 'w') as f:
        for k in ks:
            print(template.format(k=k), file=f)

    for k in ks:
        print('Computing edge ranking results for k =', k)
        edge_ranking = get_rank_k_edge_values(edge_prop, k)
        sorted_edge_scores = edge_ranking.sort_values(ascending=False)
        top_edges = sorted_edge_scores.iloc[:edge_count]
        top_edge_pvalues = pvalues.loc[top_edges.index]
        bottom_edges = sorted_edge_scores.iloc[edge_count:]
        permutation_count = 1000
        permutation_pvalues = pd.Series(0.0, index=range(permutation_count))
        for i in range(permutation_count):
            edge_selection = np.random.choice(bottom_edges.index, size=100)
            selected_pvalues = pvalues.loc[edge_selection]
            comparison_result = scipy.stats.mannwhitneyu(
                top_edge_pvalues,
                selected_pvalues,
                alternative='greater',
            )
            permutation_pvalues.iloc[i] = comparison_result.pvalue

        nl10_permutation_pvalues = -np.log10(permutation_pvalues)

        with new_plot():
            plt.figure(figsize=(5, 5))
            nl10_permutation_pvalues.plot.hist(bins=50)
            title = (f'Survival $R^2$: top {edge_count} edges ($k = {k}$) vs. '
                     f'{permutation_count} random selections')
            plt.title(title)
            plt.xlabel('$- \\log_{10}$($P$-value) from Mann-Whitney $U$ test')

            nl10_0_05 = -np.log10(0.05)
            plt.axvline(x=nl10_0_05, color='#FF0000FF')

            nl10_0_001 = -np.log10(0.001)
            plt.axvline(x=nl10_0_001, color='#000000FF')

            figure_path = output_path / f'survival_rsquare_hist_k_{k}.pdf'
            print('Saving survival R^2 histogram to', figure_path)
            plt.savefig(figure_path, bbox_inches='tight')
Exemple #11
0
#!/usr/bin/env python3
import csv
from pathlib import Path

from data_path_utils import create_data_path

data_path = create_data_path('parse_er_targets')

trrust_path = Path('~/data/trrust_rawdata.txt').expanduser()

er_names = {'ESR1', 'ESR2'}
er_targets = set()

with trrust_path.open() as f:
    r = csv.reader(f, delimiter='\t')
    for line in r:
        tf_name = line[0]
        target_name = line[1]
        if tf_name in er_names:
            er_targets.add(target_name)

target_path = data_path / 'er_targets.txt'
print('Saving {} ER targets to {}'.format(len(er_targets), target_path))
with target_path.open('w') as f:
    for target in sorted(er_targets):
        print(target, file=f)
Exemple #12
0
import json
from pathlib import Path
import pickle

from data_path_utils import (
    DATA_PATH,
    create_data_path,
    find_newest_data_path,
)
import numpy as np
import pandas as pd

from propagation import normalize, propagate
from utils import weighted_correlation

data_path = create_data_path('drug_targets')

with Path('~/data/drugs_targets.json').expanduser().open() as f:
    raw_data = json.load(f)

drug_target_data_all = pd.DataFrame(raw_data).T
# Select those with protein targets
drug_target_data = drug_target_data_all.loc[drug_target_data_all.gene_symbols.notnull(), :]

dtd_path = data_path / 'drug_targets.pickle'
print('Saving drug target data matrix to', dtd_path)
drug_target_data.to_pickle(dtd_path)

synonyms = defaultdict(list)
for row_name, synonym_csv in drug_target_data.synonyms.iteritems():
    for synonym in synonym_csv.split(','):
Exemple #13
0
gene_metadata = pd.read_table(lbs_dir / 'mutLBSgene_basic.txt')
lbs_muts = pd.read_table(lbs_dir /
                         'mutLBSgene_tcga_cosmic_overlapped_mutations.txt')
lbs_genes = sorted(set(lbs_muts.gene))

missing_from_tcga = set(lbs_genes) - set(genes)
print('Genes in LBS data but not TCGA mutation data:', len(missing_from_tcga))

lbs_mut_set = set(zip(lbs_muts.gene, lbs_muts.nsSNV))
# set of tuples of (patient, gene, AA sub)
brca_muts_in_lbs = set()
for i, row in muts.iterrows():
    if isinstance(row.HGVSp_Short, float):
        # null
        continue
    aa_sub = strip_prefix(row.HGVSp_Short, 'p.')
    patient = get_patient_barcode(row.Tumor_Sample_Barcode)
    gene = row.Hugo_Symbol
    if (gene, aa_sub) in lbs_mut_set:
        item = (patient, gene, aa_sub)
        brca_muts_in_lbs.add(item)
        print('Found LBS mut:', item)

data_path = create_data_path('intersect_muts_lbs')

lbs_mut_df = pd.DataFrame(list(brca_muts_in_lbs))
lbs_mut_df.columns = ['patient', 'gene', 'aa_sub']
tcga_mut_path = data_path / 'brca_lbs_muts.csv'
print('Saving BRCA mutations in LBS DB to', tcga_mut_path)
lbs_mut_df.to_csv(tcga_mut_path, index=None)
Exemple #14
0
#!/usr/bin/env python3
from data_path_utils import (
    DATA_PATH,
    create_data_path,
    find_newest_data_path,
)
import pandas as pd
from scipy.stats import pearsonr

from utils import consolidate_data_frames, sorted_intersection

data_path = create_data_path('tcga_lincs_expr_features')

drugs = [
    'arimidex',
    'taxol',
]

tcga_expr_path = find_newest_data_path(
    'parse_cosmic_diffexpr') / 'brca_expr.pickle'
print('Reading expression data from', tcga_expr_path)
tcga_expr = pd.read_pickle(tcga_expr_path)

lincs_expr = pd.read_csv(
    find_newest_data_path('gct_drug_subset') / 'subset.csv',
    header=None,
    index_col=0,
)
lincs_expr.columns = drugs

lincs_genes = set(lincs_expr.index)
Exemple #15
0
    scale_continuous_df_cols,
    sorted_intersection,
)

p = ArgumentParser()
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)
p.add_argument('--plot-pca-components', action='store_true')
if '__file__' in globals():
    args = p.parse_args()
else:
    args = p.parse_args([])

entrez_hugo_mapping = read_entrez_hugo_mapping()

output_label = f'compute_drug_features_labels_alpha_{args.alpha:.2f}'
data_path = create_data_path(output_label)

drug_response_dir = find_newest_data_path('drug_response_labels')
tx_info_raw = pd.read_pickle(drug_response_dir / 'tx_info.pickle')

network_path = find_newest_data_path('build_hippie_network') / 'network.pickle'
print('Loading network from', network_path)
with network_path.open('rb') as f:
    network = pickle.load(f)

self_edge_count = 0
# HACK: remove self edges
for node in network.nodes:
    if network.has_edge(node, node):
        network.remove_edge(node, node)
        self_edge_count += 1
Exemple #16
0
#!/usr/bin/env python3
from data_path_utils import (
    create_data_path,
    find_newest_data_path,
)
import pandas as pd

from gene_mappings import read_entrez_hugo_mapping, read_hugo_entrez_mapping

data_path = create_data_path('dump_muts_for_nbs')

hugo_entrez_mapping = read_hugo_entrez_mapping()
entrez_hugo_mapping = read_entrez_hugo_mapping()

mut_path = find_newest_data_path('parse_tcga_mutations')

muts_all = pd.read_pickle(mut_path / 'mutations.pickle')
gene_sel = pd.Series(
    [
        (gene in hugo_entrez_mapping and hugo_entrez_mapping[gene])
        for gene in muts_all.columns
    ],
    index=muts_all.columns,
).astype(bool)
muts = muts_all.loc[:, gene_sel]
muts.columns = [hugo_entrez_mapping[gene] for gene in muts.columns]
muts = muts.groupby(axis=1, level=-1).any().astype(int)
gene_symbols = [entrez_hugo_mapping[gene] for gene in muts.columns]

with open(data_path / 'gene_symbols.txt', 'w') as f:
    print('Gene', file=f)
Exemple #17
0
def main():
    script_label = 'prop_edge_lbs_shuffle'
    data_path = create_data_path(script_label)
    output_path = create_output_path(script_label)

    hem = read_hugo_entrez_mapping()

    lbs_mut_path = find_newest_data_path('intersect_muts_lbs')
    lbs_muts = pd.read_csv(lbs_mut_path / 'brca_lbs_muts.csv')

    prop_edge_path = find_newest_data_path(f'propagate_mutations_edges_alpha_{args.alpha:.2f}')
    with pd.HDFStore(prop_edge_path / 'data_propagated.hdf5') as store:
        mut_edge_prop = store['mutations']

    patients_with_lbs_muts = set(lbs_muts.patient)
    print('Patients with LBS mutations:', len(patients_with_lbs_muts))

    lbs_muts_by_patient = defaultdict(set)
    for i, row in lbs_muts.iterrows():
        if row.gene not in hem:
            print('Skipping gene', row.gene)
            continue
        lbs_muts_by_patient[row.patient].add(hem[row.gene])

    all_edge_set = {i for i in mut_edge_prop.columns if '_' in i}
    all_edges = sorted(all_edge_set)
    all_gene_set = set(mut_edge_prop.columns) - all_edge_set

    shuffle_count = 100
    sorted_patients = sorted(patients_with_lbs_muts)
    patient_count = len(sorted_patients)
    lbs_edges_by_patient = pd.Series(0, index=sorted_patients)

    # Assign label of 1 for an edge if either node has a LBS mutation
    selected_edges_by_patient: Dict[str, Set[str]] = {}
    shuffled_edges_by_patient: Dict[str, List[Set[str]]] = {}

    shuffled_by_patient = {}
    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(f'Shuffling LBS mutations for patient {patient} ({i}/{patient_count})')
        muts = lbs_muts_by_patient[patient]
        mut_count = len(muts)
        l = []
        for j in range(shuffle_count):
            other_genes = all_gene_set - muts
            new_muts = sample(other_genes, mut_count)
            l.append(new_muts)
        shuffled_by_patient[patient] = l

    # TODO: parallelize this; it's too slow
    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(f'Computing selected/shuffled edges for patient {i}/{patient_count}')
        lbs_genes = lbs_muts_by_patient[patient]
        selected_edges: Set[str] = set()
        shuffled_edges: List[Set[str]] = [set() for _ in range(shuffle_count)]
        edge_scores = mut_edge_prop.loc[patient, all_edges].copy().sort_values(ascending=False)
        for g1_g2 in edge_scores.index:
            g1, g2 = g1_g2.split('_')
            if g1 in lbs_genes or g2 in lbs_genes:
                selected_edges.add(g1_g2)
            # TODO: clean up iteration
            for j, shuffled_genes in enumerate(shuffled_by_patient[patient]):
                if g1 in shuffled_genes or g2 in shuffled_genes:
                    shuffled_edges[j].add(g1_g2)
        lbs_edges_by_patient.loc[patient] = len(selected_edges)
        selected_edges_by_patient[patient] = selected_edges
        shuffled_edges_by_patient[patient] = shuffled_edges

    selected_edge_count = pd.Series(
        {patient: len(edges) for patient, edges in selected_edges_by_patient.items()}
    ).sort_index()

    with new_plot():
        selected_edge_count.plot.hist(bins=25)
        plt.xlabel('Number of LBS-incident edges')
        plt.ylabel('Patients')

        figure_path = output_path / 'lbs_edge_count.pdf'
        print('Saving LBS edge count histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    shuffled_data_path = data_path / 'shuffled_muts_edges_by_patient.pickle'
    print('Saving shuffled muts by patient to', shuffled_data_path)
    with open(shuffled_data_path, 'wb') as f:
        pickle.dump(
            {
                'shuffled_by_patient': shuffled_by_patient,
                'selected_edges_by_patient': selected_edges_by_patient,
                'shuffled_edges_by_patient': shuffled_edges_by_patient,
            },
            f,
        )
Exemple #18
0
    tfs = set(tfn.split('::')[0] for tfn in hits.tf_name)
    return tfs


genes = get_genes()

mg = mygene.MyGeneInfo()

q = mg.querymany(
    genes,
    species='human',
    scopes=['ensemblgene', 'entrezgene', 'symbol'],
    fields=['ensembl.gene', 'entrezgene', 'symbol'],
)

data_path = create_data_path('query_mygene')

raw_results_path = data_path / 'raw_results.json'
print('Saving raw query results to', raw_results_path)
with open(raw_results_path, 'w') as f:
    json.dump(q, f)

mapping = {}
for result in q:
    if 'entrezgene' in result:
        mapping[result['query']] = str(result['entrezgene'])

mapping_path = data_path / 'mapping.json'
print('Saving mapping to', mapping_path)
with open(mapping_path, 'w') as f:
    json.dump(mapping, f)
Exemple #19
0
    PrData,
    RocData,
    plot_pr,
    plot_roc,
    sorted_intersection,
)

p = ArgumentParser()
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)
if '__file__' in globals():
    args = p.parse_args()
else:
    args = p.parse_args([])

script_label = 'ki67_analysis'
data_path = create_data_path(script_label)
output_path = create_output_path(script_label)

expr_path = find_newest_data_path('parse_cosmic_diffexpr')
expr = pd.read_pickle(expr_path / 'brca_expr.pickle')

gene = 'MKI67'


def ki67_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, gene]
Exemple #20
0
#!/usr/bin/env python3
from pathlib import Path

from data_path_utils import create_data_path
import pandas as pd

cell_line_expr_path = Path('~/data/brca-cell-lines/pmid26771497/breast_rnaseq_qn.txt').expanduser()

expr_data_raw = pd.read_table(cell_line_expr_path, index_col='gene_id')
# Index is Entrez ID, first column is HUGO symbol, second is ensembl ID
expr_data = expr_data_raw.iloc[:, 2:]
expr_data.columns = [col.upper() for col in expr_data.columns]
expr_data = expr_data.T

data_path = create_data_path('parse_pmid26771497_expr')
expr_path = data_path / 'expr.pickle'
print('Saving expression data to', expr_path)
expr_data.to_pickle(expr_path)