Example #1
0
def pmid_26033813_analysis(drug: str):
    tree = build_tree()

    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_labels = labels_all.loc[selected_samples]
    selected_expr = expr.loc[selected_samples, :]

    fit_tree(selected_expr, selected_labels, tree)

    predictions = pd.Series(
        [
            predict_sample(sample_name, selected_expr, tree)
            for sample_name in selected_samples
        ],
        index=selected_samples,
    )

    rd = RocData.calculate(selected_labels, predictions)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'PMID26033813 ROC: {drug.title()}',
             output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, predictions)
    plot_pr(pr, f'PMID26033813 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
def map_reads_to_genes(sam_path: Path) -> Tuple[pd.Series, pd.Series]:
    tree_path = find_newest_data_path('build_tree')
    with open(tree_path / 'trees.pickle', 'rb') as f:
        tree_data = pickle.load(f)

        trees = tree_data['trees']
        gene_length = tree_data['gene_length']
        intervals_by_gene = tree_data['intervals_by_gene']

    read_counts = pd.Series(0, index=sorted(intervals_by_gene))
    reads_mapped_to_genes = 0
    reads_aligned = 0
    reads_total = 0

    print('Reading', sam_path)
    with open(sam_path) as f:
        # Read each line of the SAM file.
        for line in f:
            # Filter out the line that is not a read.
            if line.startswith('@'):
                continue

            reads_total += 1

            col = line.split('\t')
            flags = int(col[1])
            if flags & 0x4:
                # unmapped
                continue

            reads_aligned += 1

            chrom = col[2]
            start = int(col[3])
            read_length = len(col[9])
            end = start + read_length

            # Get the gene id at a certain point if there is any.
            gene_ids = trees[chrom][start:end]

            # Reads shouldn't map to multiple genes, but it's still better to be
            # safe with this and not count reads multiple times if this happens
            if gene_ids:
                reads_mapped_to_genes += 1

            for gene_id in gene_ids:
                read_counts.loc[gene_id.data] += 1

    rpkm = (read_counts * 1000000) / (reads_total * gene_length)

    gene_count = (read_counts > 0).sum()

    summary_data = pd.Series({
        'read_count': reads_total,
        'reads_aligned': reads_aligned,
        'mapped_to_genes': reads_mapped_to_genes,
        'genes_with_reads': gene_count,
    })

    return rpkm, summary_data
Example #3
0
def read_hugo_entrez_mapping() -> Dict[str, str]:
    print('Reading Hugo to Entrez mapping')
    hugo_entrez_mapping = {}
    entrez_ids = set()
    with open(HUGO_ENTREZ_MAPPING_PATH) as f:
        r = csv.DictReader(f, delimiter='\t')
        for row in r:
            entrez_id = row['Entrez Gene ID(supplied by NCBI)']
            entrez_ids.add(entrez_id)
            hugo_entrez_mapping[row['Approved Symbol']] = entrez_id
            for synonym in row['Synonyms'].split():
                hugo_entrez_mapping[synonym] = entrez_id

    # List of 2-tuples:
    #  [0] key in hugo_entrez_mapping
    #  [1] new key which will map to the same value
    manual_mapping_addition = [
        ('ADGRE5', 'CD97'),
    ]
    for key_existing, key_new in manual_mapping_addition:
        hugo_entrez_mapping[key_new] = hugo_entrez_mapping[key_existing]

    mygene_path = find_newest_data_path('query_mygene')
    with open(mygene_path / 'mapping.json') as f:
        hugo_entrez_mapping.update(json.load(f))

    print(
        'Read Hugo to Entrez mapping: {} gene names to {} Entrez IDs'.format(
            len(hugo_entrez_mapping),
            len(entrez_ids),
        )
    )

    return hugo_entrez_mapping
Example #4
0
def get_cluster_assignments() -> pd.Series:
    nbs_matlab_path = find_newest_data_path('nbs_matlab')
    matlab_file = nbs_matlab_path / 'nbs_cluster.mat'
    print('Reading', matlab_file)
    mat = scipy.io.loadmat(str(matlab_file))
    labels = mat['NBS_cc_label'].flatten()

    mut_data = mat['baseSMData'][0][0]
    patient_ids = [p[0] for p in mut_data[3].flatten()]

    assert len(patient_ids) == len(labels)

    return pd.Series(labels, index=patient_ids)
Example #5
0
def ki67_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, gene]
    selected_labels = labels_all.loc[selected_samples]

    rd = RocData.calculate(selected_labels, selected_expr)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'Ki67 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, selected_expr)
    plot_pr(pr, f'Ki67 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
Example #6
0
def pmid_26892682_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, selected_genes]
    selected_labels = labels_all.loc[selected_samples]

    ln_p_over_1_minus_p = selected_expr.as_matrix() @ coefs.as_matrix()
    probs = expit(ln_p_over_1_minus_p)

    rd = RocData.calculate(selected_labels, probs)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'PMID26892682 ROC: {drug.title()}',
             output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, probs)
    plot_pr(pr, f'PMID26892682 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
Example #7
0
#!/usr/bin/env python3
from data_path_utils import (
    create_data_path,
    find_newest_data_path,
)
import pandas as pd

from gene_mappings import read_entrez_hugo_mapping, read_hugo_entrez_mapping

data_path = create_data_path('dump_muts_for_nbs')

hugo_entrez_mapping = read_hugo_entrez_mapping()
entrez_hugo_mapping = read_entrez_hugo_mapping()

mut_path = find_newest_data_path('parse_tcga_mutations')

muts_all = pd.read_pickle(mut_path / 'mutations.pickle')
gene_sel = pd.Series(
    [
        (gene in hugo_entrez_mapping and hugo_entrez_mapping[gene])
        for gene in muts_all.columns
    ],
    index=muts_all.columns,
).astype(bool)
muts = muts_all.loc[:, gene_sel]
muts.columns = [hugo_entrez_mapping[gene] for gene in muts.columns]
muts = muts.groupby(axis=1, level=-1).any().astype(int)
gene_symbols = [entrez_hugo_mapping[gene] for gene in muts.columns]

with open(data_path / 'gene_symbols.txt', 'w') as f:
    print('Gene', file=f)
Example #8
0
    sorted_intersection,
)

p = ArgumentParser()
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)

if '__file__' in globals():
    args = p.parse_args()
else:
    args = p.parse_args([])

script_label = 'snf_cluster_results'
data_path = create_data_path(script_label)
output_path = create_output_path(script_label)

snf_path = find_newest_data_path('run_snf')
snf_data = pd.read_csv(snf_path / 'clustering.csv', index_col=0)
snf_data.columns = ['cluster']
cluster_assignments = snf_data.iloc[:, 0]

clusters = sorted(set(cluster_assignments))


def get_cluster_permutation(permutation: List[int]) -> pd.Series:
    mapping = dict(zip(clusters, permutation))
    relabeled = pd.Series(
        [mapping[value] for value in cluster_assignments],
        index=cluster_assignments.index,
    )
    return relabeled
Example #9
0
def main():
    script_label = 'prop_edge_lbs_shuffle'
    data_path = create_data_path(script_label)
    output_path = create_output_path(script_label)

    hem = read_hugo_entrez_mapping()

    lbs_mut_path = find_newest_data_path('intersect_muts_lbs')
    lbs_muts = pd.read_csv(lbs_mut_path / 'brca_lbs_muts.csv')

    prop_edge_path = find_newest_data_path(f'propagate_mutations_edges_alpha_{args.alpha:.2f}')
    with pd.HDFStore(prop_edge_path / 'data_propagated.hdf5') as store:
        mut_edge_prop = store['mutations']

    patients_with_lbs_muts = set(lbs_muts.patient)
    print('Patients with LBS mutations:', len(patients_with_lbs_muts))

    lbs_muts_by_patient = defaultdict(set)
    for i, row in lbs_muts.iterrows():
        if row.gene not in hem:
            print('Skipping gene', row.gene)
            continue
        lbs_muts_by_patient[row.patient].add(hem[row.gene])

    all_edge_set = {i for i in mut_edge_prop.columns if '_' in i}
    all_edges = sorted(all_edge_set)
    all_gene_set = set(mut_edge_prop.columns) - all_edge_set

    shuffle_count = 100
    sorted_patients = sorted(patients_with_lbs_muts)
    patient_count = len(sorted_patients)
    lbs_edges_by_patient = pd.Series(0, index=sorted_patients)

    # Assign label of 1 for an edge if either node has a LBS mutation
    selected_edges_by_patient: Dict[str, Set[str]] = {}
    shuffled_edges_by_patient: Dict[str, List[Set[str]]] = {}

    shuffled_by_patient = {}
    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(f'Shuffling LBS mutations for patient {patient} ({i}/{patient_count})')
        muts = lbs_muts_by_patient[patient]
        mut_count = len(muts)
        l = []
        for j in range(shuffle_count):
            other_genes = all_gene_set - muts
            new_muts = sample(other_genes, mut_count)
            l.append(new_muts)
        shuffled_by_patient[patient] = l

    # TODO: parallelize this; it's too slow
    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(f'Computing selected/shuffled edges for patient {i}/{patient_count}')
        lbs_genes = lbs_muts_by_patient[patient]
        selected_edges: Set[str] = set()
        shuffled_edges: List[Set[str]] = [set() for _ in range(shuffle_count)]
        edge_scores = mut_edge_prop.loc[patient, all_edges].copy().sort_values(ascending=False)
        for g1_g2 in edge_scores.index:
            g1, g2 = g1_g2.split('_')
            if g1 in lbs_genes or g2 in lbs_genes:
                selected_edges.add(g1_g2)
            # TODO: clean up iteration
            for j, shuffled_genes in enumerate(shuffled_by_patient[patient]):
                if g1 in shuffled_genes or g2 in shuffled_genes:
                    shuffled_edges[j].add(g1_g2)
        lbs_edges_by_patient.loc[patient] = len(selected_edges)
        selected_edges_by_patient[patient] = selected_edges
        shuffled_edges_by_patient[patient] = shuffled_edges

    selected_edge_count = pd.Series(
        {patient: len(edges) for patient, edges in selected_edges_by_patient.items()}
    ).sort_index()

    with new_plot():
        selected_edge_count.plot.hist(bins=25)
        plt.xlabel('Number of LBS-incident edges')
        plt.ylabel('Patients')

        figure_path = output_path / 'lbs_edge_count.pdf'
        print('Saving LBS edge count histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    shuffled_data_path = data_path / 'shuffled_muts_edges_by_patient.pickle'
    print('Saving shuffled muts by patient to', shuffled_data_path)
    with open(shuffled_data_path, 'wb') as f:
        pickle.dump(
            {
                'shuffled_by_patient': shuffled_by_patient,
                'selected_edges_by_patient': selected_edges_by_patient,
                'shuffled_edges_by_patient': shuffled_edges_by_patient,
            },
            f,
        )
Example #10
0
p = ArgumentParser()
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)

if '__file__' in globals():
    args = p.parse_args()
else:
    args = p.parse_args([])

output_path = create_output_path('consolidated_roc_plot')

fold_count = 5

# [0] are labels, [1] are data paths
single_curve_input_paths = [
    (r'Turnbull $\it{et}$ $\it{al.}$',
     find_newest_data_path('pmid_26033813_analysis')),
    (r'Reijm $\it{et}$ $\it{al.}$',
     find_newest_data_path('pmid_26892682_analysis')),
    (r'WExT Mutation Set Count', find_newest_data_path('wext_mut_sets')),
    (r'Network Based Stratification',
     find_newest_data_path('nbs_cluster_results')),
    (r'Similarity Network Fusion',
     find_newest_data_path('snf_cluster_results')),
]

crossval_input_paths = [
    (
        'Full, {clf}',
        find_newest_data_path(
            f'tcga_train_response_stratify_alpha_{args.alpha:.2f}'),
    ),
Example #11
0
)

p = ArgumentParser()
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)
p.add_argument('--plot-pca-components', action='store_true')
if '__file__' in globals():
    args = p.parse_args()
else:
    args = p.parse_args([])

entrez_hugo_mapping = read_entrez_hugo_mapping()

output_label = f'compute_drug_features_labels_alpha_{args.alpha:.2f}'
data_path = create_data_path(output_label)

drug_response_dir = find_newest_data_path('drug_response_labels')
tx_info_raw = pd.read_pickle(drug_response_dir / 'tx_info.pickle')

network_path = find_newest_data_path('build_hippie_network') / 'network.pickle'
print('Loading network from', network_path)
with network_path.open('rb') as f:
    network = pickle.load(f)

self_edge_count = 0
# HACK: remove self edges
for node in network.nodes:
    if network.has_edge(node, node):
        network.remove_edge(node, node)
        self_edge_count += 1
print(f'Removed {self_edge_count} self-edges from network')
Example #12
0
selected_cancer = 'brca'

p = ArgumentParser()
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)

if __name__ == '__main__':
    args = p.parse_args()
else:
    args = p.parse_args([])

label = f'treatment_features_alpha_{args.alpha:.2f}'
data_path = create_data_path(label)
output_path = create_output_path(label)

network_path = find_newest_data_path('build_hippie_network')
with (network_path / 'network.pickle').open('rb') as f:
    network = pickle.load(f)
nodes = sorted(network.nodes())
node_set = set(nodes)

w_prime = normalize(network)


def get_prop_vec(name, genes):
    s = pd.Series(0.0, index=nodes)
    gene_set = set(genes)
    genes_in_network = gene_set & node_set
    genes_not_in_network = gene_set - node_set
    print('Drug {}: {} genes in network, {} not'.format(
        name,
Example #13
0
    create_data_path,
    create_output_path,
    find_newest_data_path,
)
import pandas as pd

from gene_mappings import read_ensembl_entrez_mapping
from utils import sorted_union

p = ArgumentParser()
p.add_argument('gdc_manifest', type=Path)
args = p.parse_args()

data_path = create_data_path('consolidate_mrna_expression')

input_path = find_newest_data_path('query_cases_by_file') / 'raw_responses'

gdc_manifest = pd.read_table(args.gdc_manifest)
files_in_manifest = set(gdc_manifest.id)


def get_submitter_ids(data: dict):
    for key, value in data.items():
        if key == 'submitter_id':
            yield value
        if isinstance(value, dict):
            yield from get_submitter_ids(value)
        if isinstance(value, list):
            for sub_data in value:
                yield from get_submitter_ids(sub_data)
Example #14
0
script_label = 'wext_mut_sets'
data_path = create_data_path(script_label)
output_path = create_output_path(script_label)

p = ArgumentParser()
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)
if '__file__' in globals():
    args = p.parse_args()
else:
    args = p.parse_args([])

hugo_entrez_mapping = read_hugo_entrez_mapping()

# Manually created
input_path = find_newest_data_path('wext_results')

gene_set_data = pd.read_table(input_path / 'tcga-exclusive-sets-sampled-sets.tsv')

cols = list(gene_set_data.columns)
cols[:2] = ['gene_set', 'pvalue']
gene_set_data.columns = cols

cutoff = 0.002
selected_gene_set_strs = gene_set_data.loc[gene_set_data.pvalue < cutoff, 'gene_set']
selected_gene_sets = [set(gene_set.split(',')) for gene_set in selected_gene_set_strs]

entrez_gene_sets = [
    {hugo_entrez_mapping[gene] for gene in gene_set if gene in hugo_entrez_mapping}
    for gene_set in selected_gene_sets
]
Example #15
0
from propagation import propagate, normalize

DEFAULT_SUBPROCESSES = 2

p = ArgumentParser()
p.add_argument('-s', '--subprocesses', type=int, default=DEFAULT_SUBPROCESSES)
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)

if __name__ == '__main__':
    args = p.parse_args()
else:
    args = p.parse_args([])

data_path = create_data_path(f'propagate_mutations_alpha_{args.alpha:.2f}')

with (find_newest_data_path('build_hippie_network') / 'network.pickle').open('rb') as f:
    network = pickle.load(f)
print('Loaded network')

w_prime = normalize(network)
node_set = set(network.nodes())
nodes = sorted(node_set)
node_count = len(nodes)

with pd.HDFStore(find_newest_data_path('parse_tcga_mutations') / 'mutations.hdf5', 'r') as store:
    mutations = store['muts']
print('Read mutations')

expr = pd.read_pickle(find_newest_data_path('parse_cosmic_diffexpr') / 'brca_expr.pickle')
print('Read log-fold expression with Hugo symbols')
cutoff = 2
Example #16
0
    plot_roc,
    sorted_intersection,
)

p = ArgumentParser()
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)
if '__file__' in globals():
    args = p.parse_args()
else:
    args = p.parse_args([])

script_label = 'ki67_analysis'
data_path = create_data_path(script_label)
output_path = create_output_path(script_label)

expr_path = find_newest_data_path('parse_cosmic_diffexpr')
expr = pd.read_pickle(expr_path / 'brca_expr.pickle')

gene = 'MKI67'


def ki67_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, gene]
    selected_labels = labels_all.loc[selected_samples]

    rd = RocData.calculate(selected_labels, selected_expr)
Example #17
0
from propagation import propagate, normalize

DEFAULT_SUBPROCESSES = 2

p = ArgumentParser()
p.add_argument('-s', '--subprocesses', type=int, default=DEFAULT_SUBPROCESSES)
p.add_argument('--alpha', type=float, default=DEFAULT_ALPHA)

if __name__ == '__main__':
    args = p.parse_args()
else:
    args = p.parse_args([])

data_path = create_data_path(f'propagate_mutations_edges_alpha_{args.alpha:.2f}')

with (find_newest_data_path('build_hippie_network') / 'network.pickle').open('rb') as f:
    orig_network = pickle.load(f)
print('Loaded network')

self_edge_count = 0
# HACK: remove self edges
for node in orig_network.nodes:
    if orig_network.has_edge(node, node):
        orig_network.remove_edge(node, node)
        self_edge_count += 1
print(f'Removed {self_edge_count} self-edges from original network')

network = insert_dummy_edge_nodes(orig_network, edge_name_func=join_string_keys)

w_prime = normalize(network)
node_set = set(network.nodes())
Example #18
0
        'HIF1A',
        'CYP17A1',
        'HSD17B1',
        'ARSC',
        'NFKB1',
        'HSD17B3',
        'BLM',
        'NR3C1',
        'HSD11B2',
    ],
    'femara': [
        'ARSC',
        'CYP11B1',
        'CYP11B2',
        'CYP19A1',
        'CYP26A1',
    ],
}

er_target_path = find_newest_data_path('parse_er_targets') / 'er_targets.txt'
with er_target_path.open() as f:
    print('Assigning ER targets from', er_target_path)
    targets_raw['er_targets'] = set(line.strip() for line in f)

targets = {
    drug: [entrez_id_mapping[target] for target in target_list]
    for drug, target_list in targets_raw.items()
}

drugs = sorted(targets)
Example #19
0
def main():
    script_label = 'prop_edge_lbs_overlap'
    data_path = create_data_path(script_label)
    output_path = create_output_path(script_label)

    hem = read_hugo_entrez_mapping()

    lbs_mut_path = find_newest_data_path('intersect_muts_lbs')
    lbs_muts = pd.read_csv(lbs_mut_path / 'brca_lbs_muts.csv')

    prop_edge_path = find_newest_data_path(
        f'propagate_mutations_edges_alpha_{args.alpha:.2f}')
    with pd.HDFStore(prop_edge_path / 'data_propagated.hdf5') as store:
        mut_edge_prop = store['mutations']

    patients_with_lbs_muts = set(lbs_muts.patient)
    print('Patients with LBS mutations:', len(patients_with_lbs_muts))

    lbs_muts_by_patient = defaultdict(set)
    for i, row in lbs_muts.iterrows():
        if row.gene not in hem:
            print('Skipping gene', row.gene)
            continue
        lbs_muts_by_patient[row.patient].add(hem[row.gene])

    all_edge_set = {i for i in mut_edge_prop.columns if '_' in i}
    all_edges = sorted(all_edge_set)

    edge_prop = mut_edge_prop.loc[:, all_edges]

    shuffle_count = 100
    sorted_patients = sorted(patients_with_lbs_muts)
    patient_count = len(sorted_patients)
    ndcg = pd.Series(0.0, index=sorted_patients)
    shuffled_ndcg = pd.DataFrame(0.0,
                                 index=sorted_patients,
                                 columns=range(shuffle_count))
    lbs_edges_by_patient = pd.Series(0, index=sorted_patients)

    print('Loading shuffled data')
    prop_lbs_shuffle_path = find_newest_data_path('prop_edge_lbs_shuffle')

    with open(prop_lbs_shuffle_path / 'shuffled_muts_edges_by_patient.pickle',
              'rb') as f:
        d = pickle.load(f)
        shuffled_by_patient = d['shuffled_by_patient']
        selected_edges_by_patient = d['selected_edges_by_patient']
        shuffled_edges_by_patient = d['shuffled_edges_by_patient']

    ## NDCG analysis

    # For each patient, rank edges by propagated mutation scores, assign label of 1 if
    # either node connected to that edge has a LBS mutation

    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(f'Computing NDCG for patient {i}/{patient_count}')

        edge_scores = mut_edge_prop.loc[patient, all_edges].copy().sort_values(
            ascending=False)
        selected_edges = selected_edges_by_patient[patient]
        shuffled_edge_list = shuffled_edges_by_patient[patient]

        relevance = np.array([e in selected_edges
                              for e in edge_scores.index]).astype(float)
        ndcg.loc[patient] = normalized_discounted_cumulative_gain(
            relevance)[-1]

        for j, shuffled_edges in enumerate(shuffled_edge_list):
            shuffled_relevance = np.array(
                [e in shuffled_edges for e in edge_scores.index]).astype(float)
            shuffled_ndcg.loc[patient,
                              j] = normalized_discounted_cumulative_gain(
                                  shuffled_relevance)[-1]

    with pd.HDFStore(data_path / 'ndcg_data.hdf5') as store:
        store['ndcg'] = ndcg
        store['shuffled_ndcg'] = shuffled_ndcg
        store['lbs_edges_by_patient'] = lbs_edges_by_patient

    shuffled_ndcg_flat = shuffled_ndcg.unstack()
    #shuffled_ndcg_median = shuffled_ndcg.median(axis=1)

    with new_plot():
        ndcg.plot.hist(bins=hist_bin_count)
        plt.title('NDCG histogram')
        plt.xlabel(
            'Patient NDCG score: selection of LBS edges by propagated edge score'
        )

        figure_path = output_path / 'ndcg_hist.pdf'
        print('Saving NDCG histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    with new_plot():
        shuffled_ndcg_flat.plot.hist(bins=hist_bin_count)
        plt.title('NDCG histogram')
        plt.xlabel(
            'Patient NDCG score: selection of shuffled LBS edges by propagated edge score'
        )

        figure_path = output_path / 'shuffled_ndcg_hist.pdf'
        print('Saving NDCG histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ndcg_ks = scipy.stats.ks_2samp(ndcg, shuffled_ndcg_flat)
    ndcg_ks_pvalue_str = to_matplotlib_sci_notation(ndcg_ks[1])

    with new_plot():
        ndcg.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Real NDCG',
            density=True,
        )
        shuffled_ndcg_flat.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Shuffled NDCG, across 100 permutations',
            density=True,
        )
        plt.xlabel(
            'Patient NDCG score: selection of LBS edges by propagated edge score'
        )
        plt.legend()
        plt.figtext(
            0.89,
            0.7,
            f'Kolmogorov-Smirnov $P = {ndcg_ks_pvalue_str}$',
            horizontalalignment='right',
        )

        figure_path = output_path / 'ndcg_both_hist.pdf'
        print('Saving NDCG histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ## /NDCG analysis

    ## PR and ROC AUC analysis

    roc_auc = pd.Series(0.0, index=sorted_patients)
    average_pr_scores = pd.Series(0.0, index=sorted_patients)
    shuffled_roc_auc = pd.DataFrame(0.0,
                                    index=sorted_patients,
                                    columns=range(shuffle_count))
    shuffled_average_pr_scores = pd.DataFrame(0.0,
                                              index=sorted_patients,
                                              columns=range(shuffle_count))
    # Maps patient IDs to performance objects
    roc_data_objects = {}
    pr_data_objects = {}

    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(
            f'Computing classifier performance for patient {i}/{patient_count}'
        )
        selected_edges: Set[str] = selected_edges_by_patient[patient]
        edge_scores = mut_edge_prop.loc[patient, all_edges].copy()
        labels = np.array([e in selected_edges
                           for e in edge_scores.index]).astype(float)

        rd = RocData.calculate(labels, edge_scores)
        roc_data_objects[patient] = rd
        roc_auc.loc[patient] = rd.auc

        pr = PrData.calculate(labels, edge_scores)
        pr_data_objects[patient] = pr
        average_pr_scores.loc[patient] = average_precision_score(
            labels, edge_scores)

        shuffled_edge_list: List[Set[str]] = shuffled_edges_by_patient[patient]

        for j, shuffled_edges in enumerate(shuffled_edge_list):
            shuffled_labels = np.array(
                [e in shuffled_edges for e in edge_scores.index]).astype(float)

            shuffled_rd = RocData.calculate(shuffled_labels, edge_scores)
            shuffled_roc_auc.loc[patient, j] = shuffled_rd.auc

            shuffled_average_pr_scores.loc[patient,
                                           j] = average_precision_score(
                                               shuffled_labels,
                                               edge_scores,
                                           )

    with pd.HDFStore(data_path / 'classifier_data.hdf5') as store:
        store['roc_auc'] = roc_auc
        store['average_pr'] = average_pr_scores
        store['shuffled_roc_auc'] = shuffled_roc_auc
        store['shuffled_average_pr'] = shuffled_average_pr_scores

    with new_plot():
        roc_auc.plot.hist(bins=hist_bin_count)
        plt.title('ROC AUC histogram')
        plt.xlabel(
            'Patient ROC AUC: selection of LBS edges by propagated edge score')

        figure_path = output_path / 'roc_auc_hist.pdf'
        print('Saving ROC AUC histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    #shuffled_roc_auc_median = shuffled_roc_auc.median(axis=1)
    shuffled_roc_auc_flat = shuffled_roc_auc.unstack()

    with new_plot():
        shuffled_roc_auc_flat.plot.hist(bins=hist_bin_count)
        plt.title('ROC AUC histogram')
        plt.xlabel(
            'Patient ROC AUC: selection of shuffled LBS edges by propagated edge score'
        )

        figure_path = output_path / 'shuffled_roc_auc_hist.pdf'
        print('Saving ROC AUC histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    roc_auc_ks = scipy.stats.ks_2samp(roc_auc, shuffled_roc_auc_flat)
    roc_auc_ks_pvalue_str = to_matplotlib_sci_notation(roc_auc_ks[1])

    with new_plot():
        roc_auc.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Real ROC AUC',
            density=True,
        )
        shuffled_roc_auc_flat.plot.hist(
            bins=50,
            alpha=0.8,
            label='Shuffled ROC AUC, across 100 permutations',
            density=True,
        )
        plt.xlabel(
            'Patient ROC AUC: selection of LBS edges by propagated edge score')
        plt.legend()
        plt.figtext(
            0.14,
            0.7,
            f'Kolmogorov-Smirnov $P = {roc_auc_ks_pvalue_str}$',
            horizontalalignment='left',
        )

        figure_path = output_path / 'roc_auc_both_hist.pdf'
        print('Saving ROC AUC histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    with new_plot():
        average_pr_scores.plot.hist(bins=hist_bin_count)
        plt.title('Average precision histogram')
        plt.xlabel(
            'Average precision: selection of LBS edges by propagated edge score'
        )

        figure_path = output_path / 'avg_prec_hist.pdf'
        print('Saving AP histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    shuffled_average_pr_median = shuffled_average_pr_scores.median(axis=1)
    with new_plot():
        shuffled_average_pr_median.plot.hist(bins=hist_bin_count)
        plt.title('Average precision histogram')
        plt.xlabel(
            'Average precision: selection of shuffled LBS edges by propagated edge score'
        )

        figure_path = output_path / 'shuffled_avg_prec_hist.pdf'
        print('Saving AP histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    top_n = 4
    rest_uniform = 6
    sorted_pr_scores = average_pr_scores.dropna().sort_values()
    usable_patient_count = sorted_pr_scores.shape[0]
    # Top 5, and 5 uniformly distributed from the rest
    patient_indexes = list(
        np.linspace(
            0,
            usable_patient_count - 1 - top_n,
            num=rest_uniform,
        ).astype(int))
    patient_indexes.extend(
        range(usable_patient_count - top_n, usable_patient_count))
    selected_patients = sorted_pr_scores.index[list(reversed(patient_indexes))]

    with new_plot():
        plt.figure(figsize=(10, 10))
        for patient in selected_patients:
            prd = pr_data_objects[patient]
            plt.plot(prd.rec, prd.prec, label=patient)

        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.axes().set_aspect('equal', 'datalim')
        plt.legend()

        plt.title(
            f'Precision-recall: top {top_n} patients, uniform spacing of bottom {rest_uniform}'
        )

        figure_path = output_path / 'pr_selected.pdf'
        print('Saving selected PR curves to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ## /PR and ROC AUC analysis

    ## Spearman correlation P-value analysis
    spearman_pvalues = pd.Series(0.0, index=sorted_patients)
    shuffled_spearman_pvalues = pd.DataFrame(0.0,
                                             index=sorted_patients,
                                             columns=range(shuffle_count))

    for i, patient in enumerate(patients_with_lbs_muts, 1):
        print(
            f'Computing Spearman correlation P-value for patient {i}/{patient_count}'
        )
        selected_edges: Set[str] = selected_edges_by_patient[patient]
        edge_scores = mut_edge_prop.loc[patient, all_edges].copy()
        labels = np.array([e in selected_edges
                           for e in edge_scores.index]).astype(float)

        spearman_result = scipy.stats.spearmanr(edge_scores, labels)
        spearman_pvalue = spearman_result[1]

        spearman_pvalues.loc[patient] = spearman_pvalue

        shuffled_edge_list: List[Set[str]] = shuffled_edges_by_patient[patient]

        for j, shuffled_edges in enumerate(shuffled_edge_list):
            shuffled_labels = np.array(
                [e in shuffled_edges for e in edge_scores.index]).astype(float)

            shuffled_spearman_result = scipy.stats.spearmanr(
                edge_scores, shuffled_labels)
            shuffled_spearman_pvalue = shuffled_spearman_result[1]

            shuffled_spearman_pvalues.loc[patient,
                                          j] = shuffled_spearman_pvalue

    sp_dir = Path('data/prop_edge_lbs_overlap_20180606-105746')
    with pd.HDFStore(sp_dir / 'spearman_pvalues.hdf5') as store:
        spearman_pvalues = store['spearman_pvalues']
        shuffled_spearman_pvalues = store['shuffled_spearman_pvalues']

    with pd.HDFStore(data_path / 'spearman_pvalues.hdf5') as store:
        store['spearman_pvalues'] = spearman_pvalues
        store['shuffled_spearman_pvalues'] = shuffled_spearman_pvalues

    nl10_spearman_pvalues_all = -np.log10(spearman_pvalues)
    nl10_spearman_pvalues = nl10_spearman_pvalues_all.loc[
        ~(nl10_spearman_pvalues_all.isnull())
        & ~(np.isinf(nl10_spearman_pvalues_all))]

    with new_plot():
        nl10_spearman_pvalues.plot.hist(bins=50)
        plt.title('Spearman $P$-value histogram')
        plt.xlabel(
            'Spearman $P$-values ($-\\log_{10}$): LBS edges vs. prop. edge score'
        )

        figure_path = output_path / 'spearman_pvalue_hist.pdf'
        print('Saving Spearman P-value histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    shuffled_spearman_pvalues_flat = shuffled_spearman_pvalues.unstack()
    nl10_shuffled_spearman_pvalues_flat_all = -np.log10(
        shuffled_spearman_pvalues_flat)
    nl10_shuffled_spearman_pvalues_flat = nl10_shuffled_spearman_pvalues_flat_all.loc[
        ~(nl10_shuffled_spearman_pvalues_flat_all.isnull())
        & ~(np.isinf(nl10_shuffled_spearman_pvalues_flat_all))]

    with new_plot():
        nl10_shuffled_spearman_pvalues_flat.plot.hist(bins=50)
        plt.title('Spearman $P$-value histogram')
        plt.xlabel(
            'Spearman $P$-values ($-\\log_{10}$): shuffled LBS edges vs. prop. edge score'
        )

        figure_path = output_path / 'shuffled_spearman_pvalue_hist.pdf'
        print('Saving Spearman P-value histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    spearman_ks = scipy.stats.ks_2samp(spearman_pvalues,
                                       shuffled_spearman_pvalues_flat)
    spearman_ks_pvalue_str = to_matplotlib_sci_notation(spearman_ks[1])

    with new_plot():
        nl10_spearman_pvalues.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Real Spearman $P$-values',
            density=True,
        )
        nl10_shuffled_spearman_pvalues_flat.plot.hist(
            bins=hist_bin_count,
            alpha=0.8,
            label='Shuffled Spearman $P$-values, across 100 permutations',
            density=True,
        )
        plt.xlabel(
            'Spearman $P$-values ($-\\log_{10}$): LBS edges vs. prop. edge score'
        )
        plt.legend()
        plt.figtext(
            0.89,
            0.7,
            f'Kolmogorov-Smirnov $P = {spearman_ks_pvalue_str}$',
            horizontalalignment='right',
        )

        figure_path = output_path / 'spearman_pvalues_both_hist.pdf'
        print('Saving Spearman P-value histogram to', figure_path)
        plt.savefig(figure_path, bbox_inches='tight')

    ## /Spearman correlation P-value analysis

    ## Overall ROC AUC
    print('Creating binary LBS edge matrix')
    lbs_edge_matrix = pd.DataFrame(0,
                                   index=edge_prop.index,
                                   columns=edge_prop.columns)
    for patient, edges in selected_edges_by_patient.items():
        lbs_edge_matrix.loc[patient, list(edges)] = 1

    lbs_matrix_path = data_path / 'lbs_edge_matrix.hdf5'
    print('Saving LBS edge matrix to', lbs_matrix_path)
    with pd.HDFStore(lbs_matrix_path) as store:
        store['lbs_edge_matrix'] = lbs_edge_matrix

    sorted_flattened_edge_scores = edge_prop.unstack().sort_values(
        ascending=False)
    flattened_lbs_edges = lbs_edge_matrix.unstack()
    ordered_flattened_lbs_edges = flattened_lbs_edges.loc[
        sorted_flattened_edge_scores.index]

    flattened_rd = RocData.calculate(ordered_flattened_lbs_edges,
                                     sorted_flattened_edge_scores)
    flattened_rd_path = data_path / 'flattened_rd.pickle'
    print('Saving flattened vector RocData to', flattened_rd_path)
    with open(flattened_rd_path, 'wb') as f:
        pickle.dump(flattened_rd, f)
    ## /Overall ROC AUC

    ## Survival analysis

    edge_prop_survival_dir = find_newest_data_path('edge_prop_survival')
    survival_data = pd.read_csv(edge_prop_survival_dir /
                                'univariate_surv_results.csv',
                                index_col=0)
    # Indexed by gene/edge, across all patients
    surv_edge_sel = [('_' in i) for i in survival_data.index]
    edge_survival_data = survival_data.loc[surv_edge_sel, :]

    lbs_mut_edge_matrix = pd.DataFrame(
        0.0,
        index=sorted(selected_edges_by_patient),
        columns=all_edges,
    )
    for patient, edges in selected_edges_by_patient.items():
        lbs_mut_edge_matrix.loc[patient, list(edges)] = 1

    # Binary vector: is this edge incident on a LBS mut in at least one patient?
    edges_with_lbs_muts = lbs_mut_edge_matrix.sum(axis=0).astype(bool)

    surv_pvalues_with_lbs = edge_survival_data.loc[edges_with_lbs_muts,
                                                   'pvalue']
    surv_pvalues_with_lbs.name = 'With LBS'
    surv_pvalues_without_lbs = edge_survival_data.loc[~edges_with_lbs_muts,
                                                      'pvalue']
    surv_pvalues_without_lbs.name = 'Without LBS'

    ks_res = scipy.stats.ks_2samp(surv_pvalues_with_lbs,
                                  surv_pvalues_without_lbs)

    with new_plot():
        plot_cdf(surv_pvalues_with_lbs)
        plot_cdf(surv_pvalues_without_lbs)

        plt.legend()
        plt.ylabel('CDF')
        plt.xlabel('Univariate Cox Regression $P$-value')

        figure_path = output_path / 'surv_pvalue_cdfs.pdf'
        plt.savefig(figure_path, bbox_inches='tight')

    with new_plot():
        fig = plt.figure()

        surv_pvalues_with_lbs.plot.hist(bins=50, ax=plt.gca(), alpha=0.5)
        surv_pvalues_without_lbs.plot.hist(bins=50, ax=plt.gca(), alpha=0.5)

        plt.legend('topleft')
        plt.xlabel('Univariate Cox Regression $P$-value')

        figure_path = output_path / 'surv_pvalue_hist.pdf'
        plt.savefig(figure_path, bbox_inches='tight')

    ## /Survival analysis

    ## Permuted survival analysis

    pvalues = edge_survival_data.loc[:, 'r_square']

    ks_manual = (np.array([0.1, 0.2, 0.25, 0.3]) *
                 edge_prop.shape[0]).astype(int)
    ks_auto = np.logspace(1, 3, num=15).astype(int)
    ks = sorted(chain(ks_manual, ks_auto))

    edge_count = 1000

    template = dedent('''
    \\begin{{frame}}[plain]
     \\begin{{center}}
      \\includegraphics[width=0.7\\textwidth]{{survival_rsquare_hist_k_{k}}}
     \\end{{center}}
    \\end{{frame}}
    ''')

    with open(data_path / 'figure_include.tex', 'w') as f:
        for k in ks:
            print(template.format(k=k), file=f)

    for k in ks:
        print('Computing edge ranking results for k =', k)
        edge_ranking = get_rank_k_edge_values(edge_prop, k)
        sorted_edge_scores = edge_ranking.sort_values(ascending=False)
        top_edges = sorted_edge_scores.iloc[:edge_count]
        top_edge_pvalues = pvalues.loc[top_edges.index]
        bottom_edges = sorted_edge_scores.iloc[edge_count:]
        permutation_count = 1000
        permutation_pvalues = pd.Series(0.0, index=range(permutation_count))
        for i in range(permutation_count):
            edge_selection = np.random.choice(bottom_edges.index, size=100)
            selected_pvalues = pvalues.loc[edge_selection]
            comparison_result = scipy.stats.mannwhitneyu(
                top_edge_pvalues,
                selected_pvalues,
                alternative='greater',
            )
            permutation_pvalues.iloc[i] = comparison_result.pvalue

        nl10_permutation_pvalues = -np.log10(permutation_pvalues)

        with new_plot():
            plt.figure(figsize=(5, 5))
            nl10_permutation_pvalues.plot.hist(bins=50)
            title = (f'Survival $R^2$: top {edge_count} edges ($k = {k}$) vs. '
                     f'{permutation_count} random selections')
            plt.title(title)
            plt.xlabel('$- \\log_{10}$($P$-value) from Mann-Whitney $U$ test')

            nl10_0_05 = -np.log10(0.05)
            plt.axvline(x=nl10_0_05, color='#FF0000FF')

            nl10_0_001 = -np.log10(0.001)
            plt.axvline(x=nl10_0_001, color='#000000FF')

            figure_path = output_path / f'survival_rsquare_hist_k_{k}.pdf'
            print('Saving survival R^2 histogram to', figure_path)
            plt.savefig(figure_path, bbox_inches='tight')
Example #20
0
drug_target_data_all = pd.DataFrame(raw_data).T
# Select those with protein targets
drug_target_data = drug_target_data_all.loc[drug_target_data_all.gene_symbols.notnull(), :]

dtd_path = data_path / 'drug_targets.pickle'
print('Saving drug target data matrix to', dtd_path)
drug_target_data.to_pickle(dtd_path)

synonyms = defaultdict(list)
for row_name, synonym_csv in drug_target_data.synonyms.iteritems():
    for synonym in synonym_csv.split(','):
        synonyms[synonym].append(row_name)

synonym_counts = pd.Series({k: len(v) for k, v in synonyms.items()})

with (find_newest_data_path('build_hippie_network') / 'network.pickle').open('rb') as f:
    network = pickle.load(f)
print('Loaded network')

nodes = sorted(network.nodes())
node_set = set(nodes)

w_prime = normalize(network)

all_targets = set(chain.from_iterable(drug_target_data.gene_symbols))

both = node_set & all_targets

print('Nodes in network:', len(node_set))
print('Genes targeted by at least one drug:', len(all_targets))
print('Overlap between network and targets:', len(both))
Example #21
0
def get_genes() -> Iterable[str]:
    hit_data_dir = find_newest_data_path('tf_mirna_hits_both')
    hits = pd.read_pickle(hit_data_dir /
                          'hits_max_annotated_in_transmir.pickle')
    tfs = set(tfn.split('::')[0] for tfn in hits.tf_name)
    return tfs
Example #22
0
    create_data_path,
    find_newest_data_path,
)
import pandas as pd
from scipy.stats import pearsonr

from utils import consolidate_data_frames, sorted_intersection

data_path = create_data_path('tcga_lincs_expr_features')

drugs = [
    'arimidex',
    'taxol',
]

tcga_expr_path = find_newest_data_path(
    'parse_cosmic_diffexpr') / 'brca_expr.pickle'
print('Reading expression data from', tcga_expr_path)
tcga_expr = pd.read_pickle(tcga_expr_path)

lincs_expr = pd.read_csv(
    find_newest_data_path('gct_drug_subset') / 'subset.csv',
    header=None,
    index_col=0,
)
lincs_expr.columns = drugs

lincs_genes = set(lincs_expr.index)
tcga_genes = set(tcga_expr.columns)

lincs_benchmark_gene_data = pd.read_excel(DATA_PATH /
                                          'Landmark_Genes_n978.xlsx')