Esempio n. 1
0
def main():
    usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>"
    parser = OptionParser()
    parser.add_option("-o", "--out_file", help="Output file")
    (options, args) = parser.parse_args()

    # Parse the input
    metrics_f = args[0]
    metric_name = args[1]
    label_graph_f = args[2]
    out_f = options.out_file

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the labels' data
    print('Reading label graph from {}.'.format(label_graph_f))
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    print('\n'.join(
        set(label_to_name.values()) - set(LABEL_NAME_TO_SUCCINCT.keys())))

    # Topologically sort the labels and assign them numbers
    topo_sort_labels = topological_sort(label_graph)
    label_to_topo_index = {
        label: index
        for index, label in enumerate(topo_sort_labels)
    }

    # Create text legend for graph
    #legend = ''
    #for label, topo_index in label_to_topo_index.items():
    #    legend += '{} {}'.format(topo_index, og.id_to_term[label].name)
    #with open(join(out_dir, 'graph_node_labels.txt'), 'w') as f:
    #    f.write(legend)

    # Load the metrics
    metrics_df = pd.read_csv(metrics_f, sep='\t', index_col=0)

    # Create the output directory
    #_run_cmd("mkdir -p %s" % out_dir)

    label_to_metric = {
        label: metrics_df.loc[label][metric_name]
        for label in metrics_df.index if label in label_to_name
    }

    # F1-score drawn atop ontology
    draw_collapsed_ontology(label_graph, label_to_name, label_to_metric,
                            metric_name, out_f)
Esempio n. 2
0
def ontology_subgraph_spanning_terms(span_terms, og):
    """
    Builds the ontology subgraph spanning a set of terms.
    """
    # Get most general terms
    most_general_terms = ontology_graph.most_specific_terms(
        span_terms, og, sup_relations=["inv_is_a", "inv_part_of"])
    q = deque(most_general_terms)
    subgraph_source_to_targets = defaultdict(lambda: set())
    relations = ["inv_is_a", "inv_part_of"]
    #visited_ids = set(most_general_terms)
    while len(q) > 0:
        source_t_id = q.popleft()
        for rel in relations:
            if rel in og.id_to_term[source_t_id].relationships:
                for target_t_id in og.id_to_term[source_t_id].relationships[
                        rel]:
                    target_descendants = set(
                        og.recursive_relationship(target_t_id, relations))
                    # There exists a descendant of the target represented in the samples
                    if len(target_descendants.intersection(span_terms)) > 0:
                        subgraph_source_to_targets[source_t_id].add(
                            target_t_id)
                        q.append(target_t_id)
        #visited_ids.add(source_t_id)
    return DirectedAcyclicGraph(subgraph_source_to_targets)
Esempio n. 3
0
def _retrieve_label_graph():
    labels_f = pr.resource_filename(
        resource_package, join("resources", "training_set", "labels.json"))
    with open(labels_f, 'r') as f:
        labels_data = json.load(f)
        source_to_targets = labels_data['label_graph']
        exp_to_labels = labels_data['labels']
    label_graph = DirectedAcyclicGraph(source_to_targets)
    return label_graph
def main():
    usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>"
    parser = OptionParser()
    parser.add_option("-o", "--out_file", help="File in which to write output")
    (options, args) = parser.parse_args()

    out_f = options.out_file

    pr_curve_f = args[0]
    label_graph_f = args[1]
    labeling_f = args[2]

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the labels' data
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    # Load the labellings
    with open(labeling_f, 'r') as f:
        labelling = json.load(f)
    exp_to_labels = labelling['labels']

    # Load PR-curves
    with open(pr_curve_f, 'r') as f:
        label_to_pr_curve = json.load(f)

    # Compute labels on which we will compute metrics
    include_labels = set(label_to_pr_curve.keys()) - BLACKLIST_TERMS

    # Precision recall curves overlaid on label-graph
    draw_collapsed_ontology_w_pr_curves(exp_to_labels, label_graph,
                                        label_to_name, label_to_pr_curve,
                                        out_f)
def main():
    usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>"
    parser = OptionParser()
    parser.add_option(
        "-o",
        "--out_dir",
        help=
        "Directory in which to write the output. If it doesn't exist, create the directory."
    )
    parser.add_option("-c",
                      "--conservative_mode",
                      action="store_true",
                      help="Compute conservative metrics")
    (options, args) = parser.parse_args()

    conservative_mode = options.conservative_mode
    out_dir = options.out_dir
    method_name = args[0]
    results_f = args[1]
    label_graph_f = args[2]
    labeling_f = args[3]

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the labels' data
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    # Load the labellings
    with open(labeling_f, 'r') as f:
        labelling = json.load(f)
    exp_to_labels = labelling['labels']

    # Load the results
    results_df = pd.read_csv(results_f, sep='\t', index_col=0)

    # Create the output directory
    _run_cmd("mkdir -p %s" % out_dir)

    # Compute labels on which we will compute metrics
    include_labels = set(results_df.columns) - BLACKLIST_TERMS

    # Create the assignment matrix where rows are samples, columns
    # are labels, and element (i,j) = True if sample i is annotated
    # with label j
    assignment_df = cm._compute_assignment_matrix(results_df, exp_to_labels)
    assignment_df = assignment_df.loc[results_df.index][results_df.columns]

    precisions, recalls, threshs = cm.compute_joint_metrics(
        results_df,
        assignment_df,
        include_labels,
        label_graph=label_graph,
        label_to_name=label_to_name,
        og=og,
        conservative=conservative_mode)

    with open(join(out_dir, 'joint_pr_curve.json'), 'w') as f:
        json.dump(
            {
                'precisions': precisions,
                'recalls': recalls,
                'thresholds': threshs
            },
            f,
            indent=4)
def main():
    usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>"
    parser = OptionParser()
    parser.add_option(
        "-o",
        "--out_dir",
        help=
        "Directory in which to write the output. If it doesn't exist, create the directory."
    )
    parser.add_option(
        "-f",
        "--config_file",
        action="store_true",
        help="Load plotting config from file rather than command line arguments"
    )
    parser.add_option("-c",
                      "--conservative_mode",
                      action="store_true",
                      help="Compute conservative metrics")
    (options, args) = parser.parse_args()

    conservative_mode = options.conservative_mode
    out_dir = options.out_dir

    # Parse the input
    if options.config_file:
        config_f = args[0]
        with open(config_f, 'r') as f:
            config = json.load(f)
            label_graph_f = config['label_graph_file']
            labeling_f = config['labeling_file']
            results_fs = config['results_files']
            method_names = config['method_names']
    else:
        method_names = args[0].split(',')
        result_fs = args[1].split(',')
        label_graph_f = args[0]

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the labels' data
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    # Load the labellings
    with open(labeling_f, 'r') as f:
        labelling = json.load(f)
    exp_to_labels = labelling['labels']

    # Load the results
    all_results = []
    for results_f in results_fs:
        all_results.append(pd.read_csv(results_f, sep='\t', index_col=0))
    assert _comparable_results(all_results)

    # Create the output directory
    _run_cmd("mkdir -p %s" % out_dir)

    # Compute labels on which we will compute metrics
    include_labels = set(all_results[0].columns) - BLACKLIST_TERMS

    # Create the assignment matrix where rows are samples, columns
    # are labels, and element (i,j) = True if sample i is annotated
    # with label j
    assignment_df = cm._compute_assignment_matrix(all_results[0],
                                                  exp_to_labels)

    metrics_dfs = []
    label_to_pr_curves = []
    for results_df in all_results:
        results_df = results_df.loc[assignment_df.index][assignment_df.columns]
        metrics_df = cm.compute_label_centric_metrics_binary(
            results_df, assignment_df, include_labels)
        metrics_dfs.append(metrics_df)

    # F1-score barplots overlaid on label-graph
    #draw_collapsed_ontology_w_figures(
    #    exp_to_labels,
    #    label_graph,
    #    label_to_name,
    #    label_to_pr_curves,
    #    [
    #        {
    #            label: metric_df.loc[label]['Avg. Precision']
    #            for label in metric_df.index
    #        }
    #        for metric_df in metrics_dfs
    #    ],
    #    method_names,
    #    out_dir
    #)

    # Average precision box-plots
    gf.draw_boxplot(method_names, metrics_dfs, 'F1-Score',
                    join(out_dir, "f1_scores_boxplot"))
Esempio n. 7
0
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o", "--out_dir", help="Output directory")
    (options, args) = parser.parse_args()

    binary_results_f = args[0]
    results_f = args[1]
    label_graph_f = args[2]
    decision_boundary_f = args[3]
    precision_thresh = float(args[4])
    out_dir = options.out_dir

    binary_results_df = pd.read_csv(binary_results_f, sep='\t', index_col=0)
    results_df = pd.read_csv(results_f, sep='\t', index_col=0)
    decision_df = pd.read_csv(decision_boundary_f, sep='\t', index_col=0)

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the label graph
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    label_to_f1 = {
        label: decision_df.loc[label]['F1-score']
        for label in decision_df.index
    }
    label_to_prec = {
        label: decision_df.loc[label]['precision']
        for label in decision_df.index
    }
    label_to_thresh = {
        label: decision_df.loc[label]['empirical_threshold']
        for label in decision_df.index
    }

    # Map each label to its ancestors
    label_to_ancestors = {
        label: label_graph.ancestor_nodes(label)
        for label in label_graph.get_all_nodes()
    }

    # Filter labels according to empiracle precision
    hard_labels = set([
        label for label, prec in label_to_prec.items()
        if prec < precision_thresh
    ])

    # Map each experiment to its predicted terms
    print('Mapping each sample to its predicted labels...')
    consider_labels = set(binary_results_df.columns) - hard_labels
    exp_to_pred_labels = {
        exp: [
            label for label in consider_labels
            if binary_results_df.loc[exp][label] == 1
        ]
        for exp in binary_results_df.index
    }

    print('Computing the most-specific predicted labels...')
    exp_to_ms_pred_labels = {
        exp:
        label_graph.most_specific_nodes(set(pred_labels) - QUALIFIER_TERMS)
        for exp, pred_labels in exp_to_pred_labels.items()
    }

    # Select cells with highest probability
    exp_to_select_pred_label = {
        exp:
        max([(label, results_df.loc[exp][label]) for label in ms_pred_labels],
            key=lambda x: x[1])[0]
        for exp, ms_pred_labels in exp_to_ms_pred_labels.items()
        if len(ms_pred_labels) > 0
    }

    exp_to_update_pred = {}
    for exp, select_label in exp_to_select_pred_label.items():
        print('{}: {}'.format(exp, og.id_to_term[select_label].name))
        all_labels = label_to_ancestors[select_label]
        exp_to_update_pred[exp] = all_labels

    # Add qualifier cell types
    for exp in exp_to_update_pred:
        for qual_label in QUALIFIER_TERMS:
            if qual_label in exp_to_pred_labels[exp]:
                all_labels = label_to_ancestors[qual_label]
                exp_to_update_pred[exp].update(all_labels)

    # Create dataframe with filtered results
    da = []
    for exp in binary_results_df.index:
        row = []
        for label in binary_results_df.columns:
            if label in exp_to_update_pred[exp]:
                row.append(1)
            else:
                row.append(0)
        da.append(row)

    df = pd.DataFrame(data=da,
                      columns=binary_results_df.columns,
                      index=binary_results_df.index)
    df.to_csv(join(
        out_dir, 'filtered_binary_classification_results.prec_{}.tsv'.format(
            str(precision_thresh))),
              sep='\t')
def main():
    usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>"
    parser = OptionParser()
    parser.add_option(
        "-o",
        "--out_dir",
        help=
        "Directory in which to write the output. If it doesn't exist, create the directory."
    )
    parser.add_option(
        "-f",
        "--config_file",
        action="store_true",
        help="Load plotting config from file rather than command line arguments"
    )
    parser.add_option("-c",
                      "--conservative_mode",
                      action="store_true",
                      help="Compute conservative metrics")
    (options, args) = parser.parse_args()

    conservative_mode = options.conservative_mode
    out_dir = options.out_dir

    # Parse the input
    if options.config_file:
        config_f = args[0]
        with open(config_f, 'r') as f:
            config = json.load(f)
            label_graph_f = config['label_graph_file']
            labeling_f = config['labeling_file']
            results_fs = config['results_files']
            method_names = config['method_names']
    else:
        method_names = args[0].split(',')
        result_fs = args[1].split(',')
        label_graph_f = args[0]

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the labels' data
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    # Load the labellings
    with open(labeling_f, 'r') as f:
        labelling = json.load(f)
    exp_to_labels = labelling['labels']

    # Load the results
    all_results = []
    for results_f in results_fs:
        all_results.append(pd.read_csv(results_f, sep='\t', index_col=0))
    assert _comparable_results(all_results)

    # Create the output directory
    _run_cmd("mkdir -p %s" % out_dir)

    # Compute labels on which we will compute metrics
    include_labels = set(all_results[0].columns) - BLACKLIST_TERMS

    # Create the assignment matrix where rows are samples, columns
    # are labels, and element (i,j) = True if sample i is annotated
    # with label j
    assignment_df = cm._compute_assignment_matrix(all_results[0],
                                                  exp_to_labels)

    metrics_dfs = []
    label_to_pr_curves = []
    for results_df in all_results:
        results_df = results_df.loc[assignment_df.index][assignment_df.columns]
        if conservative_mode:
            metrics_df, label_to_pr_curve = cm.compute_label_centric_metrics(
                results_df,
                assignment_df,
                include_labels,
                label_graph=label_graph,
                label_to_name=label_to_name,
                og=og,
                conservative=True)
        else:
            metrics_df, label_to_pr_curve = cm.compute_label_centric_metrics(
                results_df, assignment_df, include_labels, conservative=False)
        metrics_dfs.append(metrics_df)
        label_to_pr_curves.append(label_to_pr_curve)

    # Write precision-recall curves to file
    with open(join(out_dir, 'pr_curves.json'), 'w') as f:
        json.dump(
            {
                method_name: {
                    label: {
                        'precisions': pr[0],
                        'recalls': pr[1],
                        'thresholds': pr[2]
                    }
                    for label, pr in label_to_pr_curve.iteritems()
                }
                for method_name, label_to_pr_curve in zip(
                    method_names, label_to_pr_curves)
            },
            f,
            indent=4)

    # Precision recall curves overlaid on label-graph
    draw_collapsed_ontology_w_pr_curves(
        exp_to_labels, label_graph, label_to_name, label_to_pr_curves, [{
            label: metric_df.loc[label]['Avg. Precision']
            for label in metric_df.index
        } for metric_df in metrics_dfs], method_names, out_dir)

    # Average precision box-plots
    draw_boxplot(method_names, metrics_dfs, 'Avg. Precision',
                 join(out_dir, "avg_prec_boxplot"))

    # Achievable recall at 0.9 precision box-plots
    draw_boxplot(method_names, metrics_dfs,
                 'Achievable Recall at 0.9 Precision',
                 join(out_dir, "achievable_recall_boxplot"))

    # Avgerage precision comparision heatmap
    draw_comparison_heatmap(
        method_names, {
            method: metrics_df
            for method, metrics_df in zip(method_names, metrics_dfs)
        }, 'Avg. Precision', metrics_dfs[0].index,
        join(out_dir, 'win_diff_avg_prec_heatmap'))

    # Achievable recall at 0.9 precision comparision heatmap
    draw_comparison_heatmap(
        method_names, {
            method: metrics_df
            for method, metrics_df in zip(method_names, metrics_dfs)
        }, 'Achievable Recall at 0.9 Precision', metrics_dfs[0].index,
        join(out_dir, 'win_diff_achievable_recall_heatmap'))
def main():
    usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>"
    parser = OptionParser()
    parser.add_option(
        "-o", 
        "--out_dir", 
        help="Directory in which to write the output. If it doesn't exist, create the directory."
    )
    parser.add_option(
        "-f",
        "--config_file",
        help="JSON file with all inputs required to run this analysis"
    )
    parser.add_option(
        "-t",
        "--thresholds",
        help="Either a JSON file mapping each label to a decision threshold or number denoting the threshold to use for all cell types" 
    )
    parser.add_option(
        "-v",
        "--threshold_val",
        help="A number denoting the threshold to use for all cell types"
    )
    parser.add_option(
        "-c", 
        "--conservative_mode", 
        action="store_true", 
        help="Compute conservative metrics"
    )
    (options, args) = parser.parse_args()

    conservative_mode = options.conservative_mode
    if options.threshold_val:
        label_to_thresh = defaultdict(lambda: float(options.threshold_val))
    elif options.thresholds:
        label_to_thresh_df = pd.read_csv(options.thresholds, sep='\t', index_col=0)
        label_to_thresh = {
            label: label_to_thresh_df.loc[label]['threshold']
            for label in label_to_thresh_df.index
        }
    out_dir = options.out_dir
    

    # Parse the input
    if options.config_file:
        config_f = args[0]
        with open(config_f, 'r') as f:
            config = json.load(f)
            label_graph_f = config['label_graph_file']
            labeling_f = config['labeling_file']
            results_fs = config['results_files']
            method_name = config['method_name'] 
    else:
        method_name = args[0]
        results_f = args[1]
        label_graph_f = args[2]     
        labeling_f = args[3]

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the labels' data
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    # Load the labellings
    with open(labeling_f, 'r') as f:
        labelling = json.load(f)
    exp_to_labels = labelling['labels']

    # Load the results
    bin_results_df = pd.read_csv(results_f, sep='\t', index_col=0)

     # Create the output directory
    _run_cmd("mkdir -p %s" % out_dir)

    # Compute labels on which we will compute metrics
    include_labels = set(bin_results_df.columns) - BLACKLIST_TERMS

    # Create the assignment matrix where rows are samples, columns
    # are labels, and element (i,j) = True if sample i is annotated
    # with label j
    assignment_df = cm._compute_assignment_matrix(
        bin_results_df,
        exp_to_labels
    )
    #bin_results_da = {}
    #for label in results_df.columns:
    #    if options.thresholds and label not in label_to_thresh:
    #        continue
    #    confs = results_df[label] 
    #    bins = [
    #        (x > label_to_thresh[label])
    #        for x in confs
    #    ]
    #    bin_results_da[label] = bins
    #bin_results_df = pd.DataFrame(
    #    data=bin_results_da,
    #    index=results_df.index
    #)
    assignment_df = assignment_df.loc[bin_results_df.index][bin_results_df.columns]

    metrics_df = cm.compute_label_centric_metrics_binary(
        bin_results_df,
        assignment_df,
        include_labels,
        label_graph=label_graph,
        label_to_name=label_to_name,
        og=og,
        conservative=conservative_mode
    )

    metrics_df.to_csv(join(out_dir, 'binary_cell_type_metrics.tsv'), sep='\t')

    label_to_f1 = {
        label: metrics_df.loc[label]['F1-Score']
        for label in metrics_df.index
    }
    print(label_to_f1)

    # F1-score drawn atop ontology
    draw_collapsed_ontology(
        label_graph,
        label_to_name,
        label_to_f1,
        'F1-Score',
        out_dir
    )
Esempio n. 10
0
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o", "--out_file", help="Output file")
    (options, args) = parser.parse_args()

    single_cell_exp_list_f = args[0]
    bulk_exp_list_f = args[1]
    single_cell_label_graph_f = args[2]
    bulk_label_graph_f = args[3]

    out_f = options.out_file
    og = the_ontology()

    with open(single_cell_exp_list_f, 'r') as f:
        sc_experiments_data = json.load(f)
    with open(bulk_exp_list_f, 'r') as f:
        bulk_experiments_data = json.load(f)

    single_cell_exps = set(sc_experiments_data['experiments'])
    single_cell_exp_set_name = sc_experiments_data['list_name']
    bulk_exps = set(bulk_experiments_data['experiments'])
    bulk_exp_set_name = bulk_experiments_data['list_name']
    assert single_cell_exp_set_name == "untampered_single_cell_primary_cells_with_data"
    assert bulk_exp_set_name == "untampered_bulk_primary_cells_with_data"

    with open(single_cell_label_graph_f, 'r') as f:
        labels_data = json.load(f)
        sc_label_graph = labels_data['label_graph']
        sc_exp_to_labels = labels_data['labels']
    sc_exp_to_labels = {
        k: set(v) - set(BLACKLIST)
        for k, v in sc_exp_to_labels.items()
    }

    with open(bulk_label_graph_f, 'r') as f:
        labels_data = json.load(f)
        bulk_label_graph = labels_data['label_graph']
        bulk_exp_to_labels = labels_data['labels']
    bulk_exp_to_labels = {
        k: set(v) - set(BLACKLIST)
        for k, v in bulk_exp_to_labels.items()
    }

    # The idea here is that we only want single-cell samples
    # for which ~all~ of its most-specific labels are a subset
    # of one bulk-sample's label-set. Here we collect all of the
    # unique bulk label-sets.
    #
    # For example, given a sample labeled as {embryonic cell,
    # neural cell}, but in the bulk data we only have samples
    # labelled as {embryonic cell} and {neural cell}. We would
    # discard this cell.
    bulk_label_sets = set()
    for labels in bulk_exp_to_labels.values():
        bulk_label_sets.add(frozenset(labels))

    label_sets_not_in_bulk = set()
    removed_exps = set()
    include_exps = set()
    g = DirectedAcyclicGraph(sc_label_graph)
    for exp, labels in sc_exp_to_labels.items():
        ms_labels = set(g.most_specific_nodes(labels))
        ms_labels -= set(IGNORE)

        # Go through the bulk label-sets and check if the current
        # sample's set of most-specific labels is a subset of any
        # of them. If so, keep it. If not, we discard it.
        found = False
        for label_set in bulk_label_sets:
            if set(ms_labels) <= label_set:
                include_exps.add(exp)
                found = True
                break
        if not found:
            label_sets_not_in_bulk.add(frozenset(ms_labels))
            removed_exps.add(exp)

    print("{} single-cell experiments were removed".format(len(removed_exps)))
    print("Labels that were removed:")
    print(
        json.dumps([[og.id_to_term[x].name for x in label_set]
                    for label_set in label_sets_not_in_bulk],
                   indent=True))

    with open(out_f, 'w') as f:
        f.write(
            json.dumps(
                {
                    "list_name":
                    "untampered_single_cell_primary_cells_with_data_cell_types_in_bulk",
                    "description":
                    "These are all experiments that are in the experiment list '%s' and also share the same set of most-specific labels with at least one experiment in %s"
                    % (single_cell_exp_set_name, bulk_exp_set_name),
                    "experiments":
                    list(include_exps)
                },
                indent=4))
Esempio n. 11
0
def load_sparse_dataset():
    labels_f = join(data_dir, 'labels.json')
    studys_f = join(data_dir, 'experiment_to_study.json')
    tags_f = join(data_dir, 'experiment_to_tags.json')
    expr_matrix_f = join(data_dir, '{}.h5'.format(features))

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load labels and labels-graph
    with open(labels_f, 'r') as f:
        labels_data = json.load(f)
        source_to_targets= labels_data['label_graph']
        exp_to_labels = labels_data['labels']
    label_graph = DirectedAcyclicGraph(source_to_targets)

    # Map each ontology term label to its human-readable
    # name
    label_to_name = {
        label: og.id_to_term[label].name
        for label in source_to_targets.keys()
    }

    # Map each experiment to its most-specific labels
    exp_to_ms_labels = {
        exp: label_graph.most_specific_nodes(labels)
        for exp, labels in exp_to_labels.items()
    }

    # Load study metadata
    with open(studys_f, 'r') as f:
        exp_to_study = json.load(f)
    study_to_exps = defaultdict(lambda: set())
    for exp, study in exp_to_study.items():
        study_to_exps[study].add(exp)
    study_to_exps = dict(study_to_exps)

    # Load technical tags
    with open(tags_f, 'r') as f:
        exp_to_tags = json.load(f)

    # Load the data matrix
    print('Loading expression data from {}...'.format(expr_matrix_f))
    with h5py.File(expr_matrix_f, 'r') as f:
        the_exps = [
            str(x)
            for x in f['experiment'][:]
        ]
        gene_ids = [
            str(x)
            for x in f['gene_id'][:]
        ]
        data_matrix = f['expression'][:]
    print('Loaded matrix of shape {}.'.format(data_matrix.shape))
    print('done.')

    # Map each experiment to its index
    exp_to_index = {
        exp: ind
        for ind, exp in enumerate(the_exps)
    }
    return (
        og,
        label_graph,
        label_to_name,
        the_exps,
        exp_to_index,
        exp_to_labels,
        exp_to_tags,
        exp_to_study,
        study_to_exps,
        exp_to_ms_labels,
        data_matrix,
        gene_ids
    )
Esempio n. 12
0
def main():
    usage = "usage: %prog <options> <environment dir> <experiment list name> <cross-validation config name>"
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o",
                      "--out_dir",
                      help="Directory in which to write the output")
    (options, args) = parser.parse_args()

    results_f = args[0]
    label_graph_f = args[1]
    prefix = args[2]
    out_dir = options.out_dir

    # Load the results
    confidence_df = pd.read_csv(results_f, sep='\t', index_col=0)

    # Map each label to its name
    og = the_ontology.the_ontology()
    label_to_name = {
        label: og.id_to_term[label].name
        for label in confidence_df.columns
    }

    _run_cmd("mkdir -p %s" % out_dir)

    # Load the label-graph
    with open(label_graph_f, 'r') as f:
        labels_data = json.load(f)
    label_graph = DirectedAcyclicGraph(labels_data['label_graph'])

    # Compute the labels for which we will compute metrics over.
    # This label set is simply the set of labels for which we have
    # predictions for every sample
    include_labels = set(confidence_df.columns) - BLACKLIST_TERMS

    total_n_incons = 0
    total_n_very_incons = 0
    total_edges = 0
    incons_to_count = defaultdict(lambda: 0)
    very_incons_to_count = defaultdict(lambda: 0)
    exp_child_parent_incons = []
    for exp in confidence_df.index:
        exp_n_incons = 0
        for parent_label in confidence_df.columns:
            parent_conf = confidence_df.loc[exp][parent_label]
            if parent_label in BLACKLIST_TERMS:
                continue
            for child_label in label_graph.source_to_targets[parent_label]:
                if child_label in BLACKLIST_TERMS:
                    continue
                if child_label not in confidence_df.columns:
                    continue
                child_conf = confidence_df.loc[exp][child_label]
                # Don't consider parent-child edges where prediction-scores
                # for both nodes is less than 1%
                if child_conf < 0.01 and parent_conf < 0.01:
                    continue
                # We count the edge as inconsistent if BOTH the child's score is
                # greater than its parents and ALSO that difference is non-negligeble
                if abs(child_conf -
                       parent_conf) > EPSILON and child_conf > parent_conf:
                    exp_child_parent_incons.append(
                        (exp, child_label, parent_label,
                         (child_conf - parent_conf)))
                    incons_to_count[(parent_label, child_label)] += 1
                    total_n_incons += 1
                    exp_n_incons += 1
                    if child_conf - parent_conf > VERY_INCONS_THRESH:
                        total_n_very_incons += 1
                        very_incons_to_count[(parent_label, child_label)] += 1
                total_edges += 1
    total_fraction_inconsistent = total_n_incons / float(total_edges)
    total_fraction_very_inconsistent = total_n_very_incons / float(total_edges)

    print("Inconsistent edges:")
    for incons, count in sorted([(k, v) for k, v in incons_to_count.items()],
                                key=lambda x: x[1]):
        parent = incons[0]
        child = incons[1]
        print("%s -> %s : %d" %
              (label_to_name[parent], label_to_name[child], count))
    print("Very inconsistent edges:")
    for incons, count in sorted([(k, v)
                                 for k, v in very_incons_to_count.items()],
                                key=lambda x: x[1]):
        parent = incons[0]
        child = incons[1]
        print("%s -> %s : %d" %
              (label_to_name[parent], label_to_name[child], count))

    summary_df = pd.DataFrame(
        data=[[total_n_incons, total_edges, total_fraction_inconsistent],
              [
                  total_n_very_incons, total_edges,
                  total_fraction_very_inconsistent
              ],
              [
                  total_n_very_incons,
                  len(confidence_df.index),
                  (float(total_n_very_incons) / len(confidence_df.index))
              ]],
        columns=["No. enconsistent", "Total edges", "Fraction of total edges"],
        index=[
            "Total edges inconsistent",
            "Total edges inconsistent >%f" % VERY_INCONS_THRESH,
            "Avg. very inconsistent per sample"
        ])
    summary_df.to_csv(join(out_dir,
                           '{}.inconsistent_edges_stats.tsv'.format(prefix)),
                      sep='\t')

    exp_child_parent_incons = sorted(exp_child_parent_incons,
                                     key=lambda x: x[3])
    inconss = []
    n_less_eq = []
    l_less_than_1 = 0
    n_great_than_1 = 0
    for i, exp_child_parent_icons in enumerate(exp_child_parent_incons):
        incons = exp_child_parent_icons[3]
        inconss.append(incons)
        n_less_eq.append(float(i) / len(exp_child_parent_incons))

    fig, axarr = plt.subplots(1, 1, figsize=(3.0, 3.0), squeeze=False)
    axarr[0][0].plot(inconss, n_less_eq, color=vl.NICE_COLORS[1], lw=4)
    axarr[0][0].set_xlabel('Child prob. - Parent prob.')
    axarr[0][0].set_ylabel('Cumulative probability')
    axarr[0][0].set_xlim((0.0, 1.0))
    axarr[0][0].set_ylim((0.0, 1.0))
    out_f = join(out_dir, "{}.CDF_inconsistences".format(prefix))
    fig.savefig("%s.eps" % out_f,
                format='eps',
                bbox_inches='tight',
                dpi=100,
                transparent=True)
    fig.savefig("%s.pdf" % out_f,
                format='pdf',
                bbox_inches='tight',
                dpi=100,
                transparent=True)