예제 #1
0
def map_samples_to_terms(restrict_to_samples, metasra_f):
    og = the_ontology.the_ontology()
    query_metasra_mapped_terms_sql = "SELECT sample_accession, \
        term_id FROM mapped_ontology_terms;"

    print("Querying database for sample to terms mappings...")
    sample_to_mapped_terms = defaultdict(lambda: set())
    with sqlite3.connect(metasra_f) as metasra_conn:
        metasra_c = metasra_conn.cursor()
        results = metasra_c.execute(query_metasra_mapped_terms_sql)
        for r in results:
            sample = r[0]
            term_id = r[1]
            if sample in restrict_to_samples:
                sample_to_mapped_terms[sample].add(term_id)

    # Restrict to most specific term
    mod_sample_to_mapped_terms = {}
    for sample, terms in sample_to_mapped_terms.items():
        ms_mapped_terms = ontology_graph.most_specific_terms(
            terms, og, sup_relations=["is_a", "part_of"])
        mod_sample_to_mapped_terms[sample] = set(ms_mapped_terms)
    sample_to_mapped_terms = mod_sample_to_mapped_terms
    print("done.")
    return sample_to_mapped_terms
예제 #2
0
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o", "--out_file", help="Output file")
    (options, args) = parser.parse_args()

    pr_curves_f = args[0]
    out_f = options.out_file

    og = the_ontology.the_ontology()

    with open(pr_curves_f, 'r') as f:
        method_to_label_to_pr_curves = json.load(f)

    assert len(method_to_label_to_pr_curves) == 1
    method = sorted(method_to_label_to_pr_curves.keys())[0]
    label_to_pr_curves = method_to_label_to_pr_curves[method]

    da = []
    for label, pr in label_to_pr_curves.items():
        precs = pr['precisions']
        recs = pr['recalls']
        threshs = pr['thresholds']
        f1s = map(_compute_f1, zip(precs, recs))
        max_f1_thresh = max(zip(f1s, threshs), key=lambda x: x[0])
        da.append((label, og.id_to_term[label].name, max_f1_thresh[1],
                   max_f1_thresh[0]))

    df = pd.DataFrame(data=da,
                      columns=['label', 'label_name', 'threshold', 'F1-score'])
    df.to_csv(out_f, sep='\t', index=False)
    print(df)
예제 #3
0
def _label_experiments(
        experiment_accs,
        exp_to_info,
        which_terms='mapped_terms'
    ):
    og = the_og.the_ontology()
    exp_to_terms = defaultdict(lambda: set())
    for exp in experiment_accs:
        mapped_terms = set(
            exp_to_info[exp][which_terms]
        )
        # compute all cell-type terms
        all_terms = set()
        for term in mapped_terms:
            all_terms.update(
                og.recursive_relationship(
                    term,
                    recurs_relationships=['is_a', 'part_of']
                )
            )
        all_terms = [
            x
            for x in all_terms
            if x.split(':')[0] == 'CL'
        ]
        exp_to_terms[exp] = all_terms
    return exp_to_terms
예제 #4
0
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o",
                      "--out_dir",
                      help="Directory in which to write output")
    (options, args) = parser.parse_args()

    result_f = args[0]
    out_dir = options.out_dir

    og = the_ontology.the_ontology()

    raw_df = pd.read_csv(result_f, sep='\t', index_col=0)
    raw_df = raw_df.drop([
        'first.labels', 'tuning.scores.first', 'tuning.scores.second',
        'labels', 'pruned.labels'
    ],
                         axis=1)

    # Get all terms represented in this output
    all_terms = set()
    for label in raw_df.columns:
        if label not in SINGLER_OUTPUT_TO_TERMS:
            print('Skipping column "{}"'.format(label))
            continue
        all_terms.update(SINGLER_OUTPUT_TO_TERMS[label])

    # Map each term to its ancestors
    term_to_ancestors = {
        term: og.recursive_superterms(term)
        for term in all_terms
    }
    for term, ancestors in term_to_ancestors.items():
        all_terms.update(ancestors)
    all_terms = sorted(all_terms)

    # Compute binary-classification matrix
    da = []
    for cell in raw_df.index:
        preds = [(pred, label)
                 for pred, label in zip(raw_df.loc[cell], raw_df.columns)]
        max_label = max(preds, key=lambda x: x[0])[1]
        pred_terms = SINGLER_OUTPUT_TO_TERMS[max_label]
        all_pred_terms = set()
        for term in pred_terms:
            all_pred_terms.update(term_to_ancestors[term])

        row = []
        for term in all_terms:
            if term in all_pred_terms:
                row.append(1)
            else:
                row.append(0)
        da.append(row)

    bin_df = pd.DataFrame(data=da, columns=all_terms, index=raw_df.index)
    bin_df.to_csv(join(out_dir, 'binary_classification_results.tsv'), sep='\t')
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o",
                      "--out_dir",
                      help="Directory in which to write output")
    (options, args) = parser.parse_args()

    result_f = args[0]
    out_dir = options.out_dir

    og = the_ontology.the_ontology()

    scmatch_output_to_all_terms = defaultdict(lambda: set())
    all_terms = set()
    for scmatch_out, terms in SCMATCH_OUTPUT_TO_TERMS.items():
        for term in terms:
            scmatch_output_to_all_terms[scmatch_out].update(
                og.recursive_superterms(term))
            all_terms.update(og.recursive_superterms(term))
    scmatch_output_to_all_terms = dict(scmatch_output_to_all_terms)
    all_terms = sorted(all_terms)

    results_df = pd.read_csv(result_f, index_col=0)
    print(results_df)
    conf_da = []
    bin_da = []
    nonmapped_samples = set()
    for cell in results_df.index:
        scmatch_out = results_df.loc[cell]['top sample'].split(',')[0]
        score = results_df.loc[cell]['top correlation score']
        try:
            terms = scmatch_output_to_all_terms[scmatch_out]
        except KeyError:
            nonmapped_samples.add(scmatch_out)
            terms = []
        term_scores = []
        term_assigns = []
        for term in all_terms:
            if term in terms:
                term_scores.append(score)
                term_assigns.append(1)
            else:
                term_scores.append(float('-inf'))
                term_assigns.append(0)
        conf_da.append(term_scores)
        bin_da.append(term_assigns)
    print('Could not the following samples to ontology terms:')
    print('\n'.join(nonmapped_samples))
    conf_df = pd.DataFrame(data=conf_da,
                           columns=all_terms,
                           index=results_df.index)
    bin_df = pd.DataFrame(data=bin_da,
                          columns=all_terms,
                          index=results_df.index)
    conf_df.to_csv(join(out_dir, 'classification_results.tsv'), sep='\t')
    bin_df.to_csv(join(out_dir, 'binary_classification_results.tsv'), sep='\t')
예제 #6
0
def main():
    usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>"
    parser = OptionParser()
    parser.add_option("-o", "--out_file", help="Output file")
    (options, args) = parser.parse_args()

    # Parse the input
    metrics_f = args[0]
    metric_name = args[1]
    label_graph_f = args[2]
    out_f = options.out_file

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the labels' data
    print('Reading label graph from {}.'.format(label_graph_f))
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    print('\n'.join(
        set(label_to_name.values()) - set(LABEL_NAME_TO_SUCCINCT.keys())))

    # Topologically sort the labels and assign them numbers
    topo_sort_labels = topological_sort(label_graph)
    label_to_topo_index = {
        label: index
        for index, label in enumerate(topo_sort_labels)
    }

    # Create text legend for graph
    #legend = ''
    #for label, topo_index in label_to_topo_index.items():
    #    legend += '{} {}'.format(topo_index, og.id_to_term[label].name)
    #with open(join(out_dir, 'graph_node_labels.txt'), 'w') as f:
    #    f.write(legend)

    # Load the metrics
    metrics_df = pd.read_csv(metrics_f, sep='\t', index_col=0)

    # Create the output directory
    #_run_cmd("mkdir -p %s" % out_dir)

    label_to_metric = {
        label: metrics_df.loc[label][metric_name]
        for label in metrics_df.index if label in label_to_name
    }

    # F1-score drawn atop ontology
    draw_collapsed_ontology(label_graph, label_to_name, label_to_metric,
                            metric_name, out_f)
예제 #7
0
def main():
    usage = ""
    parser = OptionParser()
    parser.add_option("-s",
                      "--use_supplemental",
                      action="store_true",
                      help="Use supplemental labels")
    parser.add_option("-o", "--out_file", help="Output file")
    (options, args) = parser.parse_args()

    annot_f = args[0]
    exp_set_f = args[1]
    out_f = options.out_file

    # Load metadata
    with open(annot_f, 'r') as f:
        exp_to_info = json.load(f)
    with open(exp_set_f, 'r') as f:
        the_exps = json.load(f)['experiments']

    # Label the experiments
    if options.use_supplemental:
        exp_to_terms = _label_experiments(
            the_exps, exp_to_info, which_terms='supplemental_mapped_terms')
    else:
        exp_to_terms = _label_experiments(the_exps, exp_to_info)

    # Generate the labelling-graph induced by this
    # dataset
    og = the_og.the_ontology()
    all_terms = set()
    for terms in exp_to_terms.values():
        all_terms.update(terms)
    label_graph = ontology_utils.ontology_subgraph_spanning_terms(
        all_terms, og)
    label_graph = graph.transitive_reduction_on_dag(label_graph)

    # Write output
    exp_set_name = basename(exp_set_f).split('.')[0]
    with open(out_f, 'w') as f:
        f.write(
            json.dumps(
                {
                    'labels_config': {
                        'experiment_set': exp_set_name
                    },
                    'label_graph': {
                        source: list(targets)
                        for source, targets in
                        label_graph.source_to_targets.items()
                    },
                    'labels': exp_to_terms
                },
                indent=4,
                separators=(',', ': ')))
def main():
    usage = "" # TODO 
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o", "--out_file", help="File to write labels data")
    (options, args) = parser.parse_args()

    dataset_f = args[0]
    out_f = options.out_file

    # Load the ontology
    og = the_ontology.the_ontology()
    
    # Load the cell_ids and 10x datasets from which they
    # originate
    with h5py.File(dataset_f, 'r') as f:
        cell_ids = [
            str(x)[2:-1]
            for x in f['experiment'][:]
        ] 
        datasets = [
            str(x)[2:-1]
            for x in f['dataset'][:]
        ]

    # Label each cell
    cell_id_to_labels = {}
    all_labels = set()
    for dataset, cell_id in zip(datasets, cell_ids):
        ms_label = DATA_SET_TO_TARGET_TERM[dataset]
        labels = sorted(og.recursive_superterms(ms_label))
        cell_id_to_labels[cell_id] = labels
        all_labels.update(labels)

    # Generate label-graph
    label_graph = ontology_utils.ontology_subgraph_spanning_terms(
        all_labels,
        og
    )
    label_graph = graph.transitive_reduction_on_dag(label_graph)

    # Write output
    with open(out_f, 'w') as  f:
        f.write(json.dumps(
            {
                'labels_config': {},
                'label_graph': {
                    source: list(targets)
                    for source, targets in label_graph.source_to_targets.items()
                },
                'labels': cell_id_to_labels
            },
            indent=4,
            separators=(',', ': ')
        ))
예제 #9
0
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o", "--out_file", help="File to write labels data")
    (options, args) = parser.parse_args()

    raw_10x_f = args[0]
    out_f = options.out_file

    # Load the ontology
    og = the_ontology.the_ontology()

    h = pd.HDFStore(raw_10x_f, mode='r')
    df = h['DF_ALL']
    cells = df.index.get_level_values('Cell ID')
    cell_types = df.index.get_level_values('CELL_TYPE')

    print(set(cell_types))

    # Label each cell
    cell_id_to_labels = {}
    all_labels = set()
    for cell_id, cell_type in zip(cells, cell_types):
        ms_label = CELL_TYPE_TO_TERM[cell_type]
        labels = sorted(og.recursive_superterms(ms_label))
        cell_id_to_labels[cell_id] = labels
        all_labels.update(labels)

    # Generate label-graph
    label_graph = ontology_utils.ontology_subgraph_spanning_terms(
        all_labels, og)
    label_graph = graph.transitive_reduction_on_dag(label_graph)

    # Write output
    print("Writing output to {}...".format(out_f))
    with open(out_f, 'w') as f:
        f.write(
            json.dumps(
                {
                    'labels_config': {},
                    'label_graph': {
                        source: list(targets)
                        for source, targets in
                        label_graph.source_to_targets.items()
                    },
                    'labels': cell_id_to_labels
                },
                indent=4,
                separators=(',', ': ')))
예제 #10
0
def main():
    usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>"
    parser = OptionParser()
    parser.add_option("-o", "--out_file", help="File in which to write output")
    (options, args) = parser.parse_args()

    out_f = options.out_file

    pr_curve_f = args[0]
    label_graph_f = args[1]
    labeling_f = args[2]

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the labels' data
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    # Load the labellings
    with open(labeling_f, 'r') as f:
        labelling = json.load(f)
    exp_to_labels = labelling['labels']

    # Load PR-curves
    with open(pr_curve_f, 'r') as f:
        label_to_pr_curve = json.load(f)

    # Compute labels on which we will compute metrics
    include_labels = set(label_to_pr_curve.keys()) - BLACKLIST_TERMS

    # Precision recall curves overlaid on label-graph
    draw_collapsed_ontology_w_pr_curves(exp_to_labels, label_graph,
                                        label_to_name, label_to_pr_curve,
                                        out_f)
def main():
    usage = "usage: %prog <experiment metadata file> <untampered exp list file for experiments that have data> <train-test set partition file>" 
    parser = OptionParser(usage=usage)
    parser.add_option("-o", "--out_file", help="Test set output experiment list file")
    (options, args) = parser.parse_args()

    train_exps_list_f = args[0]
    test_exps_list_f = args[1]
    out_f = options.out_file

    og = the_ontology.the_ontology()

    with open(train_exps_list_f, 'r') as f:
        train_include_experiments_data = json.load(f)
    with open(test_exps_list_f, 'r') as f:
        test_include_experiments_data = json.load(f)

    include_experiments = set(train_include_experiments_data['experiments'])
    include_experiments.update(
        set(test_include_experiments_data['experiments'])
    )
    train_parent_exp_list_name = train_include_experiments_data['list_name']
    test_parent_exp_list_name = test_include_experiments_data['list_name']

    assert train_parent_exp_list_name == "training_set_experiments"
    assert test_parent_exp_list_name == "test_set_experiments"

    with open(out_f, 'w') as f:
        f.write(json.dumps(
            {
                "list_name": "untampered_bulk_primary_cells_with_data",
                "description": "These union of experiments in the experiment list '%s' and '%s'" % (
                    train_parent_exp_list_name,
                    test_parent_exp_list_name
                ),
                "experiments": list(include_experiments)
            },
            indent=4
        ))
예제 #12
0
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o", "--out_file", help="Output file")
    (options, args) = parser.parse_args()

    pr_curves_f = args[0]
    out_f = options.out_file

    og = the_ontology.the_ontology()

    with open(pr_curves_f, 'r') as f:
        label_to_pr_curves = json.load(f)

    da = []
    for label, pr in label_to_pr_curves.items():
        if label in REMOVE_TERMS:
            continue
        precs = pr[0]
        recs = pr[1]
        threshs = pr[2]
        f1s = map(_compute_f1, zip(precs, recs))
        max_f1_thresh = max(zip(f1s, precs, threshs), key=lambda x: x[0])
        thresh = min([max_f1_thresh[2], 0.5])
        #thresh = max_f1_thresh[2]

        #da.append((label, og.id_to_term[label].name, max_f1_thresh[1], max_f1_thresh[0]))
        da.append((label, og.id_to_term[label].name, thresh, max_f1_thresh[2],
                   max_f1_thresh[1], max_f1_thresh[0]))

    df = pd.DataFrame(data=da,
                      columns=[
                          'label', 'label_name', 'threshold',
                          'empirical_threshold', 'precision', 'F1-score'
                      ])
    df.to_csv(out_f, sep='\t', index=False)
    print(df)
예제 #13
0
def main():
    usage = "usage: %prog <options> <environment dir> <experiment list name> <cross-validation config name>"
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o",
                      "--out_dir",
                      help="Directory in which to write the output")
    (options, args) = parser.parse_args()

    results_f = args[0]
    label_graph_f = args[1]
    prefix = args[2]
    out_dir = options.out_dir

    # Load the results
    confidence_df = pd.read_csv(results_f, sep='\t', index_col=0)

    # Map each label to its name
    og = the_ontology.the_ontology()
    label_to_name = {
        label: og.id_to_term[label].name
        for label in confidence_df.columns
    }

    _run_cmd("mkdir -p %s" % out_dir)

    # Load the label-graph
    with open(label_graph_f, 'r') as f:
        labels_data = json.load(f)
    label_graph = DirectedAcyclicGraph(labels_data['label_graph'])

    # Compute the labels for which we will compute metrics over.
    # This label set is simply the set of labels for which we have
    # predictions for every sample
    include_labels = set(confidence_df.columns) - BLACKLIST_TERMS

    total_n_incons = 0
    total_n_very_incons = 0
    total_edges = 0
    incons_to_count = defaultdict(lambda: 0)
    very_incons_to_count = defaultdict(lambda: 0)
    exp_child_parent_incons = []
    for exp in confidence_df.index:
        exp_n_incons = 0
        for parent_label in confidence_df.columns:
            parent_conf = confidence_df.loc[exp][parent_label]
            if parent_label in BLACKLIST_TERMS:
                continue
            for child_label in label_graph.source_to_targets[parent_label]:
                if child_label in BLACKLIST_TERMS:
                    continue
                if child_label not in confidence_df.columns:
                    continue
                child_conf = confidence_df.loc[exp][child_label]
                # Don't consider parent-child edges where prediction-scores
                # for both nodes is less than 1%
                if child_conf < 0.01 and parent_conf < 0.01:
                    continue
                # We count the edge as inconsistent if BOTH the child's score is
                # greater than its parents and ALSO that difference is non-negligeble
                if abs(child_conf -
                       parent_conf) > EPSILON and child_conf > parent_conf:
                    exp_child_parent_incons.append(
                        (exp, child_label, parent_label,
                         (child_conf - parent_conf)))
                    incons_to_count[(parent_label, child_label)] += 1
                    total_n_incons += 1
                    exp_n_incons += 1
                    if child_conf - parent_conf > VERY_INCONS_THRESH:
                        total_n_very_incons += 1
                        very_incons_to_count[(parent_label, child_label)] += 1
                total_edges += 1
    total_fraction_inconsistent = total_n_incons / float(total_edges)
    total_fraction_very_inconsistent = total_n_very_incons / float(total_edges)

    print("Inconsistent edges:")
    for incons, count in sorted([(k, v) for k, v in incons_to_count.items()],
                                key=lambda x: x[1]):
        parent = incons[0]
        child = incons[1]
        print("%s -> %s : %d" %
              (label_to_name[parent], label_to_name[child], count))
    print("Very inconsistent edges:")
    for incons, count in sorted([(k, v)
                                 for k, v in very_incons_to_count.items()],
                                key=lambda x: x[1]):
        parent = incons[0]
        child = incons[1]
        print("%s -> %s : %d" %
              (label_to_name[parent], label_to_name[child], count))

    summary_df = pd.DataFrame(
        data=[[total_n_incons, total_edges, total_fraction_inconsistent],
              [
                  total_n_very_incons, total_edges,
                  total_fraction_very_inconsistent
              ],
              [
                  total_n_very_incons,
                  len(confidence_df.index),
                  (float(total_n_very_incons) / len(confidence_df.index))
              ]],
        columns=["No. enconsistent", "Total edges", "Fraction of total edges"],
        index=[
            "Total edges inconsistent",
            "Total edges inconsistent >%f" % VERY_INCONS_THRESH,
            "Avg. very inconsistent per sample"
        ])
    summary_df.to_csv(join(out_dir,
                           '{}.inconsistent_edges_stats.tsv'.format(prefix)),
                      sep='\t')

    exp_child_parent_incons = sorted(exp_child_parent_incons,
                                     key=lambda x: x[3])
    inconss = []
    n_less_eq = []
    l_less_than_1 = 0
    n_great_than_1 = 0
    for i, exp_child_parent_icons in enumerate(exp_child_parent_incons):
        incons = exp_child_parent_icons[3]
        inconss.append(incons)
        n_less_eq.append(float(i) / len(exp_child_parent_incons))

    fig, axarr = plt.subplots(1, 1, figsize=(3.0, 3.0), squeeze=False)
    axarr[0][0].plot(inconss, n_less_eq, color=vl.NICE_COLORS[1], lw=4)
    axarr[0][0].set_xlabel('Child prob. - Parent prob.')
    axarr[0][0].set_ylabel('Cumulative probability')
    axarr[0][0].set_xlim((0.0, 1.0))
    axarr[0][0].set_ylim((0.0, 1.0))
    out_f = join(out_dir, "{}.CDF_inconsistences".format(prefix))
    fig.savefig("%s.eps" % out_f,
                format='eps',
                bbox_inches='tight',
                dpi=100,
                transparent=True)
    fig.savefig("%s.pdf" % out_f,
                format='pdf',
                bbox_inches='tight',
                dpi=100,
                transparent=True)
def main():
    usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>"
    parser = OptionParser()
    parser.add_option(
        "-o", 
        "--out_dir", 
        help="Directory in which to write the output. If it doesn't exist, create the directory."
    )
    parser.add_option(
        "-f",
        "--config_file",
        help="JSON file with all inputs required to run this analysis"
    )
    parser.add_option(
        "-t",
        "--thresholds",
        help="Either a JSON file mapping each label to a decision threshold or number denoting the threshold to use for all cell types" 
    )
    parser.add_option(
        "-v",
        "--threshold_val",
        help="A number denoting the threshold to use for all cell types"
    )
    parser.add_option(
        "-c", 
        "--conservative_mode", 
        action="store_true", 
        help="Compute conservative metrics"
    )
    (options, args) = parser.parse_args()

    conservative_mode = options.conservative_mode
    if options.threshold_val:
        label_to_thresh = defaultdict(lambda: float(options.threshold_val))
    elif options.thresholds:
        label_to_thresh_df = pd.read_csv(options.thresholds, sep='\t', index_col=0)
        label_to_thresh = {
            label: label_to_thresh_df.loc[label]['threshold']
            for label in label_to_thresh_df.index
        }
    out_dir = options.out_dir
    

    # Parse the input
    if options.config_file:
        config_f = args[0]
        with open(config_f, 'r') as f:
            config = json.load(f)
            label_graph_f = config['label_graph_file']
            labeling_f = config['labeling_file']
            results_fs = config['results_files']
            method_name = config['method_name'] 
    else:
        method_name = args[0]
        results_f = args[1]
        label_graph_f = args[2]     
        labeling_f = args[3]

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the labels' data
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    # Load the labellings
    with open(labeling_f, 'r') as f:
        labelling = json.load(f)
    exp_to_labels = labelling['labels']

    # Load the results
    bin_results_df = pd.read_csv(results_f, sep='\t', index_col=0)

     # Create the output directory
    _run_cmd("mkdir -p %s" % out_dir)

    # Compute labels on which we will compute metrics
    include_labels = set(bin_results_df.columns) - BLACKLIST_TERMS

    # Create the assignment matrix where rows are samples, columns
    # are labels, and element (i,j) = True if sample i is annotated
    # with label j
    assignment_df = cm._compute_assignment_matrix(
        bin_results_df,
        exp_to_labels
    )
    #bin_results_da = {}
    #for label in results_df.columns:
    #    if options.thresholds and label not in label_to_thresh:
    #        continue
    #    confs = results_df[label] 
    #    bins = [
    #        (x > label_to_thresh[label])
    #        for x in confs
    #    ]
    #    bin_results_da[label] = bins
    #bin_results_df = pd.DataFrame(
    #    data=bin_results_da,
    #    index=results_df.index
    #)
    assignment_df = assignment_df.loc[bin_results_df.index][bin_results_df.columns]

    metrics_df = cm.compute_label_centric_metrics_binary(
        bin_results_df,
        assignment_df,
        include_labels,
        label_graph=label_graph,
        label_to_name=label_to_name,
        og=og,
        conservative=conservative_mode
    )

    metrics_df.to_csv(join(out_dir, 'binary_cell_type_metrics.tsv'), sep='\t')

    label_to_f1 = {
        label: metrics_df.loc[label]['F1-Score']
        for label in metrics_df.index
    }
    print(label_to_f1)

    # F1-score drawn atop ontology
    draw_collapsed_ontology(
        label_graph,
        label_to_name,
        label_to_f1,
        'F1-Score',
        out_dir
    )
예제 #15
0
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o", "--out_file", help="Output file")
    (options, args) = parser.parse_args()

    single_cell_exp_list_f = args[0]
    bulk_exp_list_f = args[1]
    single_cell_label_graph_f = args[2]
    bulk_label_graph_f = args[3]

    out_f = options.out_file
    og = the_ontology()

    with open(single_cell_exp_list_f, 'r') as f:
        sc_experiments_data = json.load(f)
    with open(bulk_exp_list_f, 'r') as f:
        bulk_experiments_data = json.load(f)

    single_cell_exps = set(sc_experiments_data['experiments'])
    single_cell_exp_set_name = sc_experiments_data['list_name']
    bulk_exps = set(bulk_experiments_data['experiments'])
    bulk_exp_set_name = bulk_experiments_data['list_name']
    assert single_cell_exp_set_name == "untampered_single_cell_primary_cells_with_data"
    assert bulk_exp_set_name == "untampered_bulk_primary_cells_with_data"

    with open(single_cell_label_graph_f, 'r') as f:
        labels_data = json.load(f)
        sc_label_graph = labels_data['label_graph']
        sc_exp_to_labels = labels_data['labels']
    sc_exp_to_labels = {
        k: set(v) - set(BLACKLIST)
        for k, v in sc_exp_to_labels.items()
    }

    with open(bulk_label_graph_f, 'r') as f:
        labels_data = json.load(f)
        bulk_label_graph = labels_data['label_graph']
        bulk_exp_to_labels = labels_data['labels']
    bulk_exp_to_labels = {
        k: set(v) - set(BLACKLIST)
        for k, v in bulk_exp_to_labels.items()
    }

    # The idea here is that we only want single-cell samples
    # for which ~all~ of its most-specific labels are a subset
    # of one bulk-sample's label-set. Here we collect all of the
    # unique bulk label-sets.
    #
    # For example, given a sample labeled as {embryonic cell,
    # neural cell}, but in the bulk data we only have samples
    # labelled as {embryonic cell} and {neural cell}. We would
    # discard this cell.
    bulk_label_sets = set()
    for labels in bulk_exp_to_labels.values():
        bulk_label_sets.add(frozenset(labels))

    label_sets_not_in_bulk = set()
    removed_exps = set()
    include_exps = set()
    g = DirectedAcyclicGraph(sc_label_graph)
    for exp, labels in sc_exp_to_labels.items():
        ms_labels = set(g.most_specific_nodes(labels))
        ms_labels -= set(IGNORE)

        # Go through the bulk label-sets and check if the current
        # sample's set of most-specific labels is a subset of any
        # of them. If so, keep it. If not, we discard it.
        found = False
        for label_set in bulk_label_sets:
            if set(ms_labels) <= label_set:
                include_exps.add(exp)
                found = True
                break
        if not found:
            label_sets_not_in_bulk.add(frozenset(ms_labels))
            removed_exps.add(exp)

    print("{} single-cell experiments were removed".format(len(removed_exps)))
    print("Labels that were removed:")
    print(
        json.dumps([[og.id_to_term[x].name for x in label_set]
                    for label_set in label_sets_not_in_bulk],
                   indent=True))

    with open(out_f, 'w') as f:
        f.write(
            json.dumps(
                {
                    "list_name":
                    "untampered_single_cell_primary_cells_with_data_cell_types_in_bulk",
                    "description":
                    "These are all experiments that are in the experiment list '%s' and also share the same set of most-specific labels with at least one experiment in %s"
                    % (single_cell_exp_set_name, bulk_exp_set_name),
                    "experiments":
                    list(include_exps)
                },
                indent=4))
예제 #16
0
def select_best_most_specific():

    og = the_ontology.the_ontology()
예제 #17
0
def main():
    usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>"
    parser = OptionParser()
    parser.add_option(
        "-o",
        "--out_dir",
        help=
        "Directory in which to write the output. If it doesn't exist, create the directory."
    )
    parser.add_option("-c",
                      "--conservative_mode",
                      action="store_true",
                      help="Compute conservative metrics")
    (options, args) = parser.parse_args()

    conservative_mode = options.conservative_mode
    out_dir = options.out_dir
    method_name = args[0]
    results_f = args[1]
    label_graph_f = args[2]
    labeling_f = args[3]

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the labels' data
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    # Load the labellings
    with open(labeling_f, 'r') as f:
        labelling = json.load(f)
    exp_to_labels = labelling['labels']

    # Load the results
    results_df = pd.read_csv(results_f, sep='\t', index_col=0)

    # Create the output directory
    _run_cmd("mkdir -p %s" % out_dir)

    # Compute labels on which we will compute metrics
    include_labels = set(results_df.columns) - BLACKLIST_TERMS

    # Create the assignment matrix where rows are samples, columns
    # are labels, and element (i,j) = True if sample i is annotated
    # with label j
    assignment_df = cm._compute_assignment_matrix(results_df, exp_to_labels)
    assignment_df = assignment_df.loc[results_df.index][results_df.columns]

    precisions, recalls, threshs = cm.compute_joint_metrics(
        results_df,
        assignment_df,
        include_labels,
        label_graph=label_graph,
        label_to_name=label_to_name,
        og=og,
        conservative=conservative_mode)

    with open(join(out_dir, 'joint_pr_curve.json'), 'w') as f:
        json.dump(
            {
                'precisions': precisions,
                'recalls': recalls,
                'thresholds': threshs
            },
            f,
            indent=4)
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    #parser.add_option("-b", "--b_descrip", help="This is an argument")
    parser.add_option(
        "-a",
        "--algo_config_dir",
        help="The directory where all the classifier configurations are stored"
    )
    parser.add_option("-r",
                      "--artifacts_dir",
                      help="The directory in which to write temporary files")
    parser.add_option("-o", "--out_file", help="Output file")
    (options, args) = parser.parse_args()

    config_f = args[0]
    dataset_dir = args[1]
    fold_f = args[2]
    out_f = options.out_file

    # Load training configuration
    with open(config_f, 'r') as f:
        training_config = json.load(f)
    params = training_config['params']
    features = training_config['features']
    algorithm = training_config['algorithm']
    preprocessors = None
    preprocessor_params = None
    if 'preprocessors' in training_config:
        assert 'preprocessor_params' in training_config
        preprocessors = training_config['preprocessors']
        preprocessor_params = training_config['preprocessor_params']

    # Load the dataset
    r = load_dataset.load_dataset(dataset_dir, features)
    og = r[0]
    label_graph = r[1]
    label_to_name = r[2]
    the_exps = r[3]
    exp_to_index = r[4]
    exp_to_labels = r[5]
    exp_to_tags = r[6]
    exp_to_study = r[7]
    study_to_exps = r[8]
    exp_to_ms_labels = r[9]
    data_matrix = r[10]
    gene_ids = r[11]

    # Load the fold's study and training/test sets
    with open(fold_f, 'r') as f:
        fold = json.load(f)
    held_exps = fold['experiments']
    held_study = fold['study']
    fold_exps = set(the_exps) - set(held_exps)

    # Map the fold's training experiments to their
    # label sets
    fold_exp_to_labels = {exp: exp_to_labels[exp] for exp in fold_exps}

    # Build the ontology-graph spanning this
    # fold's training set
    og = the_ontology.the_ontology()
    all_labels = set()
    for labels in fold_exp_to_labels.values():
        all_labels.update(labels)
    fold_label_graph = ontology_utils.ontology_subgraph_spanning_terms(
        all_labels, og)
    fold_label_graph = graph.transitive_reduction_on_dag(fold_label_graph)

    print('Training model...')
    fold_data_df = pd.DataFrame(data=data_matrix,
                                index=the_exps,
                                columns=gene_ids)
    fold_data_df = fold_data_df.loc[fold_exps]
    fold_data_matrix = np.array(fold_data_df)
    out_dir = '.'
    mod = model.train_model(algorithm,
                            params,
                            fold_data_matrix,
                            fold_exps,
                            fold_exp_to_labels,
                            fold_label_graph,
                            item_to_group=None,
                            tmp_dir=join(out_dir, 'tmp'),
                            features=gene_ids,
                            preprocessor_names=preprocessors,
                            preprocessor_params=preprocessor_params)
    print('done.')

    # Apply model on held-out data
    print('Applying model to test set.')
    held_data_df = pd.DataFrame(data=data_matrix,
                                index=the_exps,
                                columns=gene_ids)
    held_data_df = held_data_df.loc[held_exps]
    held_data_matrix = np.array(held_data_df)
    confidence_df, score_df = mod.predict(held_data_matrix, held_data_df.index)
    print('done.')

    # Write output
    confidence_df.to_csv(out_f, sep='\t')
예제 #19
0
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat")
    parser.add_option("-o", "--out_dir", help="Output directory")
    (options, args) = parser.parse_args()

    binary_results_f = args[0]
    results_f = args[1]
    label_graph_f = args[2]
    decision_boundary_f = args[3]
    precision_thresh = float(args[4])
    out_dir = options.out_dir

    binary_results_df = pd.read_csv(binary_results_f, sep='\t', index_col=0)
    results_df = pd.read_csv(results_f, sep='\t', index_col=0)
    decision_df = pd.read_csv(decision_boundary_f, sep='\t', index_col=0)

    # Load the ontology
    og = the_ontology.the_ontology()

    # Load the label graph
    with open(label_graph_f, 'r') as f:
        label_data = json.load(f)
    label_graph = DirectedAcyclicGraph(label_data['label_graph'])
    label_to_name = {
        x: og.id_to_term[x].name
        for x in label_graph.get_all_nodes()
    }

    label_to_f1 = {
        label: decision_df.loc[label]['F1-score']
        for label in decision_df.index
    }
    label_to_prec = {
        label: decision_df.loc[label]['precision']
        for label in decision_df.index
    }
    label_to_thresh = {
        label: decision_df.loc[label]['empirical_threshold']
        for label in decision_df.index
    }

    # Map each label to its ancestors
    label_to_ancestors = {
        label: label_graph.ancestor_nodes(label)
        for label in label_graph.get_all_nodes()
    }

    # Filter labels according to empiracle precision
    hard_labels = set([
        label for label, prec in label_to_prec.items()
        if prec < precision_thresh
    ])

    # Map each experiment to its predicted terms
    print('Mapping each sample to its predicted labels...')
    consider_labels = set(binary_results_df.columns) - hard_labels
    exp_to_pred_labels = {
        exp: [
            label for label in consider_labels
            if binary_results_df.loc[exp][label] == 1
        ]
        for exp in binary_results_df.index
    }

    print('Computing the most-specific predicted labels...')
    exp_to_ms_pred_labels = {
        exp:
        label_graph.most_specific_nodes(set(pred_labels) - QUALIFIER_TERMS)
        for exp, pred_labels in exp_to_pred_labels.items()
    }

    # Select cells with highest probability
    exp_to_select_pred_label = {
        exp:
        max([(label, results_df.loc[exp][label]) for label in ms_pred_labels],
            key=lambda x: x[1])[0]
        for exp, ms_pred_labels in exp_to_ms_pred_labels.items()
        if len(ms_pred_labels) > 0
    }

    exp_to_update_pred = {}
    for exp, select_label in exp_to_select_pred_label.items():
        print('{}: {}'.format(exp, og.id_to_term[select_label].name))
        all_labels = label_to_ancestors[select_label]
        exp_to_update_pred[exp] = all_labels

    # Add qualifier cell types
    for exp in exp_to_update_pred:
        for qual_label in QUALIFIER_TERMS:
            if qual_label in exp_to_pred_labels[exp]:
                all_labels = label_to_ancestors[qual_label]
                exp_to_update_pred[exp].update(all_labels)

    # Create dataframe with filtered results
    da = []
    for exp in binary_results_df.index:
        row = []
        for label in binary_results_df.columns:
            if label in exp_to_update_pred[exp]:
                row.append(1)
            else:
                row.append(0)
        da.append(row)

    df = pd.DataFrame(data=da,
                      columns=binary_results_df.columns,
                      index=binary_results_df.index)
    df.to_csv(join(
        out_dir, 'filtered_binary_classification_results.prec_{}.tsv'.format(
            str(precision_thresh))),
              sep='\t')
def main():
    usage = "usage: %prog <experiment metadata file> <untampered exp list file for experiments that have data> <train-test set partition file>"
    parser = OptionParser(usage=usage)
    parser.add_option("-r",
                      "--train_out_file",
                      help="Training set output experiment list file")
    parser.add_option("-e",
                      "--test_out_file",
                      help="Test set output experiment list file")
    (options, args) = parser.parse_args()

    exp_info_f = args[0]
    untampered_exps_w_data_list_f = args[1]
    train_test_partition_f = args[2]
    train_out_f = options.train_out_file
    test_out_f = options.test_out_file

    og = the_ontology.the_ontology()

    with open(untampered_exps_w_data_list_f, 'r') as f:
        include_experiments_data = json.load(f)
    with open(exp_info_f, 'r') as f:
        exp_to_info = json.load(f)
    with open(train_test_partition_f, 'r') as f:
        partition_data = json.load(f)

    train_studies = partition_data['train_set_studies']
    test_studies = partition_data['test_set_studies']

    include_experiments = set(include_experiments_data['experiments'])
    parent_exp_list_name = include_experiments_data['list_name']
    assert parent_exp_list_name == "all_untampered_bulk_primary_cells_with_data"

    exp_to_study = {
        exp: exp_to_info[exp]['study_accession']
        for exp in exp_to_info
    }
    study_to_exps = defaultdict(lambda: set())
    for exp in include_experiments:
        study = exp_to_study[exp]
        study_to_exps[study].add(exp)

    train_exps = set()
    for study in train_studies:
        train_exps.update(study_to_exps[study])
    test_exps = set()
    for study in test_studies:
        test_exps.update(study_to_exps[study])

    with open(train_out_f, 'w') as f:
        f.write(
            json.dumps(
                {
                    "list_name":
                    "training_set_experiments",
                    "description":
                    "These are a subset of the experiments in the experiment list '%s' cross referenced with the training set partition"
                    % (parent_exp_list_name),
                    "experiments":
                    list(train_exps)
                },
                indent=4))
    with open(test_out_f, 'w') as f:
        f.write(
            json.dumps(
                {
                    "list_name":
                    "test_set_experiments",
                    "description":
                    "These are a subset of the experiments in the experiment list '%s' cross referenced with the test set partition."
                    % (parent_exp_list_name),
                    "experiments":
                    list(test_exps)
                },
                indent=4))