def map_samples_to_terms(restrict_to_samples, metasra_f): og = the_ontology.the_ontology() query_metasra_mapped_terms_sql = "SELECT sample_accession, \ term_id FROM mapped_ontology_terms;" print("Querying database for sample to terms mappings...") sample_to_mapped_terms = defaultdict(lambda: set()) with sqlite3.connect(metasra_f) as metasra_conn: metasra_c = metasra_conn.cursor() results = metasra_c.execute(query_metasra_mapped_terms_sql) for r in results: sample = r[0] term_id = r[1] if sample in restrict_to_samples: sample_to_mapped_terms[sample].add(term_id) # Restrict to most specific term mod_sample_to_mapped_terms = {} for sample, terms in sample_to_mapped_terms.items(): ms_mapped_terms = ontology_graph.most_specific_terms( terms, og, sup_relations=["is_a", "part_of"]) mod_sample_to_mapped_terms[sample] = set(ms_mapped_terms) sample_to_mapped_terms = mod_sample_to_mapped_terms print("done.") return sample_to_mapped_terms
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_file", help="Output file") (options, args) = parser.parse_args() pr_curves_f = args[0] out_f = options.out_file og = the_ontology.the_ontology() with open(pr_curves_f, 'r') as f: method_to_label_to_pr_curves = json.load(f) assert len(method_to_label_to_pr_curves) == 1 method = sorted(method_to_label_to_pr_curves.keys())[0] label_to_pr_curves = method_to_label_to_pr_curves[method] da = [] for label, pr in label_to_pr_curves.items(): precs = pr['precisions'] recs = pr['recalls'] threshs = pr['thresholds'] f1s = map(_compute_f1, zip(precs, recs)) max_f1_thresh = max(zip(f1s, threshs), key=lambda x: x[0]) da.append((label, og.id_to_term[label].name, max_f1_thresh[1], max_f1_thresh[0])) df = pd.DataFrame(data=da, columns=['label', 'label_name', 'threshold', 'F1-score']) df.to_csv(out_f, sep='\t', index=False) print(df)
def _label_experiments( experiment_accs, exp_to_info, which_terms='mapped_terms' ): og = the_og.the_ontology() exp_to_terms = defaultdict(lambda: set()) for exp in experiment_accs: mapped_terms = set( exp_to_info[exp][which_terms] ) # compute all cell-type terms all_terms = set() for term in mapped_terms: all_terms.update( og.recursive_relationship( term, recurs_relationships=['is_a', 'part_of'] ) ) all_terms = [ x for x in all_terms if x.split(':')[0] == 'CL' ] exp_to_terms[exp] = all_terms return exp_to_terms
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_dir", help="Directory in which to write output") (options, args) = parser.parse_args() result_f = args[0] out_dir = options.out_dir og = the_ontology.the_ontology() raw_df = pd.read_csv(result_f, sep='\t', index_col=0) raw_df = raw_df.drop([ 'first.labels', 'tuning.scores.first', 'tuning.scores.second', 'labels', 'pruned.labels' ], axis=1) # Get all terms represented in this output all_terms = set() for label in raw_df.columns: if label not in SINGLER_OUTPUT_TO_TERMS: print('Skipping column "{}"'.format(label)) continue all_terms.update(SINGLER_OUTPUT_TO_TERMS[label]) # Map each term to its ancestors term_to_ancestors = { term: og.recursive_superterms(term) for term in all_terms } for term, ancestors in term_to_ancestors.items(): all_terms.update(ancestors) all_terms = sorted(all_terms) # Compute binary-classification matrix da = [] for cell in raw_df.index: preds = [(pred, label) for pred, label in zip(raw_df.loc[cell], raw_df.columns)] max_label = max(preds, key=lambda x: x[0])[1] pred_terms = SINGLER_OUTPUT_TO_TERMS[max_label] all_pred_terms = set() for term in pred_terms: all_pred_terms.update(term_to_ancestors[term]) row = [] for term in all_terms: if term in all_pred_terms: row.append(1) else: row.append(0) da.append(row) bin_df = pd.DataFrame(data=da, columns=all_terms, index=raw_df.index) bin_df.to_csv(join(out_dir, 'binary_classification_results.tsv'), sep='\t')
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_dir", help="Directory in which to write output") (options, args) = parser.parse_args() result_f = args[0] out_dir = options.out_dir og = the_ontology.the_ontology() scmatch_output_to_all_terms = defaultdict(lambda: set()) all_terms = set() for scmatch_out, terms in SCMATCH_OUTPUT_TO_TERMS.items(): for term in terms: scmatch_output_to_all_terms[scmatch_out].update( og.recursive_superterms(term)) all_terms.update(og.recursive_superterms(term)) scmatch_output_to_all_terms = dict(scmatch_output_to_all_terms) all_terms = sorted(all_terms) results_df = pd.read_csv(result_f, index_col=0) print(results_df) conf_da = [] bin_da = [] nonmapped_samples = set() for cell in results_df.index: scmatch_out = results_df.loc[cell]['top sample'].split(',')[0] score = results_df.loc[cell]['top correlation score'] try: terms = scmatch_output_to_all_terms[scmatch_out] except KeyError: nonmapped_samples.add(scmatch_out) terms = [] term_scores = [] term_assigns = [] for term in all_terms: if term in terms: term_scores.append(score) term_assigns.append(1) else: term_scores.append(float('-inf')) term_assigns.append(0) conf_da.append(term_scores) bin_da.append(term_assigns) print('Could not the following samples to ontology terms:') print('\n'.join(nonmapped_samples)) conf_df = pd.DataFrame(data=conf_da, columns=all_terms, index=results_df.index) bin_df = pd.DataFrame(data=bin_da, columns=all_terms, index=results_df.index) conf_df.to_csv(join(out_dir, 'classification_results.tsv'), sep='\t') bin_df.to_csv(join(out_dir, 'binary_classification_results.tsv'), sep='\t')
def main(): usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>" parser = OptionParser() parser.add_option("-o", "--out_file", help="Output file") (options, args) = parser.parse_args() # Parse the input metrics_f = args[0] metric_name = args[1] label_graph_f = args[2] out_f = options.out_file # Load the ontology og = the_ontology.the_ontology() # Load the labels' data print('Reading label graph from {}.'.format(label_graph_f)) with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } print('\n'.join( set(label_to_name.values()) - set(LABEL_NAME_TO_SUCCINCT.keys()))) # Topologically sort the labels and assign them numbers topo_sort_labels = topological_sort(label_graph) label_to_topo_index = { label: index for index, label in enumerate(topo_sort_labels) } # Create text legend for graph #legend = '' #for label, topo_index in label_to_topo_index.items(): # legend += '{} {}'.format(topo_index, og.id_to_term[label].name) #with open(join(out_dir, 'graph_node_labels.txt'), 'w') as f: # f.write(legend) # Load the metrics metrics_df = pd.read_csv(metrics_f, sep='\t', index_col=0) # Create the output directory #_run_cmd("mkdir -p %s" % out_dir) label_to_metric = { label: metrics_df.loc[label][metric_name] for label in metrics_df.index if label in label_to_name } # F1-score drawn atop ontology draw_collapsed_ontology(label_graph, label_to_name, label_to_metric, metric_name, out_f)
def main(): usage = "" parser = OptionParser() parser.add_option("-s", "--use_supplemental", action="store_true", help="Use supplemental labels") parser.add_option("-o", "--out_file", help="Output file") (options, args) = parser.parse_args() annot_f = args[0] exp_set_f = args[1] out_f = options.out_file # Load metadata with open(annot_f, 'r') as f: exp_to_info = json.load(f) with open(exp_set_f, 'r') as f: the_exps = json.load(f)['experiments'] # Label the experiments if options.use_supplemental: exp_to_terms = _label_experiments( the_exps, exp_to_info, which_terms='supplemental_mapped_terms') else: exp_to_terms = _label_experiments(the_exps, exp_to_info) # Generate the labelling-graph induced by this # dataset og = the_og.the_ontology() all_terms = set() for terms in exp_to_terms.values(): all_terms.update(terms) label_graph = ontology_utils.ontology_subgraph_spanning_terms( all_terms, og) label_graph = graph.transitive_reduction_on_dag(label_graph) # Write output exp_set_name = basename(exp_set_f).split('.')[0] with open(out_f, 'w') as f: f.write( json.dumps( { 'labels_config': { 'experiment_set': exp_set_name }, 'label_graph': { source: list(targets) for source, targets in label_graph.source_to_targets.items() }, 'labels': exp_to_terms }, indent=4, separators=(',', ': ')))
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_file", help="File to write labels data") (options, args) = parser.parse_args() dataset_f = args[0] out_f = options.out_file # Load the ontology og = the_ontology.the_ontology() # Load the cell_ids and 10x datasets from which they # originate with h5py.File(dataset_f, 'r') as f: cell_ids = [ str(x)[2:-1] for x in f['experiment'][:] ] datasets = [ str(x)[2:-1] for x in f['dataset'][:] ] # Label each cell cell_id_to_labels = {} all_labels = set() for dataset, cell_id in zip(datasets, cell_ids): ms_label = DATA_SET_TO_TARGET_TERM[dataset] labels = sorted(og.recursive_superterms(ms_label)) cell_id_to_labels[cell_id] = labels all_labels.update(labels) # Generate label-graph label_graph = ontology_utils.ontology_subgraph_spanning_terms( all_labels, og ) label_graph = graph.transitive_reduction_on_dag(label_graph) # Write output with open(out_f, 'w') as f: f.write(json.dumps( { 'labels_config': {}, 'label_graph': { source: list(targets) for source, targets in label_graph.source_to_targets.items() }, 'labels': cell_id_to_labels }, indent=4, separators=(',', ': ') ))
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_file", help="File to write labels data") (options, args) = parser.parse_args() raw_10x_f = args[0] out_f = options.out_file # Load the ontology og = the_ontology.the_ontology() h = pd.HDFStore(raw_10x_f, mode='r') df = h['DF_ALL'] cells = df.index.get_level_values('Cell ID') cell_types = df.index.get_level_values('CELL_TYPE') print(set(cell_types)) # Label each cell cell_id_to_labels = {} all_labels = set() for cell_id, cell_type in zip(cells, cell_types): ms_label = CELL_TYPE_TO_TERM[cell_type] labels = sorted(og.recursive_superterms(ms_label)) cell_id_to_labels[cell_id] = labels all_labels.update(labels) # Generate label-graph label_graph = ontology_utils.ontology_subgraph_spanning_terms( all_labels, og) label_graph = graph.transitive_reduction_on_dag(label_graph) # Write output print("Writing output to {}...".format(out_f)) with open(out_f, 'w') as f: f.write( json.dumps( { 'labels_config': {}, 'label_graph': { source: list(targets) for source, targets in label_graph.source_to_targets.items() }, 'labels': cell_id_to_labels }, indent=4, separators=(',', ': ')))
def main(): usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>" parser = OptionParser() parser.add_option("-o", "--out_file", help="File in which to write output") (options, args) = parser.parse_args() out_f = options.out_file pr_curve_f = args[0] label_graph_f = args[1] labeling_f = args[2] # Load the ontology og = the_ontology.the_ontology() # Load the labels' data with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } # Load the labellings with open(labeling_f, 'r') as f: labelling = json.load(f) exp_to_labels = labelling['labels'] # Load PR-curves with open(pr_curve_f, 'r') as f: label_to_pr_curve = json.load(f) # Compute labels on which we will compute metrics include_labels = set(label_to_pr_curve.keys()) - BLACKLIST_TERMS # Precision recall curves overlaid on label-graph draw_collapsed_ontology_w_pr_curves(exp_to_labels, label_graph, label_to_name, label_to_pr_curve, out_f)
def main(): usage = "usage: %prog <experiment metadata file> <untampered exp list file for experiments that have data> <train-test set partition file>" parser = OptionParser(usage=usage) parser.add_option("-o", "--out_file", help="Test set output experiment list file") (options, args) = parser.parse_args() train_exps_list_f = args[0] test_exps_list_f = args[1] out_f = options.out_file og = the_ontology.the_ontology() with open(train_exps_list_f, 'r') as f: train_include_experiments_data = json.load(f) with open(test_exps_list_f, 'r') as f: test_include_experiments_data = json.load(f) include_experiments = set(train_include_experiments_data['experiments']) include_experiments.update( set(test_include_experiments_data['experiments']) ) train_parent_exp_list_name = train_include_experiments_data['list_name'] test_parent_exp_list_name = test_include_experiments_data['list_name'] assert train_parent_exp_list_name == "training_set_experiments" assert test_parent_exp_list_name == "test_set_experiments" with open(out_f, 'w') as f: f.write(json.dumps( { "list_name": "untampered_bulk_primary_cells_with_data", "description": "These union of experiments in the experiment list '%s' and '%s'" % ( train_parent_exp_list_name, test_parent_exp_list_name ), "experiments": list(include_experiments) }, indent=4 ))
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_file", help="Output file") (options, args) = parser.parse_args() pr_curves_f = args[0] out_f = options.out_file og = the_ontology.the_ontology() with open(pr_curves_f, 'r') as f: label_to_pr_curves = json.load(f) da = [] for label, pr in label_to_pr_curves.items(): if label in REMOVE_TERMS: continue precs = pr[0] recs = pr[1] threshs = pr[2] f1s = map(_compute_f1, zip(precs, recs)) max_f1_thresh = max(zip(f1s, precs, threshs), key=lambda x: x[0]) thresh = min([max_f1_thresh[2], 0.5]) #thresh = max_f1_thresh[2] #da.append((label, og.id_to_term[label].name, max_f1_thresh[1], max_f1_thresh[0])) da.append((label, og.id_to_term[label].name, thresh, max_f1_thresh[2], max_f1_thresh[1], max_f1_thresh[0])) df = pd.DataFrame(data=da, columns=[ 'label', 'label_name', 'threshold', 'empirical_threshold', 'precision', 'F1-score' ]) df.to_csv(out_f, sep='\t', index=False) print(df)
def main(): usage = "usage: %prog <options> <environment dir> <experiment list name> <cross-validation config name>" parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_dir", help="Directory in which to write the output") (options, args) = parser.parse_args() results_f = args[0] label_graph_f = args[1] prefix = args[2] out_dir = options.out_dir # Load the results confidence_df = pd.read_csv(results_f, sep='\t', index_col=0) # Map each label to its name og = the_ontology.the_ontology() label_to_name = { label: og.id_to_term[label].name for label in confidence_df.columns } _run_cmd("mkdir -p %s" % out_dir) # Load the label-graph with open(label_graph_f, 'r') as f: labels_data = json.load(f) label_graph = DirectedAcyclicGraph(labels_data['label_graph']) # Compute the labels for which we will compute metrics over. # This label set is simply the set of labels for which we have # predictions for every sample include_labels = set(confidence_df.columns) - BLACKLIST_TERMS total_n_incons = 0 total_n_very_incons = 0 total_edges = 0 incons_to_count = defaultdict(lambda: 0) very_incons_to_count = defaultdict(lambda: 0) exp_child_parent_incons = [] for exp in confidence_df.index: exp_n_incons = 0 for parent_label in confidence_df.columns: parent_conf = confidence_df.loc[exp][parent_label] if parent_label in BLACKLIST_TERMS: continue for child_label in label_graph.source_to_targets[parent_label]: if child_label in BLACKLIST_TERMS: continue if child_label not in confidence_df.columns: continue child_conf = confidence_df.loc[exp][child_label] # Don't consider parent-child edges where prediction-scores # for both nodes is less than 1% if child_conf < 0.01 and parent_conf < 0.01: continue # We count the edge as inconsistent if BOTH the child's score is # greater than its parents and ALSO that difference is non-negligeble if abs(child_conf - parent_conf) > EPSILON and child_conf > parent_conf: exp_child_parent_incons.append( (exp, child_label, parent_label, (child_conf - parent_conf))) incons_to_count[(parent_label, child_label)] += 1 total_n_incons += 1 exp_n_incons += 1 if child_conf - parent_conf > VERY_INCONS_THRESH: total_n_very_incons += 1 very_incons_to_count[(parent_label, child_label)] += 1 total_edges += 1 total_fraction_inconsistent = total_n_incons / float(total_edges) total_fraction_very_inconsistent = total_n_very_incons / float(total_edges) print("Inconsistent edges:") for incons, count in sorted([(k, v) for k, v in incons_to_count.items()], key=lambda x: x[1]): parent = incons[0] child = incons[1] print("%s -> %s : %d" % (label_to_name[parent], label_to_name[child], count)) print("Very inconsistent edges:") for incons, count in sorted([(k, v) for k, v in very_incons_to_count.items()], key=lambda x: x[1]): parent = incons[0] child = incons[1] print("%s -> %s : %d" % (label_to_name[parent], label_to_name[child], count)) summary_df = pd.DataFrame( data=[[total_n_incons, total_edges, total_fraction_inconsistent], [ total_n_very_incons, total_edges, total_fraction_very_inconsistent ], [ total_n_very_incons, len(confidence_df.index), (float(total_n_very_incons) / len(confidence_df.index)) ]], columns=["No. enconsistent", "Total edges", "Fraction of total edges"], index=[ "Total edges inconsistent", "Total edges inconsistent >%f" % VERY_INCONS_THRESH, "Avg. very inconsistent per sample" ]) summary_df.to_csv(join(out_dir, '{}.inconsistent_edges_stats.tsv'.format(prefix)), sep='\t') exp_child_parent_incons = sorted(exp_child_parent_incons, key=lambda x: x[3]) inconss = [] n_less_eq = [] l_less_than_1 = 0 n_great_than_1 = 0 for i, exp_child_parent_icons in enumerate(exp_child_parent_incons): incons = exp_child_parent_icons[3] inconss.append(incons) n_less_eq.append(float(i) / len(exp_child_parent_incons)) fig, axarr = plt.subplots(1, 1, figsize=(3.0, 3.0), squeeze=False) axarr[0][0].plot(inconss, n_less_eq, color=vl.NICE_COLORS[1], lw=4) axarr[0][0].set_xlabel('Child prob. - Parent prob.') axarr[0][0].set_ylabel('Cumulative probability') axarr[0][0].set_xlim((0.0, 1.0)) axarr[0][0].set_ylim((0.0, 1.0)) out_f = join(out_dir, "{}.CDF_inconsistences".format(prefix)) fig.savefig("%s.eps" % out_f, format='eps', bbox_inches='tight', dpi=100, transparent=True) fig.savefig("%s.pdf" % out_f, format='pdf', bbox_inches='tight', dpi=100, transparent=True)
def main(): usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>" parser = OptionParser() parser.add_option( "-o", "--out_dir", help="Directory in which to write the output. If it doesn't exist, create the directory." ) parser.add_option( "-f", "--config_file", help="JSON file with all inputs required to run this analysis" ) parser.add_option( "-t", "--thresholds", help="Either a JSON file mapping each label to a decision threshold or number denoting the threshold to use for all cell types" ) parser.add_option( "-v", "--threshold_val", help="A number denoting the threshold to use for all cell types" ) parser.add_option( "-c", "--conservative_mode", action="store_true", help="Compute conservative metrics" ) (options, args) = parser.parse_args() conservative_mode = options.conservative_mode if options.threshold_val: label_to_thresh = defaultdict(lambda: float(options.threshold_val)) elif options.thresholds: label_to_thresh_df = pd.read_csv(options.thresholds, sep='\t', index_col=0) label_to_thresh = { label: label_to_thresh_df.loc[label]['threshold'] for label in label_to_thresh_df.index } out_dir = options.out_dir # Parse the input if options.config_file: config_f = args[0] with open(config_f, 'r') as f: config = json.load(f) label_graph_f = config['label_graph_file'] labeling_f = config['labeling_file'] results_fs = config['results_files'] method_name = config['method_name'] else: method_name = args[0] results_f = args[1] label_graph_f = args[2] labeling_f = args[3] # Load the ontology og = the_ontology.the_ontology() # Load the labels' data with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } # Load the labellings with open(labeling_f, 'r') as f: labelling = json.load(f) exp_to_labels = labelling['labels'] # Load the results bin_results_df = pd.read_csv(results_f, sep='\t', index_col=0) # Create the output directory _run_cmd("mkdir -p %s" % out_dir) # Compute labels on which we will compute metrics include_labels = set(bin_results_df.columns) - BLACKLIST_TERMS # Create the assignment matrix where rows are samples, columns # are labels, and element (i,j) = True if sample i is annotated # with label j assignment_df = cm._compute_assignment_matrix( bin_results_df, exp_to_labels ) #bin_results_da = {} #for label in results_df.columns: # if options.thresholds and label not in label_to_thresh: # continue # confs = results_df[label] # bins = [ # (x > label_to_thresh[label]) # for x in confs # ] # bin_results_da[label] = bins #bin_results_df = pd.DataFrame( # data=bin_results_da, # index=results_df.index #) assignment_df = assignment_df.loc[bin_results_df.index][bin_results_df.columns] metrics_df = cm.compute_label_centric_metrics_binary( bin_results_df, assignment_df, include_labels, label_graph=label_graph, label_to_name=label_to_name, og=og, conservative=conservative_mode ) metrics_df.to_csv(join(out_dir, 'binary_cell_type_metrics.tsv'), sep='\t') label_to_f1 = { label: metrics_df.loc[label]['F1-Score'] for label in metrics_df.index } print(label_to_f1) # F1-score drawn atop ontology draw_collapsed_ontology( label_graph, label_to_name, label_to_f1, 'F1-Score', out_dir )
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_file", help="Output file") (options, args) = parser.parse_args() single_cell_exp_list_f = args[0] bulk_exp_list_f = args[1] single_cell_label_graph_f = args[2] bulk_label_graph_f = args[3] out_f = options.out_file og = the_ontology() with open(single_cell_exp_list_f, 'r') as f: sc_experiments_data = json.load(f) with open(bulk_exp_list_f, 'r') as f: bulk_experiments_data = json.load(f) single_cell_exps = set(sc_experiments_data['experiments']) single_cell_exp_set_name = sc_experiments_data['list_name'] bulk_exps = set(bulk_experiments_data['experiments']) bulk_exp_set_name = bulk_experiments_data['list_name'] assert single_cell_exp_set_name == "untampered_single_cell_primary_cells_with_data" assert bulk_exp_set_name == "untampered_bulk_primary_cells_with_data" with open(single_cell_label_graph_f, 'r') as f: labels_data = json.load(f) sc_label_graph = labels_data['label_graph'] sc_exp_to_labels = labels_data['labels'] sc_exp_to_labels = { k: set(v) - set(BLACKLIST) for k, v in sc_exp_to_labels.items() } with open(bulk_label_graph_f, 'r') as f: labels_data = json.load(f) bulk_label_graph = labels_data['label_graph'] bulk_exp_to_labels = labels_data['labels'] bulk_exp_to_labels = { k: set(v) - set(BLACKLIST) for k, v in bulk_exp_to_labels.items() } # The idea here is that we only want single-cell samples # for which ~all~ of its most-specific labels are a subset # of one bulk-sample's label-set. Here we collect all of the # unique bulk label-sets. # # For example, given a sample labeled as {embryonic cell, # neural cell}, but in the bulk data we only have samples # labelled as {embryonic cell} and {neural cell}. We would # discard this cell. bulk_label_sets = set() for labels in bulk_exp_to_labels.values(): bulk_label_sets.add(frozenset(labels)) label_sets_not_in_bulk = set() removed_exps = set() include_exps = set() g = DirectedAcyclicGraph(sc_label_graph) for exp, labels in sc_exp_to_labels.items(): ms_labels = set(g.most_specific_nodes(labels)) ms_labels -= set(IGNORE) # Go through the bulk label-sets and check if the current # sample's set of most-specific labels is a subset of any # of them. If so, keep it. If not, we discard it. found = False for label_set in bulk_label_sets: if set(ms_labels) <= label_set: include_exps.add(exp) found = True break if not found: label_sets_not_in_bulk.add(frozenset(ms_labels)) removed_exps.add(exp) print("{} single-cell experiments were removed".format(len(removed_exps))) print("Labels that were removed:") print( json.dumps([[og.id_to_term[x].name for x in label_set] for label_set in label_sets_not_in_bulk], indent=True)) with open(out_f, 'w') as f: f.write( json.dumps( { "list_name": "untampered_single_cell_primary_cells_with_data_cell_types_in_bulk", "description": "These are all experiments that are in the experiment list '%s' and also share the same set of most-specific labels with at least one experiment in %s" % (single_cell_exp_set_name, bulk_exp_set_name), "experiments": list(include_exps) }, indent=4))
def select_best_most_specific(): og = the_ontology.the_ontology()
def main(): usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>" parser = OptionParser() parser.add_option( "-o", "--out_dir", help= "Directory in which to write the output. If it doesn't exist, create the directory." ) parser.add_option("-c", "--conservative_mode", action="store_true", help="Compute conservative metrics") (options, args) = parser.parse_args() conservative_mode = options.conservative_mode out_dir = options.out_dir method_name = args[0] results_f = args[1] label_graph_f = args[2] labeling_f = args[3] # Load the ontology og = the_ontology.the_ontology() # Load the labels' data with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } # Load the labellings with open(labeling_f, 'r') as f: labelling = json.load(f) exp_to_labels = labelling['labels'] # Load the results results_df = pd.read_csv(results_f, sep='\t', index_col=0) # Create the output directory _run_cmd("mkdir -p %s" % out_dir) # Compute labels on which we will compute metrics include_labels = set(results_df.columns) - BLACKLIST_TERMS # Create the assignment matrix where rows are samples, columns # are labels, and element (i,j) = True if sample i is annotated # with label j assignment_df = cm._compute_assignment_matrix(results_df, exp_to_labels) assignment_df = assignment_df.loc[results_df.index][results_df.columns] precisions, recalls, threshs = cm.compute_joint_metrics( results_df, assignment_df, include_labels, label_graph=label_graph, label_to_name=label_to_name, og=og, conservative=conservative_mode) with open(join(out_dir, 'joint_pr_curve.json'), 'w') as f: json.dump( { 'precisions': precisions, 'recalls': recalls, 'thresholds': threshs }, f, indent=4)
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-b", "--b_descrip", help="This is an argument") parser.add_option( "-a", "--algo_config_dir", help="The directory where all the classifier configurations are stored" ) parser.add_option("-r", "--artifacts_dir", help="The directory in which to write temporary files") parser.add_option("-o", "--out_file", help="Output file") (options, args) = parser.parse_args() config_f = args[0] dataset_dir = args[1] fold_f = args[2] out_f = options.out_file # Load training configuration with open(config_f, 'r') as f: training_config = json.load(f) params = training_config['params'] features = training_config['features'] algorithm = training_config['algorithm'] preprocessors = None preprocessor_params = None if 'preprocessors' in training_config: assert 'preprocessor_params' in training_config preprocessors = training_config['preprocessors'] preprocessor_params = training_config['preprocessor_params'] # Load the dataset r = load_dataset.load_dataset(dataset_dir, features) og = r[0] label_graph = r[1] label_to_name = r[2] the_exps = r[3] exp_to_index = r[4] exp_to_labels = r[5] exp_to_tags = r[6] exp_to_study = r[7] study_to_exps = r[8] exp_to_ms_labels = r[9] data_matrix = r[10] gene_ids = r[11] # Load the fold's study and training/test sets with open(fold_f, 'r') as f: fold = json.load(f) held_exps = fold['experiments'] held_study = fold['study'] fold_exps = set(the_exps) - set(held_exps) # Map the fold's training experiments to their # label sets fold_exp_to_labels = {exp: exp_to_labels[exp] for exp in fold_exps} # Build the ontology-graph spanning this # fold's training set og = the_ontology.the_ontology() all_labels = set() for labels in fold_exp_to_labels.values(): all_labels.update(labels) fold_label_graph = ontology_utils.ontology_subgraph_spanning_terms( all_labels, og) fold_label_graph = graph.transitive_reduction_on_dag(fold_label_graph) print('Training model...') fold_data_df = pd.DataFrame(data=data_matrix, index=the_exps, columns=gene_ids) fold_data_df = fold_data_df.loc[fold_exps] fold_data_matrix = np.array(fold_data_df) out_dir = '.' mod = model.train_model(algorithm, params, fold_data_matrix, fold_exps, fold_exp_to_labels, fold_label_graph, item_to_group=None, tmp_dir=join(out_dir, 'tmp'), features=gene_ids, preprocessor_names=preprocessors, preprocessor_params=preprocessor_params) print('done.') # Apply model on held-out data print('Applying model to test set.') held_data_df = pd.DataFrame(data=data_matrix, index=the_exps, columns=gene_ids) held_data_df = held_data_df.loc[held_exps] held_data_matrix = np.array(held_data_df) confidence_df, score_df = mod.predict(held_data_matrix, held_data_df.index) print('done.') # Write output confidence_df.to_csv(out_f, sep='\t')
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_dir", help="Output directory") (options, args) = parser.parse_args() binary_results_f = args[0] results_f = args[1] label_graph_f = args[2] decision_boundary_f = args[3] precision_thresh = float(args[4]) out_dir = options.out_dir binary_results_df = pd.read_csv(binary_results_f, sep='\t', index_col=0) results_df = pd.read_csv(results_f, sep='\t', index_col=0) decision_df = pd.read_csv(decision_boundary_f, sep='\t', index_col=0) # Load the ontology og = the_ontology.the_ontology() # Load the label graph with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } label_to_f1 = { label: decision_df.loc[label]['F1-score'] for label in decision_df.index } label_to_prec = { label: decision_df.loc[label]['precision'] for label in decision_df.index } label_to_thresh = { label: decision_df.loc[label]['empirical_threshold'] for label in decision_df.index } # Map each label to its ancestors label_to_ancestors = { label: label_graph.ancestor_nodes(label) for label in label_graph.get_all_nodes() } # Filter labels according to empiracle precision hard_labels = set([ label for label, prec in label_to_prec.items() if prec < precision_thresh ]) # Map each experiment to its predicted terms print('Mapping each sample to its predicted labels...') consider_labels = set(binary_results_df.columns) - hard_labels exp_to_pred_labels = { exp: [ label for label in consider_labels if binary_results_df.loc[exp][label] == 1 ] for exp in binary_results_df.index } print('Computing the most-specific predicted labels...') exp_to_ms_pred_labels = { exp: label_graph.most_specific_nodes(set(pred_labels) - QUALIFIER_TERMS) for exp, pred_labels in exp_to_pred_labels.items() } # Select cells with highest probability exp_to_select_pred_label = { exp: max([(label, results_df.loc[exp][label]) for label in ms_pred_labels], key=lambda x: x[1])[0] for exp, ms_pred_labels in exp_to_ms_pred_labels.items() if len(ms_pred_labels) > 0 } exp_to_update_pred = {} for exp, select_label in exp_to_select_pred_label.items(): print('{}: {}'.format(exp, og.id_to_term[select_label].name)) all_labels = label_to_ancestors[select_label] exp_to_update_pred[exp] = all_labels # Add qualifier cell types for exp in exp_to_update_pred: for qual_label in QUALIFIER_TERMS: if qual_label in exp_to_pred_labels[exp]: all_labels = label_to_ancestors[qual_label] exp_to_update_pred[exp].update(all_labels) # Create dataframe with filtered results da = [] for exp in binary_results_df.index: row = [] for label in binary_results_df.columns: if label in exp_to_update_pred[exp]: row.append(1) else: row.append(0) da.append(row) df = pd.DataFrame(data=da, columns=binary_results_df.columns, index=binary_results_df.index) df.to_csv(join( out_dir, 'filtered_binary_classification_results.prec_{}.tsv'.format( str(precision_thresh))), sep='\t')
def main(): usage = "usage: %prog <experiment metadata file> <untampered exp list file for experiments that have data> <train-test set partition file>" parser = OptionParser(usage=usage) parser.add_option("-r", "--train_out_file", help="Training set output experiment list file") parser.add_option("-e", "--test_out_file", help="Test set output experiment list file") (options, args) = parser.parse_args() exp_info_f = args[0] untampered_exps_w_data_list_f = args[1] train_test_partition_f = args[2] train_out_f = options.train_out_file test_out_f = options.test_out_file og = the_ontology.the_ontology() with open(untampered_exps_w_data_list_f, 'r') as f: include_experiments_data = json.load(f) with open(exp_info_f, 'r') as f: exp_to_info = json.load(f) with open(train_test_partition_f, 'r') as f: partition_data = json.load(f) train_studies = partition_data['train_set_studies'] test_studies = partition_data['test_set_studies'] include_experiments = set(include_experiments_data['experiments']) parent_exp_list_name = include_experiments_data['list_name'] assert parent_exp_list_name == "all_untampered_bulk_primary_cells_with_data" exp_to_study = { exp: exp_to_info[exp]['study_accession'] for exp in exp_to_info } study_to_exps = defaultdict(lambda: set()) for exp in include_experiments: study = exp_to_study[exp] study_to_exps[study].add(exp) train_exps = set() for study in train_studies: train_exps.update(study_to_exps[study]) test_exps = set() for study in test_studies: test_exps.update(study_to_exps[study]) with open(train_out_f, 'w') as f: f.write( json.dumps( { "list_name": "training_set_experiments", "description": "These are a subset of the experiments in the experiment list '%s' cross referenced with the training set partition" % (parent_exp_list_name), "experiments": list(train_exps) }, indent=4)) with open(test_out_f, 'w') as f: f.write( json.dumps( { "list_name": "test_set_experiments", "description": "These are a subset of the experiments in the experiment list '%s' cross referenced with the test set partition." % (parent_exp_list_name), "experiments": list(test_exps) }, indent=4))