def main(): usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>" parser = OptionParser() parser.add_option("-o", "--out_file", help="Output file") (options, args) = parser.parse_args() # Parse the input metrics_f = args[0] metric_name = args[1] label_graph_f = args[2] out_f = options.out_file # Load the ontology og = the_ontology.the_ontology() # Load the labels' data print('Reading label graph from {}.'.format(label_graph_f)) with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } print('\n'.join( set(label_to_name.values()) - set(LABEL_NAME_TO_SUCCINCT.keys()))) # Topologically sort the labels and assign them numbers topo_sort_labels = topological_sort(label_graph) label_to_topo_index = { label: index for index, label in enumerate(topo_sort_labels) } # Create text legend for graph #legend = '' #for label, topo_index in label_to_topo_index.items(): # legend += '{} {}'.format(topo_index, og.id_to_term[label].name) #with open(join(out_dir, 'graph_node_labels.txt'), 'w') as f: # f.write(legend) # Load the metrics metrics_df = pd.read_csv(metrics_f, sep='\t', index_col=0) # Create the output directory #_run_cmd("mkdir -p %s" % out_dir) label_to_metric = { label: metrics_df.loc[label][metric_name] for label in metrics_df.index if label in label_to_name } # F1-score drawn atop ontology draw_collapsed_ontology(label_graph, label_to_name, label_to_metric, metric_name, out_f)
def ontology_subgraph_spanning_terms(span_terms, og): """ Builds the ontology subgraph spanning a set of terms. """ # Get most general terms most_general_terms = ontology_graph.most_specific_terms( span_terms, og, sup_relations=["inv_is_a", "inv_part_of"]) q = deque(most_general_terms) subgraph_source_to_targets = defaultdict(lambda: set()) relations = ["inv_is_a", "inv_part_of"] #visited_ids = set(most_general_terms) while len(q) > 0: source_t_id = q.popleft() for rel in relations: if rel in og.id_to_term[source_t_id].relationships: for target_t_id in og.id_to_term[source_t_id].relationships[ rel]: target_descendants = set( og.recursive_relationship(target_t_id, relations)) # There exists a descendant of the target represented in the samples if len(target_descendants.intersection(span_terms)) > 0: subgraph_source_to_targets[source_t_id].add( target_t_id) q.append(target_t_id) #visited_ids.add(source_t_id) return DirectedAcyclicGraph(subgraph_source_to_targets)
def _retrieve_label_graph(): labels_f = pr.resource_filename( resource_package, join("resources", "training_set", "labels.json")) with open(labels_f, 'r') as f: labels_data = json.load(f) source_to_targets = labels_data['label_graph'] exp_to_labels = labels_data['labels'] label_graph = DirectedAcyclicGraph(source_to_targets) return label_graph
def main(): usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>" parser = OptionParser() parser.add_option("-o", "--out_file", help="File in which to write output") (options, args) = parser.parse_args() out_f = options.out_file pr_curve_f = args[0] label_graph_f = args[1] labeling_f = args[2] # Load the ontology og = the_ontology.the_ontology() # Load the labels' data with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } # Load the labellings with open(labeling_f, 'r') as f: labelling = json.load(f) exp_to_labels = labelling['labels'] # Load PR-curves with open(pr_curve_f, 'r') as f: label_to_pr_curve = json.load(f) # Compute labels on which we will compute metrics include_labels = set(label_to_pr_curve.keys()) - BLACKLIST_TERMS # Precision recall curves overlaid on label-graph draw_collapsed_ontology_w_pr_curves(exp_to_labels, label_graph, label_to_name, label_to_pr_curve, out_f)
def main(): usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>" parser = OptionParser() parser.add_option( "-o", "--out_dir", help= "Directory in which to write the output. If it doesn't exist, create the directory." ) parser.add_option("-c", "--conservative_mode", action="store_true", help="Compute conservative metrics") (options, args) = parser.parse_args() conservative_mode = options.conservative_mode out_dir = options.out_dir method_name = args[0] results_f = args[1] label_graph_f = args[2] labeling_f = args[3] # Load the ontology og = the_ontology.the_ontology() # Load the labels' data with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } # Load the labellings with open(labeling_f, 'r') as f: labelling = json.load(f) exp_to_labels = labelling['labels'] # Load the results results_df = pd.read_csv(results_f, sep='\t', index_col=0) # Create the output directory _run_cmd("mkdir -p %s" % out_dir) # Compute labels on which we will compute metrics include_labels = set(results_df.columns) - BLACKLIST_TERMS # Create the assignment matrix where rows are samples, columns # are labels, and element (i,j) = True if sample i is annotated # with label j assignment_df = cm._compute_assignment_matrix(results_df, exp_to_labels) assignment_df = assignment_df.loc[results_df.index][results_df.columns] precisions, recalls, threshs = cm.compute_joint_metrics( results_df, assignment_df, include_labels, label_graph=label_graph, label_to_name=label_to_name, og=og, conservative=conservative_mode) with open(join(out_dir, 'joint_pr_curve.json'), 'w') as f: json.dump( { 'precisions': precisions, 'recalls': recalls, 'thresholds': threshs }, f, indent=4)
def main(): usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>" parser = OptionParser() parser.add_option( "-o", "--out_dir", help= "Directory in which to write the output. If it doesn't exist, create the directory." ) parser.add_option( "-f", "--config_file", action="store_true", help="Load plotting config from file rather than command line arguments" ) parser.add_option("-c", "--conservative_mode", action="store_true", help="Compute conservative metrics") (options, args) = parser.parse_args() conservative_mode = options.conservative_mode out_dir = options.out_dir # Parse the input if options.config_file: config_f = args[0] with open(config_f, 'r') as f: config = json.load(f) label_graph_f = config['label_graph_file'] labeling_f = config['labeling_file'] results_fs = config['results_files'] method_names = config['method_names'] else: method_names = args[0].split(',') result_fs = args[1].split(',') label_graph_f = args[0] # Load the ontology og = the_ontology.the_ontology() # Load the labels' data with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } # Load the labellings with open(labeling_f, 'r') as f: labelling = json.load(f) exp_to_labels = labelling['labels'] # Load the results all_results = [] for results_f in results_fs: all_results.append(pd.read_csv(results_f, sep='\t', index_col=0)) assert _comparable_results(all_results) # Create the output directory _run_cmd("mkdir -p %s" % out_dir) # Compute labels on which we will compute metrics include_labels = set(all_results[0].columns) - BLACKLIST_TERMS # Create the assignment matrix where rows are samples, columns # are labels, and element (i,j) = True if sample i is annotated # with label j assignment_df = cm._compute_assignment_matrix(all_results[0], exp_to_labels) metrics_dfs = [] label_to_pr_curves = [] for results_df in all_results: results_df = results_df.loc[assignment_df.index][assignment_df.columns] metrics_df = cm.compute_label_centric_metrics_binary( results_df, assignment_df, include_labels) metrics_dfs.append(metrics_df) # F1-score barplots overlaid on label-graph #draw_collapsed_ontology_w_figures( # exp_to_labels, # label_graph, # label_to_name, # label_to_pr_curves, # [ # { # label: metric_df.loc[label]['Avg. Precision'] # for label in metric_df.index # } # for metric_df in metrics_dfs # ], # method_names, # out_dir #) # Average precision box-plots gf.draw_boxplot(method_names, metrics_dfs, 'F1-Score', join(out_dir, "f1_scores_boxplot"))
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_dir", help="Output directory") (options, args) = parser.parse_args() binary_results_f = args[0] results_f = args[1] label_graph_f = args[2] decision_boundary_f = args[3] precision_thresh = float(args[4]) out_dir = options.out_dir binary_results_df = pd.read_csv(binary_results_f, sep='\t', index_col=0) results_df = pd.read_csv(results_f, sep='\t', index_col=0) decision_df = pd.read_csv(decision_boundary_f, sep='\t', index_col=0) # Load the ontology og = the_ontology.the_ontology() # Load the label graph with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } label_to_f1 = { label: decision_df.loc[label]['F1-score'] for label in decision_df.index } label_to_prec = { label: decision_df.loc[label]['precision'] for label in decision_df.index } label_to_thresh = { label: decision_df.loc[label]['empirical_threshold'] for label in decision_df.index } # Map each label to its ancestors label_to_ancestors = { label: label_graph.ancestor_nodes(label) for label in label_graph.get_all_nodes() } # Filter labels according to empiracle precision hard_labels = set([ label for label, prec in label_to_prec.items() if prec < precision_thresh ]) # Map each experiment to its predicted terms print('Mapping each sample to its predicted labels...') consider_labels = set(binary_results_df.columns) - hard_labels exp_to_pred_labels = { exp: [ label for label in consider_labels if binary_results_df.loc[exp][label] == 1 ] for exp in binary_results_df.index } print('Computing the most-specific predicted labels...') exp_to_ms_pred_labels = { exp: label_graph.most_specific_nodes(set(pred_labels) - QUALIFIER_TERMS) for exp, pred_labels in exp_to_pred_labels.items() } # Select cells with highest probability exp_to_select_pred_label = { exp: max([(label, results_df.loc[exp][label]) for label in ms_pred_labels], key=lambda x: x[1])[0] for exp, ms_pred_labels in exp_to_ms_pred_labels.items() if len(ms_pred_labels) > 0 } exp_to_update_pred = {} for exp, select_label in exp_to_select_pred_label.items(): print('{}: {}'.format(exp, og.id_to_term[select_label].name)) all_labels = label_to_ancestors[select_label] exp_to_update_pred[exp] = all_labels # Add qualifier cell types for exp in exp_to_update_pred: for qual_label in QUALIFIER_TERMS: if qual_label in exp_to_pred_labels[exp]: all_labels = label_to_ancestors[qual_label] exp_to_update_pred[exp].update(all_labels) # Create dataframe with filtered results da = [] for exp in binary_results_df.index: row = [] for label in binary_results_df.columns: if label in exp_to_update_pred[exp]: row.append(1) else: row.append(0) da.append(row) df = pd.DataFrame(data=da, columns=binary_results_df.columns, index=binary_results_df.index) df.to_csv(join( out_dir, 'filtered_binary_classification_results.prec_{}.tsv'.format( str(precision_thresh))), sep='\t')
def main(): usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>" parser = OptionParser() parser.add_option( "-o", "--out_dir", help= "Directory in which to write the output. If it doesn't exist, create the directory." ) parser.add_option( "-f", "--config_file", action="store_true", help="Load plotting config from file rather than command line arguments" ) parser.add_option("-c", "--conservative_mode", action="store_true", help="Compute conservative metrics") (options, args) = parser.parse_args() conservative_mode = options.conservative_mode out_dir = options.out_dir # Parse the input if options.config_file: config_f = args[0] with open(config_f, 'r') as f: config = json.load(f) label_graph_f = config['label_graph_file'] labeling_f = config['labeling_file'] results_fs = config['results_files'] method_names = config['method_names'] else: method_names = args[0].split(',') result_fs = args[1].split(',') label_graph_f = args[0] # Load the ontology og = the_ontology.the_ontology() # Load the labels' data with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } # Load the labellings with open(labeling_f, 'r') as f: labelling = json.load(f) exp_to_labels = labelling['labels'] # Load the results all_results = [] for results_f in results_fs: all_results.append(pd.read_csv(results_f, sep='\t', index_col=0)) assert _comparable_results(all_results) # Create the output directory _run_cmd("mkdir -p %s" % out_dir) # Compute labels on which we will compute metrics include_labels = set(all_results[0].columns) - BLACKLIST_TERMS # Create the assignment matrix where rows are samples, columns # are labels, and element (i,j) = True if sample i is annotated # with label j assignment_df = cm._compute_assignment_matrix(all_results[0], exp_to_labels) metrics_dfs = [] label_to_pr_curves = [] for results_df in all_results: results_df = results_df.loc[assignment_df.index][assignment_df.columns] if conservative_mode: metrics_df, label_to_pr_curve = cm.compute_label_centric_metrics( results_df, assignment_df, include_labels, label_graph=label_graph, label_to_name=label_to_name, og=og, conservative=True) else: metrics_df, label_to_pr_curve = cm.compute_label_centric_metrics( results_df, assignment_df, include_labels, conservative=False) metrics_dfs.append(metrics_df) label_to_pr_curves.append(label_to_pr_curve) # Write precision-recall curves to file with open(join(out_dir, 'pr_curves.json'), 'w') as f: json.dump( { method_name: { label: { 'precisions': pr[0], 'recalls': pr[1], 'thresholds': pr[2] } for label, pr in label_to_pr_curve.iteritems() } for method_name, label_to_pr_curve in zip( method_names, label_to_pr_curves) }, f, indent=4) # Precision recall curves overlaid on label-graph draw_collapsed_ontology_w_pr_curves( exp_to_labels, label_graph, label_to_name, label_to_pr_curves, [{ label: metric_df.loc[label]['Avg. Precision'] for label in metric_df.index } for metric_df in metrics_dfs], method_names, out_dir) # Average precision box-plots draw_boxplot(method_names, metrics_dfs, 'Avg. Precision', join(out_dir, "avg_prec_boxplot")) # Achievable recall at 0.9 precision box-plots draw_boxplot(method_names, metrics_dfs, 'Achievable Recall at 0.9 Precision', join(out_dir, "achievable_recall_boxplot")) # Avgerage precision comparision heatmap draw_comparison_heatmap( method_names, { method: metrics_df for method, metrics_df in zip(method_names, metrics_dfs) }, 'Avg. Precision', metrics_dfs[0].index, join(out_dir, 'win_diff_avg_prec_heatmap')) # Achievable recall at 0.9 precision comparision heatmap draw_comparison_heatmap( method_names, { method: metrics_df for method, metrics_df in zip(method_names, metrics_dfs) }, 'Achievable Recall at 0.9 Precision', metrics_dfs[0].index, join(out_dir, 'win_diff_achievable_recall_heatmap'))
def main(): usage = "usage: %prog <options> <| delimited results files>, <| delimited method names>" parser = OptionParser() parser.add_option( "-o", "--out_dir", help="Directory in which to write the output. If it doesn't exist, create the directory." ) parser.add_option( "-f", "--config_file", help="JSON file with all inputs required to run this analysis" ) parser.add_option( "-t", "--thresholds", help="Either a JSON file mapping each label to a decision threshold or number denoting the threshold to use for all cell types" ) parser.add_option( "-v", "--threshold_val", help="A number denoting the threshold to use for all cell types" ) parser.add_option( "-c", "--conservative_mode", action="store_true", help="Compute conservative metrics" ) (options, args) = parser.parse_args() conservative_mode = options.conservative_mode if options.threshold_val: label_to_thresh = defaultdict(lambda: float(options.threshold_val)) elif options.thresholds: label_to_thresh_df = pd.read_csv(options.thresholds, sep='\t', index_col=0) label_to_thresh = { label: label_to_thresh_df.loc[label]['threshold'] for label in label_to_thresh_df.index } out_dir = options.out_dir # Parse the input if options.config_file: config_f = args[0] with open(config_f, 'r') as f: config = json.load(f) label_graph_f = config['label_graph_file'] labeling_f = config['labeling_file'] results_fs = config['results_files'] method_name = config['method_name'] else: method_name = args[0] results_f = args[1] label_graph_f = args[2] labeling_f = args[3] # Load the ontology og = the_ontology.the_ontology() # Load the labels' data with open(label_graph_f, 'r') as f: label_data = json.load(f) label_graph = DirectedAcyclicGraph(label_data['label_graph']) label_to_name = { x: og.id_to_term[x].name for x in label_graph.get_all_nodes() } # Load the labellings with open(labeling_f, 'r') as f: labelling = json.load(f) exp_to_labels = labelling['labels'] # Load the results bin_results_df = pd.read_csv(results_f, sep='\t', index_col=0) # Create the output directory _run_cmd("mkdir -p %s" % out_dir) # Compute labels on which we will compute metrics include_labels = set(bin_results_df.columns) - BLACKLIST_TERMS # Create the assignment matrix where rows are samples, columns # are labels, and element (i,j) = True if sample i is annotated # with label j assignment_df = cm._compute_assignment_matrix( bin_results_df, exp_to_labels ) #bin_results_da = {} #for label in results_df.columns: # if options.thresholds and label not in label_to_thresh: # continue # confs = results_df[label] # bins = [ # (x > label_to_thresh[label]) # for x in confs # ] # bin_results_da[label] = bins #bin_results_df = pd.DataFrame( # data=bin_results_da, # index=results_df.index #) assignment_df = assignment_df.loc[bin_results_df.index][bin_results_df.columns] metrics_df = cm.compute_label_centric_metrics_binary( bin_results_df, assignment_df, include_labels, label_graph=label_graph, label_to_name=label_to_name, og=og, conservative=conservative_mode ) metrics_df.to_csv(join(out_dir, 'binary_cell_type_metrics.tsv'), sep='\t') label_to_f1 = { label: metrics_df.loc[label]['F1-Score'] for label in metrics_df.index } print(label_to_f1) # F1-score drawn atop ontology draw_collapsed_ontology( label_graph, label_to_name, label_to_f1, 'F1-Score', out_dir )
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_file", help="Output file") (options, args) = parser.parse_args() single_cell_exp_list_f = args[0] bulk_exp_list_f = args[1] single_cell_label_graph_f = args[2] bulk_label_graph_f = args[3] out_f = options.out_file og = the_ontology() with open(single_cell_exp_list_f, 'r') as f: sc_experiments_data = json.load(f) with open(bulk_exp_list_f, 'r') as f: bulk_experiments_data = json.load(f) single_cell_exps = set(sc_experiments_data['experiments']) single_cell_exp_set_name = sc_experiments_data['list_name'] bulk_exps = set(bulk_experiments_data['experiments']) bulk_exp_set_name = bulk_experiments_data['list_name'] assert single_cell_exp_set_name == "untampered_single_cell_primary_cells_with_data" assert bulk_exp_set_name == "untampered_bulk_primary_cells_with_data" with open(single_cell_label_graph_f, 'r') as f: labels_data = json.load(f) sc_label_graph = labels_data['label_graph'] sc_exp_to_labels = labels_data['labels'] sc_exp_to_labels = { k: set(v) - set(BLACKLIST) for k, v in sc_exp_to_labels.items() } with open(bulk_label_graph_f, 'r') as f: labels_data = json.load(f) bulk_label_graph = labels_data['label_graph'] bulk_exp_to_labels = labels_data['labels'] bulk_exp_to_labels = { k: set(v) - set(BLACKLIST) for k, v in bulk_exp_to_labels.items() } # The idea here is that we only want single-cell samples # for which ~all~ of its most-specific labels are a subset # of one bulk-sample's label-set. Here we collect all of the # unique bulk label-sets. # # For example, given a sample labeled as {embryonic cell, # neural cell}, but in the bulk data we only have samples # labelled as {embryonic cell} and {neural cell}. We would # discard this cell. bulk_label_sets = set() for labels in bulk_exp_to_labels.values(): bulk_label_sets.add(frozenset(labels)) label_sets_not_in_bulk = set() removed_exps = set() include_exps = set() g = DirectedAcyclicGraph(sc_label_graph) for exp, labels in sc_exp_to_labels.items(): ms_labels = set(g.most_specific_nodes(labels)) ms_labels -= set(IGNORE) # Go through the bulk label-sets and check if the current # sample's set of most-specific labels is a subset of any # of them. If so, keep it. If not, we discard it. found = False for label_set in bulk_label_sets: if set(ms_labels) <= label_set: include_exps.add(exp) found = True break if not found: label_sets_not_in_bulk.add(frozenset(ms_labels)) removed_exps.add(exp) print("{} single-cell experiments were removed".format(len(removed_exps))) print("Labels that were removed:") print( json.dumps([[og.id_to_term[x].name for x in label_set] for label_set in label_sets_not_in_bulk], indent=True)) with open(out_f, 'w') as f: f.write( json.dumps( { "list_name": "untampered_single_cell_primary_cells_with_data_cell_types_in_bulk", "description": "These are all experiments that are in the experiment list '%s' and also share the same set of most-specific labels with at least one experiment in %s" % (single_cell_exp_set_name, bulk_exp_set_name), "experiments": list(include_exps) }, indent=4))
def load_sparse_dataset(): labels_f = join(data_dir, 'labels.json') studys_f = join(data_dir, 'experiment_to_study.json') tags_f = join(data_dir, 'experiment_to_tags.json') expr_matrix_f = join(data_dir, '{}.h5'.format(features)) # Load the ontology og = the_ontology.the_ontology() # Load labels and labels-graph with open(labels_f, 'r') as f: labels_data = json.load(f) source_to_targets= labels_data['label_graph'] exp_to_labels = labels_data['labels'] label_graph = DirectedAcyclicGraph(source_to_targets) # Map each ontology term label to its human-readable # name label_to_name = { label: og.id_to_term[label].name for label in source_to_targets.keys() } # Map each experiment to its most-specific labels exp_to_ms_labels = { exp: label_graph.most_specific_nodes(labels) for exp, labels in exp_to_labels.items() } # Load study metadata with open(studys_f, 'r') as f: exp_to_study = json.load(f) study_to_exps = defaultdict(lambda: set()) for exp, study in exp_to_study.items(): study_to_exps[study].add(exp) study_to_exps = dict(study_to_exps) # Load technical tags with open(tags_f, 'r') as f: exp_to_tags = json.load(f) # Load the data matrix print('Loading expression data from {}...'.format(expr_matrix_f)) with h5py.File(expr_matrix_f, 'r') as f: the_exps = [ str(x) for x in f['experiment'][:] ] gene_ids = [ str(x) for x in f['gene_id'][:] ] data_matrix = f['expression'][:] print('Loaded matrix of shape {}.'.format(data_matrix.shape)) print('done.') # Map each experiment to its index exp_to_index = { exp: ind for ind, exp in enumerate(the_exps) } return ( og, label_graph, label_to_name, the_exps, exp_to_index, exp_to_labels, exp_to_tags, exp_to_study, study_to_exps, exp_to_ms_labels, data_matrix, gene_ids )
def main(): usage = "usage: %prog <options> <environment dir> <experiment list name> <cross-validation config name>" parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_dir", help="Directory in which to write the output") (options, args) = parser.parse_args() results_f = args[0] label_graph_f = args[1] prefix = args[2] out_dir = options.out_dir # Load the results confidence_df = pd.read_csv(results_f, sep='\t', index_col=0) # Map each label to its name og = the_ontology.the_ontology() label_to_name = { label: og.id_to_term[label].name for label in confidence_df.columns } _run_cmd("mkdir -p %s" % out_dir) # Load the label-graph with open(label_graph_f, 'r') as f: labels_data = json.load(f) label_graph = DirectedAcyclicGraph(labels_data['label_graph']) # Compute the labels for which we will compute metrics over. # This label set is simply the set of labels for which we have # predictions for every sample include_labels = set(confidence_df.columns) - BLACKLIST_TERMS total_n_incons = 0 total_n_very_incons = 0 total_edges = 0 incons_to_count = defaultdict(lambda: 0) very_incons_to_count = defaultdict(lambda: 0) exp_child_parent_incons = [] for exp in confidence_df.index: exp_n_incons = 0 for parent_label in confidence_df.columns: parent_conf = confidence_df.loc[exp][parent_label] if parent_label in BLACKLIST_TERMS: continue for child_label in label_graph.source_to_targets[parent_label]: if child_label in BLACKLIST_TERMS: continue if child_label not in confidence_df.columns: continue child_conf = confidence_df.loc[exp][child_label] # Don't consider parent-child edges where prediction-scores # for both nodes is less than 1% if child_conf < 0.01 and parent_conf < 0.01: continue # We count the edge as inconsistent if BOTH the child's score is # greater than its parents and ALSO that difference is non-negligeble if abs(child_conf - parent_conf) > EPSILON and child_conf > parent_conf: exp_child_parent_incons.append( (exp, child_label, parent_label, (child_conf - parent_conf))) incons_to_count[(parent_label, child_label)] += 1 total_n_incons += 1 exp_n_incons += 1 if child_conf - parent_conf > VERY_INCONS_THRESH: total_n_very_incons += 1 very_incons_to_count[(parent_label, child_label)] += 1 total_edges += 1 total_fraction_inconsistent = total_n_incons / float(total_edges) total_fraction_very_inconsistent = total_n_very_incons / float(total_edges) print("Inconsistent edges:") for incons, count in sorted([(k, v) for k, v in incons_to_count.items()], key=lambda x: x[1]): parent = incons[0] child = incons[1] print("%s -> %s : %d" % (label_to_name[parent], label_to_name[child], count)) print("Very inconsistent edges:") for incons, count in sorted([(k, v) for k, v in very_incons_to_count.items()], key=lambda x: x[1]): parent = incons[0] child = incons[1] print("%s -> %s : %d" % (label_to_name[parent], label_to_name[child], count)) summary_df = pd.DataFrame( data=[[total_n_incons, total_edges, total_fraction_inconsistent], [ total_n_very_incons, total_edges, total_fraction_very_inconsistent ], [ total_n_very_incons, len(confidence_df.index), (float(total_n_very_incons) / len(confidence_df.index)) ]], columns=["No. enconsistent", "Total edges", "Fraction of total edges"], index=[ "Total edges inconsistent", "Total edges inconsistent >%f" % VERY_INCONS_THRESH, "Avg. very inconsistent per sample" ]) summary_df.to_csv(join(out_dir, '{}.inconsistent_edges_stats.tsv'.format(prefix)), sep='\t') exp_child_parent_incons = sorted(exp_child_parent_incons, key=lambda x: x[3]) inconss = [] n_less_eq = [] l_less_than_1 = 0 n_great_than_1 = 0 for i, exp_child_parent_icons in enumerate(exp_child_parent_incons): incons = exp_child_parent_icons[3] inconss.append(incons) n_less_eq.append(float(i) / len(exp_child_parent_incons)) fig, axarr = plt.subplots(1, 1, figsize=(3.0, 3.0), squeeze=False) axarr[0][0].plot(inconss, n_less_eq, color=vl.NICE_COLORS[1], lw=4) axarr[0][0].set_xlabel('Child prob. - Parent prob.') axarr[0][0].set_ylabel('Cumulative probability') axarr[0][0].set_xlim((0.0, 1.0)) axarr[0][0].set_ylim((0.0, 1.0)) out_f = join(out_dir, "{}.CDF_inconsistences".format(prefix)) fig.savefig("%s.eps" % out_f, format='eps', bbox_inches='tight', dpi=100, transparent=True) fig.savefig("%s.pdf" % out_f, format='pdf', bbox_inches='tight', dpi=100, transparent=True)