def add_taxonomy_metadata(non_monophyletic_taxa): broken_taxa = non_monophyletic_taxa['non_monophyletic_taxa'] taxonomy = ott.OTT() id2names = taxonomy.ott_id_to_names id2ranks = taxonomy.ott_id_to_ranks for oid in broken_taxa: pattern = re.compile(r'ott') int_id = int(re.sub(pattern, '', oid)) name = "no name" rank = "no rank" if int_id in id2names: name = id2names[int_id] if (isinstance(name, tuple)): name = name[0] if int_id in id2ranks: rank = id2ranks[int_id] print(oid, name, rank) broken_taxa[oid]['name'] = name broken_taxa[oid]['rank'] = rank non_monophyletic_taxa['non_monophyletic_taxa'] = broken_taxa return non_monophyletic_taxa
def newly_broken_taxa_report(run1, run2): # load local copy of OTT print("\nAnalyzing broken taxa:", file=sys.stderr) print(" * Loading OTT ... ", end='', flush=True, file=sys.stderr) taxonomy = ott.OTT() print("done. (Using version {})".format(taxonomy.version), flush=True, file=sys.stderr) id2names = taxonomy.ott_id_to_names for id in id2names: if isinstance(id2names[id], tuple): id2names[id] = id2names[id][0] id2ranks = rank_unranked_nodes(taxonomy) # print details of names in 2 but not in 1 (the 'newly broken names') bt1 = set(run1.broken_taxa) bt2 = set(run2.broken_taxa) diff = bt2.difference(bt1) broken_taxa_filename = 'broken_taxa_report.csv' print(" * Printing details of {x} broken taxa to {f}".format( x=len(diff), f=broken_taxa_filename)) conflict_status2 = run2.get_taxon_conflict_info() with open(broken_taxa_filename, 'w') as f: for ottID in diff: # 1. Get name and rank for ottID int_id = get_id_from_ottnum(ottID) name = "no name" rank = "no rank" if int_id in id2names: name = id2names[int_id] if int_id in id2ranks: rank = id2ranks[int_id] f.write("{i},{n},{r}\n".format(i=int_id, n=name, r=rank)) # We want to know: # For each tree, # For each rank, and for all ranks put together # How many taxa does this tree (a) conflict with, (b) resolve (c) align to # Then, for the conflict, we want to know how many of each rank are NEW with run2? # Maybe we also want to know how many trees from the same study are being used? # tree: # total: (newly broken) conflicts / resolves / aligns # rank1: (newly broken) conflicts / resolves / aligns: newly-broken names # rank2: (newly broken) conflicts / resolves / aligns: newly-broken names # genus: (newly broken) conflicts / resolves / aligns victims = defaultdict(set) new_victims = defaultdict(set) tree_conflict = defaultdict(lambda: defaultdict(set)) tree_conflict_at_rank = defaultdict( lambda: defaultdict(lambda: defaultdict(set))) for ott_node, node_conflict in conflict_status2.items(): int_id = get_id_from_ottnum(ott_node) rank = id2ranks[int_id] for rel, tree_nodes in node_conflict.items(): for tree, nodes in tree_nodes.items(): if rel == "conflicts_with": tree_conflict[tree]["conflicts_with"].add(int_id) tree_conflict_at_rank[tree][rank]["conflicts_with"].add( int_id) victims[tree].add(int_id) if ott_node in diff: new_victims[tree].add(int_id) tree_conflict[tree]["newly_broken"].add(int_id) tree_conflict_at_rank[tree][rank]["newly_broken"].add( int_id) elif rel == "supported_by" or rel == "partial_path_of": tree_conflict[tree]["aligns_to"].add(int_id) tree_conflict_at_rank[tree][rank]["aligns_to"].add(int_id) elif rel == "resolved_by": tree_conflict[tree]["resolves"].add(int_id) tree_conflict_at_rank[tree][rank]["resolves"].add(int_id) # FIXME: write out number of duplicate trees per study # NEW: show aligns_to last print( "Here are the {} trees that broke NEW taxa, starting with the most newly-broken taxa:\n" .format(len(new_victims))) print("\n\n{}: ({}) {} / {} / {} ".format("tree", bold(yellow("newly_broken")), yellow("conflicts_with"), cyan("aligns_to"), green("resolves"))) for tree in sorted(new_victims, key=lambda x: len(new_victims.get(x)), reverse=True): ctree = tree if len(tree_conflict[tree]["conflicts_with"]) > len( tree_conflict[tree]["aligns_to"]): ctree = bold(red(tree)) print("\n\n{}: ({}) {} / {} / {} ".format( ctree, bold(yellow(len(tree_conflict[tree]["newly_broken"]))), yellow(len(tree_conflict[tree]["conflicts_with"])), cyan(len(tree_conflict[tree]["aligns_to"])), green(len(tree_conflict[tree]["resolves"])))) for rank in sorted(tree_conflict_at_rank[tree], key=lambda key: rank_of_rank[key]): conflict = tree_conflict_at_rank[tree][rank] examples = '' if rank_of_rank[rank] < rank_of_rank["genus"]: examples2 = set() for example_id in conflict["newly_broken"]: examples2.add(id2names[example_id]) if (len(examples2) > 0): examples = '{}'.format(examples2) n_newly_broken = len(conflict["newly_broken"]) if (n_newly_broken > 0): n_newly_broken = bold(yellow(len(conflict["newly_broken"]))) crank = get_color_rank(rank) else: n_newly_broken = '0' crank = rank n_conflicts_with = len(conflict["conflicts_with"]) if (n_conflicts_with > 0): n_conflicts_with = yellow(n_conflicts_with) n_aligns_to = len(conflict["aligns_to"]) if (n_aligns_to > 0): n_aligns_to = cyan(n_aligns_to) n_resolves = len(conflict["resolves"]) if (n_resolves > 0): n_resolves = green(n_resolves) print(" {}: ({}) {} / {} / {} {}".format( crank, n_newly_broken, n_conflicts_with, n_aligns_to, n_resolves, examples)) print( "\n\n\nHere are the other {} trees that broke taxa, starting with the most newly-broken taxa:\n" .format(len(victims) - len(new_victims))) for tree in sorted(victims, key=lambda x: len(victims.get(x)), reverse=True): if tree in new_victims: continue ctree = tree if len(tree_conflict[tree]["conflicts_with"]) > len( tree_conflict[tree]["aligns_to"]): ctree = bold(red(tree)) print("\n\n{}: {} / {} / {} ".format( ctree, yellow(len(tree_conflict[tree]["conflicts_with"])), cyan(len(tree_conflict[tree]["aligns_to"])), green(len(tree_conflict[tree]["resolves"])))) for rank in sorted(tree_conflict_at_rank[tree], key=lambda key: rank_of_rank[key]): conflict = tree_conflict_at_rank[tree][rank] examples = '' if rank_of_rank[rank] < rank_of_rank["genus"]: examples2 = set() for example_id in conflict["conflicts_with"]: examples2.add(id2names[example_id]) if (len(examples2) > 0): examples = '{}'.format(examples2) n_conflicts_with = len(conflict["conflicts_with"]) if (n_conflicts_with > 0): n_conflicts_with = yellow(n_conflicts_with) crank = get_color_rank(rank) else: crank = rank n_aligns_to = len(conflict["aligns_to"]) if (n_aligns_to > 0): n_aligns_to = cyan(n_aligns_to) n_resolves = len(conflict["resolves"]) if (n_resolves > 0): n_resolves = green(n_resolves) print(" {}: {} / {} / {} {}".format(crank, n_conflicts_with, n_aligns_to, n_resolves, examples))
def newly_broken_taxa_report(run1,run2): # load local copy of OTT print("\nAnalyzing broken taxa:") print(" * Loading OTT ... ", end='',flush=True); taxonomy = ott.OTT() print("done. (Using version {})".format(taxonomy.version), flush=True); id2names = taxonomy.ott_id_to_names for id in id2names: if isinstance(id2names[id],tuple): id2names[id] = id2names[id][0] id2ranks = rank_unranked_nodes(taxonomy) # print details of names in 2 but not in 1 (the 'newly broken names') bt1=set(run1.broken_taxa) bt2=set(run2.broken_taxa) diff = bt2.difference(bt1) broken_taxa_filename = 'broken_taxa_report.csv' print(" * Printing details of {x} broken taxa to {f}".format( x=len(diff), f=broken_taxa_filename )) conflict_status1 = run1.get_taxon_conflict_info() conflict_status2 = run2.get_taxon_conflict_info() # print(conflict_status1) # exit(0) with open(broken_taxa_filename, 'w') as f: for ottID in diff: # 1. Get name and rank for ottID int_id = get_id_from_ottnum(ottID) name = "no name" rank = "no rank" if int_id in id2names: name = id2names[int_id] if int_id in id2ranks: rank = id2ranks[int_id] f.write("{i},{n},{r}\n".format(i=int_id,n=name,r=rank)) # We want to know: # For each tree, # For each rank, and for all ranks put together # How many taxa does this tree (a) conflict with, (b) resolve (c) align to # Then, for the conflict, we want to know how many of each rank are NEW with run2? # Maybe we also want to know how many trees from the same study are being used? # tree: # total: (newly broken) conflicts / resolves / aligns # rank1: (newly broken) conflicts / resolves / aligns: newly-broken names # rank2: (newly broken) conflicts / resolves / aligns: newly-broken names # genus: (newly broken) conflicts / resolves / aligns (tree_conflict1,tree_conflict_at_rank1) = get_conflict_info_by_tree(conflict_status1, id2ranks) (conflict1,conflict_at_rank1) = union_over_trees(tree_conflict1, tree_conflict_at_rank1) (tree_conflict2,tree_conflict_at_rank2) = get_conflict_info_by_tree(conflict_status2, id2ranks) # FIXME: write out number of duplicate trees per study # NEW: show aligns_to last print("Here are the trees in order of NEW broken taxa, then all broken taxa:\n") print("\n\n{}: ({}) {} / ({}) {} / ({}) {} ".format("tree", bold(yellow("change")), yellow("conflicts_with"), bold(cyan("change")), cyan("aligns_to"), bold(green("change")), green("resolves"))) for tree in sorted(tree_conflict2, key=lambda tree:( len(tree_conflict2[tree]["conflicts_with"] - conflict1["conflicts_with"]), len(tree_conflict2[tree]["conflicts_with"]) ), reverse=True): ctree=tree if len(tree_conflict2[tree]["conflicts_with"]) > len(tree_conflict2[tree]["aligns_to"]): ctree=bold(red(tree)) print("\n\n{}: {}".format(ctree, conflict_summary_line(conflict1, tree_conflict2[tree]))) for rank in sorted(tree_conflict_at_rank2[tree], key=lambda key:rank_of_rank[key]): c1 = conflict_at_rank1[rank] c2 = tree_conflict_at_rank2[tree][rank] examples='' if rank_of_rank[rank] < rank_of_rank["genus"]: examples2 = set() for example_id in c2["conflicts_with"] - c1["conflicts_with"]: examples2.add(id2names[example_id]) if (len(examples2) > 0): examples = '{}'.format(examples2) n_newly_broken = len(c2["conflicts_with"] - c1["conflicts_with"]) if (n_newly_broken > 0): crank = get_color_rank(rank) else: crank = rank print(" {}: {} {}".format(crank, conflict_summary_line(c1, c2), examples)) # Find duplicate trees per study print ("Studies with duplicate trees:\n\n") trees_for_study = defaultdict(set) for study_tree in run2.read_input_trees()['input_trees']: (study,tree) = study_tree.split('@') trees_for_study[study].add(tree) for study,trees in trees_for_study.items(): if len(trees) > 1: print("{} : {} : {}".format(study,len(trees),trees))
parser.add_argument('-v', dest='verbose', action='store_true', help='print details on diffs; default false') return parser.parse_args() if __name__ == "__main__": args = parse_cmdline() print(" * Synth output: {d}".format(d=args.synth), file=sys.stderr) taxon = args.taxon taxon_id = get_id_from_ottnum(taxon) print(" * Loading OTT ... ", end='', flush=True, file=sys.stderr) taxonomy = ott.OTT() print("done. (Using version {})".format(taxonomy.version), flush=True, file=sys.stderr) # get stats object for each run synth = runStatistics(args.synth) print(" * Finding nodes for taxon {}, id={}".format(args.taxon, taxon_id), file=sys.stderr) desc = get_all_descendants(taxonomy, taxon_id) print(" - Found {} descendants, including original taxon".format( len(desc)), file=sys.stderr) conflict = synth.get_taxon_conflict_info()