def test_unzip(self): """unzip(items) should be the inverse of zip(*items)""" chars = [list('abcde'), list('ghijk')] numbers = [[1, 2, 3, 4, 5], [0, 0, 0, 0, 0]] strings = [["abcde", "fghij", "klmno"], ['xxxxx'] * 3] lists = [chars, numbers, strings] zipped = [zip(*i) for i in lists] unzipped = [unzip(i) for i in zipped] for u, l in zip(unzipped, lists): self.assertEqual(u, l)
def get_polyphyletic(cons): """get polyphyletic groups and a representative tip""" tips, taxonstrings = unzip(cons.items()) tree, lookup = make_consensus_tree(taxonstrings, False, tips=tips) cache_tipnames(tree) names = {} for n in tree.non_tips(): if n.name is None: continue if (n.name, n.Rank) not in names: names[(n.name, n.Rank)] = {} if n.parent is not None: names[(n.name, n.Rank)][n.parent.name] = n.tip_names[0] return names
def name_node_score_fold(tree, score_f=fmeasure, tiebreak_f=min_tips, verbose=False): """Compute name scores for internal nodes, pick the 'best' For this method, we traverse the tree once building up a dict of scores for names and nodes, we can then pick the 'best' node out of the dict to avoid horrible lookups in the tree """ if verbose: print("Starting name_node_score_fold...") name_node_score = {i: {} for i in range(len(RANK_ORDER))} n_ranks = len(RANK_ORDER) for node in tree.non_tips(include_self=True): node.RankNameScores = [None] * n_ranks for rank, name in enumerate(node.RankNames): if name is None: continue # precision in this case is the percent of informative tips that # descend that are of the name relative to the number of # informative tips that descend precision = node.ValidRelFreq[rank][name] # recall in this case is the percent of informative tips that # descent that are of the name relative to the total number of # tips in the tree with name recall = node.ConsensusRelFreq[rank][name] # calculate score and save it for the corrisponding rank position # so that these values can be examined later in other contexts score = score_f(precision, recall) node.RankNameScores[rank] = score if name not in name_node_score[rank]: name_node_score[rank][name] = [] name_node_score[rank][name].append((node, score)) # run through the built up dict and pick the best node for a name used_scores = {} for rank, names in name_node_score.items(): used_scores[rank] = [] for name, node_scores in names.items(): node_scores_sorted = sorted(node_scores, key=itemgetter(1))[::-1] nodes, scores = unzip(node_scores_sorted) scores = array(scores) used_scores[rank].append((name, scores[0])) # if there is a tie in scores... if sum(scores == scores[0]) > 1: # ugly hack to get around weird shape mismatch indices = where(scores == scores[0], range(len(nodes)), None) tie_nodes = [] for i in indices: if i is not None: tie_nodes.append(nodes[i]) else: tie_nodes.append(None) node_to_keep = tiebreak_f(tie_nodes) for node, score in node_scores_sorted: if node == node_to_keep: continue else: node.RankNames[rank] = None else: for node, score in node_scores_sorted[1:]: node.RankNames[rank] = None return used_scores