def switch_hosts(self, t0, seed=None): """ Select an extant pathogen lineage at random and reassign its host :return: """ assert len( self.extant_h) > 1, "Error: attempted to switch between one host" if seed: random.seed(seed) pick_p = random.choice( self.extant_p) # select an extant pathogen lineage at random pick_h = pick_p.host while pick_h == pick_p.host: pick_h = random.choice(self.extant_h) # add a node of degree size 2 to annotate host switch event in tree pick_p.dist = t0 - pick_p.height next_p = Tree(name=pick_p.name + '_m%s-%sm' % (pick_p.host.name, pick_h.name), dist=0) next_p.add_features(host=pick_h, height=t0) pick_p.up = next_p next_p.children = [pick_p] self.extant_p.remove(pick_p) self.extant_p.append(next_p) self.not_extant_p.append(pick_p)
def build_conv_topo(annotated_tree, vnodes): tconv = annotated_tree.copy(method="deepcopy") for n in tconv.iter_leaves(): n.add_features(L=1) for n in tconv.traverse(): n.add_features(COPY=0) # get the most recent ancestral node of all the convergent clades l_convergent_clades = tconv.search_nodes(T=True) common_anc_conv=tconv.get_common_ancestor(l_convergent_clades) # duplicate it at its same location (branch lenght = 0). we get # a duplicated subtree with subtrees A and B (A == B) dist_dup = common_anc_conv.dist if not common_anc_conv.is_root(): dup_point = common_anc_conv.add_sister(name="dup_point",dist=0.000001) dup_point_root = False else: dup_point = Tree() dup_point_root = True dup_point.dist=0.000001 dup_point.add_features(ND=0,T=False, C=False, Cz=False) common_anc_conv.detach() common_anc_conv_copy = common_anc_conv.copy(method="deepcopy") # tag duplicated nodes: for n in common_anc_conv_copy.traverse(): n.COPY=1 if n.ND not in vnodes and not n.is_root(): n.dist=0.000001 # pruned A from all branches not leading to any convergent clade l_leaves_to_keep_A = common_anc_conv.search_nodes(COPY=0, C=False, L=1) #logger.debug("A: %s",l_leaves_to_keep_A) common_anc_conv.prune(l_leaves_to_keep_A, preserve_branch_length=True) # pruned B from all branches not leading to any non-convergent clade l_leaves_to_keep_B = common_anc_conv_copy.search_nodes(COPY=1, C=True, L=1) #logger.debug("B : %s", l_leaves_to_keep_B) common_anc_conv_copy.prune(l_leaves_to_keep_B, preserve_branch_length=True) dup_point.add_child(common_anc_conv_copy) dup_point.add_child(common_anc_conv) tconv = dup_point.get_tree_root() nodeId = 0 for node in tconv.traverse("postorder"): node.ND = nodeId nodeId += 1 return tconv
def birth(tree, node): #subpop is the subpopulation where the event is to occur, #setpop is the set of nodes in subpop child1, child2 = Tree(), Tree() child1.dist, child2.dist = 0, 0 child1.add_features(extinct=False) child2.add_features(extinct=False) #add children to nodes node.add_child(child1) node.add_child(child2) return tree
def initialise(rate): tree = Tree() tree.add_features(extinct=False) tree.dist = 0.0 node = random.choice(tree.get_leaves()) tree = birth(tree, node) leaf_nodes = tree.get_leaves() wtime = random.expovariate(rate) for leaf in leaf_nodes: if not leaf.extinct: leaf.dist += wtime return tree
def makeNewDistanceMatrix(n, seqStringList, distanceMatrix, i, j, dictPos, dictTree): newMatrix = [] rows = n columns = rows for row in range(rows + 1): rowScore = [] for column in range(columns + 1): if row == 0 and column == 0: rowScore.append("~") elif row == 0: rowScore.append(seqStringList[column - 1]) #On spécifie la valeur du noeud dans la nouvelle matrix (oldVal,newVal) if seqStringList[column - 1] in dictPos: dictPos[seqStringList[column - 1]] = (dictPos[seqStringList[column - 1]][0], column) else: # On doit créer un nouvel entrée pour le merge dictPos[seqStringList[column - 1]] = (column, column) t = Tree() t.add_child(dictTree[distanceMatrix[i][0]]) t.add_child(dictTree[distanceMatrix[0][j]]) t.add_features(name=seqStringList[column - 1], dist=0) dictTree[seqStringList[column - 1]] = t #On doit inactiver les anciennes valeurs elif column == 0: rowScore.append(seqStringList[row - 1]) elif row != i and column != i and row != column: rowScore.append(distanceMatrix[dictPos[seqStringList[ row - 1]][0]][dictPos[seqStringList[column - 1]][0]]) else: rowScore.append(0) newMatrix.append(rowScore) for row in range(rows + 1): # On met à jour les anciens indices dictPos[seqStringList[row - 1]] = (dictPos[seqStringList[row - 1]][1], dictPos[seqStringList[row - 1]][1]) return newMatrix, dictPos, dictTree
def parse_tree(json_obj): tree = Tree() tree.add_features(custom_name='0') for i in json_obj["tree"]: # parse stem if (i.get("stem")): if (i["stem"]["parent"] == 0): parent_node = tree else: stem_parent_name = str(i["stem"]["parent"]) parent_node = tree.search_nodes( custom_name=stem_parent_name)[0] child = parent_node.add_child() child.add_features(custom_name=str(i["stem"]["child"])) elif (i.get("leaf")): # parse leaf leaf_parent_name = str(i["leaf"]["parent"]) parent_node = tree.search_nodes(custom_name=leaf_parent_name)[0] parent_node.add_child(name=str(i["leaf"]["label"])) return tree
def subtree(clone): '''Helper function to generate the subtree for each subclone Recursively called to include all subclones situated under given clone''' # calculate branch distance as difference between clone and parent birthdays distance = clone.birthday - clone.parent.birthday s = Tree(name=clone.ID, dist=distance) # set clone as root of subtree if log == True: size = 10*np.log10(clone.get_family_size()) else: size = clone.get_family_size() s.add_features(weight=size, rgb_color=clone.rgb_color) # create copy of subclones list and filter (this avoids the original subclones list to be filtered) sub_filtered = clone.subclones[:] if det_lim > 0: sub_filtered = list(filter(lambda subclone: subclone.get_family_size() >= det_lim, sub_filtered)) for sub in sub_filtered: st = subtree(sub) # call subtree function recursively for each subclone s.add_child(st) return s
def makeDistanceMatrix(seqList, dictPos, dictTree, createDic): distanceMatrix = [] rows = len(seqList) columns = rows for row in range(rows + 1): rowScore = [] for column in range(columns + 1): if row == 0 and column == 0: rowScore.append("~") elif row == 0: #Crée les dictionnaires INITIAUX rowScore.append(seqList[column - 1].getName()) if (createDic): dictPos[seqList[column - 1].getName()] = (column, column) t = Tree() t.add_features(name=seqList[column - 1].getName(), active=True) dictTree[seqList[column - 1].getName()] = t elif column == 0: rowScore.append(seqList[row - 1].getName()) else: rowScore.append(0) distanceMatrix.append(rowScore) return distanceMatrix, dictPos, dictTree
def partitionTreeSet(N): if N == 1: x = Tree(";",format=100) x.add_features(value=N, name=str(N)) xFace = styleFace(x.name) x.add_face(xFace,column=0,position="branch-top") return (x,) else: y = () base = Tree(";",format=100) base.dist = 1 for k in range(lam(N)): left = partitionTreeSet(N-(k+1)) right = partitionTreeSet(k+1) for l in left: for r in right: l.dist = 1 r.dist = 1 z = base.copy() z.dist = 1 z.add_features(value=N, name=str(N)) z.add_child(l.copy()) z.add_child(r.copy()) zFace = styleFace(z.name) z.add_face(zFace,column=0,position="branch-top") y = y + (z,) return y
# Parse the node information for the root root_info = lines[last_comments:] median = float(root_info.split("age_median=")[1].split(":")[0]) mean = float(root_info.split("age_mean=")[1].split("]")[0]) sd = float(root_info.split("age_sd=")[1].split(":")[0]) min = float(root_info.split("age_range={")[1].split("_")[0]) max = float(root_info.split("age_range={")[1].split("_")[1].split("}")[0]) ciMin = float(root_info.split("age_quant_5_95={")[1].split("_")[0]) ciMax = float( root_info.split("age_quant_5_95={")[1].split("_")[1].split("}")[0]) id = float(root_info.split("id=")[1].split(":")[0]) t.add_features(support=1.0, age_median=median, age_mean=mean, age_sd=sd, age_range="{" + str(min) + "_" + str(max) + "}", age_quant_5_95="{" + str(ciMin) + "_" + str(ciMax) + "}", id=id) ts = TreeStyle() ts.min_leaf_separation = 0 ts.show_scale = False ts.show_leaf_name = False ts.scale = scale #0.3 #0.1# 0.3 # 10 pixels per branch length unit nstyle = NodeStyle() nstyle["size"] = 0.0001 for n in t.traverse(): n.set_style(nstyle) for leaf in t:
def build_tree(population, det_lim=1, log=False): '''Builds an ete3 Tree object based on the clone phylogeny in the population A detection limit can be set which will filter out clones that fall below this limit. The limit is one by default, so that only alive clones are taken into account. A log-scale can be set which will be used to calculate the node sizes as the log10 of the clone size''' def tree_layout(node): '''Tree layout function to define the layout of each node within the tree''' hex_color = '#%02X%02X%02X' %(node.rgb_color) node.img_style["fgcolor"] = hex_color # set color of node node.img_style["size"] = node.weight # set size of node start_clone = population.start_clone t = Tree(name=start_clone.ID, dist=0) # set start clone as root of tree if log == True: size = 10*np.log10(start_clone.get_family_size()) else: size = start_clone.get_family_size() t.add_features(weight=size, rgb_color=start_clone.rgb_color) def subtree(clone): '''Helper function to generate the subtree for each subclone Recursively called to include all subclones situated under given clone''' # calculate branch distance as difference between clone and parent birthdays distance = clone.birthday - clone.parent.birthday s = Tree(name=clone.ID, dist=distance) # set clone as root of subtree if log == True: size = 10*np.log10(clone.get_family_size()) else: size = clone.get_family_size() s.add_features(weight=size, rgb_color=clone.rgb_color) # create copy of subclones list and filter (this avoids the original subclones list to be filtered) sub_filtered = clone.subclones[:] if det_lim > 0: sub_filtered = list(filter(lambda subclone: subclone.get_family_size() >= det_lim, sub_filtered)) for sub in sub_filtered: st = subtree(sub) # call subtree function recursively for each subclone s.add_child(st) return s # create copy of subclones list and filter (this avoids the original subclones list to be filtered) filtered = start_clone.subclones[:] if det_lim > 0: filtered = list(filter(lambda clone: clone.get_family_size() >= det_lim, filtered)) for subclone in filtered: s = subtree(subclone) t.add_child(s) # Define TreeStyle ts = TreeStyle() ts.show_leaf_name = False ts.show_branch_length = False ts.show_branch_support = False ts.rotation = 90 # rotate the tree to get a horizontal one ts.layout_fn = tree_layout return t, ts
def main(): # Creates a list of Sequence object with the name and content seqList = readSequences("proteines.fa") # Updates the newContent property with the oldContent without gap seqList = removeGaps(seqList) print("New sequences") printSequences(seqList) print(" =========================================") # Parses the BLOSUM62 matrix blosumMatrix = makeBlosumMatrix() dictPos = {} dictTree = {} # Calculates the first distance matrix using blosum62 score distanceMatrix, dictPos, dictTree = calculateDistanceMatrix( blosumMatrix, seqList, dictPos, dictTree) print("Matrice initiale des distances") printMatrix(distanceMatrix) print(" =========================================") print("Matrice pondérée") negativeMatrix, posSmallest, dictPos, dictTree = calculateNJMatrix( seqList, distanceMatrix, dictPos, dictTree) printMatrix(negativeMatrix) print("Smallest is: ", posSmallest[0], posSmallest[1]) # Cette fonction merge 2 séquences en une nouvelle, modifie la liste des sequences et rajoute le noeud dans l'arbre seqListString, distanceMatrix, njTreeStringArray, dictPos, dictTree = updateNJTree( posSmallest[0], posSmallest[1], len(seqList), distanceMatrix, seqList, dictPos, dictTree) print(" =========================================") print(" =========================================") print("Matrice des distances après 1 itération") printMatrix(distanceMatrix) # Le but ici est de looper et de modifier la matrice jusqu'à ce que seulement 2 noeuds restent dans la liste des sequences # Dans ce cas-là, on les merge dans une racine vide (car NJ retourne un non-enraciné) while len(seqListString) > 2: seqList = [] # newSeqList est une liste de String, on doit donc créer les objets correspondants for s in seqListString: seqList.append(Sequence(s, "", "")) #On recalcule la matrice NJ à partir de la nouvelle matrice des distances negativeMatrix, posSmallest, dictPos, dictTree = calculateNJMatrix( seqList, distanceMatrix, dictPos, dictTree) print(" =========================================") print("Matrice pondérée") printMatrix(negativeMatrix) print("Smallest is: ", posSmallest[0], posSmallest[1]) print("") # Cette fonction merge 2 séquences en une nouvelle, modifie la liste des sequences et rajoute le noeud dans l'arbre seqListString, distanceMatrix, njTreeStringArray, dictPos, dictTree = updateNJTree( posSmallest[0], posSmallest[1], len(seqList), distanceMatrix, seqList, dictPos, dictTree) print(" =========================================") print(" =========================================") print("Matrice des distances après itérations") printMatrix(distanceMatrix) #Il nous reste un seul match à faire, on aura toujours 3 colonnes t = Tree() t.add_child(dictTree[distanceMatrix[0][2]]) t.add_child(dictTree[distanceMatrix[0][1]]) t.add_features(name='notaroot') t.add_features(dist=0) dictTree['notaroot'] = t t = dictTree['notaroot'] print(" =========================================") print("Arbre NJ obtenu") print(t) print(" =========================================") treesFromFile = readTrees("arbres.nw") # Enlever les commentaires pour voir toutes les comparaisons RF des arbres du fichier #rfMatrix = calculateRFMatrix(treesFromFile) #print ("==============RF MATRIX==============") #printMatrix(rfMatrix) cpt = 0 for i in treesFromFile: print("Comparaison RF entre Arbre NJ et Arbre ", cpt, " du fichier.") print(robinsonFould(i, t)) cpt += 1 distances(t)
dictPos[nameNode] = i t = Tree() a = t.add_child(name=nameNode) a.add_features(active=True) dictTree[nameNode] = a print(a) #Exemple de merge nameNode = 'd' dictPos[nameNode] = 1 #nameList = ['a','d'] noeud = Tree() #a = t.add_child(name=nameNode) noeud.add_child(dictTree[nameList[1]]) noeud.add_child(dictTree[nameList[2]]) noeud.add_features(name=nameNode) dictTree[nameNode] = noeud test = dictTree[nameNode] print(test.get_ascii(show_internal=True)) print(noeud.get_ascii(show_internal=True)) print(dictPos) print(dictTree) for node in t1: if node.is_root(): print("hello") #if not node.is_leaf(): #innerbranch.append(node) #print (node) #for leaf in t1:
def build_tree(data, feature_info, sens, expl, output, metric, conf, max_depth, min_leaf_size=100, agg_type='avg', max_bins=10, subsample_frac=1.0): """ Builds a decision tree guided towards nodes with high bias Parameters ---------- data : the dataset feature_info : information about user features sens : name of the sensitive feature expl : name of the explanatory feature output : the target feature metric : the fairness metric to use conf : the confidence level max_depth : maximum depth of the decision-tree min_leaf_size : minimum size of a leaf agg_type : aggregation method for children scores max_bins : maximum number of bins to use when binning continuous features Returns ------- tree : the tree built by the algorithm """ from ete3 import Tree logging.info('Building a Guided Decision Tree') tree = Tree() # Check if there are multiple labeled outputs # targets = data.columns[-output.num_labels:].tolist() targets = output.names.tolist() logging.debug('Targets: %s', targets) features = set(data.columns.tolist()) - set([sens, expl]) - set(targets) logging.debug('Contextual Features: %s', features) # check the data dimensions if metric.dataType == Metric.DATATYPE_CORR: if expl: dim = (feature_info[expl].arity, 6) else: dim = 6 else: # get the dimensions of the OUTPUT x SENSITIVE contingency table if expl: dim = (feature_info[expl].arity, output.arity, feature_info[sens].arity) else: dim = (output.arity, feature_info[sens].arity) logging.debug('Data Dimension for Metric: %s', dim) # bin the continuous features cont_thresholds = find_thresholds(data, features, feature_info, max_bins) score_params = ScoreParams(metric, agg_type, conf) split_params = SplitParams(targets, sens, expl, dim, feature_info, cont_thresholds, min_leaf_size, subsample_frac) # get a measure for the root if metric.dataType == Metric.DATATYPE_CT: stats = [count_values(data, sens, targets[0], expl, dim)[0]] elif metric.dataType == Metric.DATATYPE_CORR: stats = [corr_values(data, sens, targets[0], expl, dim)[0]] else: stats = [data[targets + [sens]]] _, root_metric = score(stats, score_params) tree.add_features(metric=root_metric[0]) # # Builds up the tree recursively. Selects the best feature to split on, # in order to maximize the average bias (mutual information) in all # sub-trees. def rec_build_tree(node_data, node, pred, split_features, depth, parent_score, pool): """ Recursive tree building. Parameters ---------- node_data : the data for the current node pred : the predicate defining the current context split_features : the features on which a split can occur depth : the current depth parent_score : the metric score at the parent pool : the thread pool Returns ------- tree : the tree built by the algorithm """ node.add_features(size=len(node_data)) # make a new leaf if recursion is stopped if (depth == max_depth) or (len(split_features) == 0): return logging.debug('looking for splits at pred %s', pred) # select the best feature to split on split_score, best_feature, threshold, to_drop, child_metrics = \ select_best_feature(node_data, split_features, split_params, score_params, parent_score, pool) # no split found, make a leaf if best_feature is None: return logging.info('splitting on %s (score=%s) with threshold %s at pred %s', best_feature, split_score, threshold, pred) if threshold: # binary split data_left = node_data[node_data[best_feature] <= threshold] data_right = node_data[node_data[best_feature] > threshold] # predicates for sub-trees pred_left = "{} <= {}".format(best_feature, threshold) pred_right = "{} > {}".format(best_feature, threshold) # add new nodes to the underlying tree structure left_child = node.add_child(name=str(pred_left)) left_child.add_features(feature_type='continuous', feature=best_feature, threshold=threshold, is_left=True, metric=child_metrics['left']) right_child = node.add_child(name=str(pred_right)) right_child.add_features(feature_type='continuous', feature=best_feature, threshold=threshold, is_left=False, metric=child_metrics['right']) # recursively build the tree rec_build_tree(data_left, left_child, pred + [pred_left], split_features - set(to_drop), depth + 1, split_score, pool) rec_build_tree(data_right, right_child, pred + [pred_right], split_features - set(to_drop), depth + 1, split_score, pool) else: # categorical split for val in node_data[best_feature].unique(): # check if this child was pruned or not if val in child_metrics: # predicate for the current sub-tree new_pred = "{} = {}".format(best_feature, val) # add a node to the underlying tree structure child = node.add_child(name=str(new_pred)) child.add_features(feature_type='categorical', feature=best_feature, category=val, metric=child_metrics[val]) child_data = node_data[node_data[best_feature] == val] # recursively build the tree rec_build_tree( child_data, child, pred + [new_pred], split_features - set(to_drop + [best_feature]), depth + 1, split_score, pool) # # When contextual features are just a few there is # no actual benefit out of parallelization. In fact, # contention introduces a slight overhead. Hence, # use only one thread to score less than 10 features. # if len(features) < 10: pool_size = 1 else: pool_size = 1 # max(1, multiprocessing.cpu_count() - 2) if pool_size == 1: rec_build_tree(data, tree, [], features, 0, 0, None) else: pool = multiprocessing.Pool(pool_size) rec_build_tree(data, tree, [], features, 0, 0, pool) pool.close() pool.join() return tree