def label_duplications(root, recursive=True): """Labels duplication nodes with 'D'. If recursive is True, traverses the whole tree to do this. """ if root.istip == False: children = [i for i in root.children] side1 = children[0] side2 = children[1] if side1.istip: bipart1 = [side1.label] else: bipart1 = read_trees.postorder3(side1).bipart_proper if side2.istip: bipart2 = [side2.label] else: bipart2 = read_trees.postorder3(side2).bipart_proper duplication = "No" for i in bipart1: if i in bipart2: duplication = "Yes" for i in bipart2: if i in bipart1: duplication = "Yes" if duplication == "Yes": root.label = 'D' if recursive == True: for node in root.children: label_duplications(node)
def subtrees_function(root, subtrees=None): """A function to check whether a node is one of various kinds, allowing us to check for duplications and hence make the largest possible subtrees without internal duplication events. Takes the root node of the tree to be split into subtrees and returns a list of the subtrees. NB: the subtree nodes in the list are still the ones in the original tree, this doesn't make copies of them. """ if subtrees is None: subtrees = [] children = [i for i in root.children] side1 = children[0] side2 = children[1] bipart1 = read_trees.postorder3(side1).bipart_proper bipart2 = read_trees.postorder3(side2).bipart_proper # Checking if *this* node corresponds to a duplication. duplication = "No" for i in bipart1: if i in bipart2: duplication = "Yes" for i in bipart2: if i in bipart1: duplication = "Yes" # Checking if either side of the node contains another, downstream # subduplication. side1_dup = len(bipart1) - len(set(bipart1)) side2_dup = len(bipart2) - len(set(bipart2)) # If there are subduplications on both sides we need to go further into # the tree. if side1_dup != 0 and side2_dup != 0: subtrees_function(side1, subtrees) subtrees_function(side2, subtrees) # If only one of the sides contains a subduplication, we make a subtree # for the other side and go further into the tree for this side. elif side1_dup != 0: subtrees.append(side2) subtrees_function(side1, subtrees) elif side2_dup != 0: subtrees.append(side1) subtrees_function(side2, subtrees) # If there are no subduplications at all, great! else: if duplication == "Yes": subtrees.append(side1) subtrees.append(side2) elif duplication == "No": subtrees.append(root) return subtrees
def compare_trees(species_biparts, species_name_array, subtree, extra_names, family, mode, log_name, cutoff, gene_name): """This function compares a subtree to a species tree which has already been split into biparts. """ conflicts = [] concordances = [] keepgoing = True current_node = subtree new_names = [] # We need to add the names that are 'next-door' to this subtree on the # overall tree. (Why??? Come back to this when troubleshooting.) while True: parent = current_node.parent if parent != None: make_trees.label_duplications(parent, recursive=False) if parent.label == "D": current_node = parent else: for i in parent.children: if i != current_node: new_names = read_trees.postorder3(i) new_names = new_names.bipart_proper break else: break gene_name_array = read_trees.postorder3(subtree) gene_name_array = gene_name_array.bipart_proper gene_name_array.extend(new_names) #If this is a homolog then the earliest node to not have a duplication must be found if family == "homolog": gene_name_array.extend(extra_names) # Actually making the comparisons. subtree_biparts = read_trees.postorder2(subtree, subtrees=True) if mode == "n" or mode == "s": rels = comp_biparts(species_biparts, subtree_biparts, species_name_array, gene_name_array, log_name, cutoff, mode, gene_name) elif mode == "r": rels = comp_biparts(subtree_biparts, species_biparts, gene_name_array, species_name_array, log_name, cutoff, mode, gene_name) for rel in rels: if rel.relation == "conflict": conflicts.append(rel) elif rel.relation == "concordant": concordances.append(rel) return conflicts, concordances
def conflict_stats(conflicts_dict, tree, outfile): """This function should take a dictionary from sort_conflicts and calculate the most common conflict at each node, second-most common, etc. """ # We made this as a dictionary earlier because it was easier to do it # that way then, but now we want to put things in a defined order so we # need a list. stats_dict = {} for node in conflicts_dict.keys(): stats_dict[node] = [] for name in conflicts_dict[node].keys(): conflict_list = conflicts_dict[node][name] new_list = [name, conflict_list] stats_dict[node].append(new_list) outfile.write( "node_id,species_bipart,ortholog_bipart,alternative_conflicts,number_of_conflicts,percentage,genes\n") for node in stats_dict.keys(): # Order all the conflicts within each node from most to least # common. node_on_tree = read_trees.node_finder(tree, node) node_bipart = read_trees.postorder3(node_on_tree) stats_dict[node].sort(reverse=True, key=length_of_2nd_entry) # Get the total so we can calculate percentages. total = 0 for conflict in stats_dict[node]: total += len(conflict[1]) counter = 0 cumulative_percent = 0 for conflict in stats_dict[node]: how_common = len(conflict[1]) percent = float(how_common)/total * 100 # Write each result out to a table. Double-check this! output = [] output.append(str(node)) output.append(";".join(node_bipart.bipart_proper)) output.append(";".join(conflict[1][0].ortholog_bipart)) # Alternative conflicts should be included where they exist. if conflict[1][0].alt_conflict: alternatives = [] alternatives.append( ";".join(sorted(conflict[1][0].alt_conflict))) for i in conflict[1]: include = False for j in alternatives: if i.alt_conflict: if ";".join(sorted(i.alt_conflict)) != j: include = True if include: alternatives.append(";".join(sorted(i.alt_conflict))) output.append(" : ".join(alternatives)) else: output.append("") output.append(str(how_common)) output.append(str(percent)) #get the gene names gene_names_joined = "" gene_names_joined = get_gene_names(conflict[1]) output.append(gene_names_joined) string = ",".join(output) + "\n" outfile.write(string) percent = round(percent, 2) cumulative_percent += percent counter += 1
print("--species_tree and --gene_folder are required arguments in \ this mode.") sys.exit(0) # In 'return perfect concordances' mode, we need to initialise a list # to hold the files if ret_perf_concord: perf_concords = [] # Making the species tree. tree_file = open(species_tree, "r") for line in tree_file: tree = line species_root, species_name_array = make_trees.build(tree) species_biparts = read_trees.postorder2(species_root) all_taxa = read_trees.postorder3(species_root) species_biparts.append(all_taxa) # We need these later. total_conflicts = [] total_concordances = [] # Making sure the folder name is correct. if gene_folder[-1] == '/': homologs_folder = gene_folder else: homologs_folder = gene_folder + '/' # Because we do all the work of counting conflicts inside this for loop, # we use less memory as we only handle one file at once. file_list = os.listdir(homologs_folder)