Exemple #1
0
def label_duplications(root, recursive=True):
    """Labels duplication nodes with 'D'. If recursive is True, traverses 
    the whole tree to do this.
    """
    if root.istip == False:
        children = [i for i in root.children]

        side1 = children[0]
        side2 = children[1]

        if side1.istip:
            bipart1 = [side1.label]
        else:
            bipart1 = read_trees.postorder3(side1).bipart_proper

        if side2.istip:
            bipart2 = [side2.label]
        else:
            bipart2 = read_trees.postorder3(side2).bipart_proper

        duplication = "No"

        for i in bipart1:
            if i in bipart2:
                duplication = "Yes"
        for i in bipart2:
            if i in bipart1:
                duplication = "Yes"

        if duplication == "Yes":
            root.label = 'D'
    if recursive == True:
        for node in root.children:
            label_duplications(node)
Exemple #2
0
def subtrees_function(root, subtrees=None):
    """A function to check whether a node is one of various kinds, allowing
    us to check for duplications and hence make the largest possible 
    subtrees without internal duplication events. Takes the root node of the
    tree to be split into subtrees and returns a list of the subtrees. NB:
    the subtree nodes in the list are still the ones in the original tree, 
    this doesn't make copies of them.
    """

    if subtrees is None:
        subtrees = []

    children = [i for i in root.children]

    side1 = children[0]
    side2 = children[1]

    bipart1 = read_trees.postorder3(side1).bipart_proper
    bipart2 = read_trees.postorder3(side2).bipart_proper

    # Checking if *this* node corresponds to a duplication.
    duplication = "No"

    for i in bipart1:
        if i in bipart2:
            duplication = "Yes"
    for i in bipart2:
        if i in bipart1:
            duplication = "Yes"

    # Checking if either side of the node contains another, downstream
    # subduplication.
    side1_dup = len(bipart1) - len(set(bipart1))
    side2_dup = len(bipart2) - len(set(bipart2))

    # If there are subduplications on both sides we need to go further into
    # the tree.
    if side1_dup != 0 and side2_dup != 0:
        subtrees_function(side1, subtrees)
        subtrees_function(side2, subtrees)

    # If only one of the sides contains a subduplication, we make a subtree
    # for the other side and go further into the tree for this side.
    elif side1_dup != 0:
        subtrees.append(side2)
        subtrees_function(side1, subtrees)

    elif side2_dup != 0:
        subtrees.append(side1)
        subtrees_function(side2, subtrees)

    # If there are no subduplications at all, great!
    else:
        if duplication == "Yes":
            subtrees.append(side1)
            subtrees.append(side2)
        elif duplication == "No":
            subtrees.append(root)

    return subtrees
def compare_trees(species_biparts, species_name_array, subtree, extra_names, family, mode, log_name, cutoff, gene_name):
    """This function compares a subtree to a species tree which has already	
    been split into biparts.
    """

    conflicts = []
    concordances = []
    keepgoing = True
    current_node = subtree
    new_names = []

    # We need to add the names that are 'next-door' to this subtree on the
    # overall tree. (Why??? Come back to this when troubleshooting.)

    while True:
        parent = current_node.parent
        if parent != None:
            make_trees.label_duplications(parent, recursive=False)
            if parent.label == "D":
                current_node = parent
            else:
                for i in parent.children:
                    if i != current_node:
                        new_names = read_trees.postorder3(i)
                        new_names = new_names.bipart_proper
                break
        else:
            break

    gene_name_array = read_trees.postorder3(subtree)
    gene_name_array = gene_name_array.bipart_proper
    gene_name_array.extend(new_names)
    
    #If this is a homolog then the earliest node to not have a duplication must be found
    if family == "homolog":
    	gene_name_array.extend(extra_names)

    		

    # Actually making the comparisons.
    subtree_biparts = read_trees.postorder2(subtree, subtrees=True)

    if mode == "n" or mode == "s":
        rels = comp_biparts(species_biparts, subtree_biparts,
                            species_name_array, gene_name_array, log_name, cutoff, mode, gene_name)
    elif mode == "r":
        rels = comp_biparts(subtree_biparts, species_biparts,
                            gene_name_array, species_name_array, log_name, cutoff, mode, gene_name)

    for rel in rels:
        if rel.relation == "conflict":
            conflicts.append(rel)
        elif rel.relation == "concordant":
            concordances.append(rel)

    return conflicts, concordances
def conflict_stats(conflicts_dict, tree, outfile):
    """This function should take a dictionary from sort_conflicts and 
    calculate the most common conflict at each node, second-most common, 
    etc.
    """

    # We made this as a dictionary earlier because it was easier to do it
    # that way then, but now we want to put things in a defined order so we
    # need a list.
    stats_dict = {}

    for node in conflicts_dict.keys():
        stats_dict[node] = []

        for name in conflicts_dict[node].keys():
            conflict_list = conflicts_dict[node][name]
            new_list = [name, conflict_list]
            stats_dict[node].append(new_list)

    outfile.write(
        "node_id,species_bipart,ortholog_bipart,alternative_conflicts,number_of_conflicts,percentage,genes\n")

    for node in stats_dict.keys():
        # Order all the conflicts within each node from most to least
        # common.
        node_on_tree = read_trees.node_finder(tree, node)
        node_bipart = read_trees.postorder3(node_on_tree)
        stats_dict[node].sort(reverse=True, key=length_of_2nd_entry)

        # Get the total so we can calculate percentages.
        total = 0
        for conflict in stats_dict[node]:
            total += len(conflict[1])

        counter = 0
        cumulative_percent = 0

        for conflict in stats_dict[node]:
            
            how_common = len(conflict[1])
            percent = float(how_common)/total * 100

            # Write each result out to a table. Double-check this!
            output = []
            output.append(str(node))
            output.append(";".join(node_bipart.bipart_proper))
            output.append(";".join(conflict[1][0].ortholog_bipart))


            # Alternative conflicts should be included where they exist.
            if conflict[1][0].alt_conflict:
                alternatives = []
                alternatives.append(
                    ";".join(sorted(conflict[1][0].alt_conflict)))
                for i in conflict[1]:
                    include = False
                    for j in alternatives:
                        if i.alt_conflict:
                            if ";".join(sorted(i.alt_conflict)) != j:
                                include = True
                    if include:
                        alternatives.append(";".join(sorted(i.alt_conflict)))
                output.append(" : ".join(alternatives))
            else:
                output.append("")
            output.append(str(how_common))
            output.append(str(percent))
            
            #get the gene names
            gene_names_joined = ""
            gene_names_joined = get_gene_names(conflict[1])
            output.append(gene_names_joined)
            	
            string = ",".join(output) + "\n"
            outfile.write(string)

            percent = round(percent, 2)
            cumulative_percent += percent
            counter += 1
            print("--species_tree and --gene_folder are required arguments in \
this mode.")
            sys.exit(0)

        # In 'return perfect concordances' mode, we need to initialise a list
        # to hold the files
        if ret_perf_concord:
            perf_concords = []

        # Making the species tree.
        tree_file = open(species_tree, "r")
        for line in tree_file:
            tree = line
        species_root, species_name_array = make_trees.build(tree)
        species_biparts = read_trees.postorder2(species_root)
        all_taxa = read_trees.postorder3(species_root)
        species_biparts.append(all_taxa)

        # We need these later.
        total_conflicts = []
        total_concordances = []

        # Making sure the folder name is correct.
        if gene_folder[-1] == '/':
            homologs_folder = gene_folder
        else:
            homologs_folder = gene_folder + '/'

        # Because we do all the work of counting conflicts inside this for loop,
        # we use less memory as we only handle one file at once.
        file_list = os.listdir(homologs_folder)