def createSubgroupsOfGenes(tree_path, pathForSubgroups):
    #setOfGenes = set()
    setOfGenes = []
    tree = Tree(tree_path)
    dicNodesLeaves = tree.get_cached_content()
    for node in tree.traverse("postorder"):
        #setOfGenes.add(dicNodesLeaves[node])
        currSet = dicNodesLeaves[node]
        setOfNames = set()
        for n in currSet:
            setOfNames.add(n.name)
        if (len(setOfNames) == 1):
            continue
        setOfGenes.append(setOfNames)
    with open(pathForSubgroups, 'wb') as handle:
        pickle.dump(setOfGenes, handle)
    return
Example #2
0
# open tree file
with open(args.tree_file, "r") as f:

    # each line is a bootstrap / mcmc tree
    for line in f:

        # update total tree count
        total_tree_count += 1

        # read in tree
        t = Tree(line)

        # cache tree
        ## keys = node objects, values = sets of leaf names
        cache = t.get_cached_content(store_attr="name")
        # store all  leave names in set
        all_leaves = set([leaf.name for leaf in t.iter_leaves()])

        # traverse nodes in bootstrap tree
        for node in t.traverse("postorder"):

            # skip if leaf node
            if node.is_leaf():
                continue

            # define 3 sets of taxa that emerge from node
            ## taxa associated with branch one:
            children_one = cache.get(node.children[0])
            ## taxa associated with branch two:
            children_two = cache.get(node.children[1])
Example #3
0
class UniFrac(object):
    """
    note that the whole association between metadata and leave nodes works by .loc:
    the nodes are named according to the dataframe index and we look up a nodes metadata with df_metadata.loc[node.name]
    """
    def __init__(self, datamatrix, df_metadata):
        super(UniFrac, self).__init__()

        "make sure that the dataframe index is unique"
        assert len(df_metadata) == len(
            set(df_metadata.index)
        ), 'row-index is not unique, but we need uniqueness to associate the metadata with the leaves in the tree'

        self.datamatrix = datamatrix
        self.df_metadata = df_metadata
        self.tree = None
        self._linkage = None  # just kept to do the cut_trees call
        self.cluster_roots = None  # just kept to do the cut_trees call
        self.nodes2leaves = None  # for caching leaf lookups, however this return a set!!

    def _update_leave_metadata(self):
        "puts the metadata in self.metadata as features into the trees leaves"
        assert self.tree

        # to speed things up, query the dataframe only once
        leaves = self.tree.get_leaves()
        leavenames = [leave.name for leave in leaves]
        meta = self.df_metadata.loc[
            leavenames].values  # sorts the metadata in the same order as leavenames
        featurenames = self.df_metadata.columns.values
        for i, leaf in enumerate(leaves):
            leaf.add_features(**dict(zip(featurenames, meta[i, :])))
            #TODO not sure if this overwrites previous features (thats what i want) or just adds additional features!

    def build_tree(self, method, metric):
        """
        constructs the hierarchical clustering tree, 
        but no clustering (corresponding to some tree pruning) in here
        """
        self._linkage = linkage(self.datamatrix, method=method, metric=metric)
        # turn it into a ete tree
        leave_labels = self.df_metadata.index.values
        newick_tree = linkage_to_newick(self._linkage, labels=leave_labels)
        self.tree = Tree(newick_tree)
        self.nodes2leaves = self.tree.get_cached_content(
        )  # makes it easy to lookup leaves of a node
        # populate the leaves with metadatqa
        self._update_leave_metadata()

    def cluster(self, n_clusters):
        """
        prunes the hierarchical clustering tree to get clusters of data
        this clustering is also added to the metadata
        also adds the self.cluster_roots (caching it, we need it in unifrac calls)
        """
        assert self.tree
        clustering_prune = cut_tree(self._linkage, n_clusters)
        self.df_metadata['clustering'] = clustering_prune
        self._update_leave_metadata()
        self.cluster_roots = find_cluster_roots(self.tree)

        for i, cluster_root in enumerate(self.cluster_roots):
            cluster_root.add_features(
                **{
                    'is_cluster_root': i,
                    'n_datapoints': len(self.nodes2leaves[cluster_root])
                })

    def unifrac_distance(self, group1, group2, randomization=None):
        """
        calculates the uniFrac distance of the two sample-groups
        group1: list of nodenames (i.e. indices of the metadata)
        group2: ---"--- 
        randomization: (int) how many times to compute the 'randomized' uniFrac distance to get a pvalue
        """
        assert 'clustering' in self.df_metadata.columns and self.cluster_roots, "run cluster() first"

        # all_leaves = self.tree.get_leaves()  # TODO this is a performance hog
        the_Root = self.tree.get_tree_root()
        all_leaves = self.nodes2leaves[
            the_Root]  # for performance reasons this is better then the line above

        # make sure all group elements are in hte tree
        leaf_names = [_.name for _ in all_leaves]
        assert all([_ in leaf_names for _ in group1])
        assert all([_ in leaf_names for _ in group2])

        # t.get_leaves_by_name(group1)
        group1_nodes = set([_ for _ in all_leaves if _.name in group1
                            ])  # sets for faster `in` lookup
        group2_nodes = set([_ for _ in all_leaves if _.name in group2
                            ])  # TODO replace by search_nodes?!

        the_distance = self._unifrac_dist(group1_nodes, group2_nodes)

        if randomization and randomization > 0:
            G1 = len(group1_nodes)
            G2 = len(group2_nodes)
            all_nodes = list(
                group1_nodes |
                group2_nodes)  # union, but turn into list for partioning later
            randomized_distances = []

            for i in range(randomization):
                shuffle(all_nodes)  # inplace shuffle
                group1_nodes_random = set(all_nodes[:G1])
                group2_nodes_random = set(all_nodes[G1:])
                randomized_distances.append(
                    self._unifrac_dist(group1_nodes_random,
                                       group2_nodes_random))

            randomized_distances = np.array(randomized_distances)

            # pvalue
            p = 1 - stats.norm(
                loc=randomized_distances.mean(-1),
                scale=randomized_distances.std(-1)).cdf(the_distance)
            p2 = np.sum(randomized_distances > the_distance) / len(
                randomized_distances)
            # print(p, p2)
            return the_distance, randomized_distances, p2
        else:
            return the_distance

    def _unifrac_dist(self, group1_nodes, group2_nodes):
        "given two node lists, calculate the unifrac distance"
        At, Bt = len(group1_nodes), len(group2_nodes)
        nom = {}
        denom = {}
        for i, current_cluster_root in enumerate(self.cluster_roots):
            leafs = list(self.nodes2leaves[current_cluster_root]
                         )  # all the datapoitns in the cluster
            Ai = len([_ for _ in leafs if _ in group1_nodes])
            Bi = len([_ for _ in leafs if _ in group2_nodes])
            distance2root = current_cluster_root.distance2root  # cached already
            nom[i] = distance2root * np.abs(Ai / At - Bi / Bt)
            denom[i] = distance2root * np.abs(Ai / At + Bi / Bt)

        n_clusters = len(nom)
        summed_nom = sum([nom[i] for i in range(n_clusters)])
        summed_denom = sum([denom[i] for i in range(n_clusters)])
        unifrac_distance = summed_nom / summed_denom
        return unifrac_distance

    def visualize(self, group1=None, group2=None):
        import matplotlib
        import matplotlib.pyplot as plt

        # annotate the cluster roots with their fractions
        if group1 or group2:
            for i, cluster_root in enumerate(self.cluster_roots):
                # count downstream conditions in the leafs
                datapoints_in_cluster = list(self.nodes2leaves[cluster_root])
                cluster_root.add_face(
                    TextFace(f"Group1: {len(group1)}// Group2:{len(group2)}"),
                    column=0,
                    position="branch-right")

        def _custom_layout(node):
            cmap_cluster = plt.cm.tab10(
                np.linspace(0, 1, len(self.cluster_roots)))
            cmap_treated = plt.cm.viridis(np.linspace(0, 1, 2))

            if node.is_leaf():
                c_cluster = matplotlib.colors.rgb2hex(
                    cmap_cluster[node.clustering, :])
                c_treat = matplotlib.colors.rgb2hex(
                    cmap_treated[node.treated, :])
                node.img_style["fgcolor"] = c_treat
                node.img_style["bgcolor"] = c_cluster

            if 'is_cluster_root' in node.features:
                c_cluster = matplotlib.colors.rgb2hex(
                    cmap_cluster[node.is_cluster_root, :])
                node.img_style["bgcolor"] = c_cluster
                node.img_style["draw_descendants"] = False
                node.add_face(TextFace(f"#data:{node.n_datapoints}"),
                              column=0,
                              position="branch-right")

        ts = TreeStyle()
        ts.mode = "r"
        ts.show_leaf_name = False
        ts.arc_start = -180  # 0 degrees = 3 o'clock
        ts.arc_span = 270
        ts.layout_fn = _custom_layout
        self.tree.show(tree_style=ts)
Example #4
0
    default='EN',
    help='language chosen. FR for french, EN (default) for english',
    choices=['EN', 'FR'])

args = parser.parse_args()

sys.stdout.write("\nLoading tree... \r")
sys.stdout.flush()
t = Tree(args.filename)  #read the input tree.
nbsp = len(t)  ## get nb of tips
sys.stdout.write("Loading tree... DONE [the tree has %d tips] \n" % nbsp)
sys.stdout.flush()
##
sys.stdout.write("Storing tree nodes for faster lookup... \r")
sys.stdout.flush()
node2leaves = t.get_cached_content()
sys.stdout.write("Storing tree nodes for faster lookup... DONE\n")
sys.stdout.flush()

t.x = 6.0
t.y = 9.660254 - 10.0
t.alpha = 30.0
t.ray = 30.0
t.zoomview = np.ceil(np.log2(30 / t.ray))
maxZoomView = 0  ##

##FUNCTIONS
#getattr(t,n)


def rad(deg):
Example #5
0
        #incomplete line
        print("Incomplete processing of ancestor lineages!")
        print(ancestors_d)

try:
    ancestor = t.get_common_ancestor(ancestors_d[filename])
    #print(ancestor)
    t.set_outgroup(ancestor)
except KeyError:
    print("Root not selected!")
    print(t.get_tree_root())
    quit()
t.ladderize(direction=1)

#select scale 0-1.0 or 0-100 for support values
supportscache = t.get_cached_content(store_attr="support")
supportslist = [x.support for x in supportscache]
if max(supportslist) == 1:
    minsupport = 0.85
else:
    minsupport = 85
find_supported(t, support=minsupport) #find non-terminal nodes with high support

    
##################
###    MAIN    ###
##################
#create a dictionary of taxon data and a list of all localities
locdata = {}
localization = set()
Example #6
0
def get_species_tree(biodb):
    
    from ete3 import Tree,TreeStyle
    
    server, db = manipulate_biosqldb.load_db(biodb)
    
    species2n_complete_genomes, species2n_draft_genomes, species2completeness = get_species_data(server,
                                                                                                 biodb)

    
    sql_tree = 'select tree from reference_phylogeny t1 inner join biodatabase t2 on t1.biodatabase_id=t2.biodatabase_id ' \
               ' where t2.name="%s";' % biodb
               
    server, db = manipulate_biosqldb.load_db(biodb)
    complete_tree = Tree(server.adaptor.execute_and_fetchall(sql_tree,)[0][0])
    R = complete_tree.get_midpoint_outgroup()
    complete_tree.set_outgroup(R)

    sql = 'select distinct taxon_id,species from taxid2species_%s t1 ' \
          ' inner join species_curated_taxonomy_%s t2 on t1.species_id=t2.species_id;' % (biodb, 
                                                                                          biodb)
          
    taxon_id2species_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,))
    
    
    # changing taxon id to species id
    for leaf in complete_tree.iter_leaves():
        #print '%s --> %s' % (leaf.name, str(taxon_id2species_id[str(leaf.name)]))
        leaf.name = "%s" % str(taxon_id2species_id[str(leaf.name)])

    # attributing unique id to each node
    # if all node descendant have the same name, use that name as node name
    n = 0
    for node in complete_tree.traverse():
        if node.name=='':
            desc_list = list(set([i.name for i in node.iter_descendants()]))
            try:
                desc_list.remove('')
            except ValueError:
                pass
            if len(desc_list) != 1:
                node.name = '%sbb' % n
            else:
                node.name = desc_list[0]
            n+=1
 
    # Collapsing nodes while traversing
    # http://etetoolkit.org/docs/latest/tutorial/tutorial_trees.html#collapsing-nodes-while-traversing-custom-is-leaf-definition
    node2labels = complete_tree.get_cached_content(store_attr="name")
    
    def collapsed_leaf(node):
        if len(node2labels[node]) == 1:
            return True
        else:
            return False

    species_tree = Tree(complete_tree.write(is_leaf_fn=collapsed_leaf))
    
    
    for lf_count, lf in enumerate(species_tree.iter_leaves()):
        
        try:
            n_complete_genomes = species2n_complete_genomes[lf.name]
        except:
            n_complete_genomes = False
        try:
            n_draft_genomes = species2n_draft_genomes[lf.name]
        except:
            n_draft_genomes = False   

        if n_draft_genomes:
            c1 = round(species2completeness[lf.name][0])
            c2 = round(species2completeness[lf.name][1])
            if c1 == c2:
                completeness = "%s%%" % c1
            else:
                completeness = "%s-%s%%" % (c1, c2)
        if n_complete_genomes and n_draft_genomes:

            lf.name = "%s (%sc/%sd, %s)" % (lf.name,
                                       n_complete_genomes,
                                       n_draft_genomes,
                                       completeness)

        if n_complete_genomes and not n_draft_genomes:
            lf.name = "%s (%sc)" % (lf.name,
                                    n_complete_genomes)
        if not n_complete_genomes and n_draft_genomes:
            lf.name = "%s (%sd, %s)" % (lf.name,
                                    n_draft_genomes,
                                    completeness)
    
    return complete_tree, species_tree
Example #7
0
def get_inconsistent_trees(tree,
                           ali,
                           outgroups,
                           all_families,
                           sfile,
                           octr,
                           otr,
                           oal,
                           stats=None,
                           discard_sp=None):
    """
    For a given ensembl tree, check whether synteny-derived constrained topologies are consistent
    with it. If not, the corresponding constrained trees, ensembl subtrees and ensembl
    sub-alignments will be saved to file.

    Args:
        tree (str): ensembl tree in newick format
        ali (str): ensembl ali in fasta format
        outgroups (list): list of outgroup species used in the synteny-analysis
        all_families (dict of OrthologyFamily instances): for each outgroup genes (key) an
        OrthologyFamily instance (synteny-derived orthogroups and constrained tree topology)
        cfile (str): file to write name of synteny consistent subtrees
        mfile (str): file to write name of multigenic subtrees
        stats (dict, optional): dict to count the number of consistent and inconsistent trees

    """

    whole_tree = Tree(tree, format=1)

    for leaf in whole_tree.get_leaves():

        namesp = leaf.name + '_' + leaf.S
        leaf.prev_name = leaf.name
        leaf.name = namesp

    cached_whole_tree = whole_tree.get_cached_content(
        store_attr=['name', 'prev_name', 'S'])

    outgr_leaves = [
        i for i in cached_whole_tree[whole_tree] if i[2] in outgroups
    ]

    #for each gene of the outgroup present in tree
    for outgr_leaf in outgr_leaves:

        #if we have a corresponding constrained tree topology
        if outgr_leaf[1] in all_families:

            ctree = all_families[outgr_leaf[1]].ctree

            cached_ctree = ctree.get_cached_content(store_attr=['name'])

            #fast way to flatten list of tuple
            ctree_leaves = set(sum(cached_ctree[ctree], ()))

            lca = whole_tree.get_common_ancestor(ctree_leaves)

            leaves = cached_whole_tree[lca]

            leaves_in_fam = [i for i in leaves\
                             if i[1] in all_families[outgr_leaf[1]].genes_in_orthotable\
                             or i[0] in ctree_leaves]

            leavesnames_in_fam = {i[0] for i in leaves_in_fam}

            #keep all genes present in the family
            if len(ctree_leaves) < len(leaves):

                to_replace_inside = leavesnames_in_fam.difference(ctree_leaves)

                all_families[outgr_leaf[1]].update_constrained_tree(
                    to_replace_inside, lca)

            if discard_sp:
                keep = [
                    i for i in ctree_leaves
                    if i.split('_')[-1] not in discard_sp
                ]
                ctree.prune(keep)
                lca = lca.copy()
                lca.prune(keep)
                leavesnames_in_fam = set(keep)

            if len(leavesnames_in_fam) <= 2:
                sfile.write(outgr_leaf[1] + "\t" + "Too few genes" + '\n')
                stats['Too few genes'] = stats.get('Too few genes', 0) + 1
                continue

            comparison = ctree.compare(lca)

            #check if the constraint is present in the tree
            if comparison['source_edges_in_ref'] != 1:

                #check if family is not too multigenic, in whih case correction is difficult
                if not all_families[outgr_leaf[1]].is_multigenic():

                    #we make a copy in case more than 1 subtree is inconsistent
                    #"newick-extended" copy is iterative (based on ete3 load/write)
                    #This way we do not risk to hit recusrion limit
                    ori_tree = lca.copy("newick-extended")
                    ori_tree.prune(leavesnames_in_fam)

                    #write original subtrees
                    ori_tree.write(outfile=otr+'/'+outgr_leaf[1]+'.nh',\
                                 format=9, features=["D"])

                    #write constrained tree topology
                    ctree.write(outfile=octr + '/C_' + outgr_leaf[1] + '.nh',
                                format=9,
                                features=["D"])

                    #write corresponding sub-alignment
                    gene_species_mapping = dict(
                        (name, sp) for namesp, name, sp in leaves_in_fam)
                    seq = ut.get_subali(ali, gene_species_mapping,
                                        gene_species_mapping)
                    ut.write_fasta(seq, oal + '/' + outgr_leaf[1] + '.fa')
                    sfile.write(outgr_leaf[1] + "\t" + "Inconsistent" + '\n')
                    stats['Inconsistent'] = stats.get('Inconsistent', 0) + 1

                else:
                    sfile.write(outgr_leaf[1] + "\t" +
                                "Inconsistent_multigenic" + '\n')
                    stats['Inconsistent_multigenic'] = stats.get(
                        'Inconsistent_multigenic', 0) + 1

            else:
                sfile.write(outgr_leaf[1] + "\t" + "Consistent" + '\n')
                stats['Consistent'] = stats.get('Consistent', 0) + 1
tree = TreeNode.read('/home/meike/tests/Files/tree/test.nwk')

i = 1
for node in tree.levelorder():
    if not node.is_tip():
        if node.name is not None and node.name != '':
            node.name = '%s:N%d' % (node.name, i)
        else:
            node.name = 'N%d' % i
        i += 1

t2 = tree.write('/home/meike/tests/Files/tree/test_with_ids.nwk')

# We create a cache with every node content
node2labels = t.get_cached_content(store_attr="name")

collapsed_nodes = []
for node in t.traverse():
    if collapsed_leaf(node) == 1:
        collapsed_nodes.append(node.name)

with open('/home/meike/tests/Files/tree/collapsing.txt', 'w') as f:
    f.write('COLLAPSE\nDATA\n')
    for name in collapsed_nodes:
        f.write(name + '\n')

#t.show()
#t.write(is_leaf_fn=collapsed_leaf)

# We can even load the collapsed version as a new tree
Example #9
0
        values = col[col.nonzero()].data[0]
        counter = Counter(values)
        if counter:
            most_common = float(counter.most_common()[0][1])
            con.append(most_common / len(values))
    con = np.array(con)
    return con.mean(), gappyness


# loads alg, tree
alg, alg_index = load_alg(sys.argv[2])
name2algindex = {name: i for i, name in enumerate(alg_index)}
tree = Tree(sys.argv[1])
tree.set_outgroup(tree.get_midpoint_outgroup())

print(tree)
tree.show()

# Creates a node to sequence cache
for leaf in tree:
    leaf.seqindex = name2algindex[leaf.name]

n2seqs = tree.get_cached_content(store_attr="seqindex")

# Iters each internal node in the tree and calculate sub-alg quality
for n in tree.traverse("level_order"):
    if n.children:
        con = alg_conservation(alg, list(n2seqs[n]))
        print(n, con)
        input()
Example #10
0
    for r in SeqIO.parse(fname, format="fasta"):
        index.append(r.id)
        alg.append(seq2vector(r.seq))

    named_index = {name: i for i, name in enumerate(index)}
    return np.array(alg), named_index, index


tree_file = sys.argv[1]
alg_file = sys.argv[2]
thr = float(sys.argv[3])

alg, index, i2name = load_alg(alg_file)
tree = Tree(tree_file)
tree.set_outgroup(tree.get_midpoint_outgroup())
node2content = tree.get_cached_content(store_attr="name")

for n in tree.traverse("levelorder"):
    if n.children:
        ch1 = n.children[0]
        ch2 = n.children[1]

        leaves_left = [index[name] for name in node2content[ch1]]
        leaves_right = [index[name] for name in node2content[ch2]]
        if len(leaves_left) < 3 or len(leaves_right) < 3:
            continue

        rows, cols = alg[tuple(leaves_left), :].nonzero()
        colres_left = Counter(cols)
        cols_left = set([c for c, count in colres_left.items()
                         if count >= 2])  #>= 0.1 * len(leaves_left) ])
Example #11
0
def orthologies_with_outgroup(forest, duplicated_sp, outgroup, dict_genes, out):

    """
    Browses a gene tree forest and searches for orthologs with the outgroup.
    Writes genes without phylogenetic orthologs to a file.
    Also writes files with high-confidence orthologs and paralogs to use to otpimize the synteny
    support threshold to call orthology.

    Args:
        forest (str): Name of the gene trees forest file
        duplicated_sp (list of str): List of all duplicated species for the considered WGD
        outgroup (str): Non-duplicated outgroup
        dict_genes (dict of GeneSpeciesPosition tuples): All gene positions for each species
        out (str): Output file to write genes without phylogenetic orthologs

    Returns:
        dict: Orthologs of outgroup genes in each duplicated species

    Note (FIXME): Written to work within scorpios as orthologs and paralogs file names are derived
                  from output file patterns, assuming it contains an '_'.

    """

    ortho = {e: {} for e in duplicated_sp}

    orthofile = out.replace(out.split("/")[-1].split('_')[0], "orthologs")
    parafile = out.replace(out.split("/")[-1].split('_')[0], "paralogs")

    with open(out, 'w') as outfile, open(forest, 'r') as infile, open(parafile, 'w') as out_para,\
         open(orthofile, 'w') as out_ortho:

        sys.stderr.write("Browsing gene trees for orthologies with the outgroup...\n")

        for tree in ut.read_multiple_objects(infile):

            #load tree
            tree = Tree(tree.strip(), format=1)
            node2leaves = tree.get_cached_content()
            leaves = [i for i in tree.get_leaves()]

            #add a tag to genes of duplicated species
            tag_duplicated_species(leaves, duplicated_sp)

            #find all clades with only genes of duplicated species
            subtrees = tree.get_monophyletic(values=["Y"], target_attr="duplicated")

            #find all outgroup genes
            outgroup_genes = [i for i in leaves if i.S == outgroup]

            #search for an ortholog gene in the outgroup for all clades of teleost genes
            for subtree in subtrees:

                seen = {}
                subtree_leaves = subtree.get_leaves()
                found = False

                #browse all outgroup genes
                for j in outgroup_genes:

                    #find the node that splits the outgroup gene and duplicated species genes
                    lca = tree.get_common_ancestor(subtree, j)
                    topo_distance = len(node2leaves[lca])

                    # if it is a speciation or dubious duplication node --> speciation
                    if org.is_speciation(lca):
                        branch_distance = tree.get_distance(subtree, j)
                        if subtree not in seen:
                            seen[subtree] = []
                        seen[subtree].append((topo_distance, branch_distance, j))
                        found = True

                # if no 'true' ortholog
                # check if all descendants include only outgroup + duplicated species
                if not found:
                    for j in outgroup_genes:
                        lca = tree.get_common_ancestor(subtree, j)

                        for gene in lca.get_leaves():
                            if gene.duplicated != "Y" and gene.S != outgroup:
                                break

                        #if no break, it means all descendants are outgroup or dup.
                        else:
                            topo_distance = len(node2leaves[lca])
                            branch_distance = tree.get_distance(subtree, j)
                            seen[subtree] = seen.get(subtree, [])
                            seen[subtree].append((topo_distance, branch_distance, j))


                # if an ortholog was found, add it to the orthology dict
                if seen:
                    content = []
                    seen[subtree].sort(key=lambda x: (x[0], x[1]))
                    outgroup_gene = seen[subtree][0]
                    outgroup_gene = outgroup_gene[2].name
                    for species in duplicated_sp:
                        genes = [i.name for i in subtree_leaves if i.S == species]
                        genes = get_genes_positions(genes, species, dict_genes)

                        ortho[species][outgroup_gene] = ortho[species].get(outgroup_gene, [])
                        ortho[species][outgroup_gene] += genes


                        content += [g.name+'_'+species.replace(' ', '.')+\
                                         '|'+str(g.chromosome)+\
                                         '|'+str(g.index) for g in genes]

                    all_ortho = [i[2].name for i in seen[subtree]]
                    paralogs = [i.name for i in outgroup_genes if i.name not in all_ortho]

                    if paralogs:
                        paralog = random.choice(paralogs)

                        if paralog in dict_genes[outgroup]\
                           and outgroup_gene in dict_genes[outgroup]:

                            tmp_dict = dict_genes[outgroup]

                            out_ortho.write(' '.join(content)+'\t')
                            out_ortho.write(str(outgroup_gene)+'|'+\
                                            str(tmp_dict[outgroup_gene].chromosome)+'|'+\
                                            str(tmp_dict[outgroup_gene].index)+'|'+str(0)+'|'+\
                                            str(0)+'\n')

                            out_para.write(' '.join(content)+'\t')
                            out_para.write(str(paralog)+'|'+\
                                           str(tmp_dict[paralog].chromosome)+'|'+\
                                           str(tmp_dict[paralog].index)+'|'+\
                                           str(0)+'|'+str(0)+'\n')

                # if no ortholog found
                # write genes without ortholog along with all outgroup genes in tree
                # (potential candidate for orthology)
                elif any(i.name in dict_genes[outgroup] for i in outgroup_genes):

                    #genes without orthologs
                    missed_genes = []
                    for species in duplicated_sp:
                        genes = [i.name for i in subtree_leaves if i.S == species]
                        genes = get_genes_positions(genes, species, dict_genes)
                        missed_genes += [g.name+'_'+species.replace(' ', '.')+\
                                         '|'+str(g.chromosome)+\
                                         '|'+str(g.index) for g in genes]

                    if missed_genes:
                        outfile.write(' '.join(missed_genes)+'\t')

                        #candidate orthologs in the outgroup
                        outgr_genes = [i.name for i in outgroup_genes]

                        in_paralogs = []
                        for pair in itertools.combinations(outgr_genes, 2):
                            if tree.get_distance(pair[0], pair[1], topology_only=True) == 1:
                                in_paralogs.append(pair[0]+'|'+pair[1])

                        outgr_write = []
                        genome = dict_genes[outgroup]
                        for gene in outgr_genes:
                            if gene in genome:

                                lca = tree.get_common_ancestor(subtree, gene)
                                branch_distance = tree.get_distance(subtree, gene)
                                topo_distance = len(node2leaves[lca])

                                outgr_write.append(str(gene)+'|'+str(genome[gene].chromosome)+'|'+\
                                                   str(genome[gene].index)+'|'+str(topo_distance)+\
                                                   '|'+str(branch_distance))

                        outfile.write(' '.join(outgr_write)+'\t'+' '.join(in_paralogs)+'\n')

    sys.stderr.write("Phylogenetic orthologies with the outgroup OK\n")

    return ortho
Example #12
0
def Tree_analysis(tree,tabla,out,analysis_type,out2):  

	###Al subsequents variables could be modified
	binomial_value = float(0.05) #Default value for the option 2 of the core evaluation method for the tree
	p_value = float(0.05) #p-value threeshold for the binomial method (2 method) 
	percentage = float(0.9) #Minimun percentage threeshold of subjects requiered to defined a core 
	taxo_p = float(0.9) #Minimun percentage of the same taxonomic group within all OTUs contained into the same Node
	
	output_file=open(out, 'w')
	output_file_2=open(out2, 'w')	

	tree = Tree(tree, quoted_node_names=True, format=1) #Here we load the 97_otus tree
	table = {}
	cont = 1
	for line in open(tabla):
		if (line.startswith('#')):
			output_file_2.write(str(line))
		else:
			fields = list(map(str.strip, line.split('\t'))) #We create a dictionary with all the keys and values of the OTU table against reference
			table[fields[0]] = list(map(float, fields[1:-1]))
	
	table2 = {}
	
	for line in open(tabla):
		if (line.startswith('#')):
			continue
		else:
			fields2 = list(map(str.strip, line.split('\t'))) #Here we load a dictionary with the taxonomy information from the picked OTUs
			table2[fields2[0]] = list(map(str, fields2[(len(fields2)-1):len(fields2)]))
	
	table_final_res = [0] * len(fields[1:-1])
	table_final_res = ([float(i) for i in table_final_res])
	sum_abun_rela = 0
	cores = 0
	
	for leaf in tree:
		if leaf.name not in table:
			leaf.vector = None
		else:
			leaf.vector = table[leaf.name] #Create value vectors for each of the tree tips of the tree with the values of the OTU table previously generated

	node2content = tree.get_cached_content()

	flag=0
	for node in tree.traverse(): #This loop is used to add values into de vectors created before
		if not node.is_leaf():

			leaf_vectors = np.array([leaf.vector for leaf in node2content[node] if leaf.vector is not None])
			node.vector = leaf_vectors.sum(axis=0)
		
			if(flag == 0):
				save_node1=node.vector
				total_saved_leaves = np.array([leaf.name for leaf in node2content[node]])

				flag=1
	
	if(analysis_type==4): #This method only prints the information of the tree, only for information of the tree purpouse
		print(tree.get_ascii(show_internal=True))
		output_file.write(tree.get_ascii(show_internal=True) + '\n' + '\n')
		for node in tree.traverse("preorder"):
			print (node.name, node.vector)
			output_file.write(node.name + '\t' + str(node.vector) + '\n')

	if(analysis_type!=4):
		output_file.write("Core" + '\t' + "Prevalence" + '\t' + "Abundance" + '\t' + "Relative abundances" + '\t' + "Min" + '\t' + "Max" + '\t' + "Average" + '\t' + "SD" + '\t' + "Leaves" + '\t' + "Taxonomy" + '\t' + "Leaves number" + '\n') 
	
	
	if(analysis_type==1 or analysis_type==2 or analysis_type==3): #Here we evaluate the tree traversally using one of the choosen methods: 100% core, binomial or percentage
		for node in tree.traverse("postorder"):
		
			tot_cont=np.count_nonzero(node.vector) #Count the number ob subjects in this study with one ore more ocurrence in the vector for a certain node 
			tot_cont2=np.asarray(node.vector).size #Count the total vector array size
			a=stats.binom_test(tot_cont, n=tot_cont2, p=binomial_value, alternative='greater') #Binomial test that uses the binomial_value
			rela=(tot_cont/tot_cont2)
			
			if(analysis_type==1 and np.all(node.vector) or (analysis_type==2 and a <= p_value) or (analysis_type==3 and rela >= percentage)): #Depending on the method used to go through the tree, we will evaluate different parameters to check if the node should be or not taken into account
				
				node.vector=([float(i) for i in node.vector]) #Transform all the values contained in node.vector to float, to perform operations efficiently 
				abundance=node.vector/save_node1 #Relative abundance of each subject in the node over the terminal node (sum of all nodes)
				abundance =([float(i) for i in abundance]) 
				mean_abun=np.mean([float(i) for i in abundance]) #Mean abundance of the node
				std_abun=np.std([float(i) for i in abundance]) #Standard deviation of the node
				abundance_rela=sum(node.vector)/sum(save_node1) #Global relative abundance of the node over the terminal node
				table_final_res=list(map(sum, zip(table_final_res, abundance))) #Getting all the results for each node into a final result table
				sum_abun_rela=sum_abun_rela+abundance_rela #The sum of all global relative abundance
				cores=cores+1 #Total number of cores
				
				node2content = tree.get_cached_content()
				
				output_file_2.write(str(node.name) + '\t')
				for x in range(len(abundance)): 
					output_file_2.write(str(abundance[x]) + '\t'),
				output_file_2.write('\n')
				
				output_file.write(node.name + '\t' +  str(rela) + '\t' + str(node.vector) + '\t' + str(abundance) + '\t' + str(min(abundance)) + '\t' + str(max(abundance)) + '\t' + str(mean_abun) + '\t' + str(std_abun) + '\t')
								
				conteo_hojas=nodes_eval(node,tree,output_file,table2,taxo_p,total_saved_leaves) #With this line we can assign a taxonomy to each node based in the taxonomy of each OTU, dependig on the minimun taxonomy percentage level stablished before 
				
				output_file.write(str(conteo_hojas) + '\n') #Print the total number of leaves of this node
				
				tree=erase_node(node,tree) #Once a node has been evaluated, this line erase that node from the tree to simplify the calculations of the next nodes
			
				G = tree.search_nodes(name=node.name)[0]
				removed_node = G.detach()
						
		output_file.write(str(cores) + '\t' + '\t' + '\t' + str(table_final_res) + '\t' + str(min(table_final_res)) + '\t' + str(max(table_final_res)) + '\t' + str(np.mean([float(i) for i in table_final_res])) + '\t' + str(np.std([float(i) for i in table_final_res])) + '\n')