Example #1
0
def get_example_tree():

    # Set dashed blue lines in all leaves
    nst1 = NodeStyle()
    nst1["bgcolor"] = "LightSteelBlue"
    nst2 = NodeStyle()
    nst2["bgcolor"] = "Moccasin"
    nst3 = NodeStyle()
    nst3["bgcolor"] = "DarkSeaGreen"
    nst4 = NodeStyle()
    nst4["bgcolor"] = "Khaki"


    t = Tree("((((a1,a2),a3), ((b1,b2),(b3,b4))), ((c1,c2),c3));")
    for n in t.traverse():
        n.dist = 0

    n1 = t.get_common_ancestor("a1", "a2", "a3")
    n1.set_style(nst1)
    n2 = t.get_common_ancestor("b1", "b2", "b3", "b4")
    n2.set_style(nst2)
    n3 = t.get_common_ancestor("c1", "c2", "c3")
    n3.set_style(nst3)
    n4 = t.get_common_ancestor("b3", "b4")
    n4.set_style(nst4)
    ts = TreeStyle()
    ts.layout_fn = layout
    ts.show_leaf_name = False

    ts.mode = "c"
    ts.root_opening_factor = 1
    return t, ts
Example #2
0
def get_example_tree():

    # Set dashed blue lines in all leaves
    nst1 = NodeStyle()
    nst1["bgcolor"] = "LightSteelBlue"
    nst2 = NodeStyle()
    nst2["bgcolor"] = "Moccasin"
    nst3 = NodeStyle()
    nst3["bgcolor"] = "DarkSeaGreen"
    nst4 = NodeStyle()
    nst4["bgcolor"] = "Khaki"

    t = Tree("((((a1,a2),a3), ((b1,b2),(b3,b4))), ((c1,c2),c3));")
    for n in t.traverse():
        n.dist = 0

    n1 = t.get_common_ancestor("a1", "a2", "a3")
    n1.set_style(nst1)
    n2 = t.get_common_ancestor("b1", "b2", "b3", "b4")
    n2.set_style(nst2)
    n3 = t.get_common_ancestor("c1", "c2", "c3")
    n3.set_style(nst3)
    n4 = t.get_common_ancestor("b3", "b4")
    n4.set_style(nst4)
    ts = TreeStyle()
    ts.layout_fn = layout
    ts.show_leaf_name = False

    ts.mode = "c"
    ts.root_opening_factor = 1
    return t, ts
Example #3
0
def get_example_tree():

    # Set dashed blue lines in all leaves
    nst1 = NodeStyle()
    nst1["bgcolor"] = "LightSteelBlue"
    nst2 = NodeStyle()
    nst2["bgcolor"] = "Moccasin"
    nst3 = NodeStyle()
    nst3["bgcolor"] = "DarkSeaGreen"
    nst4 = NodeStyle()
    nst4["bgcolor"] = "Khaki"


    t = Tree("( 🌲,( 🥑,(( 🌷, ( 🌴, ( 🍌, ( 🍍, ( 🌽, ( 🎋, 🌾 )))))),(( 🍇, ((( 🥜, ☘️), ((( 🌹, 🍓 ), (( 🍎, 🍐 ), ( 🍑, (🌸, 🍒) ))), ( 🌰, ( 🎃, ( 🍉, ( 🥒, 🍈)))))), (( 🌺, 🥦 ), (( 🍊, 🍋 ), ( 🍁, 🥭))))),( 🌵, ( 🥝, (( 🍠, ( 🌶️, (🍆, ( 🥔, 🍅)))), ( 🥕,( 🥬, ( 🌻, 🌼)))))))));")
    for n in t.traverse():
        n.dist = 0

    n1 = t.get_common_ancestor("a1", "a2", "a3")
    n1.set_style(nst1)
    n2 = t.get_common_ancestor("b1", "b2", "b3", "b4")
    n2.set_style(nst2)
    n3 = t.get_common_ancestor("c1", "c2", "c3")
    n3.set_style(nst3)
    n4 = t.get_common_ancestor("b3", "b4")
    n4.set_style(nst4)
    ts = TreeStyle()
    ts.layout_fn = layout
    ts.show_leaf_name = False

    ts.mode = "c"
    ts.root_opening_factor = 1
    return t, ts
Example #4
0
def smart_reroot(treefile, outgroupfile, outfile, format=0):
    """
    simple function to reroot Newick format tree using ete2

    Tree reading format options see here:
    http://packages.python.org/ete2/tutorial/tutorial_trees.html#reading-newick-trees
    """
    tree = Tree(treefile, format=format)
    leaves = [t.name for t in tree.get_leaves()][::-1]
    outgroup = []
    for o in must_open(outgroupfile):
        o = o.strip()
        for leaf in leaves:
            if leaf[:len(o)] == o:
                outgroup.append(leaf)
        if outgroup:
            break

    if not outgroup:
        print("Outgroup not found. Tree {0} cannot be rerooted.".format(treefile), file=sys.stderr)
        return treefile

    try:
        tree.set_outgroup(tree.get_common_ancestor(*outgroup))
    except ValueError:
        assert type(outgroup) == list
        outgroup = outgroup[0]
        tree.set_outgroup(outgroup)
    tree.write(outfile=outfile, format=format)

    logging.debug("Rerooted tree printed to {0}".format(outfile))
    return outfile
def rename_model(target, model, accelerated_genomes):
    """Iteratively rename each ancestor of accelerated_genomes, walking down the tree to each ancestor"""
    new_model = os.path.join(target.getGlobalTempDir(),
                             'region_specific_conserved_subtree.mod')
    lines = open(model).readlines()
    t = Tree(lines[-1].split('TREE: ')[1], format=1)
    # this model may not have all of the genomes, if they were not aligned in this region
    accelerated_genomes = list(
        set(t.get_leaf_names()) & set(accelerated_genomes))
    if len(accelerated_genomes) > 1:
        anc = t.get_common_ancestor(accelerated_genomes)
        nodes = anc.get_descendants()
        leaves = anc.get_leaves()
        internal_nodes = [x for x in nodes if x not in leaves]
        for n in [anc] + internal_nodes:
            oldest_name = [x.name for x in n.get_children() if x.name != '1']
            if len(oldest_name) == 1:
                n.name = oldest_name[0] + '_Anc'
            else:
                n.name = '_'.join(oldest_name)
            with open(new_model, 'w') as outf:
                for l in lines[:-1]:
                    outf.write(l)
                outf.write('TREE: ' + t.write(format=1) + '\n')
            yield n.name, new_model
            n.name = '1'
    else:  # only one accelerated genome here -- get common ancestor above will return root node
        with open(new_model, 'w') as outf:
            for l in lines[:-1]:
                outf.write(l)
            outf.write('TREE: ' + t.write(format=1) + '\n')
        yield accelerated_genomes[0], new_model
Example #6
0
 def remove_outgroups(self, ognames, remove=False):
     """reroot using outgroups and remove them"""
     self.reroot = False
     try:
         if remove:
             for og in ognames:
                 self.taxa_order.remove(og)
             self.numtaxa = len(self.taxa_order)
         for i in range(len(self.trees)):
             t = Tree(self.trees[i])
             if len(ognames) < 2:
                 t.set_outgroup(ognames[0])
                 if remove:
                     t.prune(self.taxa_order, preserve_branch_length=True)
             else:
                 ancestor = t.get_common_ancestor(ognames)
                 if not t == ancestor:
                     t.set_outgroup(ancestor)
                 if remove:
                     t.prune(self.taxa_order, preserve_branch_length=True)
             self.trees[i] = t.write()
     except ValueError as e:
         print(e)
         print(
             "\n Somthing is wrong with the input outgroup names \n Quiting ..."
         )
         sys.exit()
Example #7
0
def make_newick_tree(marker_fasta, tree_outfile,reference):
    ## using muscle
    # make the alignment file
    temp_align = '.'.join(marker_fasta.split('.')[:-1]) + '.aln' 
    cm1 ="muscle -in "+marker_fasta+ " -out "+temp_align
    os.system(cm1)
    #make the tree using clustal
    cm2 ="clustalw -infile="+temp_align+" -tree=1"
    # have to wait for few second for the aln file actually comes out lol
    os.system(cm2)
    temp_tree = '.'.join(marker_fasta.split('.')[:-1]) + '.ph' # that's what this file gets named by default, and i'm sick of looking for the cmd line arg to fix.
    print(temp_tree)
    print("modifying")
    #modify for negative branch
    modify_tree = '.'.join(marker_fasta.split('.')[:-1]) + '.new'
    cm3 = "sed -e 's,:-[0-9\.]\+,:0.0,g' "+temp_tree+" > "+modify_tree   
    os.system(cm3)
    
    if reference == 'NC_000913':
        t= Tree(modify_tree)
        ancestor = t.get_common_ancestor("Campylobacter_jejuni_NC_002163","Nitrosomonas_europaea_NC_004757")
        t.set_outgroup(ancestor)
        t.write(format = 1, outfile = modify_tree)
    # dealing with negative branch length
    #print "marker_fasta",marker_fasta
    #print "temp_tree", temp_tree
    # move the created tree file to the location i say its going
    shutil.copy(modify_tree, tree_outfile)
Example #8
0
def root_tree(tree_path, species_path, output_name):

    with open(species_path, 'r') as f:
        species_list = f.read().splitlines()

    tree = Tree(tree_path)
    ancestor = tree.get_common_ancestor(species_list)
    tree.set_outgroup(ancestor)
    tree.write(outfile=output_name, format=1)
def main(argv):
    start = time.time()
    style1 = NodeStyle()
    style1["fgcolor"] = "#0f0f0f"
    style1["size"] = 0
    #style1["vt_line_color"] = "#ff0000"
    style1["hz_line_color"] = "#ff0000"
    #style1["vt_line_width"] = 2
    style1["hz_line_width"] = 2
    #style1["vt_line_type"] = 2 # 0 solid, 1 dashed, 2 dotted
    #style1["hz_line_type"] = 2

    style2 = NodeStyle()
    style2["fgcolor"] = "#0f0f0f"
    style2["size"] = 0
    style2["vt_line_color"] = "#ff0000"
    #style2["hz_line_color"] = "#ff0000"
    style2["vt_line_width"] = 2
    #style2["hz_line_width"] = 2
    style2["vt_line_type"] = 2  # 0 solid, 1 dashed, 2 dotted
    #style2["hz_line_type"] = 2

    tree1 = Tree(str(argv[1]))
    save = int(argv[3])
    leafs = argv[2]
    leafs = leafs.replace("(", "")
    leafs = leafs.replace(")", "")
    leafs = leafs.replace("'", "")
    leafs = leafs.replace(" ", "")
    q = leafs.split(',')

    #tree2 = _Tree(str(arg2))

    se = tree1.get_common_ancestor(q)

    for n in q:
        print(n)
        node = tree1 & n
        while (node.up != se):
            node.img_style = style1
            node = node.up
    #n.img_style = style2

        node.img_style = style1
    ts = TreeStyle()
    ts.show_leaf_name = True

    if (save == 1):
        if os.path.exists("crud/static/crud/Tree1.png"):
            os.remove("crud/static/crud/Tree1.png")
        tree1.render("crud/static/crud/Tree1.png", tree_style=ts)
    else:
        if os.path.exists("crud/static/crud/Tree2.png"):
            os.remove("crud/static/crud/Tree2.png")
        tree1.render("crud/static/crud/Tree2.png", tree_style=ts)

    return True
Example #10
0
def main(tree_path, species_path):

    tree = Tree(tree_path, format=1)

    with open(species_path, 'r') as f:
        species = f.read().splitlines()

    #get the first internal node grouping all given species
    common_ancestor = tree.get_common_ancestor(species)

    return common_ancestor
Example #11
0
def add_section_annotations(tree: Tree) -> None:
    """Annotates taxonomic sections.

    Pretty hacky. Finds first common ancestor of leaf nodes per section,
    then sets a bgcolor. If a section contains a single node, then only
    that node is styled. Also adds a section label, but exact position
    is determined by which node gets found first using search_nodes().

    Relies on accurate section annotation - FP strains were set to Talaromyces
    which breaks this.
    """
    leaves = tree.get_leaf_names()
    sections = defaultdict(list)
    for strain in session.query(Strain).filter(Strain.id.in_(leaves)):
        if "FP" in strain.species.epithet:
            continue
        sections[strain.species.section.name].append(str(strain.id))

    index = 0
    colours = [
        "LightSteelBlue",
        "Moccasin",
        "DarkSeaGreen",
        "Khaki",
        "LightSalmon",
        "Turquoise",
        "Thistle"
    ]

    for section, ids in sections.items():
        # Find MRCA and set bgcolor of its node
        style = NodeStyle()
        style["bgcolor"] = colours[index]
        if len(ids) == 1:
            node = tree.search_nodes(name=ids[0])[0]
        else:
            node = tree.get_common_ancestor(*ids)
        node.set_style(style)

        # Grab first node found in this section, and add section label
        node = tree.search_nodes(name=ids[0])[0]
        face = faces.TextFace(section, fsize=20)
        node.add_face(face, column=1, position="aligned")

        # Wraparound colour scheme
        index += 1
        if index > len(colours) - 1:
            index = 0
def prune_species_tree(gene_tree,
                       cached_species_tree=None,
                       keep_polytomies=False):

    gTree = Tree(gene_tree)

    #species reading

    #leaf names should be of the type [speciesID_ProteinName]
    leaf_names = gTree.get_leaf_names()

    species_list = {x.split('_')[0] for x in leaf_names}
    species_list = list(species_list)

    species_ids = {''.join(filter(str.isdigit, x)): x for x in species_list}

    #big species tree
    if cached_species_tree:
        s = cached_species_tree
    else:
        s = Tree(EGGNOGv4_SPECIES_TREE)

    #get lca for core
    common_ancestor = s.get_common_ancestor(list(species_ids.keys())).copy()

    #prune to subset
    # common_ancestor.prune(species_ids) # slower method
    leaves = {x.name: x for x in common_ancestor.get_leaves()}
    to_remove = leaves.keys() - species_ids.keys()
    for species_id in to_remove:
        if species_id in leaves:
            leaves[species_id].delete()
    assert (len(common_ancestor.get_leaf_names()) == len(species_ids))

    #binarize
    if not keep_polytomies:
        common_ancestor.resolve_polytomy(recursive=True)

    #change names
    for leaf in common_ancestor.get_leaves():
        leaf.name = species_ids[leaf.name]

    #write out reconciliation_job
    species_nw = common_ancestor.write(format=5)

    return species_nw
def open_tsv_population_size(tree_file, tsv_file):
    t = Tree(tree_file, format=1)
    csv = pd.read_csv(tsv_file, header=None, sep='\t')
    for index, (leaf_1, leaf_2, _, ne, _) in csv.iterrows():
        if leaf_1 == leaf_2:
            leaves = t.get_leaves_by_name(leaf_1)
            assert (len(leaves) == 1)
            n = leaves[0]
        else:
            n = t.get_common_ancestor([leaf_1, leaf_2])
        n.pop_size = ne

    pop_size_dict = dict()
    root_pop_size = float(t.pop_size)
    pop_size_dict["LogPopulationSize"] = [
        np.log(float(n.pop_size) / root_pop_size) for n in t.traverse()
    ]
    return pop_size_dict, t
Example #14
0
def get_dates(chain_name):
    label_tree = Tree(f"{chain_name}_sample.labels", format=1)
    dates = pd.read_csv(f"{chain_name}_sample.dates", sep='\t', index_col=0)
    # rate_tree = Tree(f"{chain_name}_sample.ratetree",format=1)

    name2group = {
        "anammox bacteria": "GCA_001828545.1|GCA_004282745.1",
        "root": "GCA_000011385.1|GCA_003576915.1",
        "cyanobacteria": "GCA_000011385.1|GCA_000013205.1",
        "Nostocales": "GCA_000196515.1|GCA_001548455.1",
        "pleurocapsales": "GCA_000317575.1|GCA_000317025.1",
    }

    c = []
    for gname, group in name2group.items():
        group = group.split('|')
        raw_name = '%s' % label_tree.get_common_ancestor(group).name
        sub_dates = dates.loc[[raw_name], :]
        sub_dates.index = [gname]
        c.append(sub_dates)

    df = pd.concat(c, axis=0)
    return df
def create_tree_data(treename, df):
    t = Tree(treename)
    branch_lengths_s = []
    branch_lengths_hs = []
    dist = []
    ns = []
    nhs = []
    for index, row in progressbar.progressbar(df.iterrows()):
        d = 0
        x = row["species"]
        y = row["homology_species"]
        bl = []
        c = 0
        mca = t.get_common_ancestor(x, y)
        node = t & x
        while node.up != mca:
            d += node.dist
            bl.append(node.dist)
            node = node.up
            c += 1
        ns.append(c)
        c = 0
        branch_lengths_s.append(bl)
        bl = []
        node = t & y
        while node.up != mca:
            d += node.dist
            bl.append(node.dist)
            node = node.up
            c += 1
        nhs.append(c)
        branch_lengths_hs.append(bl)
        dist.append(d)
    create_branch_length_padding(branch_lengths_s)
    create_branch_length_padding(branch_lengths_hs)
    return np.array(branch_lengths_s), np.array(
        branch_lengths_hs), np.array(dist), np.array(ns), np.array(nhs)
Example #16
0
def get_sister_species(species_tree, species, anc):
    """
    Extracts a list of species related to a given species: species branching between `species` and
    the ancestor `anc`.

    Args:
        species_tree (ete3 Tree): ete3 tree object
        species (str): name of the species
        anc (str): name of the ancestor

    Returns:
        list: species branching between `species` and `anc`
    """

    sp_and_sisters = [species]
    duplicated_sp = get_species(species_tree, anc)
    tree = Tree(species_tree, format=1)
    lca = tree.get_common_ancestor([species] + duplicated_sp)
    sp_and_sisters += [
        i.name for i in lca.get_leaves()
        if i.name not in [species] + duplicated_sp
    ]

    return sp_and_sisters
Example #17
0
parser.add_argument(
    '--verbose', action='store_true',
    help=('Print information about the outgroup (if any) taxa to standard '
          'error'))

args = parser.parse_args()

tree = Tree(args.treeFile.read())

if args.outgroupRegex:
    from re import compile
    regex = compile(args.outgroupRegex)
    taxa = [leaf.name for leaf in tree.iter_leaves() if regex.match(leaf.name)]

    if taxa:
        ca = tree.get_common_ancestor(taxa)
        if args.verbose:
            print('Taxa for outgroup:', taxa, file=sys.stderr)
            print('Common ancestor:', ca.name, file=sys.stderr)
            print('Common ancestor is tree:', tree == ca, file=sys.stderr)

        if len(taxa) == 1:
            tree.set_outgroup(tree & taxa[0])
        else:
            if ca == tree:
                tree.set_outgroup(tree.get_midpoint_outgroup())
            else:
                tree.set_outgroup(tree.get_common_ancestor(taxa))

print(tree.get_ascii())
Example #18
0
for line in outg:
    outgroups.append(line.rstrip())
outg.close()

target_taxa = []
tt = open(sys.argv[2])
for line in tt:
    target_taxa.append(line.rstrip())
tt.close()

#now read in a collection of trees, calc branch lengths over sample, summarise and print out
branch_lengths = defaultdict(list)  #key = taxa, value = list of brlens
treefile = open(sys.argv[3])
for line in treefile:
    curr_tree = Tree(line.rstrip())
    root_node = curr_tree.get_common_ancestor(outgroups)
    if curr_tree != root_node:
        curr_tree.set_outgroup(root_node)
    print curr_tree
    #bundle = curr_tree.check_monophyly(values=outgroups,target_attr='name')
    #print bundle
    #if bundle[0] == False:
    #    continue
    #find common ancestor of the target taxa, and use this as the reference node for calculating branch lengths. This might not always be the measure you want!
    reference_node = curr_tree.get_common_ancestor(target_taxa)
    #if reference_node != curr_tree:
    #    curr_tree.set_outgroup(reference_node)
    #calc distance from root to each branch of interest
    for taxon in target_taxa:
        dist = curr_tree.get_distance(taxon, reference_node)
        branch_lengths[taxon].append(dist)
                C = RectFace(triangle=True, width=size[mode], height=size[mode], bgcolor=colourDict[metadata[display_name]['prefec']], fgcolor='#FFFFFF')
            else:
                C = RectFace(triangle=True, width=size[mode], height=size[mode], bgcolor='#939393', fgcolor='#FFFFFF')
        n.add_face(C, column=1, position=position)
    tree.render(file_name=sys.argv[1] + '_' + cluster + '.pdf', tree_style=ts, w=width)

big_tree = Tree(sys.argv[1])
mode = sys.argv[2]
metadata = {}
metadata = get_meta_new(metadata, big_tree)
colourDict = get_colours(clusters, big_tree, colours)

#remove dodgy sample
big_tree.search_nodes(name="'EBOV|EMLab-RT|IPDPFHGINSP_GUI_2015_5339||GIN|Conakry|?|MinION_LQ05|2015-04-08'")[0].delete(preserve_branch_length=True)
#root the same as the MCC tree
ancestor = big_tree.get_common_ancestor("'EBOV|EMLab|EM_079422|KR817187|GIN|Macenta|?||2014-03-27'","'EBOV|EMLab|Gueckedou-C05|KJ660348|GIN|Gueckedou|?||2014-03-19'")
big_tree.set_outgroup(ancestor)
big_tree.ladderize()

ts = TreeStyle()
ts.show_leaf_name = False
#ts.show_branch_support = True
ts.scale = 100000
if mode == 'small':
    ts.scale = 750000

#add legend
for each in list(colourDict.keys()):
    ts.legend.add_face(CircleFace(radius=size[mode]/2, color=colourDict[each]), column=0)
    ts.legend.add_face(TextFace(each, ftype="Helvetica", fsize=size[mode]), column=1)
ts.legend.add_face(CircleFace(radius=size[mode]/2, color='#F1F1F1'), column=0)
Example #20
0
#         |                   |          /-L
#         |                    \--------|
# ---------|                              \-M
#         |
#         |                    /-B
#         |          /--------|
#         |         |         |          /-J
#         |         |          \--------|
#          \--------|                    \-K
#                   |
#                   |          /-E
#                    \--------|
#                              \-D
#
# Each main branch of the tree is independently rooted.
node1 = t.get_common_ancestor("A", "H")
node2 = t.get_common_ancestor("B", "D")
node1.set_outgroup("H")
node2.set_outgroup("E")
print "Tree after rooting each node independently:"
print t
#
#                              /-F
#                             |
#                    /--------|                    /-L
#                   |         |          /--------|
#                   |         |         |          \-M
#                   |          \--------|
#          /--------|                   |          /-A
#         |         |                    \--------|
#         |         |                              \-C
mytree = open('raxml.298.pruned.tre', 'r')

for line in mytree:
    t = Tree(line.strip(), format=1)

mycalibs = open('calibrations.tab.txt', 'r')

for line in mycalibs:
    info = line.strip().split('\t')
    print info
    sp1 = info[0]
    sp2 = info[1]
    thetime = info[2]

    tempnode = t.get_common_ancestor(sp1, sp2)
    print sp1, sp2, tempnode
    tempnode.add_features(calibration=">" + thetime)

out = open('conus.tree.calibrationsadded.tre', 'w')

out.write("365 7\n")
myoutput = t.write(format=9,
                   features=["calibration"]).replace('[&&NHX:calibration=',
                                                     '').replace(']', '')
out.write(myoutput + '\n')
out.write('//end of file')

#print t.write(format=9,features=["calibration"], outfile = "conus.tree.calibrationsadded.tre")

#node1 = t.get_common_ancestor("arcuata", "centurio")
Example #22
0
# intree = './trees/iqtree/over20p_bac120.ufboot'
otree = './bayesTraits_test/test.trees'

intree, otree = sys.argv[1:]

root_with = 'GCA_900097105.1,GCA_000020225.1,GCA_000172155.1,GCA_001318295.1,GCA_001613545.1,GCA_000019665.1,GCA_000019965.1,GCA_001746835.1'
if __name__ == "__main__":
    if len(open(intree).read().split('\n')) == 1:
        t = Tree(intree, format=3)
    else:
        multiple_trees = []
        for row in open(intree):
            row = row.strip('\n')
            multiple_trees.append(Tree(row))

        LCA = t.get_common_ancestor(root_with.split(','))
        t.set_outgroup(LCA)
        # TODO: finish it.

    new_name2old_name = {}
    _count = 0
    for leaf in t.get_leaves():
        new_name2old_name[str(_count)] = leaf.name
        leaf.name = str(_count)
        _count += 1

    # for bayestraits
    nexus_template = """#NEXUS
    begin trees;
                translate
    {translate_text};
Example #23
0
mycursor = cnx.cursor(buffered=True)
PfamExtractionStatement = "SELECT UID,Species,NumberOfAssociatedSpecies FROM " + PfamTable
mycursor.execute(PfamExtractionStatement)
pfamResults = mycursor.fetchall()

# Each pfam is then analyzed so an age can be assigned to it.
for pfam in pfamResults:
    speciesList = pfam[1].split(',')
    UID = pfam[0]
    numberOfSpecies = pfam[2]

    # If more than one species exists for a particular pfam, then algorithm [1] is implemented (See: description in
    # the beginning of this script
    if numberOfSpecies != 1:
        # The subtree is pulled starting with the common ancestor of all species in the list as the root node
        subtree = t.get_common_ancestor(speciesList)
        # The total distance starts at zero and will be added to as each branch in the subtree is searched
        totalDistance = 0
        # We start at the base node and proceed down a branch of the subtree
        for node in subtree.iter_descendants("preorder"):
            # So long as our location is not a leaf, we add the relative age of the node to the total and continue to search the tree
            if node.is_leaf() == False:
                totalDistance += node.dist
            # Once we're located on a leaf, we have searched an entire branch of our subtree and, once we add the age of the leaf, we have acquired the
            # total age of our subtree (minus the age of the common ancestor)
            else:
                totalDistance += node.dist
                break
        # The variable maxLength is used to find the node that has the greatest number of children in the tree
        maxLength = 0
        # We then traverse the tree starting from the leaves and working our way toward the root node
class ClusterIdentification(object):
    def __init__(self):
        self.PercentileThreshold = {}
        self.dictSharedReads = {}
        self.dictClusters = {}
        self.monoFinalRes = []
        self.count = 0
        self.SerialNodes = {}
        self.t = Tree(TreeFile)
        self.nodesRemoved = []
        self.nodecheck = []

    ##The Split method identifies the percentile threshold for each sample from the results of PatDistSpectrum.py
    ##This threshold is determined from user input in the command line
    def Split(self, infile):

        Percentiles = {
            "0": "1",
            "1": "2",
            "5": "3",
            "10": "4",
            "20": "5",
            "25": "6",
            "30": "7",
            "35": "8",
            "40": "9",
            "45": "10",
            "50": "11",
            "75": "12",
            "90": "13",
            "99": "14",
            "100": "15",
        }
        with open(Spectrum, "r") as file1:
            for line in file1:
                if not "Samples" in line:
                    linerep = line.replace(" ", "")
                    if percentile in Percentiles:
                        cutoff = Percentiles[percentile]
                    else:
                        sys.stdout.write(
                            "Please specify the percentile as a number (0,1,5,10,20,25,30,35,40,50,75,90,100)"
                        )
                        sys.exit(1)
                    linesp = linerep.rstrip("\n").split("\t")
                    nodes = linesp[0]
                    nodesSp = nodes.split("__")
                    if nodesSp[0] == nodesSp[1]:
                        combNode = nodesSp[0] + "__" + nodesSp[1]
                        self.PercentileThreshold[combNode] = linesp[int(cutoff)]

        return self.PercentileThreshold

    # Identifies all variants passing the threshold defined in the Split method
    def variantCollection(self):

        PatDistSpec = self.Split(Spectrum)

        with open(PatDistOutput, "r") as file2:
            for line in file2:
                linesp = line.split(",")
                node = linesp[0]
                nodeshort = node[idStart:idLen]
                nodedouble = nodeshort + "__" + nodeshort
                mateshort = linesp[1][idStart:idLen]
                matedouble = mateshort + "__" + mateshort
                mate = linesp[1]
                patdist = linesp[2]
                support = linesp[3].rstrip("\n")
                comb = nodeshort + "__" + mateshort
                comb2 = mateshort + "__" + nodeshort
                # Store variants that are below the respective pat dist threshold defined in PatDistSpec
                if nodeshort != mateshort:
                    if float(PatDistSpec[nodedouble]) <= float(PatDistSpec[matedouble]):
                        target = float(PatDistSpec[nodedouble])
                    else:
                        target = float(PatDistSpec[matedouble])
                    if float(patdist) <= float(target):
                        if str(supportInput) == "PASS":
                            if not comb2 in self.dictSharedReads:
                                if not comb in self.dictSharedReads:
                                    self.dictSharedReads[comb] = []
                                if not node in self.dictSharedReads[comb]:
                                    self.dictSharedReads[comb].append(node)
                                if not mate in self.dictSharedReads[comb]:
                                    self.dictSharedReads[comb].append(mate)
                            else:
                                if not node in self.dictSharedReads[comb2]:
                                    self.dictSharedReads[comb2].append(node)
                                if not mate in self.dictSharedReads[comb2]:
                                    self.dictSharedReads[comb2].append(mate)
                        elif not support == "None":
                            if float(supportInput) <= float(support):
                                if not comb2 in self.dictSharedReads:
                                    if not comb in self.dictSharedReads:
                                        self.dictSharedReads[comb] = []
                                    if not node in self.dictSharedReads[comb]:
                                        self.dictSharedReads[comb].append(node)
                                    if not mate in self.dictSharedReads[comb]:
                                        self.dictSharedReads[comb].append(mate)
                                else:
                                    if not node in self.dictSharedReads[comb2]:
                                        self.dictSharedReads[comb2].append(node)
                                    if not mate in self.dictSharedReads[comb2]:
                                        self.dictSharedReads[comb2].append(mate)

    ##Identifying potential outliers is optional (-oR flag from the command line )
    # Based on the retrieve common ancestor function, it identifies outliers as those which contain < 3 intra-variants associated with a given sample
    def PhylyOutlierRem(self, n, node1, node2, OutlierFile, idStart, idLen):
        PhyloOutliers = {}
        ancestorList = []
        ancshort = []
        node1short = node1[idStart:idLen]
        node2short = node2[idStart:idLen]
        nodecomb = node1short + "__" + node2short
        nodecombRev = node2short + "__" + node2short

        if not nodecomb or not nodecombRev in self.monoFinalRes:
            if not node1 in self.nodesRemoved:
                if not node2 in self.nodesRemoved:
                    # Collect all common ancestors for each pair of variants
                    ancestor = self.t.get_common_ancestor(n)
                    for i in ancestor:
                        ancestorList.append(i.name)
                        ancestorShort = i.name[idStart:idLen]
                        if not ancestorShort in PhyloOutliers:
                            PhyloOutliers[ancestorShort] = []
                        PhyloOutliers[ancestorShort].append(1)

                    # Sum the variants for each sample, if < 3, store variant as outlier
                    for k, v in PhyloOutliers.iteritems():
                        vsum = sum(v)
                        if vsum < 3:
                            for item in ancestorList:
                                if item[idStart:idLen] == k:
                                    if not item in self.nodesRemoved:
                                        if node1short in self.SerialNodes:
                                            if not node2short in self.SerialNodes[node1short]:
                                                ancestorList.remove(item)
                                                self.nodesRemoved.append(item)
                                        elif node2short in self.SerialNodes:
                                            if not node1short in self.SerialNodes[node2short]:
                                                ancestorList.remove(item)
                                                self.nodesRemoved.append(item)
                                        else:
                                            ancestorList.remove(item)
                                            self.nodesRemoved.append(item)
            for i in self.nodesRemoved:
                if not i in self.nodecheck:
                    try:
                        item = self.t.search_nodes(name=item)[0]
                        i.delete()
                        self.nodecheck.append(i)
                    except:
                        pass

        return ancestorList

    # Create all combinations of intrahost sample identifiers for each respective sequential sample set
    # These results are used to assist in PhylyOutlierRem
    def intraComb(self, infile):

        with open(IntraFile) as f:
            for line in f:
                line = line.rstrip("\n")
                linesp = line.split(",")

                length = len(linesp)
                comb = int(length)
                for i in linesp:
                    self.SerialNodes[i] = []
                    for pair in itertools.combinations(linesp, 2):

                        for item in pair:
                            if i != item:
                                if not item in self.SerialNodes[i]:
                                    self.SerialNodes[i].append(item)

        return self.SerialNodes

    # First step of merging overlapping pairs of connected samples
    def ClusterKeys(self, values, node1, node2):

        if node1 in [x for v in values for x in v if type(v) == list] or node1 in values:
            if not node2 in [x for v in values for x in v if type(v) == list] or node2 in values:
                for k, v in self.dictClusters.iteritems():
                    if node1 in self.dictClusters[k]:
                        self.dictClusters[k].append(node2)

        if node2 in [x for v in values for x in v if type(v) == list] or node2 in values:
            if not node1 in [x for v in values for x in v if type(v) == list] or node1 in values:
                for k, v in self.dictClusters.iteritems():
                    if node2 in self.dictClusters[k]:
                        self.dictClusters[k].append(node1)

        if node1 in [x for v in values for x in v if type(v) == list] or node1 in values:
            if node2 in [x for v in values for x in v if type(v) == list] or node2 in values:
                for k, v in self.dictClusters.iteritems():
                    if node1 in self.dictClusters[k]:
                        self.dictClusters[k].append(node2)
                        self.dictClusters[k].append(node1)

                    if node2 in self.dictClusters[k]:
                        self.dictClusters[k].append(node2)
                        self.dictClusters[k].append(node1)

        if not node1 in [x for v in values for x in v if type(v) == list] or node1 in values:
            if not node2 in [x for v in values for x in v if type(v) == list] or node2 in values:
                self.count += 1
                if not self.count in self.dictClusters:
                    self.dictClusters[self.count] = []
                self.dictClusters[self.count].append(node1)
                self.dictClusters[self.count].append(node2.rstrip("\n"))

    # Second step of merging overlapping pairs of connected samples
    def ClusterKeys2(self, dictClusters):
        Clustvals = {}
        sysvers = str(sys.version_info[0]) + "." + str(sys.version_info[1])
        if float(sysvers) == 2.7:
            ##For python 2.7
            Clustvals = {k: set(val) for k, val in self.dictClusters.items()}
        elif float(sysvers) == 2.6:
            ##For python 2.6
            Clustvals = dict((k, val) for (k, val) in self.dictClusters.items())
        merged = set()
        srt = sorted(self.dictClusters.keys())
        srt2 = srt[:]
        for key in srt:
            for k in srt2:
                if not k == key:
                    if Clustvals[k].intersection(self.dictClusters[key]) and key not in merged:
                        merged.add(k)
                        self.dictClusters[key] = list(Clustvals[k].union(self.dictClusters[key]))
                        srt2.remove(k)
        for k in merged:
            del self.dictClusters[k]
        try:
            if len(self.dictClusters) > 0:
                del self.dictClusters[0]
        except:
            pass
        ValLengths = []
        ItemNumber = []

        for k, v in self.dictClusters.iteritems():
            ValLengths.append(int(len(set(v))))
            for i in v:
                if not i in ItemNumber:
                    ItemNumber.append(i)
        ValLengths[:] = []
        for k, v in self.dictClusters.iteritems():
            vset = set(v)
            v[:] = []
            vset = list(vset)
            self.dictClusters[k] = str(vset)
        return self.dictClusters

    # Retrieve common ancestors
    def CommonAncestor(self, nodes):
        ancestors = []
        ancestor = self.t.get_common_ancestor(nodes)
        for i in ancestor:
            ancestors.append(i.name)
        return ancestors

    # Identify poly- , para-, and monophyletic pairs of variants
    def CheckMono(self, ncomb, PhyloVarRemoval, Rejects, monoFinal):
        monoResult = str(
            self.t.check_monophyly(values=PhyloVarRemoval, ignore_missing=True, target_attr="name", unrooted=True)
        )
        monoResultSp = monoResult.split(",")
        mR = monoResultSp[1].replace("'", "").replace(")", "").replace(" ", "")
        if "monophyletic" in mR:
            if not ncomb in monoFinal:
                monoFinal[ncomb] = []
            monoFinal[ncomb].append(mR)
            if not ncomb in self.monoFinalRes:

                self.monoFinalRes.append(ncomb)

            return True
        elif "paraphyletic" in mR:
            if not ncomb in monoFinal:
                monoFinal[ncomb] = []
            monoFinal[ncomb].append(mR)
            if not ncomb in self.monoFinalRes:

                self.monoFinalRes.append(ncomb)
        elif not ncomb in Rejects:

            Rejects.append(ncomb)

    ##Analysis identifies all ancestors to variants passing the required percentile thresholds
    # Following the removal of outliers, it parses through every combination of these variants to determine whether the pair is polyphyletic or not
    def variantAnalysis(self):
        monoFinal = {}
        self.variantCollection()
        if outlierFlag == "TRUE":
            OutlierFile = open(outputPath + TreeShort + "." + percentile + "." + supportInput + ".Outlier.txt", "w")
        try:
            self.intraComb(IntraFile)
        except:
            pass
        Rejects = []

        for k, v in self.dictSharedReads.iteritems():

            x = 0
            count = 0
            ksp = k.split("__")
            krev = ksp[1] + "__" + ksp[0]
            FinalList = []
            clusters = self.dictSharedReads[k]

            for pair in itertools.combinations(clusters, 2):
                n = list(pair)
                node1 = n[0]
                node2 = n[1]
                if not node1 in self.nodesRemoved:
                    if not node2 in self.nodesRemoved:
                        nodeList = []
                        node1short = str(pair)[(idStart + 2) : (idLen + 2)]
                        pairSp = str(pair).split(",")
                        node2short = pairSp[1].replace(" ", "").replace("'", "")[idStart:idLen]
                        nshort = [node1short, node2short]
                        ncomb = node1short + "__" + node2short
                        ncombRev = node2short + "__" + node1short
                        if node1short != node2short:
                            if not ncomb or not ncombRev in self.monoFinalRes:
                                if outlierFlag == "TRUE":
                                    PhyloVarRemoval = self.PhylyOutlierRem(n, node1, node2, OutlierFile, idStart, idLen)
                                else:
                                    PhyloVarRemoval = self.CommonAncestor(n)

                                for i in PhyloVarRemoval:
                                    node = i[idStart:idLen]
                                    if not node in nodeList:
                                        nodeList.append(node)
                        if not node1 in self.nodesRemoved:
                            if not node2 in self.nodesRemoved:
                                if len(nodeList) == 2:
                                    if not ncomb or not ncombRev in self.monoFinalRes:

                                        if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal):
                                            break
                                elif len(nodeList) > 2:
                                    monoPos = 0
                                    lengthNode = len(nodeList)
                                    flag = 0
                                    nodeCheck = 0
                                    nodeRemoval = []

                                    for i in set(nodeList):
                                        if not i in nshort:
                                            if not i in self.SerialNodes:
                                                nodeRemoval.append(i)
                                                flag = 1
                                    if flag == 0:
                                        if len(PhyloVarRemoval) > 1:
                                            if not ncomb or not ncombRev in self.monoFinalRes:
                                                if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal):
                                                    break
                                    else:
                                        for item in nodeRemoval:
                                            nodeRemovalShort = item[idStart:idLen]
                                            if not nodeRemovalShort + "__" + node1short in self.dictSharedReads.keys():
                                                if (
                                                    not node1short + "__" + nodeRemovalShort
                                                    in self.dictSharedReads.keys()
                                                ):
                                                    if (
                                                        not node2short + "__" + nodeRemovalShort
                                                        in self.dictSharedReads.keys()
                                                    ):
                                                        if (
                                                            not nodeRemovalShort + "__" + node2short
                                                            in self.dictSharedReads.keys()
                                                        ):
                                                            for i in PhyloVarRemoval:
                                                                nodeShort = i[idStart:idLen]
                                                                if nodeShort in nodeRemoval:
                                                                    PhyloVarRemoval.remove(i)
                                        if len(PhyloVarRemoval) > 1:
                                            if not ncomb in self.monoFinalRes:
                                                if self.CheckMono(ncomb, PhyloVarRemoval, Rejects, monoFinal):
                                                    break
                                        flag = 0
        if outlierFlag == "TRUE":
            for i in set(self.nodesRemoved):
                OutlierFile.write("%s\n" % i)

        self.dictClusters = {}
        self.count = 0
        for i in self.monoFinalRes:
            if not "polyphyletic" in i:
                if not self.count in self.dictClusters:
                    self.dictClusters[self.count] = []
                values = self.dictClusters.values()

                isp = i.split("__")
                node1 = isp[0]
                node2 = isp[1].split("\t")[0]
                ClusterKeys = self.ClusterKeys(values, node1, node2)
        try:
            FinalClustering = self.ClusterKeys2(ClusterKeys)
        except:
            print "WARNING: Patristic Distance Data files may be empty..."
            sys.exit(1)
        for k, v in monoFinal.iteritems():
            print k + "\t" + str(v)
        print "Clusters that are polyphyletic: " + str(Rejects)
        return FinalClustering
Example #25
0
def build_nj_phylip(alignment, outfile, outgroup, work_dir="."):
    """
    build neighbor joining tree of DNA seqs with PHYLIP in EMBOSS

    PHYLIP manual
    http://evolution.genetics.washington.edu/phylip/doc/
    """

    phy_file = op.join(work_dir, "work", "aln.phy")
    try:
        AlignIO.write(alignment, file(phy_file, "w"), "phylip")
    except ValueError:
        print("Repeated seq name, possibly due to truncation. NJ tree not built.", file=sys.stderr)
        return None

    seqboot_out = phy_file.rsplit(".",1)[0] + ".fseqboot"
    seqboot_cl = FSeqBootCommandline(FPHYLIP_BIN("fseqboot"), \
        sequence=phy_file, outfile=seqboot_out, \
        seqtype="d", reps=100, seed=12345)
    stdout, stderr = seqboot_cl()
    logging.debug("Resampling alignment: %s" % seqboot_cl)

    dnadist_out = phy_file.rsplit(".",1)[0] + ".fdnadist"
    dnadist_cl = FDNADistCommandline(FPHYLIP_BIN("fdnadist"), \
        sequence=seqboot_out, outfile=dnadist_out, method="f")
    stdout, stderr = dnadist_cl()
    logging.debug\
        ("Calculating distance for bootstrapped alignments: %s" % dnadist_cl)

    neighbor_out = phy_file.rsplit(".",1)[0] + ".njtree"
    e = phy_file.rsplit(".",1)[0] + ".fneighbor"
    neighbor_cl = FNeighborCommandline(FPHYLIP_BIN("fneighbor"), \
        datafile=dnadist_out, outfile=e, outtreefile=neighbor_out)
    stdout, stderr = neighbor_cl()
    logging.debug("Building Neighbor Joining tree: %s" % neighbor_cl)

    consense_out = phy_file.rsplit(".",1)[0] + ".consensustree.nodesupport"
    e = phy_file.rsplit(".",1)[0] + ".fconsense"
    consense_cl = FConsenseCommandline(FPHYLIP_BIN("fconsense"), \
        intreefile=neighbor_out, outfile=e, outtreefile=consense_out)
    stdout, stderr = consense_cl()
    logging.debug("Building consensus tree: %s" % consense_cl)

    # distance without bootstrapping
    dnadist_out0 = phy_file.rsplit(".",1)[0] + ".fdnadist0"
    dnadist_cl0 = FDNADistCommandline(FPHYLIP_BIN("fdnadist"), \
        sequence=phy_file, outfile=dnadist_out0, method="f")
    stdout, stderr = dnadist_cl0()
    logging.debug\
        ("Calculating distance for original alignment: %s" % dnadist_cl0)

    # infer branch length on consensus tree
    consensustree1 = phy_file.rsplit(".",1)[0] + ".consensustree.branchlength"
    run_ffitch(distfile=dnadist_out0, outtreefile=consensustree1, \
            intreefile=consense_out)

    # write final tree
    ct_s = Tree(consense_out)

    if outgroup:
        t1 = consensustree1 + ".rooted"
        t2 = smart_reroot(consensustree1, outgroup, t1)
        if t2 == t1:
            outfile = outfile.replace(".unrooted", "")
        ct_b = Tree(t2)
    else:
        ct_b = Tree(consensustree1)

    nodesupport = {}
    for node in ct_s.traverse("postorder"):
        node_children = tuple(sorted([f.name for f in node]))
        if len(node_children) > 1:
            nodesupport[node_children] = node.dist/100.

    for k,v in nodesupport.items():
        ct_b.get_common_ancestor(*k).support = v
    print(ct_b)
    ct_b.write(format=0, outfile=outfile)

    try:
        s = op.getsize(outfile)
    except OSError:
        s = 0
    if s:
        logging.debug("NJ tree printed to %s" % outfile)
        return outfile, phy_file
    else:
        logging.debug("Something was wrong. NJ tree was not built.")
        return None
Example #26
0
    if len(node) > biggest_other_node:
        biggest_other_node = len(node)
        tree.set_outgroup(node) 
#test the various phylogenetic criteria for LGT.

print "Tree\tResult\tEuksInTree\tSupportEukMonophyly\tEuksInTargetGroup\tDistanceToClosestEukClade\tSupergroupsInTargetGroup"
#euk sequence is a singleton nested within a clade of bacteria, and there is only one eukaryote sequence in the tree
if len(eukaryote_seqs) == 1: #this is, I guess, an LGT candidate
    print sys.argv[1] + "\tSingleton\t1\tN/A\tN/A\tN/A\t1"
#euk sequence is a singleton nested within a clade of bacteria, and the eukaryotes are not monophyletic in the tree
#print len(eukaryote_seqs)
else:
    try:
        answer = tree.check_monophyly(values=eukaryote_seqs, target_attr="name")
        if answer[0] == True:
            ca = tree.get_common_ancestor(eukaryote_seqs)
            target_group_sgs = {}
            for leaf in ca:
                if leaf.name in group_assignments:
                    leaf_supergroup = group_assignments[leaf.name]
                    if leaf_supergroup in euk_supergroups:
                        target_group_sgs[leaf_supergroup] = 1
                else:
                    print "Warning: a sequence in this tree doesn't have a supergroup assignment: " + str(leaf.name)
            num_sgs = len(target_group_sgs.keys())
            print sys.argv[1] + "\tEuks monophyletic\t" + str(len(eukaryote_seqs)) + "\t" + str(ca.support) + "\tN/A\tN/A\t" + str(num_sgs) 
        elif answer[0] == False:
            mono_groups = []
            target_group = ''
            for node in tree.get_monophyletic(values=['Eukaryote'], target_attr="domain"):
                for leaf in node:
Example #27
0
import random
from ete3 import Tree
# Creates a normal tree
t = Tree( '((H:0.3,I:0.1):0.5, A:1, (B:0.4,(C:0.5,(J:1.3, (F:1.2, D:0.1):0.5):0.5):0.5):0.5);' )
print t
# Let's locate some nodes using the get common ancestor method
ancestor=t.get_common_ancestor("J", "F", "C")
# the search_nodes method (I take only the first match )
A = t.search_nodes(name="A")[0]
# and using the shorcut to finding nodes by name
C= t&"C"
H= t&"H"
I= t&"I"
# Let's now add some custom features to our nodes. add_features can be
#  used to add many features at the same time.
C.add_features(vowel=False, confidence=1.0)
A.add_features(vowel=True, confidence=0.5)
ancestor.add_features(nodetype="internal")
# Or, using the oneliner notation
(t&"H").add_features(vowel=False, confidence=0.2)
# But we can automatize this. (note that i will overwrite the previous
# values)
for leaf in t.traverse():
    if leaf.name in "AEIOU":
        leaf.add_features(vowel=True, confidence=random.random())
    else:
        leaf.add_features(vowel=False, confidence=random.random())
# Now we use these information to analyze the tree.
print "This tree has", len(t.search_nodes(vowel=True)), "vowel nodes"
print "Which are", [leaf.name for leaf in t.iter_leaves() if leaf.vowel==True]
# But features may refer to any kind of data, not only simple
Example #28
0
def caluclate_rootstrap(treeFile, bootFile, is_rooted, out_group):
    '''
    Parameters
    ----------
    treeFile: rooted tree in newick format (.treefile in IQ-TREE)
    bootFile: rooted bootstrap trees in newick format (e.g. .ufboot file in IQ-TREE)
    rooted: if the bootstrap trees are rooted (defult is True). If not rooted provide outgroup taxa file
    og: A file with outgroup taxa in Nexus format
    
    Returns
    -------
    rootstrapTree: rooted tree with rootstrap support values as branch lengths in newick format
    '''

    boottrees = []
    trees = []
    polyphyly = 0
    N_boottrees = 0
    if not is_rooted:
        if out_group == None:
            raise SystemExit('Error: Please provide outgroup taxa in Nexus format')
        ML_tree = Tree(treeFile)
        try:
            og = Read_Nex(out_group) #get the outgroup taxa
        except:
            raise SystemExit('Error: Cannot find outgroup taxa')
        if len(og) == 1: #if there is one outgroup taxon use it to root the tree
            ML_root = ML_tree.search_nodes(name=og[0])[0]
        else: #if there are more than one outgroup taxon find their common ancestor
            ML_root = ML_tree.get_common_ancestor(og)
        if not ML_root.is_root():
            ML_tree.set_outgroup(ML_root)
        ingroup = [n.name for n in ML_tree.get_leaves() if n.name not in og]
        try:#check if the ingroup is monophyletic
            if ML_tree.check_monophyly(values=ingroup, target_attr="name", ignore_missing=True)[0]:
                ML_tree.prune(ingroup) #prune ingroup taxa only
                rootedMLtree = os.path.splitext(treeFile)[0]+'_rooted.treefile'
                ML_tree.write(outfile=rootedMLtree) #write the rooted ML tree with ingroup taxa only to a file
            else:
                 raise SystemExit('Error: ML ingroup taxa are not monophyletic')
        except:
                    raise SystemExit('Error: ML ingroup taxa are not monophyletic')
                
        with open(bootFile, 'r') as f:
            for tree in f:
                N_boottrees += 1
                t = Tree(tree)
                ingroup = [n.name for n in t.get_leaves() if n.name not in og]
                if len(og) == 1: #if there is one outgroup taxon use it to root the tree
                    root = t.search_nodes(name=og[0])[0]
                elif len(og) > 1: #if there are more than one outgroup taxon find their common ancestor
                    root = t.get_common_ancestor(og)
                else: #if there is no outgroup taxa raise an error
                    raise SystemExit('Error: Please provide outgroup taxa in Nexus format')
                if not root.is_root():
                    t.set_outgroup(root)
                try:#check if the ingroup is monophyletic
                    if t.check_monophyly(values=ingroup, target_attr="name", ignore_missing=True)[0]:
                        trees.append(t.write(format=9))
                    else:
                        polyphyly += 1
                except:
                    polyphyly += 1
        
        for tree in trees:
            t = Tree(tree)
            t.prune(ingroup)
            boottrees.append(t.write(format=9))
    else: #If you are using rooted ML tree and rooted bootstrap trees (e.g. NR model)
        ML_tree = Tree(treeFile)
        with open(bootFile, 'r') as f:
            for tree in f:
                N_boottrees += 1
                t = Tree(tree)
                boottrees.append(t.write(format=9))

    booted = [(g[0], len(list(g[1]))) for g in ite.groupby(boottrees)] #a list of all unique bootstrap trees with thier number of occurrence
    boottrees = []
    for b in booted:
        t2 = Tree(b[0])
        x = []
        for n in t2.traverse():
            if n.is_root():
                for child in n.children:
                    if child.is_leaf():
                        x.append([child.name])
                    else:
                        x.append([i.name for i in child.get_descendants()])
                boottrees.append([b[1],x])
    if is_rooted:
        roots = all_possible_roots(treeFile)
    else:
        roots = all_possible_roots(rootedMLtree)

    rootstrap_value = dict.fromkeys(roots.keys(), 0)
    for node, rooted in roots.items():
        t1 = Tree(rooted)
        x = []
        for n in t1.traverse():
            if n.is_root():
                for child in n.children:
                    if child.is_leaf():
                        x.append([child.name])
                    else:
                        x.append([i.name for i in child.get_descendants()])
        y = [set(i) for i in x]
        for split in boottrees:
            z = [set(i) for i in split[1]]
            if len(y) == len(z):
                for group in y:
                    if group in z:
                        z.remove(group)
            if len(z) == 0:
                rootstrap_value[node] += split[0]/N_boottrees
            else:
                rootstrap_value[node] += 0

    if is_rooted:
        t = Tree(treeFile)
    else:
        t = Tree(rootedMLtree)
    
    k = 1
    for n in t.traverse():
        if not n.is_root():
            if not n.is_leaf():
                n.add_features(name='n'+str(k))
                n.add_features(rootstrap=rootstrap_value[n.name]*100)
                k += 1
            else:
                n.add_features(rootstrap=rootstrap_value[n.name]*100)
    temp = os.path.splitext(treeFile)[0]+'.temp'
    rootstrapTree = os.path.splitext(treeFile)[0]+'.rootstrap'
    t.write(outfile=temp, features =["rootstrap"])
    x = dendropy.Tree.get(path=temp, schema='newick')
    x.write(path=rootstrapTree, schema='nexus')
    os.remove(temp)
    return polyphyly
def reconcile_trees(higher_level, input_trees, computation_method,
                    reconciliation_software, cpu_cores, keep_polytomies,
                    root_notung, infer_transfers, output_reconciliations):

    reconciliation.NOTUNG_WEB = reconciliation_software

    tree_computations = read_trees(input_trees)
    eggNOG_level_species = eu.read_level_species()

    # species tree generation
    eggNOG_speciesTree = Tree(reconciliation.EGGNOGv4_SPECIES_TREE)
    # do intial pruning to exclude all non-euNOG species
    higher_node = eggNOG_speciesTree.get_common_ancestor(
        [str(x) for x in eggNOG_level_species[higher_level]])
    eggNOG_speciesTree = higher_node.detach()

    sys.stderr.write('Generating species trees reconciliation...\n')
    if cpu_cores > 1:
        cached_jobs = [(x, y, z, eggNOG_speciesTree, keep_polytomies,
                        root_notung) for x, y, z in tree_computations]
        with Pool(cpu_cores) as p:
            reconciliation_jobs = list(
                tqdm(p.imap(reconciliation.prepare_reconciliation_job,
                            cached_jobs),
                     total=len(cached_jobs)))
    else:
        reconciliation_jobs = []
        for nog_id, sample_no, tree_nw in tqdm(tree_computations):
            species_nw = reconciliation.prune_species_tree(
                tree_nw, eggNOG_speciesTree, keep_polytomies)
            job = (nog_id, sample_no, tree_nw, species_nw, root_notung)
            reconciliation_jobs.append(job)

    sys.stderr.write('Starting reconciliation for %d jobs...\n' %
                     len(reconciliation_jobs))
    reconciliation_method = reconciliation.reconcile

    # reconciliation
    if computation_method == 'cluster':
        reconciliation.submit_taskArray(higher_level, reconciliation_jobs,
                                        keep_polytomies, root_notung,
                                        infer_transfers)
        reconciliations = reconciliation.collect_taskArray(higher_level,
                                                           cpu_cores=cpu_cores)
    elif cpu_cores > 1:
        with Pool(cpu_cores) as p:
            reconciliations = list(
                tqdm(p.imap(reconciliation_method, reconciliation_jobs),
                     total=len(reconciliation_jobs)))
    else:
        reconciliations = [
            reconciliation_method(x) for x in tqdm(reconciliation_jobs)
        ]

    # flag incomplete reconciliations
    to_delete = []
    for i in range(len(reconciliations)):
        if reconciliations[i] is None:
            to_delete.append(i)
        else:
            nog_id, result = reconciliations[i]
            if not result:
                # reconciliation failed
                reconciliations[i] = (nog_id, ('S', -5.0))

    # delete incomplete
    if to_delete:
        for i in sorted(to_delete, reverse=True):
            sys.stderr.write(
                'Deleting reconciliation entry %d.%d because empty\n' %
                (higher_level, i))
            del reconciliations[i]

    write_reconciliations(output_reconciliations, reconciliations)
Example #30
0
print t
#          /-A
#         |
#         |          /-H
#---------|---------|
#         |          \-F
#         |
#         |          /-B
#          \--------|
#                   |          /-E
#                    \--------|
#                              \-D
#
# Let's define that the ancestor of E and D as the tree outgroup.  Of
# course, the definition of an outgroup will depend on user criteria.
ancestor = t.get_common_ancestor("E","D")
t.set_outgroup(ancestor)
print "Tree rooteda at E and D's ancestor is more basal that the others."
print t
#
#                    /-B
#          /--------|
#         |         |          /-A
#         |          \--------|
#         |                   |          /-H
#---------|                    \--------|
#         |                              \-F
#         |
#         |          /-E
#          \--------|
#                    \-D
Example #31
0
parser = argparse.ArgumentParser(description='系統樹と表の並び替え')   
parser.add_argument('-n', '--newick', help='newick file') 
parser.add_argument('-t', '--table', help='table file, sep = tab, first line index')
parser.add_argument('-o1', '--outgroup1', help='set outgroup1')   
parser.add_argument('-o2', '--outgroup2', help='set outgroup2')  
args = parser.parse_args() 

NEWICK = args.newick
g_genome = args.table
OUTG1 = args.outgroup1
OUTG2 = args.outgroup2

# 系統樹の読み込み
t = Tree(NEWICK , format= 0)
ancestor = t.get_common_ancestor(OUTG1,OUTG2)
t.set_outgroup( ancestor )
ts = TreeStyle()
ts.show_leaf_name = True
ts.show_branch_support = True
t.render(NEWICK+".png", w=600, units="mm",tree_style=ts)
t.write(format=0, outfile=NEWICK+".newick")


# ゲノム情報の読み込み
info = pd.read_table(g_genome, sep='\t', index_col=0)
frame = pd.DataFrame(info)


# テーブルの並び替え
strain_list = t.get_leaf_names()
Example #32
0
tree = Tree( '((H:1,I:1):0.5, A:1, (B:1,(C:1,D:1):0.5):0.5);' )
print "this is the original tree:"
print tree
#                    /-H
#          /--------|
#         |          \-I
#         |
#---------|--A
#         |
#         |          /-B
#          \--------|
#                   |          /-C
#                    \--------|
#                              \-D
# Finds the first common ancestor between B and C.
ancestor = tree.get_common_ancestor("D", "C")
print "The ancestor of C and D is:"
print ancestor
#          /-C
#---------|
#          \-D
# You can use more than two nodes in the search
ancestor = tree.get_common_ancestor("B", "C", "D")
print "The ancestor of B, C and D is:"
print ancestor
#          /-B
#---------|
#         |          /-C
#          \--------|
#                    \-D
# Finds the first sister branch of the ancestor node. Because
Example #33
0
        target_leaf = leaf
    else:
        leaf.add_features(domain="Other")
#print eukaryote_seqs
#test the various phylogenetic criteria for LGT.

#euk sequence is a singleton nested within a clade of bacteria, and there is only one eukaryote sequence in the tree
if len(eukaryote_seqs) == 1: #this is, I guess, an LGT candidate
    print sys.argv[1] + "\tSingleton"
#euk sequence is a singleton nested within a clade of bacteria, and the eukaryotes are not monophyletic in the tree
#print len(eukaryote_seqs)
else:
    try:
        answer = tree.check_monophyly(values=eukaryote_seqs, target_attr="name")
        if answer[0] == True:
            ca = tree.get_common_ancestor(eukaryote_seqs)
            print sys.argv[1] + "\tEuks monophyletic\t" + str(len(eukaryote_seqs)) + "\t" + str(ca.support) 
        elif answer[0] == False:
            mono_groups = []
            target_group = ''
            for node in tree.get_monophyletic(values=['Eukaryote'], target_attr="domain"):
                if target_leaf in node:
                    target_group = node
                else:
                    mono_groups.append(node)
            size_target_group = len(target_group)
            #get distance
            shortest_distance = 999999999999999.0
            closest_other_group = ''
            for subtree in mono_groups:
                curr_distance = tree.get_distance(target_group, subtree, topology_only=True)
Example #34
0
    line = line.rstrip()
    (geneFamily, nodeRb, iesColumn, presence) = line.split() # nodes are in rb format
    # find what type of speciation event nodeRb corresponds to
    if float(presence) > cutoff:
        spe = d[geneFamily][nodeRb]
        if spe:
            homies.setdefault((geneFamily,iesColumn), []).append(spe)

# load species tree
t = Tree(os.path.join(basePath, 'analysis/', 'phyldogT' + asrRun, 'results', 'OutputSpeciesTree_ConsensusNumbered.tree'), format = 1)

# replace species names with speciation events and add 0 to root node label
for l in t.traverse():
    if(l.is_leaf()):
        l.name = (re.sub(r'.+_.+_(\d+)', r'\1', l.name))
    elif(l.is_root()):
        if(l.name):
            quit(l.name) # is root named?
        else:
            l.name = '0'


for (geneFamily, iesColumn) in homies:
    L = homies[(geneFamily, iesColumn)]
    if(len(L) == 1):
        # if only one node
        print('\t'.join([geneFamily, iesColumn, L[0]]))
    else:
        ancestor = t.get_common_ancestor(L)
        print('\t'.join([geneFamily, iesColumn, ancestor.name]))
Example #35
0
# load species tree
t = Tree(
    '/home/dsellis/data/IES/analysis/phyldog/results/OutputSpeciesTree_ConsensusNumbered.tree',
    format=1)

# replace species names with speciation events and add 0 to root node label
for l in t.traverse():
    if (l.is_leaf()):
        l.name = (re.sub(r'.+_.+_(\d+)', r'\1', l.name))
    elif (l.is_root()):
        if (l.name):
            quit(l.name)  # is root named?
        else:
            l.name = '0'

# read line by line firstIES.dat
f = open('/home/dsellis/data/IES/analysis/tables/firstIES.dat', 'r')
header = f.readline()
print('\t'.join(['cluster', 'iesColumn', 'spEvent']))
for line in f:
    line = line.rstrip()
    L = line.split()
    cluster = L.pop(0)
    iesColumn = L.pop(0)
    if (len(L) == 1):
        # if only one node
        print('\t'.join([cluster, iesColumn, L[0]]))
    else:
        ancestor = t.get_common_ancestor(L)
        print('\t'.join([cluster, iesColumn, ancestor.name]))
Example #36
0
    tree.render(file_name=sys.argv[1] + "_" + cluster + ".pdf", tree_style=ts, w=width)


big_tree = Tree(sys.argv[1])
mode = sys.argv[2]
metadata = {}
metadata = get_meta_new(metadata, big_tree)
colourDict = get_colours(clusters, big_tree, colours)

# remove dodgy sample
big_tree.search_nodes(name="'EBOV|EMLab-RT|IPDPFHGINSP_GUI_2015_5339||GIN|Conakry|?|MinION_LQ05|2015-04-08'")[0].delete(
    preserve_branch_length=True
)
# root the same as the MCC tree
ancestor = big_tree.get_common_ancestor(
    "'EBOV|EMLab|EM_079422|KR817187|GIN|Macenta|?||2014-03-27'",
    "'EBOV|EMLab|Gueckedou-C05|KJ660348|GIN|Gueckedou|?||2014-03-19'",
)
big_tree.set_outgroup(ancestor)
big_tree.ladderize()

ts = TreeStyle()
ts.show_leaf_name = False
# ts.show_branch_support = True
ts.scale = 100000
if mode == "small":
    ts.scale = 750000

# add legend
for each in colourDict.keys():
    ts.legend.add_face(CircleFace(radius=size[mode] / 2, color=colourDict[each]), column=0)
    ts.legend.add_face(TextFace(each, ftype="Helvetica", fsize=size[mode]), column=1)
Example #37
0
                          "grnPRASc_MMETSP0941_Gene.14464-Transcript_5625_Chlorophyta_Prasinococcales"],
               "Clade2": ["crypGONIp_MMETSP0107_Gene.30083-Transcript_20766_Cryptophyta_Cryptomonadales",
                          "Phenylobacterium_sp._RIFCSPHIGHO2_01_FULL_69_31_OHB27812.1"]
              }
#or make them available in a tsv:
with open('_ancestors.tsv', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    try:
        ancestors_d.update({r[0]: [r[1], r[2]] for r in reader})
    except IndexError:
        #incomplete line
        print("Incomplete processing of ancestor lineages!")
        print(ancestors_d)

try:
    ancestor = t.get_common_ancestor(ancestors_d[filename])
    #print(ancestor)
    t.set_outgroup(ancestor)
except KeyError:
    print("Root not selected!")
    print(t.get_tree_root())
    quit()
t.ladderize(direction=1)

#select scale 0-1.0 or 0-100 for support values
supportscache = t.get_cached_content(store_attr="support")
supportslist = [x.support for x in supportscache]
if max(supportslist) == 1:
    minsupport = 0.85
else:
    minsupport = 85
Example #38
0
print t
#          /-A
#         |
#         |          /-H
#---------|---------|
#         |          \-F
#         |
#         |          /-B
#          \--------|
#                   |          /-E
#                    \--------|
#                              \-D
#
# Let's define that the ancestor of E and D as the tree outgroup.  Of
# course, the definition of an outgroup will depend on user criteria.
ancestor = t.get_common_ancestor("E", "D")
t.set_outgroup(ancestor)
print "Tree rooteda at E and D's ancestor is more basal that the others."
print t
#
#                    /-B
#          /--------|
#         |         |          /-A
#         |          \--------|
#         |                   |          /-H
#---------|                    \--------|
#         |                              \-F
#         |
#         |          /-E
#          \--------|
#                    \-D
    tree = Tree(args.tree, format=1)
    nwk = Tree(args.nwk, format=1)
    assert (len(tree) == len(nwk))
    tree_leaf_names = set(tree.get_leaf_names())
    for leaf in nwk:
        if leaf.name not in tree_leaf_names:
            try:
                leaf.name = next(n for n in tree_leaf_names
                                 if leaf.name.split("_")[0] == n.split("_")[0])
            except StopIteration:
                leaf.name = next(n for n in tree_leaf_names
                                 if leaf.name.split("_")[1] == n.split("_")[1])
    assert (set(nwk.get_leaf_names()) == tree_leaf_names)
    root_age = nwk.get_closest_leaf()[1]
    df = [["Root", root_age, root_age, root_age]]
    for n in nwk.iter_descendants(strategy='postorder'):
        if not n.is_leaf():
            name = tree.get_common_ancestor(n.get_leaf_names()).name
            if n.dist <= 0:
                print("Node " + name +
                      " is attached to it's parent, thus it's discarded.")
                continue
            age = n.get_closest_leaf()[1]
            df += [[name, age, age, age]]

    header = ["NodeName", "Age", "LowerBound", "UpperBound"]
    pd.DataFrame(df).to_csv(args.nwk.replace(".nwk", ".tsv"),
                            index=False,
                            header=header,
                            sep="\t")
Example #40
0
for line in outg:
    outgroups.append(line.rstrip())
outg.close()

target_taxa = []
tt = open(sys.argv[2])
for line in tt:
    target_taxa.append(line.rstrip())
tt.close()

#now read in a collection of trees, calc branch lengths over sample, summarise and print out
branch_lengths = defaultdict(list) #key = taxa, value = list of brlens
treefile = open(sys.argv[3])
for line in treefile:
    curr_tree = Tree(line.rstrip())
    root_node = curr_tree.get_common_ancestor(outgroups)
    if curr_tree != root_node:
        curr_tree.set_outgroup(root_node)
    print curr_tree
    #bundle = curr_tree.check_monophyly(values=outgroups,target_attr='name')
    #print bundle
    #if bundle[0] == False:
    #    continue
    #find common ancestor of the target taxa, and use this as the reference node for calculating branch lengths. This might not always be the measure you want!
    reference_node = curr_tree.get_common_ancestor(target_taxa)
    #if reference_node != curr_tree:
    #    curr_tree.set_outgroup(reference_node)
    #calc distance from root to each branch of interest
    for taxon in target_taxa:
        dist = curr_tree.get_distance(taxon, reference_node) 
        branch_lengths[taxon].append(dist)
Example #41
0
#         |                   |          /-L
#         |                    \--------|
#---------|                              \-M
#         |
#         |                    /-B
#         |          /--------|
#         |         |         |          /-J
#         |         |          \--------|
#          \--------|                    \-K
#                   |
#                   |          /-E
#                    \--------|
#                              \-D
#
# Each main branch of the tree is independently rooted.
node1 = t.get_common_ancestor("A", "H")
node2 = t.get_common_ancestor("B", "D")
node1.set_outgroup("H")
node2.set_outgroup("E")
print "Tree after rooting each node independently:"
print t
#
#                              /-F
#                             |
#                    /--------|                    /-L
#                   |         |          /--------|
#                   |         |         |          \-M
#                   |          \--------|
#          /--------|                   |          /-A
#         |         |                    \--------|
#         |         |                              \-C
Example #42
0
for main_node in main_tree.traverse(strategy="levelorder"):
    if main_node.is_leaf():
        continue
    new_support = 0
    # Get all leaf names from the main tree
    clade_leaf_names = main_tree.get_leaf_names()
    # Now check for each bs_tree if the common ancestor of these same leaves have more leaves
    for bs_tree in bootstrap_trees:

        # Get all node objects for all the leaves by name
        clade_leaf_nodes_in_bs_tree = [
            bs_tree.get_leaves_by_name(leaf_name)[0]
            for leaf_name in clade_leaf_names
        ]
        # Get common ancestor in bs_tree
        common_ancestor_node = bs_tree.get_common_ancestor(
            clade_leaf_nodes_in_bs_tree)
        # Get leafnames of the common ancestor node and check if they are the same
        bs_tree_clade_leaf_names = common_ancestor_node.get_leaf_names()
        if set(clade_leaf_names) == set(bs_tree_clade_leaf_names):
            # Clades match!
            new_support = new_support + 1
    new_support = new_support / len(bootstrap_trees) * 100
    logger.debug("Support for internal node was {}, now is {}".format(
        main_node.support, new_support))
    main_node.support = new_support

# Output the new tree
logger.info("Writing new bootstrapped tree to STDOUT.")
print(main_tree.write())
Example #43
0
print tree

archaea = [] #make a list of archaea that are in the tree
bacteria = []
#check the domain of each taxon in the tree
for taxon in tree:
	print taxon.name + "\t" + id_to_domain[taxon.name]
	if id_to_domain[taxon.name] == 'Archaea':
		archaea.append(taxon.name)
	else:
		bacteria.append(taxon.name)

#first, check if archaea are monophyletic in the tree

if tree.check_monophyly(values=archaea, target_attr="name")[0] == True:

	#find the branch separating archaea and bacteria, and reroot the tree on that
	archaea_ancestor = tree.get_common_ancestor(archaea) 
	tree.set_outgroup(archaea_ancestor)
elif tree.check_monophyly(values=bacteria, target_attr="name")[0] == True:
	bacteria_ancestor = tree.get_common_ancestor(bacteria)
	tree.set_outgroup(bacteria_ancestor)
else:
	#neither archaea nor bacteria were monophyletic, so print some error and quit
	print sys.argv[1] + ": neither A nor B monophyletic."
	quit()

outfile_name = sys.argv[1] + "_rerooted"
tree.write(outfile=outfile_name)