def getTheTrees():
    ##DOWNLOAD taxdump and store in taxo folder
    ##DOWNLOAD TAXREF BY HAND! and put it in taxo/

    class Trans:
        def __init__(self):
            self.common_name_FR = []

    print("Getting french translations...")
    TRANS = {}  ##translations in french
    with open("taxo/TAXREFv11.txt") as f:
        for line in f:
            sciname = line.split("\t")[14]
            comnameFR = line.split("\t")[19]
            if (sciname not in TRANS and line.split("\t")[19] != ''):
                TRANS[sciname] = Trans()
            if (line.split("\t")[19] != ''):
                TRANS[sciname].common_name_FR.append(comnameFR)

    #get translation of ranks
    print("Getting rank names in french...")
    RANKS = {}
    with open("taxo/ranks_FR.txt") as f:
        for line in f:
            rank_en = line.split("\t")[0]
            rank_fr = line.split("\t")[1].rstrip()  ##to remove \n
            RANKS[rank_en] = rank_fr

    class Taxid:
        def __init__(self):
            self.sci_name = ""
            self.authority = ""
            self.synonym = ""
            #			self.common_name = ""
            self.common_name = []
            #			self.common_name_FR = ""
            self.common_name_FR = []

    cpt = 0
    cptfr = 0
    ATTR = {}  ##here we will list attribute of each species per taxid
    print("Reading NCBI taxonomy...")
    with open("taxo/names.dmp") as f:
        for line in f:
            taxid = line.split("|")[0].replace("\t", "")
            tid_val = line.split("|")[1].replace("\t", "")
            tid_type = line.split("|")[3].replace("\t", "")
            ##PEUT ETRE RAJOUTER DES PETTS FILTRES COMME CA ??? A VOIR.
            # n.common_name = n.common_name[0] if len(n.common_name)>0 else ""
            # n.common_name = n.common_name.replace("'","''");
            # n.common_name_FR = n.common_name_FR[0] if len(n.common_name_FR)>0 else ""
            # n.common_name_FR = n.common_name_FR.replace("'","''");
            # n.rank = n.rank.replace("'","''");
            # n.rank_FR = n.rank_FR.replace("'","''");

            # n.sci_name = n.sci_name.replace("'","''")
            # #add parenthesis to the common name
            # if n.common_name!='':
            #     n.common_name = "(" + n.common_name + ")"

            if (taxid not in ATTR):
                ATTR[taxid] = Taxid()
            if (tid_type == "scientific name"):
                ATTR[taxid].sci_name = tid_val
                #and get translation in french (if any)
                if tid_val in TRANS:
                    ATTR[taxid].common_name_FR = TRANS[tid_val].common_name_FR
                    cptfr += 1
            if (tid_type == "authority"):
                if (ATTR[taxid].authority != ""):
                    ATTR[taxid].authority = ATTR[
                        taxid].authority + ", " + tid_val
                else:
                    ATTR[taxid].authority = tid_val
            if (tid_type == "synonym"):
                if (ATTR[taxid].synonym != ""):
                    ATTR[taxid].synonym = ATTR[taxid].synonym + ", " + tid_val
                else:
                    ATTR[taxid].synonym = tid_val
            if (tid_type == "common name"):
                cpt += 1
                ATTR[taxid].common_name.append(tid_val)
                # if (ATTR[taxid].common_name!=""):
                # 	ATTR[taxid].common_name = ATTR[taxid].common_name + ", " + tid_val
                # else:
                # 	ATTR[taxid].common_name = tid_val

    T = {}

    ###New gettrees
    filepath = 'taxo/nodes.dmp'
    print("Building the NCBI taxonomy tree...")
    with open(filepath) as fp:
        first_line = fp.readline()  ## remove the 1 | 1 edge
        for line in fp:
            dad = line.split("|")[1].replace("\t", "")
            son = line.split("|")[0].replace("\t", "")
            rank = line.split("|")[2].replace("\t", "")
            if (dad not in T):
                T[dad] = Tree()
                T[dad].name = dad
                #				T[dad].rank = rank
                #				T[dad].rank_FR = RANKS[rank]
                T[dad].taxid = dad
                T[dad].sci_name = ATTR[dad].sci_name
                T[dad].common_name = ATTR[dad].common_name
                T[dad].synonym = ATTR[dad].synonym
                T[dad].authority = ATTR[dad].authority
                T[dad].common_name_FR = ATTR[dad].common_name_FR
            if (son not in T):
                T[son] = Tree()
                T[son].name = son
                T[son].rank = rank
                T[son].rank_FR = RANKS[rank]
                T[son].taxid = son
                T[son].sci_name = ATTR[son].sci_name
                T[son].common_name = ATTR[son].common_name
                T[son].synonym = ATTR[son].synonym
                T[son].authority = ATTR[son].authority
                T[son].common_name_FR = ATTR[son].common_name_FR
            else:
                if (hasattr(T[son], 'rank') == False):
                    T[son].rank = rank
                    T[son].rank_FR = RANKS[rank]
            T[dad].add_child(T[son])
    return T
    prop = sys.argv[
        8]  # whether the probability of a rate change should be proportional to the branch length (y by default)

try:
    f = open(treefile, 'r')
except IOError:
    print("Unknown file: ", treefile)
    sys.exit()

line = ""
for l in f:
    line = line + l
f.close()

# Starting ultrametric tree
ultra_tree = Tree(line, format=0)

use_bl = True
if prop == "n":
    use_bl = False

rates = dict()
rates[ultra_tree.get_tree_root()] = 1.0

number_of_small_changes_per_branch = dict()
number_of_big_changes_per_branch = dict()

average_dist = 0.0
n_branches = 0
for n in ultra_tree.traverse(strategy="preorder"):
    if n != ultra_tree.get_tree_root():
Example #3
0
        for line in f:
            token = line.rstrip().rsplit()
            d[token[0]] = token[1]
    return d


number_of_sets = 0
number_of_sets_with_duplication = 0
number_of_sets_with_speciation = 0
number_of_sets_with_relocalisation = 0
number_of_sets_with_duplication_and_relocalisation = 0
number_of_sets_with_speciation_and_relocalisation = 0

# Load Species Tree
os.chdir("/cellar/rona/Phytozome10/Phyldog")
species_tree = Tree("Phytozome10_constrainedTree_rooted_labelled.tree",
                    format=1)
# Create dictionary for each location, with each node on the species tree as a key with a starting value of 0. This value will be updated if there is a loss/gain at that node.
number_of_relocalisations_on_species_tree_node = {}
for node in species_tree.traverse():
    number_of_relocalisations_on_species_tree_node[node.name] = [
        0, 0, 0, 0
    ]  #format - node.name: relocalisations following dup, total_dups, relocalisations from spec, total_specs

os.chdir(location_of_trees)
for filename in glob.glob("OG0*.locus.tree"):
    orthogroup = (filename[:-11])
    node_2_node_dict = make_node_2_node_dict(
        orthogroup)  # ortho_node = species_node

    orthogroup_tree = Tree(filename, format=1)
    os.chdir(location_of_dup_reloc_files)
Example #4
0
    index = []
    alg = []
    for r in SeqIO.parse(fname, format="fasta"):
        index.append(r.id)
        alg.append(seq2vector(r.seq))

    named_index = {name: i for i, name in enumerate(index)}
    return np.array(alg), named_index, index


tree_file = sys.argv[1]
alg_file = sys.argv[2]
thr = float(sys.argv[3])

alg, index, i2name = load_alg(alg_file)
tree = Tree(tree_file)
tree.set_outgroup(tree.get_midpoint_outgroup())
node2content = tree.get_cached_content(store_attr="name")

for n in tree.traverse("levelorder"):
    if n.children:
        ch1 = n.children[0]
        ch2 = n.children[1]

        leaves_left = [index[name] for name in node2content[ch1]]
        leaves_right = [index[name] for name in node2content[ch2]]
        if len(leaves_left) < 3 or len(leaves_right) < 3:
            continue

        rows, cols = alg[tuple(leaves_left), :].nonzero()
        colres_left = Counter(cols)
Example #5
0


#http://stackoverflow.com/questions/23172293/use-python-to-extract-branch-lengths-from-newick-format
pattern = re.compile(r"\b[0-9]+(?:\.[0-9]+)?\b")


logger.debug("trees:\n  * %s", "\n  * ".join(lnf))
nb_input_tree_before = len(lnf)
logger.debug("%s trees in %s", nb_input_tree_before, args.tree_dir)


for treefilename in lnf:
    # test if a tree
    try:
        t=Tree(treefilename)
    except:
        logger.warning("%s is not a newick tree, this tree will not be used",treefilename)
        lnf.remove(treefilename)
        t=""
    if t:
        treefile=open(treefilename,"r")
        tree=treefile.read().strip()
        treefile.close()
        #test if branch length
        branch_lengths = pattern.findall(tree)
        if branch_lengths == []:
            logger.warning("No branch length in %s, this tree will not be used",treefilename)
            lnf.remove(treefilename)

nb_input_tree_after = len(lnf)
    t = readTreeFromFile(file)

    index = 0
    for node in t.traverse("postorder"):
        if not node.is_leaf():
            node.name=str(index)
            node.support=str(index)
            index += 1

    id2Height = dict()
    nodeId2LeafListRef = dict()
    leafList2NodeIdRef = dict()
    idToDescendants = dict()
    # Now we want to get the calibrations according to the options that have been user-input.

    t_begin = Tree()

    # Balanced or not?
    if ('y' in balanced):
        # Getting calibrations from both sides of the root
        t_begin = t
    else:
        # Getting calibrations only from one side
        choices = [0,1]
        choice  = random.choice(choices)
        print("Choosing calibrations from subtree: ", choice)
        t_begin = t.get_children()[choice]
    print("Number of nodes in sampled subtree: ", len(t_begin.get_descendants()))

    id2Height = getInternalNodeHeights( t_begin )
    nodeId2LeafListRef, leafList2NodeIdRef, idToDescendants = getNameToLeavesAndIdToDescendantIdsLink( t_begin )
    f = open(file, 'r')
except IOError:
    print("Unknown file: ", file)
    sys.exit()

#File where I store useful functions
exec(open("/home/boussau/Programming/Notebooks/code/functions.py").read())

allTrees = list()
for l in f:
    l2 = l.strip()
    # removing anything within square brackets
    if "[" in l2:
        l2 = re.sub('\[[^\]]+\]\s*', '', l2)
    print(l2)
    t = Tree(l2)
    createNameToLeavesLink(t)
    allTrees.append(t)
f.close()

id2Heights = list()
for t in allTrees:
    node2Height, id2Height = getNodeHeights(t)
    print(id2Height)
    id2Heights.append(id2Height)

#print(len(id2Heights))

# Creating a uniform weight vector
weights = [1] * len(id2Heights)
outputsWeightedChronogram(allTrees[0].copy(), id2Heights, out, weights)
Example #8
0
def plot_heat_tree_V1(taxid2n,
                      tree_file,
                      genes,
                      taxid2st=False,
                      leaf_label_conversion_dico=False):
    '''
    Plot heatmap next to a tree. The order of the heatmap **MUST** be the same,
    as order of the leafs on the tree. The tree must be in the Newick format. If
    *output_file* is specified, then heat-tree will be rendered as a PNG,
    otherwise interactive browser will pop-up with your heat-tree.

    TODO ajouter en option la possibilite d'ajouter en option la valeur dans la cellule

    Parameters
    ----------


    tree_file: str
        Path to the tree file in Newick format. The leaf node labels should
        be the same as as row names in the heatmap file. E.g. row1, row2.

    output_file: str, optional
        If specified the heat-tree will be rendered in that file as a PNG image,
        otherwise interactive browser will pop-up. **N.B.** program will wait
        for you to exit the browser before continuing.
    '''

    t1 = Tree(tree_file)
    tss = TreeStyle()
    #t.populate(8)
    # Calculate the midpoint node
    R = t1.get_midpoint_outgroup()
    # and set it as tree outgroup
    t1.set_outgroup(R)  # To operate with numbers efficiently

    import matplotlib.cm as cm
    from matplotlib.colors import rgb2hex
    import matplotlib as mpl
    norm = mpl.colors.Normalize(vmin=0.8, vmax=1)  # map2count[map[0]][0]
    cmap_blue = cm.Blues
    m2 = cm.ScalarMappable(norm=norm, cmap=cmap_blue)

    leaf_number = 0
    for lf in t1.iter_leaves():
        leaf_number += 1
        lf.branch_vertical_margin = 0

        try:
            data = taxid2n[str(lf.name)]
        except:
            data = [0]

        try:
            st = taxid2st[lf.name]
        except:
            st = False
            '''
            if "taxon2accession_list" not in locals():
                from chlamdb.biosqldb import manipulate_biosqldb
                server, db = manipulate_biosqldb.load_db("k_cosson_05_16")
                sql = 'select taxon_id, accession from bioentry where biodatabase_id=104'
                data_tax = server.adaptor.execute_and_fetchall(sql,)
                taxon2accession_list = {}
                for i in data_tax:
                    if i[0] not in taxon2accession_list:
                        taxon2accession_list[i[0]] = [i[1]]
                    else:
                       taxon2accession_list[i[0]].append(i[1])
            else:
                for taxon in taxon2accession_list:
                    if lf.name in taxon2accession_list[taxon]:
                        for accession in taxon2accession_list[taxon]:
                            print lf.name, accession
                            try:
                                st = taxid2st[accession]
                                data = taxid2n[accession]
                                print 'st ok!!', st
                                break
                            except:
                                continue
             '''

        if accession2description:
            try:
                lf.name = accession2description[lf.name]
            except:
                pass
        if st:
            lf.name = lf.name + ' (' + st + ')'
        else:
            pass
        for col, value in enumerate(data):

            if leaf_number == 1:
                n = TextFace('%s' % (genes[col]), fsize=6)
                n.vt_align = 2
                n.hz_align = 2
                n.rotation = 270
                n.margin_top = 0
                n.margin_right = 0
                n.margin_left = 4
                n.margin_bottom = 0
                n.inner_background.color = "white"
                n.opacity = 1.
                tss.aligned_header.add_face(n, col)
                #lf.add_face(n, col, position="aligned")

            if value > 0:
                n = TextFace('  ')
                n.margin_top = 0
                n.margin_right = 0
                n.margin_left = 0
                n.margin_bottom = 0
                n.inner_background.color = rgb2hex(m2.to_rgba(
                    float(value)))  #'#140718' #"#81BEF7"
                n.opacity = 1.
                lf.add_face(n, col, position="aligned")

            else:
                n = TextFace('  ')
                n.margin_top = 0
                n.margin_right = 0
                n.margin_left = 0
                n.margin_bottom = 0
                n.inner_background.color = "white"
                n.opacity = 1.
                lf.add_face(n, col, position="aligned")

    return t1, leaf_number, tss
Example #9
0
def deepbiome_draw_phylogenetic_tree(
        log,
        network_info,
        path_info,
        num_classes,
        file_name="%%inline",
        img_w=500,
        branch_vertical_margin=20,
        arc_start=0,
        arc_span=360,
        node_name_on=True,
        name_fsize=10,
        tree_weight_on=True,
        tree_weight=None,
        tree_level_list=['Genus', 'Family', 'Order', 'Class', 'Phylum'],
        weight_opacity=0.4,
        weight_max_radios=10,
        phylum_background_color_on=True,
        phylum_color=[],
        phylum_color_legend=False,
        show_covariates=True,
        verbose=True):
    """
    Draw phylogenetic tree

    Parameters
    ----------
    log (logging instance) :
        python logging instance for logging
    network_info (dictionary) :
        python dictionary with network_information
    path_info (dictionary):
        python dictionary with path_information
    num_classes (int):
        number of classes for the network. 0 for regression, 1 for binary classificatin.
    file_name (str):
        name of the figure for save.
        - "*.png", "*.jpg"
        - "%%inline" for notebook inline output.
        default="%%inline"
    img_w (int):
        image width (pt)
        default=500
    branch_vertical_margin (int):
        vertical margin for branch
        default=20
    arc_start (int):
        angle that arc start
        default=0
    arc_span (int):
        total amount of angle for the arc span
        default=360
    node_name_on (boolean):
        show the name of the last leaf node if True
        default=False
    name_fsize (int):
        font size for the name of the last leaf node
        default=10
    tree_weight_on (boolean):
        show the amount and the direction of the weight for each edge in the tree by circle size and color.
        default=True
    tree_weight (ndarray):
        reference tree weights
        default=None
    tree_level_list (list):
        name of each level of the given reference tree weights
        default=['Genus', 'Family', 'Order', 'Class', 'Phylum']
    weight_opacity  (float):
        opacity for weight circle
        default= 0.4
    weight_max_radios (int):
        maximum radios for weight circle
        default= 10
    phylum_background_color_on (boolean):
        show the background color for each phylum based on `phylumn_color`.
        default= True
    phylum_color (list):
        specify the list of background colors for phylum level. If `phylumn_color` is empty, it will arbitrarily assign the color for each phylum.
        default= []
    phylum_color_legend (boolean):
        show the legend for the background colors for phylum level
        default= False
    show_covariates (boolean):
        show the effect of the covariates
        default= True
    verbose (boolean):
        show the log if True
        default=True
    Returns
    -------
    
    Examples
    --------
    Draw phylogenetic tree
    
    deepbiome_draw_phylogenetic_tree(log, network_info, path_info, num_classes, file_name = "%%inline")
    """

    os.environ[
        'QT_QPA_PLATFORM'] = 'offscreen'  # for tree figure (https://github.com/etetoolkit/ete/issues/381)
    reader_class = getattr(readers,
                           network_info['model_info']['reader_class'].strip())
    reader = reader_class(log, path_info, verbose=verbose)
    data_path = path_info['data_info']['data_path']
    try:
        count_path = path_info['data_info']['count_path']
        x_list = np.array(
            pd.read_csv(path_info['data_info']['count_list_path'],
                        header=None).iloc[:, 0])
        x_path = np.array([
            '%s/%s' % (count_path, x_list[fold])
            for fold in range(x_list.shape[0]) if '.csv' in x_list[fold]
        ])
    except:
        x_path = np.array([
            '%s/%s' % (data_path, path_info['data_info']['x_path'])
            for fold in range(1)
        ])

    reader.read_dataset(x_path[0], None, 0)

    network_class = getattr(
        build_network, network_info['model_info']['network_class'].strip())
    network = network_class(network_info,
                            path_info,
                            log,
                            fold=0,
                            num_classes=num_classes,
                            tree_level_list=tree_level_list,
                            is_covariates=reader.is_covariates,
                            covariate_names=reader.covariate_names,
                            verbose=False)

    if len(phylum_color) == 0:
        colors = mcolors.CSS4_COLORS
        colors_name = list(colors.keys())
        if reader.is_covariates and show_covariates:
            phylum_color = np.random.choice(
                colors_name,
                network.phylogenetic_tree_info['Phylum_with_covariates'].
                unique().shape[0])
        else:
            phylum_color = np.random.choice(
                colors_name,
                network.phylogenetic_tree_info['Phylum'].unique().shape[0])

    basic_st = NodeStyle()
    basic_st['size'] = weight_max_radios * 0.5
    basic_st['shape'] = 'circle'
    basic_st['fgcolor'] = 'black'

    t = Tree()
    root_st = NodeStyle()
    root_st["size"] = 0
    t.set_style(root_st)

    tree_node_dict = {}
    tree_node_dict['root'] = t

    upper_class = 'root'
    lower_class = tree_level_list[-1]
    lower_layer_names = tree_weight[-1].columns.to_list()

    layer_tree_node_dict = {}
    phylum_color_dict = {}
    for j, val in enumerate(lower_layer_names):
        t.add_child(name=val)
        leaf_t = t.get_leaves_by_name(name=val)[0]
        leaf_t.set_style(basic_st)
        layer_tree_node_dict[val] = leaf_t
        if lower_class == 'Phylum' and phylum_background_color_on:
            phylum_st = copy.deepcopy(basic_st)
            phylum_st["bgcolor"] = phylum_color[j]
            phylum_color_dict[val] = phylum_color[j]
            leaf_t.set_style(phylum_st)
    tree_node_dict[lower_class] = layer_tree_node_dict
    upper_class = lower_class
    upper_layer_names = lower_layer_names

    for i in range(len(tree_level_list) - 1):
        lower_class = tree_level_list[-2 - i]
        if upper_class == 'Disease' and show_covariates == False:
            lower_layer_names = network.phylogenetic_tree_info[
                lower_class].unique()
        else:
            lower_layer_names = tree_weight[-i - 1].index.to_list()

        layer_tree_node_dict = {}
        for j, val in enumerate(upper_layer_names):
            parient_t = tree_node_dict[upper_class][val]
            if upper_class == 'Disease':
                child_class = lower_layer_names
            else:
                child_class = network.phylogenetic_tree_info[lower_class][
                    network.phylogenetic_tree_info[upper_class] ==
                    val].unique()

            for k, child_val in enumerate(child_class):
                parient_t.add_child(name=child_val)
                leaf_t = parient_t.get_leaves_by_name(name=child_val)[0]
                if lower_class == 'Phylum' and phylum_background_color_on:
                    phylum_st = copy.deepcopy(basic_st)
                    phylum_st["bgcolor"] = phylum_color[k]
                    phylum_color_dict[child_val] = phylum_color[k]
                    leaf_t.set_style(phylum_st)
                else:
                    leaf_t.set_style(basic_st)
                if tree_weight_on:
                    edge_weights = np.array(tree_weight[-1 - i])
                    edge_weights *= (weight_max_radios / np.max(edge_weights))
                    if upper_class == 'Disease':
                        upper_num = 0
                    else:
                        upper_num = network.phylogenetic_tree_dict[
                            upper_class][val]
                    if upper_class == 'Disease' and reader.is_covariates == True and show_covariates:
                        lower_num = network.phylogenetic_tree_dict[
                            '%s_with_covariates' % lower_class][child_val]
                    else:
                        lower_num = network.phylogenetic_tree_dict[
                            lower_class][child_val]
                    leaf_t.add_features(weight=edge_weights[lower_num,
                                                            upper_num])
                layer_tree_node_dict[child_val] = leaf_t
        tree_node_dict[lower_class] = layer_tree_node_dict
        upper_class = lower_class
        upper_layer_names = lower_layer_names

    def layout(node):
        if "weight" in node.features:
            # Creates a sphere face whose size is proportional to node's
            # feature "weight"
            color = {1: "RoyalBlue", 0: "Red"}[int(node.weight > 0)]
            C = CircleFace(radius=node.weight, color=color, style="circle")
            # Let's make the sphere transparent
            C.opacity = weight_opacity
            # And place as a float face over the tree
            faces.add_face_to_node(C, node, 0, position="float")

        if node_name_on & node.is_leaf():
            # Add node name to laef nodes
            N = AttrFace("name", fsize=name_fsize, fgcolor="black")
            faces.add_face_to_node(N, node, 0)

    ts = TreeStyle()

    ts.show_leaf_name = False
    ts.mode = "c"
    ts.arc_start = arc_start
    ts.arc_span = arc_span
    ts.layout_fn = layout
    ts.branch_vertical_margin = branch_vertical_margin
    ts.show_scale = False

    if phylum_color_legend:
        for phylum_name in np.sort(list(phylum_color_dict.keys())):
            color_name = phylum_color_dict[phylum_name]
            ts.legend.add_face(CircleFace(weight_max_radios * 1, color_name),
                               column=0)
            ts.legend.add_face(TextFace(" %s" % phylum_name, fsize=name_fsize),
                               column=1)

    return t.render(file_name=file_name, w=img_w, tree_style=ts)


# #########################################################################################################################
# if __name__ == "__main__":
#     argdict = argv_parse(sys.argv)
#     try: gpu_memory_fraction = float(argdict['gpu_memory_fraction'][0])
#     except: gpu_memory_fraction = None
#     try: max_queue_size=int(argdict['max_queue_size'][0])
#     except: max_queue_size=10
#     try: workers=int(argdict['workers'][0])
#     except: workers=1
#     try: use_multiprocessing=argdict['use_multiprocessing'][0]=='True'
#     except: use_multiprocessing=False

#     ### Logger ############################################################################################
#     logger = logging_daily.logging_daily(argdict['log_info'][0])
#     logger.reset_logging()
#     log = logger.get_logging()
#     log.setLevel(logging_daily.logging.INFO)

#     log.info('Argument input')
#     for argname, arg in argdict.items():
#         log.info('    {}:{}'.format(argname,arg))

#     ### Configuration #####################################################################################
#     config_data = configuration.Configurator(argdict['path_info'][0], log)
#     config_data.set_config_map(config_data.get_section_map())
#     config_data.print_config_map()

#     config_network = configuration.Configurator(argdict['network_info'][0], log)
#     config_network.set_config_map(config_network.get_section_map())
#     config_network.print_config_map()

#     path_info = config_data.get_config_map()
#     network_info = config_network.get_config_map()
#     test_evaluation, train_evaluation, network = deepbiome_train(log, network_info, path_info, number_of_fold=20)
Example #10
0
def plot_heat_tree(biodb, taxid2n, tree_file):
    '''
    Plot heatmap next to a tree. The order of the heatmap **MUST** be the same,
    as order of the leafs on the tree. The tree must be in the Newick format. If
    *output_file* is specified, then heat-tree will be rendered as a PNG,
    otherwise interactive browser will pop-up with your heat-tree.

    Parameters
    ----------
    heatmap_file: str
        Path to the heatmap file. The first row must have '#Names' as first
        element of the header.
            e.g. #Names, A, B, C, D
                row1, 2, 4, 0, 4
                row2, 4, 6, 2, -1

    tree_file: str
        Path to the tree file in Newick format. The leaf node labels should
        be the same as as row names in the heatmap file. E.g. row1, row2.

    output_file: str, optional
        If specified the heat-tree will be rendered in that file as a PNG image,
        otherwise interactive browser will pop-up. **N.B.** program will wait
        for you to exit the browser before continuing.
    '''

    from chlamdb.biosqldb import manipulate_biosqldb
    server, db = manipulate_biosqldb.load_db(biodb)

    taxid2organism = manipulate_biosqldb.taxon_id2genome_description(
        server, biodb, True)

    t1 = Tree(tree_file)

    # Calculate the midpoint node
    R = t1.get_midpoint_outgroup()
    # and set it as tree outgroup
    t1.set_outgroup(R)

    leaf_number = 0
    for lf in t1.iter_leaves():
        leaf_number += 1
        lf.branch_vertical_margin = 0
        try:
            data = [taxid2n[str(lf.name)]]
        except:
            data = [0]
        #print 'taxon', int(lf.name)
        lf.name = taxid2organism[int(lf.name)]
        for col, value in enumerate(data):
            if value > 0:
                n = TextFace(' %s ' % str(value))
                n.margin_top = 2
                n.margin_right = 2
                n.margin_left = 2
                n.margin_bottom = 2
                n.inner_background.color = "#81BEF7"
                n.opacity = 1.
                lf.add_face(n, col, position="aligned")

            else:
                n = TextFace(' %s ' % str(value))
                n.margin_top = 2
                n.margin_right = 2
                n.margin_left = 2
                n.margin_bottom = 2
                n.inner_background.color = "white"
                n.opacity = 1.
                lf.add_face(n, col, position="aligned")

    return t1, leaf_number
Example #11
0
def plot_heatmap_tree_locus(biodb,
                            tree_file,
                            taxid2count,
                            taxid2identity=False,
                            taxid2locus=False,
                            reference_taxon=False,
                            n_paralogs_barplot=False):
    '''

    plot tree and associated heatmap with count of homolgs
    optional:
        - add identity of closest homolog
        - add locus tag of closest homolog

    '''

    from chlamdb.biosqldb import manipulate_biosqldb

    server, db = manipulate_biosqldb.load_db(biodb)

    taxid2organism = manipulate_biosqldb.taxon_id2genome_description(
        server, biodb, True)

    t1 = Tree(tree_file)
    ts = TreeStyle()
    ts.draw_guiding_lines = True
    ts.guiding_lines_color = "gray"
    # Calculate the midpoint node
    R = t1.get_midpoint_outgroup()
    # and set it as tree outgroup
    t1.set_outgroup(R)

    leaf_number = 0

    for lf in t1.iter_leaves():

        if str(lf.name) not in taxid2count:
            taxid2count[str(lf.name)] = 0

    max_count = max([taxid2count[str(lf.name)] for lf in t1.iter_leaves()])

    for i, lf in enumerate(t1.iter_leaves()):

        # top leaf, add header
        if i == 0:

            n = TextFace('Number of homologs')
            n.margin_top = 1
            n.margin_right = 1
            n.margin_left = 20
            n.margin_bottom = 1
            n.inner_background.color = "white"
            n.opacity = 1.
            n.rotation = -25
            #lf.add_face(n, 7, position="aligned")
            ts.aligned_header.add_face(n, 1)

            if taxid2identity:
                n = TextFace('Protein identity')
                n.margin_top = 1
                n.margin_right = 1
                n.margin_left = 20
                n.margin_bottom = 1
                n.inner_background.color = "white"
                n.opacity = 1.
                n.rotation = -25
                #lf.add_face(n, 7, position="aligned")
                ts.aligned_header.add_face(n, 2)
            if taxid2locus:
                n = TextFace('Locus tag')
                n.margin_top = 1
                n.margin_right = 1
                n.margin_left = 20
                n.margin_bottom = 1
                n.inner_background.color = "white"
                n.opacity = 1.
                n.rotation = -25
                #lf.add_face(n, 7, position="aligned")
                ts.aligned_header.add_face(n, 3)

        leaf_number += 1

        lf.branch_vertical_margin = 0

        data = [taxid2count[str(lf.name)]]

        # possibility to add one or more columns
        for col, value in enumerate(data):
            col_index = col
            if value > 0:
                n = TextFace(' %s ' % str(value))
                n.margin_top = 2

                n.margin_right = 2
                if col == 0:
                    n.margin_left = 20
                else:
                    n.margin_left = 2
                n.margin_bottom = 2
                n.inner_background.color = "white"  # #81BEF7
                n.opacity = 1.
                lf.add_face(n, col, position="aligned")

            else:
                n = TextFace(' %s ' % str(value))
                n.margin_top = 2
                n.margin_right = 2
                if col == 0:
                    n.margin_left = 20
                else:
                    n.margin_left = 2
                n.margin_bottom = 2
                n.inner_background.color = "white"
                n.opacity = 1.
                lf.add_face(n, col, position="aligned")
        # optionally indicate number of paralogs as a barplot
        if n_paralogs_barplot:
            col_index += 1
            percent = (float(value) / max_count) * 100
            n = StackedBarFace([percent, 100 - percent],
                               width=150,
                               height=18,
                               colors=['#6699ff', 'white'],
                               line_color='white')
            n.rotation = 0
            n.inner_border.color = "white"
            n.inner_border.width = 0
            n.margin_right = 15
            n.margin_left = 0
            lf.add_face(n, col + 1, position="aligned")

        # optionally add additionnal column with identity
        if taxid2identity:
            import matplotlib.cm as cm
            from matplotlib.colors import rgb2hex
            import matplotlib as mpl

            norm = mpl.colors.Normalize(vmin=0, vmax=100)
            cmap = cm.OrRd
            m = cm.ScalarMappable(norm=norm, cmap=cmap)

            try:
                if round(taxid2identity[str(lf.name)], 2) != 100:
                    value = "%.2f" % round(taxid2identity[str(lf.name)], 2)
                else:
                    value = "%.1f" % round(taxid2identity[str(lf.name)], 2)
            except:
                value = '-'
            if str(lf.name) == str(reference_taxon):
                value = '         '
            n = TextFace(' %s ' % value)
            n.margin_top = 2
            n.margin_right = 2
            n.margin_left = 20
            n.margin_bottom = 2
            if not value.isspace() and value is not '-':
                n.inner_background.color = rgb2hex(m.to_rgba(float(value)))
                if float(value) > 82:
                    n.fgcolor = 'white'
            n.opacity = 1.
            if str(lf.name) == str(reference_taxon):
                n.inner_background.color = '#800000'

            lf.add_face(n, col_index + 1, position="aligned")
        # optionaly add column with locus name
        if taxid2locus:
            try:
                value = str(taxid2locus[str(lf.name)])
            except:
                value = '-'
            n = TextFace(' %s ' % value)
            n.margin_top = 2
            n.margin_right = 2
            n.margin_left = 2
            n.margin_bottom = 2
            if str(lf.name) != str(reference_taxon):
                n.inner_background.color = "white"
            else:
                n.fgcolor = '#ff0000'
                n.inner_background.color = "white"
            n.opacity = 1.
            lf.add_face(n, col_index + 2, position="aligned")
        lf.name = taxid2organism[str(lf.name)]

    return t1, leaf_number, ts
Example #12
0
"python3 draw_tanglegram.py -newick1 ./all_1469_new.newick -newick2 ./nxrA.newick -cf1 ./gene_annotation.txt -cf2 ./phylum_annotate.txt -length 'max' -sep '_' -extra_set 'rename' "
import sys

from .tanglegram import *
from bin.format_newick import sort_tree
from os.path import dirname, join, exists
from ete3 import Tree
import io

# raw gene 2 species tree
example_dir = r"D:\Desktop\OneDrive - The Chinese University of Hong Kong\luo lab\项目\AOB\whole_tree\nxrA"
desktop_NOB_tmp = "/d/Desktop/NOB_HGT"
gene_tree = join(example_dir, 'nxrA.newick')
species_tree = join(example_dir, '..', 'all', 'all_1469_new.newick')
species_tree = Tree(species_tree, format=3)
for l in species_tree.get_leaves():
    l.name = l.name.split('_')[-1].replace('.', 'v')

gene_tree_colors = join(example_dir, 'gene_annotation.txt')
species_tree_colors = join(example_dir, '..', 'all', 'phylum_annotate.txt')

Angst_reconciles = sort_tree(
    Tree("D:/Desktop/NOB_HGT/angst/AnGST.newick", format=3))

fig = main(gene_tree,
           species_tree,
           gene_tree_colors,
           species_tree_colors,
           l_legnth='max',
           sep='_',
Example #13
0
def plot_tree(canopy, folder='results', seed=0):

    t = Tree()
    nstyle = NodeStyle()
    nstyle['fgcolor'] = 'black'
    nstyle['size'] = 0
    t.set_style(nstyle)

    r = lambda: np.random.randint(0, 255)

    def get_color():
        return '#%02X%02X%02X' % (r(), r(), r())

    log_path = folder + '/log_' + canopy + '_' + str(seed) + '.csv'
    log = np.loadtxt(log_path, delimiter=',', dtype=np.float)
    labels = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(canopy),
                        delimiter='\t',
                        dtype=np.int)[:, 1]

    # print(log[:,:2].astype(int))

    n = int(np.max(log[:, :2])) + 1
    np.random.seed(4)
    colors = [get_color() for i in range(n)]
    nodes = {}
    for i in np.arange(n)[::-1]:
        node = t.add_child(name=str(i))
        nstyle = NodeStyle()
        nstyle['fgcolor'] = colors[labels[int(i)]]
        nstyle['size'] = 5
        node.set_style(nstyle)
        nodes[i] = node

    counter = 0
    for i, j, link in log:
        i, j = int(i), int(j)

        depth_i = depth(nodes[i])
        depth_j = depth(nodes[j])

        # print(depth_i, depth_j)

        par_i = nup(nodes[i], depth_i - 1)
        par_j = nup(nodes[j], depth_j - 1)

        if depth_i == depth_j:
            l = nodes[i].detach()
            r = nodes[j].detach()

        else:
            par_i = nup(nodes[i], depth_i - 1)
            par_j = nup(nodes[j], depth_j - 1)

            # print(par_i)
            # print(par_j)

            l = par_i.detach()
            r = par_j.detach()

        new_node = t.add_child()
        nstyle = NodeStyle()
        nstyle['fgcolor'] = 'black'
        nstyle['size'] = 0
        new_node.set_style(nstyle)

        new_node.add_child(l, name=str(i))
        new_node.add_child(r, name=str(j))

    max_depth = 0
    for node in t.get_leaves():
        d = depth(node)
        max_depth = max(max_depth, d)
    max_depth += 1

    for node in t.get_leaves():
        d = depth(node)
        node.dist = max_depth - d

    ts = TreeStyle()
    ts.show_leaf_name = True
    ts.rotation = 90
    # ts.show_branch_length = True
    # ts.show_branch_support = True
    ts.show_scale = False
    t.show(tree_style=ts)
Example #14
0
#!/usr/bin/env python3

# The branch length is retrieved from http://www.timetree.org/

from ete3 import Tree

t = Tree(name="ancestry4")

n1 = t.add_child(dist=12.9, name="ancestry3")
t.add_child(name="rheMac", dist=28.1)

n2 = n1.add_child(dist=6.59, name="ancestry2")
n1.add_child(name="ponAbe", dist=15.2)

n3 = n2.add_child(dist=2.21, name="ancestry1")
n2.add_child(name="gorGor", dist=8.61)

n3.add_child(name="hg", dist=6.4)
n3.add_child(name="panTro", dist=6.4)

#print(t)
print(t.write(features=[]))
Example #15
0
        }
        sequence_data_path = replicate_output_dir + "sequence_data.fas"
        character_data_path = replicate_output_dir + "character_data.fas"
        history_tree_path = replicate_output_dir + "unlabeled_trait_history.nwk"
        true_history_path = replicate_output_dir + "labeled_trait_history.nwk"
        write_simulation_parameters(tree_path, model_parameters,
                                    sequence_data_path, character_data_path,
                                    history_tree_path, true_history_path,
                                    aln_len, simulation_parameters_path)
        simulation_output_log = replicate_output_dir + "simulator_log.txt"
        res = os.system(simulator_path + " param=" +
                        simulation_parameters_path + " > " +
                        simulation_output_log)

        # re-write the history path without internal nodes names
        history_tree = Tree(history_tree_path, format=1)
        history_tree.write(outfile=history_tree_path, format=5)

        # extract the labeling of nodes in the trait history for relax parameters and traitrelax debugging
        label_to_nodes = {"0": [], "1": []}
        true_history = Tree(true_history_path, format=1)
        node_index = 0
        label_regex = re.compile("{(.*?)}")
        for node in true_history.traverse("postorder"):
            if not node.is_root():
                node_id = node_index
                node_index += 1
                node_label = label_regex.search(node.name).group(1)
                label_to_nodes[node_label].append(node_id)
        labels_str = "model1.nodes_id="
        for i in range(len(label_to_nodes["0"]) - 1):
Example #16
0
from ete3 import Tree
t = Tree('((H:0.3,I:0.1):0.5, A:1, (B:0.4,(C:1,D:1):0.5):0.5);')


# Create a small function to filter your nodes
def conditional_function(node):
    if node.dist > 0.3:
        return True
    else:
        return False


# Use previous function to find matches. Note that we use the traverse
# method in the filter function. This will iterate over all nodes to
# assess if they meet our custom conditions and will return a list of
# matches.
matches = filter(conditional_function, t.traverse())
print len(matches), "nodes have ditance >0.3"
# depending on the complexity of your conditions you can do the same
# in just one line with the help of lambda functions:
matches = filter(lambda n: n.dist > 0.3 and n.is_leaf(), t.traverse())
print len(matches), "nodes have ditance >0.3 and are leaves"
Example #17
0
def scale_tree(input_tree_path, output_tree_path, scaling_factor=1.0):
    tree = Tree(input_tree_path, format=1)
    for node in tree.traverse():
        node.dist = node.dist * scaling_factor
    tree.write(outfile=output_tree_path, format=1)
Example #18
0
from ete3 import Tree
# Let's create simple tree
t = Tree('((((H,K),(F,I)G),E),((L,(N,Q)O),(P,S)));', format=1)
print "Original tree looks like this:"
print t
#
#                                        /-H
#                              /--------|
#                             |          \-K
#                    /--------|
#                   |         |          /-F
#          /--------|          \--------|
#         |         |                    \-I
#         |         |
#         |          \-E
#---------|
#         |                    /-L
#         |          /--------|
#         |         |         |          /-N
#         |         |          \--------|
#          \--------|                    \-Q
#                   |
#                   |          /-P
#                    \--------|
#                              \-S
# Prune the tree in order to keep only some leaf nodes.
t.prune(["H", "F", "E", "Q", "P"])
print "Pruned tree"
print t
#
#                              /-F
Example #19
0
from ete3 import Tree
import sys

t = Tree(sys.argv[1])
tips = []
for tip in t:
    tips.append(tip.name)

Archaea = open("archaeagenomes.csv")
Bacteria = open("bacteriagenomes.csv")

archaea = []
bacteria = []

for line in Archaea:
    this_id = line.strip()
    if this_id in tips:
        archaea.append(this_id)

for line in Bacteria:
    this_id = line.strip()
    if this_id in tips:
        bacteria.append(this_id)

all_taxa = archaea + bacteria
ancestorA = t.get_common_ancestor(archaea)
ancestorB = t.get_common_ancestor(bacteria)
ancestorR = t.get_common_ancestor(all_taxa)

t.unroot()
ab_dist = ancestorA.get_distance(ancestorB)
Example #20
0
    def renderingTreeImage(self):

        path = os.path.join('Input', 'ProteinInput')

        seq_records = SeqIO.parse(path, 'fasta')

        for record in seq_records:
            self.input_protein_accession_number.append(record.id)
            self.input_protein_sequence.append(record.seq)

        with open(os.path.join('execs', 'tmp',
                               "rooted_tree.nwk")) as nwk_tree_handle:
            nwk_tree = nwk_tree_handle.read()
            t = Tree(nwk_tree)
            print(t)
            print '\n'

        ts = TreeStyle()
        ts.title.add_face(TextFace(
            'PhyloEpsilon - Protein Ortholog Finding Tool by Bryan Dighera',
            fsize=16,
        ),
                          column=0)
        ts.allow_face_overlap = True
        ts.show_leaf_name = True
        ts.show_branch_support = True

        leaf_names = []
        for leaf in t.get_leaf_names():

            np_xp_pattern = re.compile('N[P]|X[P]')
            digits_pattern = re.compile('\d+.\d')

            np_xp_search_obj = re.search(np_xp_pattern, leaf)
            digits_search_obj = re.search(digits_pattern, leaf)

            np_xp_name = np_xp_search_obj.group()
            digits_name = digits_search_obj.group()
            final_accession = str(np_xp_name + '_' + digits_name)
            print final_accession
            leaf_names.append(final_accession)

        #print 'leaf names: ' + '%s' % leaf_names

        P = Protein()
        protein_domains, domain_colors, unrepeated_domains = P.Domains()
        print domain_colors

        #Creates a dictionary that corresponds the protein accession number to its corresponding introns
        for i in range(len(leaf_names)):
            self.accession_dict_with_introns[
                self.input_protein_accession_number[i]] = self.exon_lengths[i]

        i = 0

        print 'protein accession number: ' + '%s' % self.input_protein_accession_number
        print 'Accession dict: ' + '%s' % self.accession_dict_with_introns + '\n'

        #Iterates through the accession numbers that correspond the the order of the leaves of the phylogenetic tree to retrieve introns and build fig
        for accession_number in leaf_names:
            intron_motifs = [[0, 0, "[]", None, 12, "White", "White", None]]

            #Checks the accession number against the dictionary and retrieves the corresponding introns, if no introns then doesn't append any
            if accession_number in self.accession_dict_with_introns:
                print accession_number, self.accession_dict_with_introns[
                    accession_number]
                exon_list = self.accession_dict_with_introns[accession_number]
                print exon_list

                for exon_length in exon_list:
                    if str(exon_length) != 'NONE':

                        for location in exon_length:
                            split_exon_location = str(location).split('-')
                            protein_seq_exon_location = int(
                                math.floor(int(split_exon_location[1]) / 3))

                            #Calculates the intron phase and then checks the phase to append appropriate color indicating phase on diagram
                            intron_phase = (int(split_exon_location[1]) -
                                            int(split_exon_location[0])) % 3

                            if intron_phase == 0:
                                intron_motifs.append([
                                    protein_seq_exon_location - 2,
                                    protein_seq_exon_location + 2, "[]", None,
                                    5, "Grey", "Grey", None
                                ])
                            elif intron_phase == 1:
                                intron_motifs.append([
                                    protein_seq_exon_location - 2,
                                    protein_seq_exon_location + 2, "[]", None,
                                    5, "Black", "Black", None
                                ])

                            elif intron_phase == 2:
                                intron_motifs.append([
                                    protein_seq_exon_location - 2,
                                    protein_seq_exon_location + 2, "[]", None,
                                    5, "Blue", "Blue", None
                                ])
                    else:
                        print 'NO INTRONS FOUND FOR RECORD'

                print str(intron_motifs) + '\n'
                msa_protein_seq = self.msa_aligned_protein[i].strip('-')

                #ete3 module that adds the introns(motifs) to the phylogenetic tree
                seqFace = SeqMotifFace(str(msa_protein_seq),
                                       gapcolor="black",
                                       seq_format='line',
                                       scale_factor=1,
                                       motifs=intron_motifs)
                (t & t.get_leaf_names()[i]).add_face(seqFace, 0, "aligned")

                i += 1

        n = 0

        # Iterates through the accession numbers that correspond to the order of the leaves of the phylogenetic tree and compare to domain dict values
        # TODO: Add the legend and possibly give a number to each of the domains so they can be easily identified in the legend
        for accession_number in leaf_names:

            domain_motifs = [[0, 0, "[]", None, 12, "White", "White", None]]

            for domain in protein_domains:

                if accession_number in domain:

                    print 'leaf accession #: ' + '%s' % accession_number
                    print 'domains accession: ' + '%s' % domain.keys()[0]
                    print domain.values()[0]

                    for each_domain in domain.values()[0]:

                        try:

                            domain_motif_color = domain_colors[each_domain[0]]
                            start_domain_loc = int(
                                each_domain[1].split(':')[0])

                            end_domain_loc = int(each_domain[1].split(':')[1])
                            domain_name = str(each_domain[0])

                            domain_motifs.append([
                                start_domain_loc, end_domain_loc, "<>", 20, 20,
                                'Black', domain_motif_color, 'arial|8|black|'
                            ])
                        except ValueError:

                            domain_motif_color = domain_colors[each_domain[0]]

                            start_pattern = re.compile('(?<!=\W)\d+')
                            start_pattern_search = re.search(
                                start_pattern,
                                str(each_domain[1].split(':')[0]))
                            start_domain_loc = int(
                                start_pattern_search.group())

                            end_pattern = re.compile('(?<!=\W)\d+')
                            end_pattern_search = re.search(
                                end_pattern, str(each_domain[1].split(':')[1]))
                            end_domain_loc = int(end_pattern_search.group())

                            domain_motifs.append([
                                start_domain_loc, end_domain_loc, "<>", 20, 20,
                                'Black', domain_motif_color, 'arial|8|black|'
                            ])

            print domain_motifs

            msa_protein_seq = self.msa_aligned_protein[n].strip('-')
            print msa_protein_seq
            print len(msa_protein_seq)
            print '*' * 100

            domainFace = SeqMotifFace(str(msa_protein_seq),
                                      gapcolor="black",
                                      seq_format='line',
                                      scale_factor=1,
                                      motifs=domain_motifs)
            (t & t.get_leaf_names()[n]).add_face(domainFace, 0, "aligned")

            n += 1

        #Creating the legend

        print protein_domains
        for single_unrepeat, colors in domain_colors.iteritems():

            ts.legend.add_face(TextFace(single_unrepeat), column=0)
            ts.legend.add_face(SeqMotifFace(
                "A" * 45, [[0, 80, "[]", None, 8, "Black", colors, None]]),
                               column=1)
            ts.legend_position = 1

        #name_of_run = nameOfRun()
        file_name = self.run_name
        t.show(tree_style=ts)
        t.render(os.path.join('CompletedTrees', file_name + '.pdf'),
                 tree_style=ts)
            breakdown[info[tax_level]] = 1.0 / float(total_size)
    return breakdown


#compute the most frequent sister group of each (monophyletic?) group on the tree, to identify trends in gene transfers, "unstable" taxa, etc.
labels = {}
name_to_tax_info = defaultdict(dict)
taxa_names = []
summary = defaultdict(dict)
groups = []
clades_per_group = defaultdict(list)

target_label = 'cluster'  #edit this to make the comparisons at a desired taxonomic level

#read the ML tree, set up the taxonomy stuff, and calculate the number of clades per label, and the sizes of those clades (to report at the end)
ml_tree = Tree(sys.argv[1])
for leaf in ml_tree:
    taxonomy = parse_taxonomy(leaf.name)
    name_to_tax_info[leaf.name] = taxonomy
    taxa_names.append(leaf.name)
    leaf.add_feature("tax", taxonomy[target_label])
    labels[taxonomy[target_label]] = 1
groups = labels.keys()

#compute the number of clades per label in the ML tree, and their sizes
ML_groups = defaultdict(
    list
)  #the list is the size of each clade, len(list) is the number of clades for that label in the ML tree
for label in groups:
    for node in ml_tree.get_monophyletic(values=[label], target_attr="tax"):
        size_clade = 0
                leaf_style["hz_line_color"] = "black"
                leaf_style["hz_line_width"] = 5
                leaf_style["vt_line_color"] = "black"
                leaf_style["vt_line_width"] = 5
                bg_color = color_dict[phylum]
                leaf_style["bgcolor"] = bg_color
                leaf.set_style(leaf_style)
                leaf_face = TextFace(leaf_name.strip("'"), fsize=20)
                leaf.add_face(leaf_face, 0, 'aligned')
                break
            except TreeError:
                leaf_name = "'%s'" % leaf_name
                time_redo -= 1
                continue


if __name__ == '__main__':
    params = read_params(sys.argv)
    mkdir(params['outdir'])
    t = Tree(params['newick'], format=1)
    # t.show(tree_style=ts)
    color_dict, genus_in_phylum = get_dict(params['tax_ass'])
    set_default_node_style(t)
    set_leaf_style(genus_in_phylum, t)
    ts = get_default_tree_style(color_dict)

    pdf_file = '%s/phylo_tree.pdf' % params['outdir']
    png_file = '%s/phylo_tree.png' % params['outdir']
    t.render(pdf_file, tree_style=ts, dpi=300)
    image_trans(pdf_file, png_file)
Example #23
0
if "--all" in sys.argv:
    tree_file_list = [
        t for t in glob("trees/HIV1_FLT_2018_genome_DNA_mask*.fa.treefile")
    ]
    plot_prefix = "whole_v_allmasks"
if "--mask-as-ref" in sys.argv:
    tree_file_list = [ref_tree_file]
    ref_tree_file = "trees/HIV1_FLT_2018_genome_DNA_mask100.fa.treefile"
    plot_prefix = "masked_v_whole"
    ref_bs_label = "Masked Alignment Bootstrap"
    tree_bs_label = "Whole Alignment Bootstrap"
    pct_masks = [100]
    tree_orientation = 1

ref_tree = Tree(ref_tree_file, format=1)
ref_tree.set_outgroup(outgroup)
add_support_and_subtypes(ref_tree)

mask_regex = r"mask(\d+)"

shared_edge_support_values = []
ref_only_edge_support_values = []
for tree_file in tree_file_list:
    pct_mask_match = re.search(r"mask(\d+)", tree_file)
    if pct_mask_match is not None:
        pct_mask = int(pct_mask_match.groups()[0])
    else:
        pct_mask = 0
    print("Adding {} bootstrap values to tree from {}...".format(
        tree_file, ref_tree_file))
Example #24
0
            stack.append(dictionary[current][3])
            result.append("(")
        else:
            result.append(current)
        current_prev = current

    result.pop()
    result.append(")")
    return result


if __name__ == "__main__":
    matrix, length = readInput()
    dictionary = {}
    finalCluster = upgma(matrix, length, dictionary)
    result = printCluster(dictionary, finalCluster)
    result = ''.join(result)
    result = result + ";"
    
#ete3 is tool for pylogenetic tree construction
    
    from ete3 import Tree

    tree = Tree(result)
    print("UPGMA Resultant Clustering:")
    print("")
    print(result)
    print("")
    print(tree)
   
Example #25
0
        if line.startswith('LTR'):
            # record name, divergence, and scaled divergence (IGC)
            (rt_name, classification, I, clust, clustSize, model, div, divc,
             IGC) = line.strip().split('\t')
            divergences[rt_name] = div
            divergencesCorrected[rt_name] = divc
            IGCdct[rt_name] = IGC
            classifDct[rt_name] = classification
# generate color gradient for representing divergence values as colored
# circles at tips of leaves in the rendered tree
num_colors = 20000
# yellow, red, blue, black
color_gradient = polylinear_gradient(
    ['#FAFF00', '#FF1800', '#001EFF', '#000000'], num_colors)
# load the newick tree
t = Tree(tree_flpath)
# for marking which elements did not have information in the divergences
# file for coloring the circles white
NOLTRDIVERGENCES = False
# record the greatest divergence value for automatically setting the
# outgroup as the element with the most divergent LTRs (estimating the
# branch containing the oldest element as the first split in the tree)
greatest_div = {'element': None, 'value': 0}
# scale bootstrap values to percentages
for node in t.traverse():
    node.support = node.support * 100
# assign the colors for the node circles based on divergence
for node in t:
    node_name = str(node).split('-')[-1]
    rt_name = 'LTR_retrotransposon{0}'.format(node_name.split('_')[0])
    if rt_name in divergences:
Example #26
0
def build_nj_phylip(alignment, outfile, outgroup, work_dir="."):
    """
    build neighbor joining tree of DNA seqs with PHYLIP in EMBOSS

    PHYLIP manual
    http://evolution.genetics.washington.edu/phylip/doc/
    """

    phy_file = op.join(work_dir, "work", "aln.phy")
    try:
        AlignIO.write(alignment, file(phy_file, "w"), "phylip")
    except ValueError:
        print(
            "Repeated seq name, possibly due to truncation. NJ tree not built.",
            file=sys.stderr,
        )
        return None

    seqboot_out = phy_file.rsplit(".", 1)[0] + ".fseqboot"
    seqboot_cl = FSeqBootCommandline(
        FPHYLIP_BIN("fseqboot"),
        sequence=phy_file,
        outfile=seqboot_out,
        seqtype="d",
        reps=100,
        seed=12345,
    )
    stdout, stderr = seqboot_cl()
    logging.debug("Resampling alignment: %s" % seqboot_cl)

    dnadist_out = phy_file.rsplit(".", 1)[0] + ".fdnadist"
    dnadist_cl = FDNADistCommandline(FPHYLIP_BIN("fdnadist"),
                                     sequence=seqboot_out,
                                     outfile=dnadist_out,
                                     method="f")
    stdout, stderr = dnadist_cl()
    logging.debug("Calculating distance for bootstrapped alignments: %s" %
                  dnadist_cl)

    neighbor_out = phy_file.rsplit(".", 1)[0] + ".njtree"
    e = phy_file.rsplit(".", 1)[0] + ".fneighbor"
    neighbor_cl = FNeighborCommandline(
        FPHYLIP_BIN("fneighbor"),
        datafile=dnadist_out,
        outfile=e,
        outtreefile=neighbor_out,
    )
    stdout, stderr = neighbor_cl()
    logging.debug("Building Neighbor Joining tree: %s" % neighbor_cl)

    consense_out = phy_file.rsplit(".", 1)[0] + ".consensustree.nodesupport"
    e = phy_file.rsplit(".", 1)[0] + ".fconsense"
    consense_cl = FConsenseCommandline(
        FPHYLIP_BIN("fconsense"),
        intreefile=neighbor_out,
        outfile=e,
        outtreefile=consense_out,
    )
    stdout, stderr = consense_cl()
    logging.debug("Building consensus tree: %s" % consense_cl)

    # distance without bootstrapping
    dnadist_out0 = phy_file.rsplit(".", 1)[0] + ".fdnadist0"
    dnadist_cl0 = FDNADistCommandline(FPHYLIP_BIN("fdnadist"),
                                      sequence=phy_file,
                                      outfile=dnadist_out0,
                                      method="f")
    stdout, stderr = dnadist_cl0()
    logging.debug("Calculating distance for original alignment: %s" %
                  dnadist_cl0)

    # infer branch length on consensus tree
    consensustree1 = phy_file.rsplit(".", 1)[0] + ".consensustree.branchlength"
    run_ffitch(distfile=dnadist_out0,
               outtreefile=consensustree1,
               intreefile=consense_out)

    # write final tree
    ct_s = Tree(consense_out)

    if outgroup:
        t1 = consensustree1 + ".rooted"
        t2 = smart_reroot(consensustree1, outgroup, t1)
        if t2 == t1:
            outfile = outfile.replace(".unrooted", "")
        ct_b = Tree(t2)
    else:
        ct_b = Tree(consensustree1)

    nodesupport = {}
    for node in ct_s.traverse("postorder"):
        node_children = tuple(sorted([f.name for f in node]))
        if len(node_children) > 1:
            nodesupport[node_children] = node.dist / 100.0

    for k, v in nodesupport.items():
        ct_b.get_common_ancestor(*k).support = v
    print(ct_b)
    ct_b.write(format=0, outfile=outfile)

    try:
        s = op.getsize(outfile)
    except OSError:
        s = 0
    if s:
        logging.debug("NJ tree printed to %s" % outfile)
        return outfile, phy_file
    else:
        logging.debug("Something was wrong. NJ tree was not built.")
        return None
from ete3 import Tree
import os
import re

treefilename  = input("Enter master tree name: ")
dirname = input("Enter tree file directory: ")
supertree = Tree(treefilename)
exists = False
checkname = ''
newcheckname = ''

mastertreenamelist = []
for node in supertree.traverse("postorder"):
        mastertreenamelist.append(node.name)


dirlist = os.listdir(dirname)
for filename in dirlist: 
        currenttree = Tree(dirname+filename)
        for node in currenttree.traverse("postorder"):
                #extracts species name from node
                for l in range(len(node.name)):
                        if node.name[l] == "_":
                                newcheckname += "_"
                                break
                        else:
                                newcheckname += node.name[l]

                #finds species in mastertree
                if (newcheckname in mastertreenamelist):
                        exists = True
Example #28
0
    print("reading input reconciled trees.")

    spTree = None
    isUndated = False

    IN = open(params["-g"], "r")

    l = IN.readline()

    while l != "":

        if l != "\n":

            if l.startswith("("):  ##special ignore white lines

                ALEtree = Tree(l, format=1)

                RT = ALEtreeToReconciledTree(ALEtree, isUndated=isUndated)

                if isUndated:
                    refineReconciledTreeWithTransferBack(RT)

                ConvertRTtoLossIndepVersion(RT,
                                            speciesTree=spTree,
                                            keptChildNameSuffix=".c")

                XMLlines = RT.getTreeRecPhyloXMLLines()

                for xmlline in XMLlines:
                    OUT.write(indentLevel * indentChar + xmlline + "\n")
Example #29
0
from ete3 import Tree

t = Tree()
t.populate(15)
print(t)
t.show()
Example #30
0
def getTheTrees():
    ##DOWNLOAD taxdump and store in taxo folder
    ##DOWNLOAD TAXREF BY HAND! and put it in taxo/

    class Trans:
        def __init__(self):
            self.common_name_FR = []

    print "Getting french translations..."
    TRANS = {}  ##translations in french
    with open("taxo/TAXREFv11.txt") as f:
        for line in f:
            sciname = line.split("\t")[14]
            comnameFR = line.split("\t")[19]
            if (TRANS.has_key(sciname) == False
                    and line.split("\t")[19] != ''):
                TRANS[sciname] = Trans()
            if (line.split("\t")[19] != ''):
                TRANS[sciname].common_name_FR.append(comnameFR)

    #get translation of ranks
    print "\nGetting rank names in french..."
    RANKS = {}
    with open("taxo/ranks.txt") as f:
        for line in f:
            rank_en = line.split("\t")[0]
            rank_fr = line.split("\t")[1].rstrip()  ##to remove \n
            RANKS[rank_en] = rank_fr

    class Taxid:
        def __init__(self):
            self.sci_name = ""
            self.authority = ""
            self.synonym = ""
            #			self.common_name = ""
            self.common_name = []
            #			self.common_name_FR = ""
            self.common_name_FR = []

    cpt = 0
    cptfr = 0
    ATTR = {}  ##here we will list attribute of each species per taxid
    print "Reading NCBI taxonomy..."
    with open("taxo/names.dmp") as f:
        for line in f:
            taxid = line.split("|")[0].replace("\t", "")
            tid_val = line.split("|")[1].replace("\t", "")
            tid_type = line.split("|")[3].replace("\t", "")
            if (ATTR.has_key(taxid) == False):
                ATTR[taxid] = Taxid()
            if (tid_type == "scientific name"):
                ATTR[taxid].sci_name = tid_val
                #and get translation in french (if any)
                if TRANS.has_key(tid_val):
                    ATTR[taxid].common_name_FR = TRANS[tid_val].common_name_FR
                    cptfr += 1
            if (tid_type == "authority"):
                if (ATTR[taxid].authority != ""):
                    ATTR[taxid].authority = ATTR[
                        taxid].authority + ", " + tid_val
                else:
                    ATTR[taxid].authority = tid_val
            if (tid_type == "synonym"):
                if (ATTR[taxid].synonym != ""):
                    ATTR[taxid].synonym = ATTR[taxid].synonym + ", " + tid_val
                else:
                    ATTR[taxid].synonym = tid_val
            if (tid_type == "common name"):
                cpt += 1
                ATTR[taxid].common_name.append(tid_val)
            if (tid_type == "genbank common name"):
                cpt += 1
                ATTR[taxid].common_name.append(tid_val)

            # if (ATTR[taxid].common_name!=""):
            # 	ATTR[taxid].common_name = ATTR[taxid].common_name + ", " + tid_val
            # else:
            # 	ATTR[taxid].common_name = tid_val

    T = {}

    ###New gettrees
    from ete3 import Tree
    filepath = 'taxo/nodes.dmp'
    print "Building the NCBI taxonomy tree..."
    with open(filepath) as fp:
        first_line = fp.readline()  ## remove the 1 | 1 edge
        for line in fp:
            dad = line.split("|")[1].replace("\t", "")
            son = line.split("|")[0].replace("\t", "")
            rank = line.split("|")[2].replace("\t", "")
            if (T.has_key(dad) == False):
                T[dad] = Tree()
                T[dad].name = dad
                T[dad].taxid = dad
                T[dad].sci_name = ATTR[dad].sci_name
                T[dad].common_name = ATTR[dad].common_name
                T[dad].synonym = ATTR[dad].synonym
                T[dad].authority = ATTR[dad].authority
                T[dad].common_name_FR = ATTR[dad].common_name_FR
            if (T.has_key(son) == False):
                T[son] = Tree()
                T[son].name = son
                T[son].rank = rank
                T[son].rank_FR = RANKS[rank]
                T[son].taxid = son
                T[son].sci_name = ATTR[son].sci_name
                T[son].common_name = ATTR[son].common_name
                T[son].synonym = ATTR[son].synonym
                T[son].authority = ATTR[son].authority
                T[son].common_name_FR = ATTR[son].common_name_FR
            else:
                if (hasattr(T[son], 'rank') == False):
                    T[son].rank = rank
#					T[son].rank_FR = RANKS[rank]
            T[dad].add_child(T[son])
    return T