コード例 #1
0
def pruning(inputtree, inputfasta, tree_outfilename):
    #This function remove sequences from a FASTA from a larger tree
    

    #Full initial tree - to be pruned
    k = open(inputtree, "r").read() 

    #ete3 Tree format
    f = Tree(inputtree)
 
    #List of IDs to be picked from the full FASTA
    IDlist=[] 
    fasta = open(inputfasta, "rU")
    record_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
    for recordID in record_dict.keys():
         print recordID
         IDlist.append(recordID)
    print IDlist

    tree_outfile=open(tree_outfilename, "w")

    print "pruning...", inputfasta
    f.prune(IDlist, preserve_branch_length=True)
    f.write(format=0, outfile=tree_outfilename)
    print "pruned", inputfasta
コード例 #2
0
ファイル: v26_to_v27.py プロジェクト: AstrobioMike/anvio
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_profile_db(db_path)

    # make sure the version is accurate
    profile_db = db.DB(db_path, None, ignore_version = True)
    if str(profile_db.get_version()) != current_version:
        raise ConfigError("Version of this profile database is not %s (hence, this script cannot really do anything)." % current_version)

    # migrate item orders
    item_orders = profile_db.get_table_as_dict(item_orders_table_name)
    for order_name in item_orders:
        if item_orders[order_name]['type'] == 'newick':
            newick = Tree(item_orders[order_name]['data'], format=1)
            newick = newick.write(format=2)
            profile_db._exec("""UPDATE %s SET "data" = ? WHERE "name" LIKE ?""" % item_orders_table_name, (newick, order_name))

    # migrate layer orders
    layer_orders = profile_db.get_table_as_dict(layer_orders_table_name)
    for order_name in layer_orders:
        if layer_orders[order_name]['data_type'] == 'newick':
            newick = Tree(layer_orders[order_name]['data_value'], format=1)
            newick = newick.write(format=2)
            profile_db._exec("""UPDATE %s SET "data_value" = ? WHERE "data_key" LIKE ?""" % layer_orders_table_name, (newick, order_name))

    # set the version
    profile_db.remove_meta_key_value_pair('version')
    profile_db.set_version(next_version)

    # bye
    profile_db.disconnect()
    progress.end()
コード例 #3
0
def sanitizeByType(container, sanitizeby='tsv', onlycolumns=False):
    '''for a iterable of strings, carry out sanitizeString by:
        line, 
        tsv (all or onlycolumns), 
        fasta headers, or 
        leaf in nwk'''
    
    assert sanitizeby in set(['line', 'tsv', 'newick', 'fasta'])
    if sanitizeby=='line': 
        for line in container:
            print(sanitizeString(line.strip("\r\n"), False))
    if sanitizeby=='tsv': 
        for line in container:
            if onlycolumns: 
                newline = line.strip("\r\n").split("\t")
                for i in onlycolumns: 
                    newline[i-1]=sanitizeString(newline[i-1], False)
            else:
                newline=[sanitizeString(item.strip("\r\n"), False) for item in line.split("\t")]
            print("\t".join(newline))
    if sanitizeby=='newick':
        from ete3 import Tree
        t=Tree("".join(container))
        for l in t:
            l.name=sanitizeString(l.name, False)
        print(t.write())
    if sanitizeby=='fasta': 
        from Bio import SeqIO
        from io import StringIO
        from sys import stdout
        fasta = StringIO("".join(container))
        for seq_record in SeqIO.parse(fasta, "fasta"):
            seq_record.id=sanitizeString(seq_record.description, False)
            seq_record.description=''
            SeqIO.write(seq_record, stdout, "fasta")
コード例 #4
0
ファイル: makeTree.py プロジェクト: andrewwhwang/autoblast
def createImg(filename, thres=0, samples=1):
    count = parseLineage(filename)
    suffix, matrix, taxo = getSuffixandMatrixandNewick(count,thres,samples)
    newick = convert(taxo,suffix)
    newick += ';'

    t = Tree(newick, format=1)
    ct = ClusterTree(t.write(),  text_array=matrix)
    addColors(ct)

    # nodes are linked to the array table
    array = ct.arraytable
    # Calculates some stats on the matrix. Needed to establish the color gradients.
    matrix_dist = [i for r in xrange(len(array.matrix))for i in array.matrix[r] if np.isfinite(i)]
    matrix_max = np.max(matrix_dist)
    matrix_min = np.min(matrix_dist)
    matrix_avg = (matrix_max+matrix_min)/2
    # Creates a profile face that will represent node's profile as a heatmap
    profileFace  = ProfileFace(matrix_max, matrix_min, matrix_avg, 200, 14, "heatmap",colorscheme=3)
    # Creates my own layout function that uses previous faces
    def mylayout(node):
        # If node is a leaf
        if node.is_leaf():
            # And a line profile
            add_face_to_node(profileFace, node, 0, aligned=True)
            node.img_style["size"]=2

    # Use my layout to visualize the tree
    ts = TreeStyle()
    ts.layout_fn = mylayout
    # ct.show(tree_style=ts)
    filedir = '/'.join(filename.split('/')[:-1])
    # t.write(format=9, outfile="output/newick/"+param+".nw")
    ct.render(filedir+'/phylo.png',tree_style=ts)
コード例 #5
0
def get_example_tree():
    # Random tree
    t = Tree()
    t.populate(20, random_branches=True)

    # Some random features in all nodes
    for n in t.traverse():
        n.add_features(weight=random.randint(0, 50))

    # Create an empty TreeStyle
    ts = TreeStyle()

    # Set our custom layout function
    ts.layout_fn = layout

    # Draw a tree
    ts.mode = "c"

    # We will add node names manually
    ts.show_leaf_name = False
    # Show branch data
    ts.show_branch_length = True
    ts.show_branch_support = True

    return t, ts
コード例 #6
0
def main():
    args = parse_args()

    # Use the extension specified by the user if present
    if args.extension:
        ext = args.extension
    else:
        ext = '.mod.tre'

    # Load the tree
    t = Tree(args.tree)

    # Iterate over the nodes, convert to desired value
    for node in t.iter_search_nodes():
        if args.decimal:
            if node.support >= 1:
                node.support = float(node.support * 0.01)
            else:
                print >> sys.stderr, 'bootstrap value in {} is < 1, \
                        ignoring.'.format(args.tree)
        else:
            if node.support <= 1:
                node.support = int(node.support * 100)
            else:
                print >> sys.stderr, 'bootstrap value in {} is > 1, \
                        ignoring.'.format(args.tree)

    # If the replace flag is set, replace the input file with the output file.
    # Otherwise create a new file with the '.mod.tre' extension
    if args.replace:
        out = args.tree
    else:
        out = args.tree + ext

    t.write(format=0, outfile=out)
コード例 #7
0
ファイル: EXP_MNIST.py プロジェクト: zyzzhaoyuzhe/MDPD
def show_tree(experiment_folder):
    model = MDPD.Hierachical_MDPD(1)
    model.load(os.path.join(experiment_folder, 'model.p'))

    width, depth = model.width, model.depth

    root = Tree()

    cache = [(0, root)]

    for i in range(depth + 1):
        foo = []

        for idx, node in cache:
            paren = int((idx - 1) / width)
            kid = idx - paren * width
            face = faces.ImgFace(os.path.join(experiment_folder, 'images', '{}_{}_{}.png'.format(idx, paren, kid)))
            node.add_face(face, 0)

            if i < depth:
                for k in range(width):
                    foo.append((idx * width + k + 1, node.add_child()))

        cache = foo

    ts = TreeStyle()
    ts.mode = "c"

    root.render(os.path.join(experiment_folder, 'images', 'tree_plot.png'), tree_style=ts)
    return root
コード例 #8
0
ファイル: node.py プロジェクト: mlberkeley/genetic-algs
    def ete_draw(self, fname=None):
        """ Draws the tree and saves it to a file.  If `fname` is None,
            show the tree instead of saving it.

            Args:
                fname: filename to save to (default=None)
        """
        if Cfg.USE_ETE3:
            def layout(node):
                faces.add_face_to_node(AttrFace("name"), node, column=0,
                                       position="branch-right")

            ts = TreeStyle()
            ts.show_leaf_name = False
            ts.layout_fn = layout
            ts.rotation = 90
            
            tree = EteTree(self.ete_str(), format=8)

            if fname:
                tree.render(fname, tree_style=ts)
            else:
                tree.show(tree_style=ts)
        else:
            # TODO maybe throw an error?
            pass
コード例 #9
0
def main(treefile, to, metric):
    with open(treefile) as fh:
        for treeline in fh:
            tree = Tree(treeline)
            tree = alphbetise_names(tree)
            tree = normalise_tree(tree, to, metric)
            print(tree.write(format=5))
コード例 #10
0
ファイル: raysurveyor-gentree.py プロジェクト: zorino/ray
def tree_distances(file):

    t = Tree(file)
    branch_len_out = open(file + ".patristic-dist.tsv", "w")
    avg_distance_leaves = 0

    # Computing patristic distance matrix
    header = ""
    all_leaves = t.get_leaves()
    for i in all_leaves:
        header = header + "\t" + i.name

    nb_of_distances = 0
    max_len = 0
    min_len = 9999999999999999
    branch_len_out.write(header+"\n")
    for leaf1 in all_leaves:
        row = ""
        row += str(leaf1.name)
        for leaf2 in all_leaves:
            distance = np.clip(leaf1.get_distance(leaf2), 0.0, 99999999999999999999999999)
            avg_distance_leaves += distance
            row += "\t%f" % distance
            nb_of_distances += 1
            if distance > max_len:
                max_len = distance
            if distance < min_len and distance > 0:
                min_len = distance

        branch_len_out.write(row+"\n")

    branch_len_out.close()
コード例 #11
0
def get_example_tree():
    t = Tree()
    ts = TreeStyle()
    ts.layout_fn = layout
    ts.mode = "r"
    ts.show_leaf_name = False
    t.populate(10)
    return t, ts
コード例 #12
0
ファイル: graph.py プロジェクト: astonshane/davisputnamGo
def parseTree(root):
    tree = Tree()
    tree.name = root['Name']
    tree.add_face(TextFace(root['Split'], fgcolor="red"), column=0, position="branch-bottom")
    if root['Children']:
        for child in root['Children']:
            tree.children.append(parseTree(child))
    return tree
コード例 #13
0
ファイル: ete_generate.py プロジェクト: Ward9250/ete
def run(args):
    import random
    from ete3 import Tree

    for n in range(args.number):
        t = Tree()
        t.populate(args.size, random_branches=args.random_branches)
        dump(t)
コード例 #14
0
 def __init__(self, *args, **kargs):
     kargs["format"] = 1
     Tree.__init__(self, *args, **kargs)
     for n in self.traverse():
         if n.name != "NoName":
             n.constraint = n.name.replace("{", "(").replace("}", ")").replace("@", "__target").replace("|", ",")
         else:
             n.constraint = None
コード例 #15
0
def nhx2key(nhxtree):
    """Parse a PHYLDOG nhx file or string and create key for each node."""
    t = Tree(nhxtree)
    keyD = {}
    for node in t.traverse():
        k = "|".join(sorted([n for n in node.get_leaf_names()]))
        keyD[k] = node.ND
    return(keyD)
コード例 #16
0
def revBayesTree2key(file):
    """Parse a revBayes node index tree file and create a key for each node."""
    t=Tree(file, format = 1)
    keyD = {}
    for node in t.traverse():
        k = "|".join(sorted([re.sub('\[&index=\d+\]','', n) for n in node.get_leaf_names()]))
        keyD[k] = nodeIndexFromString(node.name)
    return(keyD)
コード例 #17
0
ファイル: item_faces.py プロジェクト: AlishaMechtley/ete
def get_example_tree():

    t = Tree()
    t.populate(8, reuse_names=False)

    ts = TreeStyle()
    ts.layout_fn = master_ly
    ts.title.add_face(faces.TextFace("Drawing your own Qt Faces", fsize=15), 0)
    return t, ts
コード例 #18
0
ファイル: face_rotation.py プロジェクト: AlishaMechtley/ete
def get_example_tree():
    t = Tree()
    t.populate(10)
    ts = TreeStyle()
    ts.rotation = 45
    ts.show_leaf_name = False
    ts.layout_fn = rotation_layout

    return t, ts
コード例 #19
0
def builtTree(phylo_tree_pic, paralogs_file):
    print("Aligning top 50 genes for phylogenetic tree...")
    # maken van alignment
    clustalw_cline = ClustalwCommandline("clustalw", infile=paralogs_file)
    stdout, stderr = clustalw_cline()
    # importeren van boom bestand
    tree = Tree(paralogs_file[:-6] + ".dnd")
    # bouwen en weggschrijven van boom
    tree.render(phylo_tree_pic)
コード例 #20
0
def get_example_tree():
    t = Tree()
    ts = TreeStyle()
    ts.layout_fn = layout
    ts.mode = "c"
    ts.show_leaf_name = True
    ts.min_leaf_separation = 15
    t.populate(100)
    return t, ts
コード例 #21
0
ファイル: node.py プロジェクト: mlberkeley/genetic-algs
 def ete_print(self):
     """ Pretty print.
     
         TODO Debug and document better for case USE_ETE3 == False
     """
     if Cfg.USE_ETE3:
         t = EteTree(self.ete_str(), format=1)
         print(t.get_ascii(show_internal=True))
     else:
         return str(self)
コード例 #22
0
ファイル: balances.py プロジェクト: mortonjt/canvas
def balanceplot(balances, tree,
                layout=None,
                mode='c'):
    """ Plots balances on tree.

    Parameters
    ----------
    balances : np.array
        A vector of internal nodes and their associated real-valued balances.
        The order of the balances will be assumed to be in level order.
    tree : skbio.TreeNode
        A strictly bifurcating tree defining a hierarchical relationship
        between all of the features within `table`.
    layout : function, optional
        A layout for formatting the tree visualization. Must take a
        `ete.tree` as a parameter.
    mode : str
        Type of display to show the tree. ('c': circular, 'r': rectangular).

    Note
    ----
    The `tree` is assumed to strictly bifurcating and
    whose tips match `balances.

    See Also
    --------
    TreeNode.levelorder
    """
    # The names aren't preserved - let's pray that the topology is consistent.
    ete_tree = Tree(str(tree))
    # Some random features in all nodes
    i = 0
    for n in ete_tree.traverse():
        if not n.is_leaf():
            n.add_features(weight=balances[-i])
            i += 1

    # Create an empty TreeStyle
    ts = TreeStyle()

    # Set our custom layout function
    if layout is None:
        ts.layout_fn = default_layout
    else:
        ts.layout_fn = layout
    # Draw a tree
    ts.mode = mode

    # We will add node names manually
    ts.show_leaf_name = False
    # Show branch data
    ts.show_branch_length = True
    ts.show_branch_support = True

    return ete_tree, ts
コード例 #23
0
ファイル: phylo.py プロジェクト: tanghaibao/jcvi
def smart_reroot(treefile, outgroupfile, outfile, format=0):
    """
    simple function to reroot Newick format tree using ete2

    Tree reading format options see here:
    http://packages.python.org/ete2/tutorial/tutorial_trees.html#reading-newick-trees
    """
    tree = Tree(treefile, format=format)
    leaves = [t.name for t in tree.get_leaves()][::-1]
    outgroup = []
    for o in must_open(outgroupfile):
        o = o.strip()
        for leaf in leaves:
            if leaf[:len(o)] == o:
                outgroup.append(leaf)
        if outgroup:
            break

    if not outgroup:
        print("Outgroup not found. Tree {0} cannot be rerooted.".format(treefile), file=sys.stderr)
        return treefile

    try:
        tree.set_outgroup(tree.get_common_ancestor(*outgroup))
    except ValueError:
        assert type(outgroup) == list
        outgroup = outgroup[0]
        tree.set_outgroup(outgroup)
    tree.write(outfile=outfile, format=format)

    logging.debug("Rerooted tree printed to {0}".format(outfile))
    return outfile
def treeorder(treefile):
    from ete3 import Tree
    from ete3.treeview import faces, TreeStyle, NodeStyle, AttrFace
    t = Tree(treefile)
    rt = t.get_tree_root()
    nameorder = []
    for desc in rt.iter_descendants("preorder"):
        if not desc.is_leaf():
            continue
        nameorder.append(desc.name)
    return nameorder
コード例 #25
0
    def parse_newick(self):
        try:
           self.tree = Tree(self.nw_str)
        except NewickError:
           try:
              self.tree = Tree(self.nw_str, format=1)
           except NewickError as e:
              return "Newick Parsing Error: "+str(e)

        self.init_nodeids()
        return True
コード例 #26
0
def replace_names(tree_file, replacer):
    tree = Tree(tree_file)
    errored = False
    for tip in tree.iter_leaves():
        try:
            newname = replacer[tip.name.strip("'")]
            tip.name = newname
        except KeyError as exc:
            print("ERROR: Tip is missing from replacement file: '{}'".format(
                  tip.name), file=sys.stderr)
            errored = True
    return tree.write()
コード例 #27
0
ファイル: tree.py プロジェクト: D-PLACE/dplace
def update_newick(t, labels):
    langs_in_tree = set(str(l.label) for l in labels if l.languageTree_id == t.id)
    if not langs_in_tree:
        return False

    try:
        tree = Tree(t.newick_string, format=1)
        prune(tree, langs_in_tree, const_depth=t.name.startswith('glottolog_'))
        t.newick_string = tree.write(format=1)
        return True
    except TreeError:
        return False
コード例 #28
0
ファイル: ncbiquery.py プロジェクト: Ward9250/ete
def load_ncbi_tree_from_dump(tar):
    from ete3 import Tree
    # Download: ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
    parent2child = {}
    name2node = {}
    node2taxname = {}
    synonyms = set()
    name2rank = {}
    node2common = {}
    print("Loading node names...")
    for line in tar.extractfile("names.dmp"):
        line = str(line.decode())
        fields =  list(map(str.strip, line.split("|")))
        nodename = fields[0]
        name_type = fields[3].lower()
        taxname = fields[1]
        if name_type == "scientific name":
            node2taxname[nodename] = taxname
        if name_type == "genbank common name":
            node2common[nodename] = taxname
        elif name_type in set(["synonym", "equivalent name", "genbank equivalent name",
                               "anamorph", "genbank synonym", "genbank anamorph", "teleomorph"]):
            synonyms.add( (nodename, taxname) )
    print(len(node2taxname), "names loaded.")
    print(len(synonyms), "synonyms loaded.")

    print("Loading nodes...")
    for line in tar.extractfile("nodes.dmp"):
        line = str(line.decode())
        fields =  line.split("|")
        nodename = fields[0].strip()
        parentname = fields[1].strip()
        n = Tree()
        n.name = nodename
        n.taxname = node2taxname[nodename]
        if nodename in node2common:
            n.common_name = node2common[nodename]
        n.rank = fields[2].strip()
        parent2child[nodename] = parentname
        name2node[nodename] = n
    print(len(name2node), "nodes loaded.")

    print("Linking nodes...")
    for node in name2node:
       if node == "1":
           t = name2node[node]
       else:
           parent = parent2child[node]
           parent_node = name2node[parent]
           parent_node.add_child(name2node[node])
    print("Tree is loaded.")
    return t, synonyms
コード例 #29
0
ファイル: plot.py プロジェクト: simonvh/gimmemotifs
def _get_motif_tree(tree, data, circle=True, vmin=None, vmax=None):
    try:
        from ete3 import Tree, NodeStyle, TreeStyle
    except ImportError:
        print("Please install ete3 to use this functionality")
        sys.exit(1)

    t = Tree(tree)
    
    # Determine cutoff for color scale
    if not(vmin and vmax):
        for i in range(90, 101):
            minmax = np.percentile(data.values, i)
            if minmax > 0:
                break
    if not vmin:
        vmin = -minmax
    if not vmax:
        vmax = minmax
    
    norm = Normalize(vmin=vmin, vmax=vmax, clip=True)
    mapper = cm.ScalarMappable(norm=norm, cmap="RdBu_r")
    
    m = 25 / data.values.max()
    
    for node in t.traverse("levelorder"):
        val = data[[l.name for l in node.get_leaves()]].values.mean()
        style = NodeStyle()
        style["size"] = 0
        
        style["hz_line_color"] = to_hex(mapper.to_rgba(val))
        style["vt_line_color"] = to_hex(mapper.to_rgba(val))
        
        v = max(np.abs(m * val), 5)
        style["vt_line_width"] = v
        style["hz_line_width"] = v

        node.set_style(style)
    
    ts = TreeStyle()

    ts.layout_fn = _tree_layout
    ts.show_leaf_name= False
    ts.show_scale = False
    ts.branch_vertical_margin = 10

    if circle:
        ts.mode = "c"
        ts.arc_start = 180 # 0 degrees = 3 o'clock
        ts.arc_span = 180
    
    return t, ts
コード例 #30
0
ファイル: nwkhandler.py プロジェクト: AdmiralenOla/Scoary
def ReadTreeFromFile(filepath):
    """
    Uses ete3 to read a newick tree file, and converts this to a Scoary-readable nested list
    """
    try:
        myTree = Tree(filepath)
    except NewickError as e:
        sys.exit("Corrupted or non-existing custom tree file? %s" % e)
        
    myTree.resolve_polytomy(recursive=True)
    myTreeList, members = RecTree2List(myTree,Members=None)

    return myTreeList, members
コード例 #31
0
            or row.str.contains('Smith').any()
            or row.str.contains('Branstetter').any()
            or row.str.contains('Crawford').any()
            or row.str.contains('Leache').any()
            or row.str.contains('uce').any()):
        return 'UCE'

    return 'other'


if __name__ == '__main__':
    print(
        "In order to run this script all files must have the same name and extension and they should be saved in directories that have the datasets name. Please see an example below"
    )
    diagram = Tree(
        "((----->alignmentFileName.nex, ----->IQtreeFileName.iqtree)----->dataset1Dir, (----->alignmentFileName.nex, ----->IQtreeFileName.iqtree)----->dataset2Dir, (----->alignmentFileName.nex, ----->IQtreeFileName.iqtree)----->dataset3Dir)rootDir;",
        format=1)
    print(diagram.get_ascii(show_internal=True))
    proceed = input("do you want to proceed? Y/N\n")
    if proceed == 'Y':
        rootDir = '/data/Suha/GTR_parameters_dist/DNA/'  #the rootDir name to the directories that contain the tree files
        IQtreeFileName = 'alignment.nex.iqtree'  #the name of the iqtree file with .iqtree extension
        alignmentFileName = 'alignment.nex'  #the name of the alignment file with extension
        parametersFile = 'GTRparam.csv'  #the name of the GTR parameters output file with .csv extension

        df = pd.DataFrame()

        for DirName, subdirList, fileList in os.walk(rootDir):
            if IQtreeFileName in fileList:
                '''if you didn't allow different GTR models for each partition, please use parameters2 function instead of parametres function'''
                try:
コード例 #32
0
for file in treefiles:
    if "newick" not in file:
        continue
    label = file.split(".")[0]
    patient = label.split("_")[0]
    if "all" not in file and hasSix(patient):
        #Skip 4-tip versions of the trees.
        continue
    if onlysomepatients and label not in somepatients:
        continue
    trees = []
    for line in open(treedir + file, "r"):
        if "#" in line:
            continue
        tree = Tree(line.rstrip())
        trees.append(tree)

    for index, tree in enumerate(trees):
        print(tree)
        for branch in tree.traverse():
            branch.dist = round(branch.dist)
        line = tree.write(format=1)
        rootlen = round(tree.get_tree_root().dist)
        line = line[:-1] + ":" + str(rootlen) + ";"
        for branch in tree.traverse():
            if branch.name != "":
                name_face = AttrFace("name", fsize=30)
                branch.add_face(name_face, column=3, position="branch-right")
                pngfile = sigpngs[label][tuple(
                    branch.name.split("-")[0].split("_")[0:1])]
コード例 #33
0
#!/homes/carlac/anaconda_ete/bin/python

# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
# Copyright [2016-2019] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys, os
from ete3 import Tree

infile = sys.argv[1]
if not os.path.isfile(infile):
    sys.stderr.write("File %s not found", infile)
    sys.exit(1)

t = Tree(infile)
root = t.get_tree_root()
root.unroot()
print(root.write(format=5))
コード例 #34
0
def orthologies_with_outgroup(forest, duplicated_sp, outgroup, dict_genes,
                              out):
    """
    Browses a gene tree forest and searches for orthologs with the outgroup.
    Writes genes without phylogenetic orthologs to a file.
    Also writes files with high-confidence orthologs and paralogs to use to otpimize the synteny
    support threshold to call orthology.

    Args:
        forest (str): name of the gene trees forest file
        duplicated_sp (list of str): list of all duplicated species for the considered WGD
        outgroup (str): non-duplicated outgroup
        dict_genes (dict of GeneSpeciesPosition tuples): all gene positions for each species
        out (str): output file to write genes without phylogenetic orthologs

    Returns:
        dict: orthologs of outgroup genes in each duplicated species

    Note:
        #FIXME Written to work within scorpios as orthologs and paralogs file names are derived
        from output file patterns, assuming it contains an '_'.

    """

    ortho = {e: {} for e in duplicated_sp}

    orthofile = out.replace(out.split("/")[-1].split('_')[0], "orthologs")
    parafile = out.replace(out.split("/")[-1].split('_')[0], "paralogs")

    with open(out, 'w') as outfile, open(forest, 'r') as infile, open(parafile, 'w') as out_para,\
         open(orthofile, 'w') as out_ortho:

        sys.stderr.write(
            "Browsing gene trees for orthologies with the outgroup...\n")

        for tree in ut.read_multiple_objects(infile):

            #load tree
            tree = Tree(tree.strip(), format=1)
            node2leaves = tree.get_cached_content()
            leaves = [i for i in tree.get_leaves()]

            #add a tag to genes of duplicated species
            tag_duplicated_species(leaves, duplicated_sp)

            #find all clades with only genes of duplicated species
            subtrees = tree.get_monophyletic(values=["Y"],
                                             target_attr="duplicated")

            #find all outgroup genes
            outgroup_genes = [i for i in leaves if i.S == outgroup]

            #search for an ortholog gene in the outgroup for all clades of teleost genes
            for subtree in subtrees:

                seen = {}
                subtree_leaves = subtree.get_leaves()
                found = False

                #browse all outgroup genes
                for j in outgroup_genes:

                    #find the node that splits the outgroup gene and duplicated species genes
                    lca = tree.get_common_ancestor(subtree, j)
                    topo_distance = len(node2leaves[lca])

                    # if it is a speciation or dubious duplication node --> speciation
                    if org.is_speciation(lca):
                        branch_distance = tree.get_distance(subtree, j)
                        if subtree not in seen:
                            seen[subtree] = []
                        seen[subtree].append(
                            (topo_distance, branch_distance, j))
                        found = True

                # if no 'true' ortholog
                # check if all descendants include only outgroup + duplicated species
                if not found:
                    for j in outgroup_genes:
                        lca = tree.get_common_ancestor(subtree, j)

                        for gene in lca.get_leaves():
                            if gene.duplicated != "Y" and gene.S != outgroup:
                                break

                        #if no break, it means all descendants are outgroup or dup.
                        else:
                            topo_distance = len(node2leaves[lca])
                            branch_distance = tree.get_distance(subtree, j)
                            seen[subtree] = seen.get(subtree, [])
                            seen[subtree].append(
                                (topo_distance, branch_distance, j))

                # if an ortholog was found, add it to the orthology dict
                if seen:
                    content = []
                    seen[subtree].sort(key=lambda x: (x[0], x[1]))
                    outgroup_gene = seen[subtree][0]
                    outgroup_gene = outgroup_gene[2].name
                    for species in duplicated_sp:
                        genes = [
                            i.name for i in subtree_leaves if i.S == species
                        ]
                        genes = get_genes_positions(genes, species, dict_genes)

                        ortho[species][outgroup_gene] = ortho[species].get(
                            outgroup_gene, [])
                        ortho[species][outgroup_gene] += genes


                        content += [g.name+'_'+species.replace(' ', '.')+\
                                         '|'+str(g.chromosome)+\
                                         '|'+str(g.index) for g in genes]

                    all_ortho = [i[2].name for i in seen[subtree]]
                    paralogs = [
                        i.name for i in outgroup_genes
                        if i.name not in all_ortho
                    ]

                    if paralogs:
                        paralog = random.choice(paralogs)

                        if paralog in dict_genes[outgroup]\
                           and outgroup_gene in dict_genes[outgroup]:

                            tmp_dict = dict_genes[outgroup]

                            out_ortho.write(' '.join(content) + '\t')
                            out_ortho.write(str(outgroup_gene)+'|'+\
                                            str(tmp_dict[outgroup_gene].chromosome)+'|'+\
                                            str(tmp_dict[outgroup_gene].index)+'|'+str(0)+'|'+\
                                            str(0)+'\n')

                            out_para.write(' '.join(content) + '\t')
                            out_para.write(str(paralog)+'|'+\
                                           str(tmp_dict[paralog].chromosome)+'|'+\
                                           str(tmp_dict[paralog].index)+'|'+\
                                           str(0)+'|'+str(0)+'\n')

                # if no ortholog found
                # write genes without ortholog along with all outgroup genes in tree
                # (potential candidate for orthology)
                elif any(i.name in dict_genes[outgroup]
                         for i in outgroup_genes):

                    #genes without orthologs
                    missed_genes = []
                    for species in duplicated_sp:
                        genes = [
                            i.name for i in subtree_leaves if i.S == species
                        ]
                        genes = get_genes_positions(genes, species, dict_genes)
                        missed_genes += [g.name+'_'+species.replace(' ', '.')+\
                                         '|'+str(g.chromosome)+\
                                         '|'+str(g.index) for g in genes]

                    if missed_genes:
                        outfile.write(' '.join(missed_genes) + '\t')

                        #candidate orthologs in the outgroup
                        outgr_genes = [i.name for i in outgroup_genes]

                        in_paralogs = []
                        for pair in itertools.combinations(outgr_genes, 2):
                            if tree.get_distance(pair[0],
                                                 pair[1],
                                                 topology_only=True) == 1:
                                in_paralogs.append(pair[0] + '|' + pair[1])

                        outgr_write = []
                        genome = dict_genes[outgroup]
                        for gene in outgr_genes:
                            if gene in genome:

                                lca = tree.get_common_ancestor(subtree, gene)
                                branch_distance = tree.get_distance(
                                    subtree, gene)
                                topo_distance = len(node2leaves[lca])

                                outgr_write.append(str(gene)+'|'+str(genome[gene].chromosome)+'|'+\
                                                   str(genome[gene].index)+'|'+str(topo_distance)+\
                                                   '|'+str(branch_distance))

                        outfile.write(' '.join(outgr_write) + '\t' +
                                      ' '.join(in_paralogs) + '\n')

    sys.stderr.write("Phylogenetic orthologies with the outgroup OK\n")

    return ortho
コード例 #35
0
ファイル: treeTest.py プロジェクト: IsaacGluck/lab_of_oz
#!/usr/bin/env python
from ete3 import Tree, PhyloTree
from random import *


# GET THE 1st BOOTSRAP SAMPLE TREE
filename = "for_isaac/RAxML_bootstrap.orfg1"
file = open(filename, "r")
first_tree = file.readline()[:-1] # [:-1] Gets ride of newline at the end of the line

# MAKE IT INTO AN ETE TREE
t = Tree(first_tree, format=1)
print "ORIGINAL TREE\n"
print t

# GET A LIST OF THE LEAVES (by name or node class)
print "\n LEAVES"
# leaves = t.get_leaves()
leaves = t.get_leaf_names()
for index, leaf in enumerate(leaves):
	print (index, leaf)

# GET 4 RANDOM INDICES TO PRUNE
indices = sample(range(0, len(leaves)), 4)
print "\nRANDOM 4 INDICES: " + ', '.join(str(x) for x in indices)

# USE THOSE INDICES TO GET 4 RANDOM NODES
to_prune = []
for index in indices:
	to_prune.append(leaves[index])
コード例 #36
0
def Max_cut(taxa, trip_d):

    #connections=graph._graph_good
    t = Tree()
    ####print(taxa)
    if len(taxa) == 2:
        # Creates an empty tree
        #node=t.add_child()
        taxa = list(taxa)
        A = t.add_child(
            name=taxa[0])  # Adds a new child to the current tree root
        # and returns it
        B = t.add_child(name=taxa[1])
        return t
    if len(taxa) == 1:
        leaf = taxa.pop()
        #t.add_child(name=leaf)
        return leaf
    triplets = []
    good = []
    bad = []
    d = {ni: indi for indi, ni in enumerate(taxa)}
    rows, cols = (len(taxa), len(taxa))
    triplets_dict = defaultdict(list)
    good_mat = [[0 for i in range(cols)] for j in range(rows)]
    bad_mat = [[0 for i in range(cols)] for j in range(rows)]
    for keys in trip_d:
        words = keys.split(',')
        ####print(words)
        if (set(words).issubset(set(taxa))):
            triplets = trip_d[keys]
            for tri in triplets:
                if bad_mat[d[tri[1][0]]][d[tri[1][1]]] < 1:
                    bad.append(tri[1])
                if good_mat[d[tri[0]]][d[tri[1][0]]] < 1:
                    good.append((tri[0], tri[1][0]))
                if good_mat[d[tri[0]]][d[tri[1][1]]] < 1:
                    good.append((tri[0], tri[1][1]))
                bad_mat[d[tri[1][0]]][d[tri[1][1]]] += 1
                bad_mat[d[tri[1][1]]][d[tri[1][0]]] += 1
                good_mat[d[tri[0]]][d[tri[1][0]]] += 1
                good_mat[d[tri[0]]][d[tri[1][1]]] += 1
                good_mat[d[tri[1][0]]][d[tri[0]]] += 1
                good_mat[d[tri[1][1]]][d[tri[0]]] += 1
    taxa = set(taxa)
    ####print(triplets)

    ####print(good)
    ####print(bad)
    g = Graph(good, bad, good_mat, bad_mat, d, directed=False)
    cc = clades_from_graph(set(taxa), g)

    cc_cut = cc
    ####print(cc)
    if len(cc_cut) > 1:
        for c in cc_cut:
            sub_t = Max_cut(c, trip_d)
            ####print(sub_t)
            if isinstance(sub_t, str):
                t.add_child(name=sub_t)
            else:
                t.add_child(sub_t)
    else:

        ####print("[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[")
        cut = findCut(set(taxa), g, d)
        ####print(cut)
        for c in cut:
            new_child = Max_cut(c, trip_d)
            if isinstance(new_child, str):
                t.add_child(name=new_child)
            else:
                t.add_child(new_child)
    return t
コード例 #37
0
from ete3 import Tree, NodeStyle, TreeStyle

#output_dir = "C:/Users/ItayM5/Google Drive/MSc/posters and presentations/presentations/supplementary_materials/"
output_dir = "/groups/itay_mayrose/halabikeren/graphics/"
output_name = output_dir + "tree.png"

tree_str = Tree('((S1:1,S2:1)N1:1,(S3:1,S4:1)N2:1;)')
tree = Tree(tree_str)

# Basic tree style
ts = TreeStyle()
ts.show_leaf_name = True
ts.show_scale = True
ts.rotation = 90
ts.branch_vertical_margin = 5

internals = NodeStyle()
internals["hz_line_type"] = 0
internals["vt_line_type"] = 0
internals["vt_line_width"] = 2
internals["hz_line_width"] = 2
internals["hz_line_color"] = "Silver"  #454545" darker gray
internals["vt_line_color"] = "Silver"
internals["shape"] = "circle"
internals["size"] = 3
internals["fgcolor"] = "Silver"

clade = NodeStyle()
clade["hz_line_type"] = 0
clade["vt_line_type"] = 0
clade["vt_line_width"] = 2
コード例 #38
0
    def generateGuestPoints(self, pipeWidth=75):

        #Add in loss nodes
        for leaf in self.guest:
            node = leaf
            while node != self.guest:
                host_me = self.nodemap[node]
                host_parent = self.nodemap[node.up]
                if len(node.children) == 2:
                    lchild = self.nodemap[
                        node.children[0]] == host_me and self.nodemap[
                            node.children[1]] != host_me
                    rchild = self.nodemap[
                        node.children[1]] == host_me and self.nodemap[
                            node.children[0]] != host_me
                if len(node.children) == 2 and (lchild or rchild):
                    if self.nodemap[node.children[0]] == host_me:
                        tofix = node.children[1]
                    else:
                        tofix = node.children[0]
                    temp = Tree()
                    temp.name = "L_" + node.name
                    nodemap[temp] = host_me
                    temp.up = tofix.up
                    temp.children = [tofix]
                    tofix.up = temp
                    if tofix == node.children[0]:
                        node.children[0] = temp
                    else:
                        node.children[1] = temp
                if host_me != host_parent and host_me.up != host_parent:
                    #Add loss nodes in
                    dist = host_parent.get_distance(host_me,
                                                    topology_only=True)
                    guest_parent = node.up
                    curr = node
                    for i in range(int(dist)):
                        temp = Tree()
                        temp.name = "L_" + str(i) + "_" + guest_parent.name
                        nodemap[temp] = nodemap[curr].up
                        temp.up = curr.up
                        temp.children = [curr]
                        curr.up = temp
                        if curr == guest_parent.children[0]:
                            guest_parent.children[0] = temp
                        else:
                            guest_parent.children[1] = temp
                        curr = temp
                    guest_parent = node
                else:
                    node = node.up

        #Add levels
        for node in self.guest.traverse():
            node.add_feature('level', -1)

        for leaf in self.guest:
            node = leaf
            node.level = 0
            currmap = self.nodemap[node]
            currlevel = 0
            node = node.up
            while node != None:
                mymap = self.nodemap[node]
                if mymap == currmap:
                    node.level = max(node.level, currlevel + 1)
                else:
                    node.level = max(node.level, 0)
                currlevel = node.level
                currmap = mymap
                node = node.up

        #How many points at each level of a node in the host tree?
        rmap = {}  #map of host -> guest
        for key in self.nodemap:
            rkey = self.nodemap[key]
            if rkey in rmap:
                rmap[rkey].append(key)
            else:
                rmap[rkey] = [key]

        hostlevels = {}  # hostnode -> levelcounts
        usedlevels = {
        }  # same as hostlevels, but will count how many of each level have been used so far
        for key in rmap:
            nodes = rmap[key]

            maxlevel = 0
            for node in nodes:
                maxlevel = max(maxlevel, node.level)

            levelsizes = [0 for _ in range(maxlevel + 1)]
            for node in nodes:
                levelsizes[node.level] += 1

            hostlevels[key] = levelsizes
            usedlevels[key] = [0 for _ in range(maxlevel + 1)]

        #Generate Points - this only works for generateSpeciesTree2
        for node in self.guest.traverse():
            hostnode, level = self.nodemap[node], node.level
            used = usedlevels[hostnode][level]
            maxlevel = len(usedlevels[hostnode])
            usedlevels[hostnode][level] += 1

            bottom = hostnode.coord
            if hostnode == self.host:
                top = list(self.host.coord)
                top[1] -= 100
            else:
                top = hostnode.up.coord

            ydiff = bottom[1] - top[1]
            yused = ydiff * level / maxlevel
            y = bottom[1] - yused

            xlow, xhigh = bottom[0], top[0]
            xmid = int(xlow + (xhigh - xlow) * (yused / float(ydiff)))
            xused = int(pipeWidth * 2 * (used + 1) /
                        (float(hostlevels[hostnode][level]) + 1))
            x = xmid + xused - pipeWidth
            node.add_feature('coord', (x, y))
コード例 #39
0
def changeTree(inTree, outTree):
    rawTree = Tree(inTree)
    for node in rawTree.iter_descendants():
        node.dist = 1
    rawTree.write(outfile=outTree)
コード例 #40
0
ファイル: cut_subtrees.py プロジェクト: DyogenIBENS/SCORPIOS
def write_resolved_tree(orthog_tree, outgr_gene_name, out):
    """
    Writes solution trees for orthogroup with only 2 genes.

    Args:
        orthogroup tree (ete3.Treeode) : Node with the 2 descendants of the orthogroup.
        outgr_gene_name (str): full outgroup gene name (with species tag).
        outfile (str): filename to write the tree.
    """

    new_tree = Tree()

    new_tree.add_child(orthog_tree)
    new_tree.add_child(name=outgr_gene_name)

    new_tree.prune([i for i in new_tree.get_leaves()])

    new_tree.write(outfile=out, format=1)
コード例 #41
0
def main(arg1, arg2):
    list_splits1 = []
    list_splits2 = []
    t1 = Tree()
    tree1 = Tree(arg1)

    tree2 = Tree(arg2)

    node_midpoint = getRandomNode(tree1)

    tree1.set_outgroup(node_midpoint)

    tree2.set_outgroup(node_midpoint)

    t1, tree2 = tree2.get_tree_root().children
    t1, tree1 = tree1.get_tree_root().children
    count = 0
    for leaf in tree1.traverse("postorder"):
        if (leaf.name.strip()):
            count += 1
            leaf.add_features(order=count)
            CurrentNode2 = tree2 & leaf.name
            CurrentNode2.add_features(order=count)

        elif (leaf.name != node_midpoint):
            leaf.name = "int"

    for node in tree2.traverse("postorder"):
        if (node.name == ""):
            node.name = "int"

    Num_splits1 = 0
    Num_splits2 = 0
    Num_shared = 0
    for node in tree1.traverse("postorder"):
        list_leaves = []
        if ((node.name == "int")):
            Num_splits1 += 1
            cmin = float("+inf")
            cmax = 0
            d1, d2 = node.get_children()
            subtree = Tree()
            subtree.add_child(d1)
            subtree.add_child(d2)

            for leaf in subtree:
                list_leaves.append(leaf.name)
                if ((leaf.name != "int")):
                    CurrentNode2 = tree1 & leaf.name
                    cmin = min(CurrentNode2.order, cmin)
                    cmax = max(CurrentNode2.order, cmax)

            if ((node.is_root() == False)):
                node.name = "[" + str(cmin) + ":" + str(cmax) + "]"
                list_splits1.append(sorted(list_leaves))

    for node in tree2.traverse("postorder"):
        list_leaves = []
        if ((node.name == "int") and (node.is_root() == False)):
            Num_splits2 += 1
            cmin = float("+inf")
            cmax = 0
            size = 0
            d1, d2 = node.get_children()
            subtree2 = Tree()
            subtree2.add_child(d1)
            subtree2.add_child(d2)

            for leaf in subtree2:
                size += 1
                list_leaves.append(leaf.name)
                if ((leaf.name != "int") and (leaf.name != node_midpoint)):
                    CurrentNode2 = tree2 & leaf.name
                    cmin = min(CurrentNode2.order, cmin)
                    cmax = max(CurrentNode2.order, cmax)
            if (size == (cmax - cmin + 1)):
                node.name = "[" + str(cmin) + ":" + str(cmax) + "]"
            if (tree1.search_nodes(name=node.name)):
                Num_shared += 1
                list_splits1.remove(sorted(list_leaves))
            else:
                list_splits2.append(sorted(list_leaves))

    global leaf_num
    ts = TreeStyle()
    ts.show_leaf_name = True

    style1 = NodeStyle()
    style1["hz_line_color"] = "#ff0000"

    leaf_num = len(tree2.get_leaves())

    rf_dist = Num_splits1 + Num_splits2 - (2 * Num_shared)
    tree1 = Tree(arg1)
    L = []
    for leaf in tree1:
        L.append(leaf.name)
    L = sorted(L)
    for i in list_splits1:
        rem = set(L) - set(i)
        print(i, "||", list(rem))

    #print(list_splits1)
    #print(list_splits2)

    print(rf_dist)
    return rf_dist
コード例 #42
0
        newick_file = os.path.join(args.outdir,args.prefix+'_newick.tre')
        fp = open(newick_file,'w')
        fp.write(newick)
        fp.close()
        tr = LoadTree(treestring=newick)
        #dendrogram = UnrootedDendrogram(tr)
        #print dendrogram
        #dendrogram.showFigure()

        print(tr.asciiArt())
        ts = TreeStyle()
        ts.show_leaf_name = True
        ts.show_branch_length = True
        ts.show_branch_support = True
        print('NEWICK='+json.dumps(newick))
        rooted_tree = Tree( newick )
        #svgfile = os.path.join('/Users/avoorhis/programming/jupyter/VAMPS_API',args.prefix+'_dendrogram.svg')
        svgfile = os.path.join(args.outdir,args.prefix+'_dendrogram.svg')
        print(os.getcwd())
        #print svgfile
        print('rendering0')
        rooted_tree.render(svgfile, tree_style=ts)  # writes file to tmp



    if args.function == 'pcoa_3d':
        print('starting pcoa_3d')
        from skbio import DistanceMatrix
        dm = DistanceMatrix(dm1)
        print(dm)
        print('end')
コード例 #43
0
def plot_phylo_tree(rdata, colname, name, workdir, outdir):
    """ Generate the phylogenetic tree (dendrogram) for the PSC method. A 
    dendrogram is generated using domain pairwise scores and written in the
    newick format to a file in the workdir. The file is then read in for
    generating the phylogenetic visualizations if the number of domains in 
    the dataset is less than 300.

    :param rdata: (dataframe) Pairwise similarity scores data
    :param colname: (string) Name of column to take similarity scores from
    :param name: (string) Name of PSC method
    :param workdir: (string) Path to output directory where intermediate processing data files can be stored
    :param outdir: (string) Path to output directory where processed data files can be found
    :rtype: None
    """
    print('\t', colname)
    dist_file = '%s%sdist.csv' % (workdir, os.path.sep)
    dendro_path = '%s%s%s_dendro.nw' % (outdir, os.path.sep, name)
    tree_path = "figures%s%s_ptree.png" % (os.path.sep, name)

    # create pivot table for similarity scores of psc method
    try:
        p = rdata.pivot(index='dom1', columns='dom2', values=colname)
    except:
        print('pivot not generated for %s' % colname)
        return
    # write pivot table to file
    for i in range(len(p)):
        p.iloc[i][i] = 1
    # (convert to distance matrix)
    p = 1 - p
    p.to_csv(dist_file)

    # make name to class matrix
    dom_classification = dict(rdata[['dom1', 'cath1']].as_matrix())
    classes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']  # SCOP
    cl = [
        'blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'black',
        'black', 'black', 'black', 'black'
    ]
    class_color = dict(list(zip(classes, cl)))

    # make SCOP Class domains
    scop_class_domains = {}
    for k, v in dom_classification.items():
        scop_class = v[:1]
        if scop_class_domains.get(scop_class) is None:
            scop_class_domains[scop_class] = [k]
        else:
            scop_class_domains[scop_class].append(k)

    # create domain dendrogram from pivot data and store to file
    try:
        pdm1 = dendropy.PhylogeneticDistanceMatrix.from_csv(
            src=open(dist_file), delimiter=",")
    except:
        print('error reading file', dist_file)
        return
    nj_tree = pdm1.nj_tree()
    nj_tree.write(file=open(dendro_path, 'w'), schema='newick')

    if len(p) > 300:
        return True

    # make the tree to visualize if the number of domains is less than 300
    t = Tree(str(nj_tree) + ';')

    # Creates an independent node style for each node, which is
    # initialized with a foreground color depending on node class.
    for n in t.traverse():
        if not n.is_leaf():
            continue
        dom_class = dom_classification[n.name.replace('\'', '')]
        nstyle = NodeStyle()
        nstyle["fgcolor"] = class_color[dom_class[0]]
        nstyle["size"] = 25
        n.set_style(nstyle)

    circular_style = TreeStyle()
    circular_style.mode = "c"

    t.render(tree_path, tree_style=circular_style)

    # calculate all-to-all distances
    # same class                       different classes
    # total distance, number of nodes, total distance, number of nodes
    f_distances = [
        [0, 0, 0, 0],  # a
        [0, 0, 0, 0],  # b
        [0, 0, 0, 0],  # c
        [0, 0, 0, 0],  # d
        [0, 0, 0, 0]
    ]  # e
    class_idx = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}
    for n_start in t.traverse():
        if not n_start.is_leaf():
            continue
        n_start_class = dom_classification[n_start.name.replace('\'', '')][0]
        f_distance = f_distances[class_idx[n_start_class]]
        for n_end in t.traverse():
            if not n_end.is_leaf():
                continue
            if n_start == n_end:
                continue
            d = n_start.get_distance(n_end, topology_only=True)
            n_end_class = dom_classification[n_end.name.replace('\'', '')][0]
            if n_start_class == n_end_class:
                f_distance[0] += d
                f_distance[1] += 1
            else:
                f_distance[2] += d
                f_distance[3] += 1
    return True
コード例 #44
0
def main(arg1, arg2):
    if arg2 == 2:
        with open(arg1) as f:
            content = f.readlines()
    else:
        content = arg1
    ####print(content)
    # you may also want to remove whitespace characters like `\n` at the end of each line
    #content = [x.strip() for x in content]
    t2 = Tree(content[0])
    #print(t2)
    triplets = []
    taxa = []
    good = []
    bad = []
    for i in range(0, len(content)):
        ####print(i)
        t1 = Tree(content[i])
        #t1.show()
        ###print(t1.write(format=9))
        t1.resolve_polytomy()
        for leaf in t1:
            if leaf.is_leaf() is True:
                taxa.append(leaf.name)
        leaves, triplets = tp.triplet_decompose(t1, triplets)
        #print("leaves",leaves)
        #t2=Tree(content[i])

        ####print(triplets)
        #taxa+=leaves
    taxa = set(taxa)
    #print(triplets)
    #print("taxa",taxa)
    d = {ni: indi for indi, ni in enumerate(taxa)}
    rows, cols = (len(taxa), len(taxa))
    triplets_dict = defaultdict(list)

    for t in triplets:

        trip_key = [t[1][0], t[1][1], t[0]]
        trip_key.sort()

        trip_key = str(trip_key).strip('[]')
        trip_key = trip_key.replace("'", '')
        trip_key = trip_key.replace(" ", "")
        if (t in triplets_dict[trip_key]) is False:

            triplets_dict[trip_key].append(t)
    #print(triplets_dict)
    ####print(good)
    ####print(bad)

    ####print(clades)
    ####print("__________________________________________________________")
    ####print(outgroup)
    #g = Graph(connections, outgroup, good, bad, directed=True)
    ####print(g._graph_good)
    ####print(g._weights)
    #triplets_dict=defaultdict(list)
    supertree = Max_cut(taxa, triplets_dict)
    #print("supertree",supertree)
    #supertree.show()
    '''supertree_triplets=[]
    ST_triplets_dict=defaultdict(list)
    st_leaves,supertree_triplets,ST_triplets_dict=tp_d.triplet_decompose(supertree,supertree_triplets,ST_triplets_dict)
    total=1
    
    overlap=0
    inconsist=0
    for keys in triplets_dict:
        overlap += len(intersection(triplets_dict[keys],ST_triplets_dict[keys]))
        total+=len(triplets_dict[keys])
        if len(triplets_dict[keys])>1:
            inconsist+=len(triplets_dict[keys])
    overlap_per=overlap/total
    inconsist_per=inconsist/total
    #t2=Tree("((ah, ((ae, ab), ai)), ((ag, aa), (ac, (ad, (aj, af)))));")
    ####print(t2)
    #triplet_test.main(arg1,supertree,True,25)
    ##print(supertree)'''
    return supertree, 0, 0, triplets_dict
コード例 #45
0
        for encoded in encodeds:
            alignment = v.decodeSequenceAlignment(encoded)
            score=np.floor((100-alignment.percentIdentity())*
                        len(np.array(alignment))/100)
            print(i,'-',j,':',score)           
        Matrix[i,j]=score

Matrix=Matrix.T
    
#np.save('M.npy',Matrix)
#%%
#Matrix=np.load('M.npy',allow_pickle=True)
#%%
import upgma
tree=upgma.UPGMA(Matrix, name)

from ete3 import Tree
unrooted_tree = Tree(tree+';')
print (unrooted_tree)

#%%
To_plot=0
if To_plot==1:
    from ete3 import Tree, TreeStyle
    ts = TreeStyle()
    ts.show_leaf_name = True
    ts.branch_vertical_margin = 10 
    unrooted_tree.show(tree_style=ts)


コード例 #46
0
class Species(object):
    """Represents a collection of genomes in `path`

    :param path: Path to the directory of related genomes you wish to analyze.
    :param max_unknowns: Number of allowable unknown bases, i.e. not [ATCG]
    :param contigs: Acceptable deviations from median number of contigs
    :param assembly_size: Acceptable deviations from median assembly size
    :param mash: Acceptable deviations from median MASH distances
    :param assembly_summary: a pandas DataFrame with assembly summary information
    """

    path = attr.ib(default=Path(), converter=Path)
    max_unknowns = attr.ib(default=200)
    # TODO These are really about attrib names
    contigs = attr.ib(default=3.0)
    assembly_size = attr.ib(default=3.0)
    mash = attr.ib(default=3.0)
    assembly_summary = attr.ib(default=None)
    metadata = attr.ib(default=None)

    def __attrs_post_init__(self):
        self.log = logbook.Logger(self.path.name)
        self.label = "-".join(
            map(str, [
                self.max_unknowns, self.contigs, self.assembly_size, self.mash
            ]))
        self.paths = config.Paths(
            root=self.path,
            subdirs=[
                "qc",
                ("results", f"qc/{self.label}"),
                ("passed", f"qc/{self.label}/passed"),
                ".logs",
            ],
        )
        self.stats_path = os.path.join(self.paths.qc, "stats.csv")
        self.nw_path = os.path.join(self.paths.qc, "tree.nw")
        self.dmx_path = os.path.join(self.paths.qc, "dmx.csv")
        self.failed_path = os.path.join(self.paths.qc, "failed.csv")
        self.summary_path = os.path.join(self.paths.qc, "qc_summary.txt")
        self.paste_file = os.path.join(self.paths.qc, "all.msh")
        # Figure out if defining these as None is necessary
        self.tree = None
        self.stats = None
        if os.path.isfile(self.stats_path):
            self.stats = pd.read_csv(self.stats_path, index_col=0)
        if os.path.isfile(self.nw_path):
            self.tree = Tree(self.nw_path, 1)
        if os.path.isfile(self.failed_path):
            self.failed_report = pd.read_csv(self.failed_path, index_col=0)
        self.tolerance = {
            "unknowns": self.max_unknowns,
            "contigs": self.contigs,
            "assembly_size": self.assembly_size,
            "distance": self.mash,
        }
        self.passed = self.stats
        self.failed = {}
        self.med_abs_devs = {}
        self.dev_refs = {}
        self.allowed = {"unknowns": self.max_unknowns}

    def __str__(self):
        self.message = [
            "Species: {}".format(self.path.name),
            "Maximum Unknown Bases:  {}".format(self.max_unknowns),
            "Acceptable Deviations:",
            "Contigs, {}".format(self.contigs),
            "Assembly Size, {}".format(self.assembly_size),
            "MASH: {}".format(self.mash),
        ]
        return "\n".join(self.message)

    @property
    def genome_paths(self, ext="fasta"):
        """Returns a generator for every file ending with `ext`

        :param ext: File extension of genomes in species directory
        :returns: Generator of Genome objects for all genomes in species dir
        :rtype: generator
        """
        return [
            os.path.join(self.path, genome) for genome in os.listdir(self.path)
            if genome.endswith(ext)
        ]

    @property
    def sketches(self):
        return Path(self.paths.qc).glob("GCA*msh")

    @property
    def total_sketches(self):
        return len(list(self.sketches))

    @property
    def genome_names(self):
        ids = [i.name for i in self.genomes]
        return pd.Index(ids)

    @property
    def biosample_ids(self):
        ids = self.assembly_summary.df.loc[
            self.accession_ids].biosample.tolist()
        return ids

    # may be redundant. see genome_names attrib
    @property
    def accession_ids(self):
        ids = [
            i.accession_id for i in self.genomes if i.accession_id is not None
        ]
        return ids

    def get_tree(self):
        from ete3.coretype.tree import TreeError
        import numpy as np
        from skbio.tree import TreeNode
        from scipy.cluster.hierarchy import weighted

        ids = self.dmx.index.tolist()
        triu = np.triu(self.dmx.as_matrix())
        hclust = weighted(triu)
        t = TreeNode.from_linkage_matrix(hclust, ids)
        nw = t.__str__().replace("'", "")
        self.tree = Tree(nw)
        try:
            # midpoint root tree
            self.tree.set_outgroup(self.tree.get_midpoint_outgroup())
        except TreeError:
            self.log.error("Unable to midpoint root tree")
        self.tree.write(outfile=self.nw_path)

    @property
    def stats_files(self):
        return Path(self.paths.qc).glob("GCA*csv")

    def MAD(self, df, col):
        """Get the median absolute deviation for col"""
        MAD = abs(df[col] - df[col].median()).mean()
        return MAD

    def MAD_ref(MAD, tolerance):
        """Get the reference value for median absolute deviation"""
        dev_ref = MAD * tolerance
        return dev_ref

    def bound(df, col, dev_ref):
        lower = df[col].median() - dev_ref
        upper = df[col].median() + dev_ref
        return lower, upper

    def filter_unknown_bases(self):
        """Filter out genomes with too many unknown bases."""
        self.failed["unknowns"] = self.stats.index[
            self.stats["unknowns"] > self.tolerance["unknowns"]]
        self.passed = self.stats.drop(self.failed["unknowns"])

    # TODO Don't use decorator; perform this logic in self.filter
    def check_passed_count(f):
        """
        Count the number of genomes in self.passed.
        Commence with filtering only if self.passed has more than five genomes.
        """
        @functools.wraps(f)
        def wrapper(self, *args):
            if len(self.passed) > 5:
                f(self, *args)
            else:
                self.allowed[args[0]] = ""
                self.failed[args[0]] = ""
                self.log.info("Not filtering based on {}".format(f.__name__))

        return wrapper

    # todo remove unnecessary criteria parameter
    @check_passed_count
    def filter_contigs(self, criteria):
        """
        Only look at genomes with > 10 contigs to avoid throwing off the median absolute deviation.
        Median absolute deviation - Average absolute difference between number of contigs and the
        median for all genomes. Extract genomes with < 10 contigs to add them back in later.
        """
        eligible_contigs = self.passed.contigs[self.passed.contigs > 10]
        not_enough_contigs = self.passed.contigs[self.passed.contigs <= 10]
        # TODO Define separate function for this
        med_abs_dev = abs(eligible_contigs - eligible_contigs.median()).mean()
        self.med_abs_devs["contigs"] = med_abs_dev
        # Define separate function for this
        # The "deviation reference"
        dev_ref = med_abs_dev * self.contigs
        self.dev_refs["contigs"] = dev_ref
        self.allowed["contigs"] = eligible_contigs.median() + dev_ref
        self.failed["contigs"] = eligible_contigs[
            abs(eligible_contigs - eligible_contigs.median()) > dev_ref].index
        eligible_contigs = eligible_contigs[
            abs(eligible_contigs - eligible_contigs.median()) <= dev_ref]
        eligible_contigs = pd.concat([eligible_contigs, not_enough_contigs])
        eligible_contigs = eligible_contigs.index
        self.passed = self.passed.loc[eligible_contigs]

    @check_passed_count
    def filter_MAD_range(self, criteria):
        """
        Filter based on median absolute deviation.
        Passing values fall within a lower and upper bound.
        """
        # Get the median absolute deviation

        med_abs_dev = abs(self.passed[criteria] -
                          self.passed[criteria].median()).mean()
        dev_ref = med_abs_dev * self.tolerance[criteria]
        lower = self.passed[criteria].median() - dev_ref
        upper = self.passed[criteria].median() + dev_ref
        allowed_range = (str(int(x)) for x in [lower, upper])
        allowed_range = "-".join(allowed_range)
        self.allowed[criteria] = allowed_range
        self.failed[criteria] = self.passed[
            abs(self.passed[criteria] -
                self.passed[criteria].median()) > dev_ref].index
        self.passed = self.passed[abs(
            self.passed[criteria] - self.passed[criteria].median()) <= dev_ref]

    @check_passed_count
    def filter_MAD_upper(self, criteria):
        """
        Filter based on median absolute deviation.
        Passing values fall under the upper bound.
        """
        # Get the median absolute deviation
        med_abs_dev = abs(self.passed[criteria] -
                          self.passed[criteria].median()).mean()
        dev_ref = med_abs_dev * self.tolerance[criteria]
        upper = self.passed[criteria].median() + dev_ref
        self.failed[criteria] = self.passed[
            self.passed[criteria] > upper].index
        self.passed = self.passed[self.passed[criteria] <= upper]
        upper = "{:.4f}".format(upper)
        self.allowed[criteria] = upper

    def base_node_style(self):
        from ete3 import NodeStyle, AttrFace

        nstyle = NodeStyle()
        nstyle["shape"] = "sphere"
        nstyle["size"] = 2
        nstyle["fgcolor"] = "black"
        for n in self.tree.traverse():
            n.set_style(nstyle)
            if re.match(".*fasta", n.name):
                nf = AttrFace("name", fsize=8)
                nf.margin_right = 150
                nf.margin_left = 3
                n.add_face(nf, column=0)

    # Might be better in a layout function
    def style_and_render_tree(self, file_types=["svg"]):
        from ete3 import TreeStyle, TextFace, CircleFace

        ts = TreeStyle()
        title_face = TextFace(snakemake.config["species"].replace("_", " "),
                              fsize=20)
        title_face.margin_bottom = 10
        ts.title.add_face(title_face, column=0)
        ts.branch_vertical_margin = 10
        ts.show_leaf_name = True
        # Legend
        ts.legend.add_face(TextFace(""), column=1)
        for category in ["Allowed", "Deviations", "Filtered", "Color"]:
            category = TextFace(category, fsize=8, bold=True)
            category.margin_bottom = 2
            category.margin_right = 40
            ts.legend.add_face(category, column=1)
        for i, criteria in enumerate(CRITERIA, 2):
            title = criteria.replace("_", " ").title()
            title = TextFace(title, fsize=8, bold=True)
            title.margin_bottom = 2
            title.margin_right = 40
            cf = CircleFace(4, COLORS[criteria], style="sphere")
            cf.margin_bottom = 5
            filtered_count = len(
                list(filter(None, self.failed_report.criteria == criteria)))
            filtered = TextFace(filtered_count, fsize=8)
            filtered.margin_bottom = 5
            allowed = TextFace(self.allowed[criteria], fsize=8)
            allowed.margin_bottom = 5
            allowed.margin_right = 25
            # TODO Prevent tolerance from rendering as a float
            tolerance = TextFace(self.tolerance[criteria], fsize=8)
            tolerance.margin_bottom = 5
            ts.legend.add_face(title, column=i)
            ts.legend.add_face(allowed, column=i)
            ts.legend.add_face(tolerance, column=i)
            ts.legend.add_face(filtered, column=i)
            ts.legend.add_face(cf, column=i)
        for f in file_types:
            out_tree = os.path.join(self.paths.qc, "tree.{}".format(f))
            self.tree.render(out_tree, tree_style=ts)

    def color_tree(self):
        from ete3 import NodeStyle

        self.base_node_style()
        for failed_genome in self.failed_report.index:
            n = self.tree.get_leaves_by_name(failed_genome).pop()
            nstyle = NodeStyle()
            nstyle["fgcolor"] = COLORS[self.failed_report.loc[failed_genome,
                                                              "criteria"]]
            nstyle["size"] = 9
            n.set_style(nstyle)
        self.style_and_render_tree()

    def filter(self):
        self.filter_unknown_bases()
        self.filter_contigs("contigs")
        self.filter_MAD_range("assembly_size")
        self.filter_MAD_upper("distance")
        self.summary()
        self.write_failed_report()

    def write_failed_report(self):

        if os.path.isfile(self.failed_path):
            os.remove(self.failed_path)
        ixs = chain.from_iterable([i for i in self.failed.values()])
        self.failed_report = pd.DataFrame(index=ixs, columns=["criteria"])
        for criteria in self.failed.keys():
            if type(self.failed[criteria]) == pd.Index:
                self.failed_report.loc[self.failed[criteria],
                                       "criteria"] = criteria
        self.failed_report.to_csv(self.failed_path)

    def summary(self):
        summary = [
            self.path.name,
            "Unknown Bases",
            f"Allowed: {self.allowed['unknowns']}",
            f"Tolerance: {self.tolerance['unknowns']}",
            f"Filtered: {len(self.failed['unknowns'])}",
            "\n",
            "Contigs",
            f"Allowed: {self.allowed['contigs']}",
            f"Tolerance: {self.tolerance['contigs']}",
            f"Filtered: {len(self.failed['contigs'])}",
            "\n",
            "Assembly Size",
            f"Allowed: {self.allowed['assembly_size']}",
            f"Tolerance: {self.tolerance['assembly_size']}",
            f"Filtered: {len(self.failed['assembly_size'])}",
            "\n",
            "MASH",
            f"Allowed: {self.allowed['distance']}",
            f"Tolerance: {self.tolerance['distance']}",
            f"Filtered: {len(self.failed['distance'])}",
            "\n",
        ]
        summary = "\n".join(summary)
        with open(os.path.join(self.summary_path), "w") as f:
            f.write(summary)
        return summary

    def link_genomes(self):
        for passed_genome in self.passed.index:
            src = next(self.path.glob(f"*/*/*/{passed_genome}")).absolute()
            name = rename_genome(passed_genome, summary)
            dst = (self.paths.qc / name).absolute()
            try:
                dst.symlink_to(src)
            except FileExistsError:
                continue

    def qc(self):
        self.filter()
        self.link_genomes()
        self.get_tree()
        self.color_tree()
        self.log.info("QC finished")

    def select_metadata(self, metadata):
        try:
            self.metadata = metadata.joined.loc[self.biosample_ids]
            self.metadata.to_csv(self.metadata_path)
        except KeyError:
            self.log.exception("Metadata failed")
コード例 #47
0
def precision_matrix(tree, d, branch_length):
    """
    :param tree_name: path of the ete3 tree file
    :param d: dimension of latent space
    :param: branch_length: constant branch length along the tree, or dict of branch lengths
    :return: the covariance matrix of the gaussian vector induced by the tree,
     after inversion and post processing of the constructed precision matrix
    """

    # load tree
    if type(tree) == str:
        suffix = tree.split('.')[-1]
        if suffix == "txt":
            with open(tree, "r") as myfile:
                tree_string = myfile.readlines()
                tree = Tree(tree_string[0], 1)
        else:
            tree = Tree(tree, 1)

    # introduce an index for all the nodes
    parents = {}
    N = 0
    for idx, node in enumerate(tree.traverse("levelorder")):
        N += 1
        # set node index
        node.add_features(index=idx)

    # ancestor indexing + branch length dict
    dist = {}
    for n in tree.traverse("levelorder"):
        if not n.is_root():
            ancestor = n.up.index
            parents[n.index] = ancestor
            if type(branch_length) == dict:
                dist[n.up.index] = n.up.dist

    # Intitalize precision matrix
    inverse_covariance = np.zeros((N * d, N * d))

    # the branch length is either constant along the tree, or a dictionary
    if type(branch_length) != dict:
        t = 1 / branch_length
        for i in parents:
            pi_ind = parents[i]
            inverse_covariance[i * d:(i + 1) * d,
                               i * d:(i + 1) * d] += np.identity(d) * t
            inverse_covariance[pi_ind * d:(pi_ind + 1) * d, pi_ind *
                               d:(pi_ind + 1) * d] += np.identity(d) * t
            inverse_covariance[pi_ind * d:(pi_ind + 1) * d,
                               i * d:(i + 1) * d] += -np.identity(d) * t
            inverse_covariance[i * d:(i + 1) * d, pi_ind * d:(pi_ind + 1) *
                               d] += -np.identity(d) * t

        inverse_covariance[0:d, 0:d] += np.identity(d)
    else:
        for i in parents:
            pi_ind = parents[i]
            #t = 1 / branch_length[str(pi_ind)]
            t = 1 / branch_length[str(i)]
            inverse_covariance[i * d:(i + 1) * d,
                               i * d:(i + 1) * d] += np.identity(d) * t
            inverse_covariance[pi_ind * d:(pi_ind + 1) * d, pi_ind *
                               d:(pi_ind + 1) * d] += np.identity(d) * t
            inverse_covariance[pi_ind * d:(pi_ind + 1) * d,
                               i * d:(i + 1) * d] += -np.identity(d) * t
            inverse_covariance[i * d:(i + 1) * d, pi_ind * d:(pi_ind + 1) *
                               d] += -np.identity(d) * t

        inverse_covariance[0:d, 0:d] += np.identity(d)

    # invert precision matrix
    full_covariance = np.linalg.inv(inverse_covariance)

    leaves_covariance = marginalize_internal(full_covariance, tree, d)

    return leaves_covariance, full_covariance
コード例 #48
0
def Tree_analysis(tree,tabla,out,analysis_type,out2):  

	###Al subsequents variables could be modified
	binomial_value = float(0.05) #Default value for the option 2 of the core evaluation method for the tree
	p_value = float(0.05) #p-value threeshold for the binomial method (2 method) 
	percentage = float(0.9) #Minimun percentage threeshold of subjects requiered to defined a core 
	taxo_p = float(0.9) #Minimun percentage of the same taxonomic group within all OTUs contained into the same Node
	
	output_file=open(out, 'w')
	output_file_2=open(out2, 'w')	

	tree = Tree(tree, quoted_node_names=True, format=1) #Here we load the 97_otus tree
	table = {}
	cont = 1
	for line in open(tabla):
		if (line.startswith('#')):
			output_file_2.write(str(line))
		else:
			fields = list(map(str.strip, line.split('\t'))) #We create a dictionary with all the keys and values of the OTU table against reference
			table[fields[0]] = list(map(float, fields[1:-1]))
	
	table2 = {}
	
	for line in open(tabla):
		if (line.startswith('#')):
			continue
		else:
			fields2 = list(map(str.strip, line.split('\t'))) #Here we load a dictionary with the taxonomy information from the picked OTUs
			table2[fields2[0]] = list(map(str, fields2[(len(fields2)-1):len(fields2)]))
	
	table_final_res = [0] * len(fields[1:-1])
	table_final_res = ([float(i) for i in table_final_res])
	sum_abun_rela = 0
	cores = 0
	
	for leaf in tree:
		if leaf.name not in table:
			leaf.vector = None
		else:
			leaf.vector = table[leaf.name] #Create value vectors for each of the tree tips of the tree with the values of the OTU table previously generated

	node2content = tree.get_cached_content()

	flag=0
	for node in tree.traverse(): #This loop is used to add values into de vectors created before
		if not node.is_leaf():

			leaf_vectors = np.array([leaf.vector for leaf in node2content[node] if leaf.vector is not None])
			node.vector = leaf_vectors.sum(axis=0)
		
			if(flag == 0):
				save_node1=node.vector
				total_saved_leaves = np.array([leaf.name for leaf in node2content[node]])

				flag=1
	
	if(analysis_type==4): #This method only prints the information of the tree, only for information of the tree purpouse
		print(tree.get_ascii(show_internal=True))
		output_file.write(tree.get_ascii(show_internal=True) + '\n' + '\n')
		for node in tree.traverse("preorder"):
			print (node.name, node.vector)
			output_file.write(node.name + '\t' + str(node.vector) + '\n')

	if(analysis_type!=4):
		output_file.write("Core" + '\t' + "Prevalence" + '\t' + "Abundance" + '\t' + "Relative abundances" + '\t' + "Min" + '\t' + "Max" + '\t' + "Average" + '\t' + "SD" + '\t' + "Leaves" + '\t' + "Taxonomy" + '\t' + "Leaves number" + '\n') 
	
	
	if(analysis_type==1 or analysis_type==2 or analysis_type==3): #Here we evaluate the tree traversally using one of the choosen methods: 100% core, binomial or percentage
		for node in tree.traverse("postorder"):
		
			tot_cont=np.count_nonzero(node.vector) #Count the number ob subjects in this study with one ore more ocurrence in the vector for a certain node 
			tot_cont2=np.asarray(node.vector).size #Count the total vector array size
			a=stats.binom_test(tot_cont, n=tot_cont2, p=binomial_value, alternative='greater') #Binomial test that uses the binomial_value
			rela=(tot_cont/tot_cont2)
			
			if(analysis_type==1 and np.all(node.vector) or (analysis_type==2 and a <= p_value) or (analysis_type==3 and rela >= percentage)): #Depending on the method used to go through the tree, we will evaluate different parameters to check if the node should be or not taken into account
				
				node.vector=([float(i) for i in node.vector]) #Transform all the values contained in node.vector to float, to perform operations efficiently 
				abundance=node.vector/save_node1 #Relative abundance of each subject in the node over the terminal node (sum of all nodes)
				abundance =([float(i) for i in abundance]) 
				mean_abun=np.mean([float(i) for i in abundance]) #Mean abundance of the node
				std_abun=np.std([float(i) for i in abundance]) #Standard deviation of the node
				abundance_rela=sum(node.vector)/sum(save_node1) #Global relative abundance of the node over the terminal node
				table_final_res=list(map(sum, zip(table_final_res, abundance))) #Getting all the results for each node into a final result table
				sum_abun_rela=sum_abun_rela+abundance_rela #The sum of all global relative abundance
				cores=cores+1 #Total number of cores
				
				node2content = tree.get_cached_content()
				
				output_file_2.write(str(node.name) + '\t')
				for x in range(len(abundance)): 
					output_file_2.write(str(abundance[x]) + '\t'),
				output_file_2.write('\n')
				
				output_file.write(node.name + '\t' +  str(rela) + '\t' + str(node.vector) + '\t' + str(abundance) + '\t' + str(min(abundance)) + '\t' + str(max(abundance)) + '\t' + str(mean_abun) + '\t' + str(std_abun) + '\t')
								
				conteo_hojas=nodes_eval(node,tree,output_file,table2,taxo_p,total_saved_leaves) #With this line we can assign a taxonomy to each node based in the taxonomy of each OTU, dependig on the minimun taxonomy percentage level stablished before 
				
				output_file.write(str(conteo_hojas) + '\n') #Print the total number of leaves of this node
				
				tree=erase_node(node,tree) #Once a node has been evaluated, this line erase that node from the tree to simplify the calculations of the next nodes
			
				G = tree.search_nodes(name=node.name)[0]
				removed_node = G.detach()
						
		output_file.write(str(cores) + '\t' + '\t' + '\t' + str(table_final_res) + '\t' + str(min(table_final_res)) + '\t' + str(max(table_final_res)) + '\t' + str(np.mean([float(i) for i in table_final_res])) + '\t' + str(np.std([float(i) for i in table_final_res])) + '\n')
コード例 #49
0
ファイル: treeTest.py プロジェクト: IsaacGluck/lab_of_oz
def readTreeFileFirstLine(filename):
	file = open(filename, "r")
	first_tree = file.readline()[:-1] # [:-1] Gets ride of newline at the end of the line
	return Tree(first_tree, format=1)
コード例 #50
0
            result_trs = perform_SPR(in_tr=tmp_cpy, selection=tmp_lst[i])
            if result_trs == None:
                print("This rearrangement resulted in the same tree")
            else:
                tree_list.extend(result_trs)
    # for Tr in tree_list:
    # print Tr
    # print len(tree_list)
    return tree_list


if __name__ == "__main__":
    #### NOTE: each of the internal nodes must have a name for this rearrangement
    ### Toy example
    ### SPR must return 64 different topologies for this toy example
    t = Tree(name="root")
    Z = t.add_child(name="Z")
    Y = Z.add_child(name="Y")
    F = Z.add_child(name="F")

    X = Y.add_child(name="X")
    V = Y.add_child(name="V")

    A = V.add_child(name="A")
    B = V.add_child(name="B")

    E = X.add_child(name="E")

    W = X.add_child(name="W")
    D = W.add_child(name="D")
    C = W.add_child(name="C")
コード例 #51
0
#!/usr/bin/env python
"""
cleanup_parsnp_newick.py
Takes a .nwk file produced by parsnp and cleans up the leaf labels
If [regex] is given, will also delete all [regex] matches from leaf labels

USAGE: python cleanup_parsnp_newick.py parsnp.nwk output.nwk [regex]
"""

import sys
import re
from ete3 import Tree

if len(sys.argv) < 3:
    print __doc__
    sys.exit(1)

t = Tree(sys.argv[1])
for leaf in t.get_leaves():
    leaf.name = re.sub(r'(\.\w+)+$', '', leaf.name.strip("'"))
    if len(sys.argv) >= 4:
        leaf.name = re.sub(sys.argv[3], '', leaf.name)

t.write(format=0, outfile=sys.argv[2])
コード例 #52
0
import sys
from ete3 import Tree
import re

venom = sys.argv[1]
control = sys.argv[2]
out = sys.argv[3]
venomTree = Tree(venom, quoted_node_names=True)
controlTree = Tree(control, quoted_node_names=True)

outputTree = venomTree

venomList = []
for node in venomTree.traverse(strategy="levelorder"):
    venomList.append(node)

controlList = []
for node in controlTree.traverse(strategy="levelorder"):
    controlList.append(node)

outputList = []
for node in outputTree.traverse(strategy="levelorder"):
    outputList.append(node)

print(len(venomList))
print(len(controlList))
print(len(outputList))

for i in range(len(venomList)):
    #print(venomList[i].name)
    venomSupp = float(venomList[i].support)
コード例 #53
0
params = {'sigma': sigma, 'beta': beta, 'd': d, 'gamma': gamma, 's': s, 'rho': rho, 'dt': dt, 'time_intervals': 0}

"Provide dict with estimated params"
est_params = {'sigma': False, 'random_branch_effects': True, 'site_effects': False, 'beta': False, 'd': False, 'gamma': False, 's': False, 'rho': False}

"Import tree and sequences"    
path = './covid-analysis/'

"For spike features"
features_file = path + 'feature-files/hcov_oct2020_bestTree_byRegion_allFeatures.csv'
pastml_path = path + 'pastml/collected_preSep1_dated_pastml/'
tree_file = pastml_path + 'named.tree_phylogeny_mle_cleaned_collected_preSep1_dated_cleaned.nwk'
absolute_time = 2020.67 # absolute time of last sample

"Set up tree for run"
tree = Tree(tree_file, format=1)
tree, tree_times = TreeUtils.add_tree_times(tree)

"Set up time intervals"
final_time = max(tree_times)
root_time = absolute_time - final_time
date_time_intervals = ['2020-01-01',
                        '2020-02-15',
                        '2020-03-15',
                  '2020-04-15',
                  '2020-05-15',
                  '2020-06-15',
                  '2020-07-15',
                  '2020-08-15']
time_intervals = date2FloatYear(date_time_intervals)
time_intervals = np.array(time_intervals) - root_time
コード例 #54
0
parse.add_argument(
    "--tree",
    type=str,
    help="name of tree file with branch lengths and internal node names",
    required=True)
parse.add_argument("--outgroup",
                   type=str,
                   help="name of outgroup species name in tree",
                   required=True)

args = parse.parse_args()

#Load tree
t = Tree(
    args.tree,
    format=1)  #Format 1 loads tree with branch lengths and internal node names
print('Loading tree...' + '\n' +
      'Make sure your tree has branch lengths and internal node names')

#Traverse the tree to get a list of internal and tip node names
nodes1 = []
for node in t.traverse("levelorder"):
    nodes1.append(node.name)

#outgroup = str(args.outgroup)

#For each node in the tree get the distance from that node to the outgroup tip node, create a dictionary of this information for each node in the tree
node_age_dict = {}
for node in t.traverse("levelorder"):
    node_name = node.name
コード例 #55
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('host',
                        type=str,
                        help='The input host tree in newick format')
    parser.add_argument('guest',
                        type=str,
                        help='The input guest tree in newick format')
    parser.add_argument('mapping',
                        type=str,
                        help='Path to txt file containing guest->host mapping')

    args = parser.parse_args()

    host = Tree(args.host, format=1)
    guest = Tree(args.guest, format=1)

    nodemap = {}
    mapfile = open(args.mapping)
    for line in mapfile:
        gname, hname = line.strip().split('\t')
        nodemap[guest & gname] = host & hname

    Main(host, guest, nodemap)
"""
host = Tree('genes.stree')
guest = Tree('0.nt.raxml.treefix.tree')

#Add names
i = 0
コード例 #56
0
import os
import sys

from ete3 import Tree

parser = argparse.ArgumentParser()
parser.add_argument('-t', '--tree')
parser.add_argument('-bl', '--branch_lengths', action='store_true')
parser.add_argument('-v', '--verbose', action='store_true')
opts = parser.parse_args(sys.argv[1:])

if not os.path.isfile(opts.tree):
    sys.stderr.write("File {0} not found".format(opts.tree))
    sys.exit(1)

t = Tree(opts.tree)
if opts.verbose:
    orig_root = t.get_tree_root()
    sys.stderr.write("ORIGINAL TREE:\n" + orig_root.write(format=9) + "\n\n\n")

# intial unroot
t.unroot()
# reroot by midpoint to force unrooting later
midpoint = t.get_midpoint_outgroup()
t.set_outgroup(midpoint)

if opts.verbose:
    sys.stderr.write("MIDPOINT ROOTING:\n" + t.write(format=9) + "\n\n\n")

# final forced unrooting of tree to be absolutely sure
t.unroot()
コード例 #57
0
from ete3 import Tree
# Load an unrooted tree. Note that three branches hang from the root
# node. This usually means that no information is available about
# which of nodes is more basal.
t = Tree('(A,(H,F),(B,(E,D)));')
print "Unrooted tree"
print t
#          /-A
#         |
#         |          /-H
#---------|---------|
#         |          \-F
#         |
#         |          /-B
#          \--------|
#                   |          /-E
#                    \--------|
#                              \-D
#
# Let's define that the ancestor of E and D as the tree outgroup.  Of
# course, the definition of an outgroup will depend on user criteria.
ancestor = t.get_common_ancestor("E","D")
t.set_outgroup(ancestor)
print "Tree rooted at E and D's ancestor is more basal that the others."
print t
#
#                    /-B
#          /--------|
#         |         |          /-A
#         |          \--------|
#         |                   |          /-H
コード例 #58
0
ファイル: api.py プロジェクト: D-PLACE/pydplace
    def check(self):
        glottolog = {
            lng.id: lng
            for lng in self.read_csv('csv', 'glottolog.csv', namedtuples=True)
        }
        msgs = {'error': [], 'warning': []}

        def _msg(type_, msg, obj=None):  # pragma: no cover
            obj = '{0.__class__.__name__} {0.id}: '.format(obj) if obj else ''
            msgs[type_].append('%s:%s%s' % (type_.upper(), obj, msg))

        def error(msg, obj=None):  # pragma: no cover
            _msg('error', msg, obj=obj)

        def warning(msg, obj=None):  # pragma: no cover
            _msg('warning', msg, obj=obj)

        sources = set(e.key for e in self.sources.iterentries())
        socids, xdids, gcs, varids = \
            set(), collections.defaultdict(set), collections.defaultdict(set), {}
        for ds in self.datasets:
            for soc in ds.societies:
                if soc.id in socids:  # pragma: no cover
                    error('duplicate society ID: {0}'.format(soc.id), ds)
                xdids[soc.xd_id].add(soc.glottocode)
                gcs[soc.glottocode].add(soc.xd_id)
                socids.add(soc.id)
                if soc.glottocode not in glottolog:  # pragma: no cover
                    warning(
                        '{0} without valid glottocode {0.glottocode}'.format(
                            soc), ds)
                elif glottolog[
                        soc.
                        glottocode].family_name == 'Bookkeeping':  # pragma: no cover
                    warning(
                        '{0} mapped to Bookkeeping language: {0.glottocode}'.
                        format(soc), ds)
            # are there duplicate variables?
            for var in ds.variables:
                if var.id in varids:  # pragma: no cover
                    error('duplicate variable ID: {0}'.format(var.id), ds)
                varids[var.id] = [c.code for c in var.codes] if var.type in [
                    'Categorical', 'Ordinal'
                ] else []

            # are there undefined variables?
            undefined = set(
                [r.var_id for r in ds.data if r.var_id not in varids])
            for u in undefined:  # pragma: no cover
                error('undefined variable ID: {0}'.format(u), ds)

            for d in ds.data:
                if d.var_id not in varids:  # pragma: no cover
                    error('undefined variable ID: {0}'.format(d.var_id), ds)
                elif len(varids[d.var_id]) > 1 \
                        and d.code not in varids[d.var_id]:  # pragma: no cover
                    error(
                        'undefined code for variable {0} and society {1}:{2}'.
                        format(d.var_id, d.soc_id, d.code), ds)
                for ref in d.references:
                    if ref.key not in sources:
                        error(
                            'undefined source key "{0}" referenced in {1}'.
                            format(ref.key, ds.id), ds)

        for xdid, glottocodes in xdids.items():
            if len(glottocodes - {None}) > 1:  # pragma: no cover
                # No xd_id can be linked to more than one Glottocode!
                error('xd_id {0} mapped to multiple glottocodes {1}'.format(
                    xdid, glottocodes))

        for p in self.phylogenies:
            if p.source_id:
                if p.source_id not in sources:  # pragma: no cover
                    error(
                        '{0}: invalid source_id {1}'.format(p.id, p.source_id),
                        p)
            taxa = set()
            for taxon in p.taxa:
                taxa.add(taxon.taxon)
                if taxon.glottocode and taxon.glottocode not in glottolog:
                    error(
                        '{0}: invalid glottocode {1}'.format(
                            p.id, taxon.glottocode), p)
                for socid in taxon.soc_ids:
                    if socid not in socids:
                        error('{0}: invalid soc_id {1}'.format(p.id, socid), p)
                for xdid in taxon.xd_ids:
                    if xdid not in xdids:
                        error('{0}: invalid xd_id {1}'.format(p.id, xdid), p)

            if not p.nexus:  # pragma: no cover
                error('{0}: unable to load summary.trees'.format(p.id), p)

            try:
                Tree(p.newick, format=1)
            except NewickError as e:  # pragma: no cover
                error(
                    '{0}: invalid newick tree from summary.trees: {1}'.format(
                        p.id, e), p)

            if not p.is_glottolog:
                for node in p.newick_tree.walk():
                    if node.name and node.is_leaf and node.name not in taxa:  # pragma: no cover
                        warning('Leaf label missing in taxa.csv: {0}'.format(
                            node.name),
                                obj=p)

        for key in ['warning', 'error']:
            for msg in msgs[key]:
                print(msg)
        return not bool(msgs['error'])
コード例 #59
0
def add_group_to_tree(group, treefile, outdir, to_compress=False):
    if to_compress:
        compress = to_compress.split(",")
    else:
        compress = []
    for line in open(os.path.join(outdir, "homolog_matrix.txt"), 'r'):
        if line.startswith("\t"):
            header = line.rstrip().split("\t")[1:]
        if not line.startswith(group):
            continue
        else:
            vals = [int(x) for x in line.rstrip().split("\t")[1:]]
    groupdata = dict(zip(header, vals))

    ts = TreeStyle()
    tree = Tree(os.path.abspath(treefile))

    pal = sns.cubehelix_palette(rot=-.4, n_colors=13)
    for node in tree.iter_descendants("preorder"):
        this_node = []
        nstyle = NodeStyle()
        nstyle["shape"] = "circle"

        if node.is_leaf():
            try:
                if groupdata[node.name] > 0:
                    nstyle["fgcolor"] = colors.rgb2hex(pal[12])
                else:
                    nstyle["fgcolor"] = colors.rgb2hex(pal[0])
            except KeyError:
                nstyle["fgcolor"] = colors.rgb2hex(pal[0])
        else:
            species = {}
            for x in node.iter_descendants("preorder"):
                if x.is_leaf():
                    this_node.append(x.name)
                    s = x.name.split("_")[1]
                    if s in species:
                        species[s] += 1
                    else:
                        species[s] = 1
            for c in compress:
                try:
                    if float(species[c]) / float(len(this_node)) > 0.95:
                        nstyle["draw_descendants"] = False
                        node.name = "{} clade".format(c)
                except KeyError:
                    pass
            count = 0
            for t in this_node:
                try:
                    if groupdata[t] > 0:
                        count += 1
                except KeyError:
                    pass
            v = int(round(float(count) / float(len(this_node)) * 12))
            nstyle["fgcolor"] = colors.rgb2hex(pal[v])
            nstyle["size"] = 3 * math.sqrt(len(this_node))
        node.set_style(nstyle)

    return tree
コード例 #60
0
            stack.append(dictionary[current][1])
            stack.append(dictionary[current][2])
            stack.append(dictionary[current][3])
            result.append("(")
        else:
            result.append(current)
        current_prev = current

    result.pop()
    result.append(")")
    return result


if __name__ == "__main__":
    matrix, length = readInput()
    dictionary = {}
    finalCluster = wpgma(matrix, length, dictionary)
    result = printCluster(dictionary, finalCluster)
    result = ''.join(result)
    result = result + ";"

    #ete3 is tool for pylogenetic tree construction

    from ete3 import Tree
    tree = Tree(result)
    print("WPGMA Resultant Clustering:")
    print("")
    print(result)
    print("")
    print(tree)