Ejemplo n.º 1
0
def main(args):
    gtr_file, cdt_file, nwk_file = args
    reader = csv.reader(file(cdt_file), delimiter="\t")
    reader.next()  # header
    reader.next()  # EWEIGHT
    gid_to_name = {}
    for row in reader:
        gid, name = row[:2]
        #gid_to_name[gid] = name
        gid_to_name[gid] = name.upper()

    reader = csv.reader(file(gtr_file), delimiter="\t")
    nodes = {}
    for gtr in map(GTRLine._make, reader):
        node = Tree()
        parent_name, parent_dist = gtr.parent, float(gtr.dist)
        for child in (gtr.left_child, gtr.right_child):
            if child in gid_to_name:
                node.add_child(name=gid_to_name[child], dist=1 - parent_dist)
            else:
                assert child in nodes, child
                child_node, child_dist = nodes[child]
                node.add_child(child_node, dist=child_dist - parent_dist)

        nodes[parent_name] = (node, parent_dist)

    t = node
    print >> sys.stderr, "writing newick tree to %s" % nwk_file
    t.write(format=5, outfile=nwk_file)
Ejemplo n.º 2
0
def main(args):
    gtr_file, cdt_file, nwk_file = args
    reader = csv.reader(file(cdt_file), delimiter="\t")
    reader.next()  # header
    reader.next()  # EWEIGHT
    gid_to_name = {}
    for row in reader:
        gid, name = row[:2]
        #gid_to_name[gid] = name
        gid_to_name[gid] = name.upper()

    reader = csv.reader(file(gtr_file), delimiter="\t") 
    nodes = {}
    for gtr in map(GTRLine._make, reader):
        node = Tree() 
        parent_name, parent_dist = gtr.parent, float(gtr.dist)
        for child in (gtr.left_child, gtr.right_child):
            if child in gid_to_name:
                node.add_child(name=gid_to_name[child], dist=1-parent_dist)
            else:
                assert child in nodes, child
                child_node, child_dist = nodes[child]
                node.add_child(child_node, dist=child_dist-parent_dist)

        nodes[parent_name] = (node, parent_dist)

    t = node
    print >>sys.stderr, "writing newick tree to %s" % nwk_file
    t.write(format=5, outfile=nwk_file)
Ejemplo n.º 3
0
class K_Graph(object):

	"""docstring for K_Graph"""
	def __init__(self):
		self.theme = Tree()
		self.topic = ''

	def add_point(self,topic,point):
		for t in self.theme.traverse():
			if t.name in topic:
				t.add_child(name=point)

	def add_topic(self,topic):
		self.theme.add_child(name=topic)
		self.topic = topic

	def getCurrentGraph(self):
		for t in self.theme.traverse():
			if t.name in self.topic:
				return t

	def get_topic(self):
		return self.topic

	def save(self):
		with open('data.pickle', 'wb') as f: 
			# Pickle the 'data' dictionary using the highest protocol available. 
			pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)

	def load(self):
		with open('data.pickle', 'rb') as f: 
			# The protocol version used is detected automatically, so we do not # have to specify it. 
			return pickle.load(f)
Ejemplo n.º 4
0
def neighbor_joining(D, tree, internals):
    #fsum will have better precision when adding distances across sites
    #based on PLs not mutation
    """
    
    Args:
        D (np.array): pairwise differences between samples based on PLs (passing copy)
        tree (Tree): tree of class Tree with num tips = num samples
        internals (np.array): array of sample numbers
        
    Returns:
        Tree
        D (np.array): update pairwise differences now there are internal nodes to compare
    
    """
    print('neighbor_joining() begin', end=' ', file=sys.stderr)
    m = len(internals)
    while m > 2:  #if m is 2 then only two connected to root
        d = D[
            internals[:, None],
            internals]  #initially D matrix w/o 0 distance btwn internal nodes; then add in nodes as they have distances
        u = d.sum(axis=1) / (m - 2)

        Q = np.zeros(shape=(m, m), dtype=np.longdouble)
        for i, j in itertools.combinations(xrange(m), 2):  #std Q matrix calc
            Q[i, j] = d[i, j] - u[i] - u[j]
            Q[j, i] = Q[i, j]
        #print(Q.astype(int))
        np.fill_diagonal(Q, np.inf)
        #print(np.unique(Q, return_counts=True))
        i, j = np.unravel_index(
            Q.argmin(), (m, m)
        )  #location in matrix of smallest Q value (ie closest nodes/tips)
        l = len(D) + 2 - m

        for k in xrange(m):
            D[l, internals[k]] = D[internals[k],
                                   l] = d[i, k] + d[j, k] - d[i, j]
        D[l, internals[i]] = D[internals[i],
                               l] = vi = (d[i, j] + u[i] - u[j]) / 2
        D[l, internals[j]] = D[internals[j],
                               l] = vj = (d[i, j] + u[j] - u[i]) / 2

        ci = tree & str(internals[i])
        cj = tree & str(internals[j])
        ci.detach()
        cj.detach()
        node = Tree(name=str(l))
        node.add_child(ci, dist=int(vi))
        node.add_child(cj, dist=int(vj))
        tree.add_child(node)
        #print(tree)

        internals = np.delete(internals, [i, j])
        internals = np.append(internals, l)
        m = len(internals)
        print('.', end='', file=sys.stderr)

    print(' done', file=sys.stderr)
    return D, tree
Ejemplo n.º 5
0
def getGenera(taxonomy_queryset, only_id=False):
    """
    .. 
    This function generates a Tree object derived from the collapse 
    of all *species* under the scope of a spatial queryset.
    
    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
    
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the genera.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
     
    
    
    Returns
    -------
    :genera_tree: derived from ete2.TreeNode()
    """
    tax = taxonomy_queryset
    sps = tax.species
    genera = tax.genera
    family_tree = Tree(name='genus_root')
    for genus in genera:
        family_id = genus['parent_id']
        genus_id = genus['genus_id']
        if not only_id:
            name = genus['name']
        else:
            name = genus_id
        ab = genus['ab']
        points = genus['points']
        sp_by_gns = sps.filter(genus_id__exact=genus_id)
        gn_t = Tree(name=name, support=ab)
        gn_t.add_feature('genus_id', genus_id)
        gn_t.add_feature('level', 'genus')
        gn_t.add_feature('points', points)
        #logger.info('Building branch for genus %s' %name)
        for specie in sp_by_gns:
            if not only_id:
                name = specie['name'].split(' ')
                name = name[0] + ' ' + name[1]
            else:
                name = specie['species_id']


#                 logger.info('The name assigned is %s' %name)
            points = specie['points']
            s = Tree(name=name, support=specie['ab'])
            s.add_feature('species_id', specie['species_id'])

            s.add_feature('level', 'species')
            s.add_feature('points', points)
            gn_t.add_child(child=s)
        family_tree.add_child(child=gn_t)
    return family_tree
def getEte2Tree(hypoTree):
  t = Tree()
  for entry in hypoTree:
    if type(entry) is list:
      t.add_child(getEte2Tree(entry))
    else:
      t.name = entry.name
  return t
Ejemplo n.º 7
0
def getGenera(taxonomy_queryset,only_id=False):
    """
    .. 
    This function generates a Tree object derived from the collapse 
    of all *species* under the scope of a spatial queryset.
    
    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
    
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the genera.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
     
    
    
    Returns
    -------
    :genera_tree: derived from ete2.TreeNode()
    """
    tax = taxonomy_queryset
    sps = tax.species
    genera = tax.genera
    family_tree = Tree(name='genus_root')
    for genus in genera:
        family_id = genus['parent_id']
        genus_id = genus['genus_id']
        if not only_id:
            name = genus['name']
        else:
            name = genus_id
        ab = genus['ab']
        points = genus['points']
        sp_by_gns = sps.filter(genus_id__exact=genus_id)
        gn_t = Tree(name=name,support=ab)
        gn_t.add_feature('genus_id', genus_id)
        gn_t.add_feature('level','genus')
        gn_t.add_feature('points',points)
        #logger.info('Building branch for genus %s' %name)
        for specie in sp_by_gns:
            if not only_id:
                name = specie['name'].split(' ')
                name = name[0]+' '+name[1]
            else:
                name = specie['species_id']
#                 logger.info('The name assigned is %s' %name)
            points = specie['points']
            s = Tree(name = name,support=specie['ab'])
            s.add_feature('species_id', specie['species_id'])
            
            s.add_feature('level','species')
            s.add_feature('points',points)
            gn_t.add_child(child=s)
        family_tree.add_child(child=gn_t)
    return family_tree
Ejemplo n.º 8
0
def getClasses(taxonomic_queryset, orders_tree, only_id=False):
    """
    ..
    This function generates a Tree object derived from the collapse 
    of all *classes* under the scope of a spatial queryset.

    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
        :orders_tree: Tree derived from getOrders
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the classes.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
 
    
    Returns
    -------
    :classes_tree: derived from ete2.TreeNode() 


    """
    tax = taxonomic_queryset
    classes = tax.classes
    orders = tax.orders
    phylumTree = Tree(name='phylum_root')
    logger.info("[gbif.buildtree] Collapsing Classes")
    for class_ in classes:
        phylum_id = class_['parent_id']
        if not only_id:
            name = class_['name']
        else:
            name = class_['class_id']
        ab = class_['ab']
        #Add here the geometric feature (if necessary)
        points = class_['points']
        class_id = class_['class_id']
        #logger.info("Colapsing Class id: %s" %class_id)
        classTree = Tree(name=name, support=ab)
        classTree.add_feature('class_id', class_id)
        classTree.add_feature('level', 'class')
        classTree.add_feature('points', points)
        orders_by_class = orders.filter(parent_id__exact=class_id)
        for order in orders_by_class:
            id_o = order['order_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(
                lambda node: node.next(),
                filter(lambda branch: branch.order_id == id_o,
                       orders_tree.get_children()))
            #print branch
            # Attach the branch to the family tree
            classTree.add_child(child=branch)
        phylumTree.add_child(child=classTree)
    return phylumTree
Ejemplo n.º 9
0
def getClasses(taxonomic_queryset,orders_tree,only_id=False):
    """
    ..
    This function generates a Tree object derived from the collapse 
    of all *classes* under the scope of a spatial queryset.

    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
        :orders_tree: Tree derived from getOrders
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the classes.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
 
    
    Returns
    -------
    :classes_tree: derived from ete2.TreeNode() 


    """
    tax = taxonomic_queryset
    classes = tax.classes
    orders = tax.orders
    phylumTree = Tree(name='phylum_root')
    logger.info("[gbif.buildtree] Collapsing Classes")
    for class_ in classes:
        phylum_id = class_['parent_id']
        if not only_id: 
            name = class_['name']
        else:
            name = class_['class_id']
        ab = class_['ab']
        #Add here the geometric feature (if necessary)
        points = class_['points']
        class_id = class_['class_id']
        #logger.info("Colapsing Class id: %s" %class_id)
        classTree = Tree(name=name,support=ab)
        classTree.add_feature('id',class_id)
        classTree.add_feature('abundance',ab)
        classTree.add_feature('parent_id',phylum_id)
        classTree.add_feature('class_id',class_id)
        classTree.add_feature('level','class')
        classTree.add_feature('points',points)
        orders_by_class = orders.filter(parent_id__exact=class_id)
        for order in orders_by_class:
            id_o = order['order_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(lambda node : node.next(),filter(lambda branch : branch.order_id==id_o,orders_tree.get_children()))
            #print branch
            # Attach the branch to the family tree
            classTree.add_child(child=branch)
        phylumTree.add_child(child=classTree)
    return phylumTree  
Ejemplo n.º 10
0
def getKingdoms(taxonomic_queryset, phyla_tree, only_id=False):
    """
    ...
    This function generates a Tree object derived from the collapse 
    of all *kingdoms* under the scope of a spatial queryset.

    Parameters
    ----------
        taxonomy_queryset gbif.models / GeoquerySet
            :phyla_tree: Tree derived from getKingdoms

        only_id : Boolean (flag)
            True (default False) means that is going to append the full name of the kingdoms.
            This is a string and can be vary in length. If it is used in big data sets it will 
            impact the amount of memory used because of the heavy load of information.   
 
    
    Returns
    -------
    :kingdoms_tree: derived from ete2.TreeNode()   
    
    
    """
    tax = taxonomic_queryset
    kingdoms = tax.kingdoms
    phyla = tax.phyla
    TreeOfLife = Tree(name='Life')
    logger.info("[gbif.buildtree] Collapsing Kingdoms")
    for kingdom in kingdoms:
        kingdom_id = 0
        if not only_id:
            name = kingdom['name']
        else:
            name = kingdom['kingdom_id']
        ab = kingdom['ab']
        #Add here the geometric feature (if necessary)
        points = kingdom['points']
        kingdom_id = kingdom['kingdom_id']
        #logger.info("Colapsing kingdom: %s" %name)
        kingdomTree = Tree(name=name, support=ab)
        kingdomTree.add_feature('kingdom_id', kingdom_id)
        kingdomTree.add_feature('level', 'kingdom')
        kingdomTree.add_feature('points', points)
        phyla_by_kingdom = phyla.filter(parent_id__exact=kingdom_id)
        for phylum in phyla_by_kingdom:
            id_p = phylum['phylum_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(
                lambda node: node.next(),
                filter(lambda branch: branch.phylum_id == id_p,
                       phyla_tree.get_children()))
            #print branch
            # Attach the branch to the family tree
            kingdomTree.add_child(child=branch)
        TreeOfLife.add_child(child=kingdomTree)
    return TreeOfLife
Ejemplo n.º 11
0
def buildTree(taxid_list, nodes_dict, taxids_remove, cursor):
    """Recursive function, returns a ete tree object from a list of taxids.
Requires a cursor connected to a sqlite db build using the script /users/rg/didac/NCBI/Taxonomy/update_sqlite_DB.py
nodes_dict is an empty dict
taxids_remove is an empty list """
    
    results = query_a_list(taxid_list, cursor)
    
    # check if all taxids returned a result
    if len(set(taxid_list)) != len(results):
        taxids_with_result = set([ x[0] for x in results])
        taxids_remove += list(set(map(int, taxid_list)) - taxids_with_result )

    parent_taxid_list = []
    for result in results:
        taxid, parent_taxid, rank, name = result
        parent_taxid_list.append(parent_taxid)

        if not taxid in nodes_dict:
            c = Tree()
            c.add_feature('name', name)
            nodes_dict[ taxid ] = c

        # I don't have scientific name and rank for parent_taxid yet, but next iteration it will be the taxid
        nodes_dict[ taxid ].add_features(name=name, taxid=taxid, rank=rank)
        # add child to node parent_taxid
        if not parent_taxid in nodes_dict:
            p = Tree()
            p.add_feature('taxid', parent_taxid)
            p.add_child( nodes_dict[ taxid ] )
            nodes_dict[ parent_taxid ] = p

        else:
            # check if taxid is a child of parent_taxid (already in nodes_dict), otherwise adding it
            for descendant in nodes_dict[ parent_taxid ].iter_descendants():
                if taxid == descendant.taxid:
                    break
            else:
                nodes_dict[ parent_taxid ].add_child( nodes_dict[ taxid ] )

    parent_taxid_list = list(set(parent_taxid_list))

    try:
        # "1" is the root of the NCBI tree, if "1" is in parent_taxid_list, and it will become an empty list inside this try
        parent_taxid_list.remove(1)
    except:
        pass

    if parent_taxid_list:
        t,nodes_dict,taxids_remove = buildTree(parent_taxid_list, nodes_dict, taxids_remove, cursor)
    else:
        nodes_dict[ 1 ].add_features(name='Root', rank='Root')
        return nodes_dict[ 1 ], nodes_dict, taxids_remove
    return t, nodes_dict, taxids_remove
Ejemplo n.º 12
0
def getKingdoms(taxonomic_queryset,phyla_tree,only_id=False):
    """
    ...
    This function generates a Tree object derived from the collapse 
    of all *kingdoms* under the scope of a spatial queryset.

    Parameters
    ----------
        taxonomy_queryset gbif.models / GeoquerySet
            :phyla_tree: Tree derived from getKingdoms

        only_id : Boolean (flag)
            True (default False) means that is going to append the full name of the kingdoms.
            This is a string and can be vary in length. If it is used in big data sets it will 
            impact the amount of memory used because of the heavy load of information.   
 
    
    Returns
    -------
    :kingdoms_tree: derived from ete2.TreeNode()   
    
    
    """
    tax = taxonomic_queryset
    kingdoms = tax.kingdoms
    phyla = tax.phyla
    TreeOfLife = Tree(name='Life')
    logger.info("[gbif.buildtree] Collapsing Kingdoms")
    for kingdom in kingdoms:
        kingdom_id = 0
        if not only_id:         
            name = kingdom['name']
        else:
            name = kingdom['kingdom_id']    
        ab = kingdom['ab']
        #Add here the geometric feature (if necessary)
        points = kingdom['points']
        kingdom_id = kingdom['kingdom_id']
        #logger.info("Colapsing kingdom: %s" %name)
        kingdomTree = Tree(name=name,support=ab)
        kingdomTree.add_feature('kingdom_id',kingdom_id)
        kingdomTree.add_feature('level','kingdom')
        kingdomTree.add_feature('points',points)
        phyla_by_kingdom = phyla.filter(parent_id__exact=kingdom_id)
        for phylum in phyla_by_kingdom:
            id_p = phylum['phylum_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(lambda node : node.next(),filter(lambda branch : branch.phylum_id==id_p,phyla_tree.get_children()))
            #print branch
            # Attach the branch to the family tree
            kingdomTree.add_child(child=branch)
        TreeOfLife.add_child(child=kingdomTree)
    return TreeOfLife  
Ejemplo n.º 13
0
def neighbor_joining(D, tree, internals):
    #fsum will have better precision when adding distances across sites
    #based on PLs not mutation
    """
    
    Args:
        D (np.array): pairwise differences between samples based on PLs (passing copy)
        tree (Tree): tree of class Tree with num tips = num samples
        internals (np.array): array of sample numbers
        
    Returns:
        Tree
        D (np.array): update pairwise differences now there are internal nodes to compare
    
    """
    print('neighbor_joining() begin', end=' ', file=sys.stderr)
    m = len(internals)
    while m > 2:  #if m is 2 then only two connected to root
        d = D[internals[:,None],internals]  #initially D matrix w/o 0 distance btwn internal nodes; then add in nodes as they have distances
        u = d.sum(axis=1)/(m-2)

        Q = np.zeros(shape=(m,m), dtype=np.longdouble)
        for i,j in itertools.combinations(xrange(m),2):  #std Q matrix calc
            Q[i,j] = d[i,j]-u[i]-u[j]
            Q[j,i] = Q[i,j]
        #print(Q.astype(int))
        np.fill_diagonal(Q, np.inf)
        #print(np.unique(Q, return_counts=True))
        i,j = np.unravel_index(Q.argmin(), (m,m))  #location in matrix of smallest Q value (ie closest nodes/tips)
        l = len(D)+2-m

        for k in xrange(m):
            D[l,internals[k]] = D[internals[k],l] = d[i,k]+d[j,k]-d[i,j]
        D[l,internals[i]] = D[internals[i],l] = vi = (d[i,j]+u[i]-u[j])/2
        D[l,internals[j]] = D[internals[j],l] = vj = (d[i,j]+u[j]-u[i])/2

        ci = tree&str(internals[i])
        cj = tree&str(internals[j])
        ci.detach()
        cj.detach()
        node = Tree(name=str(l))
        node.add_child(ci,dist=int(vi))
        node.add_child(cj,dist=int(vj))
        tree.add_child(node)
        #print(tree)

        internals = np.delete(internals, [i,j])
        internals = np.append(internals, l)
        m = len(internals)
        print('.', end='', file=sys.stderr)

    print(' done', file=sys.stderr)
    return D,tree
Ejemplo n.º 14
0
 def treeFromQuartet(quartet):
     root = Tree()
     root.name = "root"
     left = root.add_child(name="Left")
     left.add_child(name=quartet[0])
     left.add_child(name=quartet[1])
     right = root.add_child(name="Right")
     right.add_child(name=quartet[2])
     right.add_child(name=quartet[3])
     for desc in root.iter_descendants():
         desc.dist = 0
     return root
Ejemplo n.º 15
0
def getFamilies(taxonomic_queryset,genera_tree,only_id=False):
    """
    ..
    This function generates a Tree object derived from the collapse 
    of all *families* under the scope of a spatial queryset.

    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
        :genera_tree: Tree derived from getGenera
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the families.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
     
    
    
    Returns
    -------
    :families_tree: derived from ete2.TreeNode()
    """
    tax = taxonomic_queryset
    families = tax.families
    genera = tax.genera
    orders_tree = Tree(name='order_root')
    for family in families:
        order_id = family['parent_id']
        if not only_id:
            name = family['name']
        else:
            name = family['family_id']
        ab = family['ab']
        #Add here the geometric feature (if necessary)
        points = family['points']
        family_id = family['family_id']
        famTree = Tree(name=name,support=ab)
        famTree.add_feature('abundance',ab)
        famTree.add_feature('id',family_id) 
        famTree.add_feature('parent_id',order_id)       
        famTree.add_feature('family_id',family_id)
        famTree.add_feature('level','family')
        famTree.add_feature('points',points)
        gens_by_fam = genera.filter(parent_id__exact=family_id)
        for genus in gens_by_fam:
            id_g = genus['genus_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(lambda node : node.next(),filter(lambda branch : branch.genus_id==id_g,genera_tree.get_children() ))
            # Attach the branch to the family tree
            famTree.add_child(child=branch)
        orders_tree.add_child(child=famTree)
    return orders_tree
Ejemplo n.º 16
0
def getFamilies(taxonomic_queryset, genera_tree, only_id=False):
    """
    ..
    This function generates a Tree object derived from the collapse 
    of all *families* under the scope of a spatial queryset.

    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
        :genera_tree: Tree derived from getGenera
    
    only_id : Boolean (flag)
        True (default False) means that is going to append the full name of the families.
        This is a string and can be vary in length. If it is used in big data sets it will 
        impact the amount of memory used because of the heavy load of information.   
     
    
    
    Returns
    -------
    :families_tree: derived from ete2.TreeNode()
    """
    tax = taxonomic_queryset
    families = tax.families
    genera = tax.genera
    orders_tree = Tree(name='order_root')
    for family in families:
        order_id = family['parent_id']
        if not only_id:
            name = family['name']
        else:
            name = family['family_id']
        ab = family['ab']
        #Add here the geometric feature (if necessary)
        points = family['points']
        family_id = family['family_id']
        famTree = Tree(name=name, support=ab)
        famTree.add_feature('family_id', family_id)
        famTree.add_feature('level', 'family')
        famTree.add_feature('points', points)
        gens_by_fam = genera.filter(parent_id__exact=family_id)
        for genus in gens_by_fam:
            id_g = genus['genus_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(
                lambda node: node.next(),
                filter(lambda branch: branch.genus_id == id_g,
                       genera_tree.get_children()))
            # Attach the branch to the family tree
            famTree.add_child(child=branch)
        orders_tree.add_child(child=famTree)
    return orders_tree
Ejemplo n.º 17
0
def getPhyla(taxonomic_queryset,classes_tree,only_id=False):
    """
    ...
    This function generates a Tree object derived from the collapse 
    of all *phyla* under the scope of a spatial queryset.

    Parameters
    ----------
    taxonomy_queryset gbif.models / GeoquerySet
        :classes_tree: Tree derived from getclasses

        only_id : Boolean (flag)
            True (default False) means that is going to append the full name of the Phyla.
            This is a string and can be vary in length. If it is used in big data sets it will 
            impact the amount of memory used because of the heavy load of information.   
  
    
    Returns
    -------
    :phyla_tree: derived from ete2.TreeNode()   
    """
    tax = taxonomic_queryset
    phyla = tax.phyla
    classes = tax.classes
    kingdomTree = Tree(name='kingdom_root')
    logger.info("[gbif.buildtree] Collapsing Phyla")
    for phylum in phyla:
        kingdom_id = phylum['parent_id']
        if not only_id:         
            name = phylum['name']
        else:
            name = phylum['phylum_id']
        ab = phylum['ab']
        #Add here the geometric feature (if necessary)
        points = phylum['points']
        phylum_id = phylum['phylum_id']
        #logger.info("Colapsing Phylum: %s" %name)
        phylumTree = Tree(name=name,support=ab)
        phylumTree.add_feature('phylum_id',phylum_id)
        phylumTree.add_feature('level','phylum')
        phylumTree.add_feature('points',points)
        classes_by_phylum = classes.filter(parent_id__exact=phylum_id)
        for class_ in classes_by_phylum:
            id_c = class_['class_id']
            #Filter the branch of the tree with the selected genus (for loop)
            branch = reduce(lambda node : node.next(),filter(lambda branch : branch.class_id==id_c,classes_tree.get_children()))
            #print branch
            # Attach the branch to the family tree
            phylumTree.add_child(child=branch)
        kingdomTree.add_child(child=phylumTree)
    return kingdomTree  
Ejemplo n.º 18
0
def init_star_tree(n):
    """Creates a tree, adds n children in star with numbers as names

    Args:
        n (int): Number of children in tree

    Returns:
        Tree: 
    """
    
    tree = Tree()
    for i in xrange(n):
        tree.add_child(name=str(i))
    return tree
def generateTree(tree, wordlist):
  nodeList = []
  for i in range(len(tree)):
    parent = Tree()
    parent.name = str(-(i+1))
    for node in eval(str(tree[i]).split(':')[0]):
      if node >= 0:
        child = Tree()
        child.name = wordlist[node]
        parent.add_child(child)
      else:
        parent.add_child(nodeList[int(-node)-1])
    nodeList.append(parent)
  print nodeList[-1].get_ascii(show_internal=True)
  return nodeList[-1]
Ejemplo n.º 20
0
def add_taxa(tree, new_taxa, taxa_in_clade, level):

    # create new tree of the new taxa
    additionalTaxa = tree_from_taxonomy(level, new_taxa)

    # find mrca parent
    treeobj = stk._parse_tree(tree)
    mrca = stk.get_mrca(tree, taxa_in_clade)
    if (mrca == 0):
        # we need to make a new tree! The additional taxa are being placed at the root of the tree
        t = Tree()
        A = t.add_child()
        B = t.add_child()
        t1 = Tree(additionalTaxa)
        t2 = Tree(tree)
        A.add_child(t1)
        B.add_child(t2)
        return t.write(format=9)
    else:
        mrca = treeobj.nodes[mrca]
        additionalTaxa = stk._parse_tree(additionalTaxa)

        if len(taxa_in_clade) == 1:
            taxon = treeobj.node(taxa_in_clade[0])
            mrca = treeobj.addNodeBetweenNodes(taxon, mrca)

        # insert a node into the tree between the MRCA and it's parent (p4.addNodeBetweenNodes)
        # newNode = treeobj.addNodeBetweenNodes(mrca, mrca_parent)

        # add the new tree at the new node using p4.addSubTree(self, selfNode, theSubTree, subTreeTaxNames=None)
        treeobj.addSubTree(mrca, additionalTaxa, ignoreRootAssert=True)

    return treeobj.writeNewick(fName=None, toString=True).strip()
Ejemplo n.º 21
0
def trees():
    outdir = os.path.join('..', 'trees')
    urls = {'global': 'http://glottolog.org/static/trees/tree-glottolog-newick.txt'}

    for entry in bs(requests.get(GLOTTOLOG_FAMILIES).text, 'html.parser').find_all('entry'):
        urls[entry.find('title').text] = entry.find('id').text

    for fname in os.listdir(outdir):
        if fname.endswith(SUFFIX):
            os.remove(os.path.join(outdir, fname))

    for family in sorted(urls):
        url = urls[family]
        if not url.endswith('newick.txt'):
            url += '.newick.txt'

        filename = os.path.join(outdir, (family + SUFFIX) if family != 'global' else family + '.trees')
        print("%30s <- %s" % (family, url))
        newick = requests.get(url).text.encode('utf-8')
        if family == 'global':
            tree = Tree()
            for n in newick.split(';\n'):
                subtree = Tree(clean_newick(n + ';'), format=3)
                nodenames = [_n.name for _n in subtree.traverse()]
                if len(nodenames) == len(set(nodenames)) + 1:
                    # FIXME: we must include isolates!
                    # just add single child?
                    tree.add_child(child=Tree(name=subtree.name), dist=1.0)
                    print 'adding isolate', subtree.name
                else:
                    tree.add_child(child=subtree, dist=1.0)
        else:
            tree = Tree(clean_newick(newick), format=3)

        if clean_tree(tree):
            newick_string = str(tree.write(format=3))
            with codecs.open(filename, 'w', encoding="utf-8") as handle:
                handle.write("#NEXUS\nBegin taxa;\n")  # write taxa to file
                for leaf in tree.traverse():
                    if str(leaf.name) in newick_string:
                        handle.write(leaf.name)
                        handle.write("\n")
                handle.write(";\nend;")
                # write newick string to file
                handle.write("\nBegin trees;\ntree UNTITLED = ")
                handle.write(newick_string)
                handle.write("\nend;")
Ejemplo n.º 22
0
def tree_generation(entities):
    for entity in entities:
        words = split(r'[\s-]+', entity)
        reversed_words_list = [words[i - 1:] for i in range(len(words), 0, -1)]
        t = Tree()
        for word in reversed_words_list:
            string = ' '.join(word)
            z = t.add_child(name=string)
            t = z
        print t.show()
Ejemplo n.º 23
0
    def quartetPuzzling(self, steps):
        seq_ids = self._sequencesDict.keys()
        if len(seq_ids) < 4:
            tree = Tree()
            for seq_id in seq_ids:
                tree.add_child(name=seq_id)
            return tree

        trees = []
        for step in range(steps):
            shuffle(seq_ids)
            first_quartet = self._optimalQuartets[self.getQuartetID(
                seq_ids[0:4])]["topology"]
            rooted_tree = self.treeFromQuartet(first_quartet)
            tree = rooted_tree.children[0]
            tree.add_child(rooted_tree.children[1])
            # tree.show()

            for i in range(4, len(seq_ids)):
                tree_utils.initEdgeLengths(tree, 0)

                quartets = []
                for triplet in combination_utils.combinationsGenerator(
                        seq_ids[0:i], 3):
                    triplet.append(seq_ids[i])
                    quartets.append(tuple(triplet))

                qt_topos_found = set()
                for quartet in quartets:
                    optimal_qt_topo_id = self._optimalQuartets[
                        self.getQuartetID(quartet)]["topology_id"]
                    qt_topo_id = self.getTopologyID(quartet)
                    if qt_topo_id == optimal_qt_topo_id and qt_topo_id not in qt_topos_found:
                        qt_topos_found.add(qt_topo_id)
                        self.increaseCostOnPath(tree, quartet[0], quartet[1])

                # choose edge with minimum cost, delete it and add new leaf seq_ids[i]
                shortest_edge = tree_utils.findShortestEdge(tree)
                # new_node = Tree(name=shortest_edge[0].name + "_" + shortest_edge[1].name)
                new_node = Tree()
                new_node.add_child(name=seq_ids[i])
                detached = shortest_edge[1].detach()
                shortest_edge[0].add_child(new_node)
                new_node.add_child(detached)
                # tree.show()

            tree_utils.initEdgeLengths(tree, 1)
            trees.append(tree)

        # find consensus tree
        return tree_utils.findConsensusTree(trees)
Ejemplo n.º 24
0
def load_label_parentidx_as_tree(label_parentidx, labels):
    root = Tree()
    root.name = 'root'
    label_node = {}
    for label in labels:
        parientid = label_parentidx[label]
        if parientid == -1:
            c = root.add_child(name=label)
        else:
            parentnode = label_node[labels[parientid]]
            c = parentnode.add_child(name=label)
        label_node[label] = c
    return root
Ejemplo n.º 25
0
Archivo: cdt.py Proyecto: Hensonmw/jcvi
    def get_gtr_tree(self):

        from ete2 import Tree

        fp = open(self.gtrfile)
        reader = csv.reader(fp, delimiter="\t")
        nodes = {}
        gnames = dict(self.gnames)
        for g in map(GTRLine._make, reader):
            node = Tree()
            parent_name, parent_dist = g.parent, float(g.dist)
            for child in (g.left_child, g.right_child):
                if child in gnames:
                    node.add_child(name=gnames[child], dist=1 - parent_dist)
                else:
                    assert child in nodes, child
                    child_node, child_dist = nodes[child]
                    node.add_child(child_node, dist=child_dist - parent_dist)

            nodes[parent_name] = (node, parent_dist)

        self.gtr_tree = node
Ejemplo n.º 26
0
def neighbor_joining(D, tree, internals):
    print('neighbor_joining() begin', end=' ', file=sys.stderr)
    m = len(internals)
    while m > 2:
        d = D[internals[:,None],internals]
        u = d.sum(axis=1)/(m-2)

        Q = np.zeros(shape=(m,m), dtype=np.longdouble)
        for i,j in itertools.combinations(xrange(m),2):
            Q[i,j] = d[i,j]-u[i]-u[j]
            Q[j,i] = Q[i,j]
        #print(Q.astype(int))
        np.fill_diagonal(Q, np.inf)
        #print(np.unique(Q, return_counts=True))
        i,j = np.unravel_index(Q.argmin(), (m,m))
        l = len(D)+2-m

        for k in xrange(m):
            D[l,internals[k]] = D[internals[k],l] = d[i,k]+d[j,k]-d[i,j]
        D[l,internals[i]] = D[internals[i],l] = vi = (d[i,j]+u[i]-u[j])/2
        D[l,internals[j]] = D[internals[j],l] = vj = (d[i,j]+u[j]-u[i])/2

        ci = tree&str(internals[i])
        cj = tree&str(internals[j])
        ci.detach()
        cj.detach()
        node = Tree(name=str(l))
        node.add_child(ci,dist=int(vi))
        node.add_child(cj,dist=int(vj))
        tree.add_child(node)
        #print(tree)

        internals = np.delete(internals, [i,j])
        internals = np.append(internals, l)
        m = len(internals)
        print('.', end='', file=sys.stderr)

    print(' done', file=sys.stderr)
    return tree
Ejemplo n.º 27
0
    def get_gtr_tree(self):

        from ete2 import Tree

        fp = open(self.gtrfile)
        reader = csv.reader(fp, delimiter="\t")
        nodes = {}
        gnames = dict(self.gnames)
        for g in map(GTRLine._make, reader):
            node = Tree()
            parent_name, parent_dist = g.parent, float(g.dist)
            for child in (g.left_child, g.right_child):
                if child in gnames:
                    node.add_child(name=gnames[child], dist=1 - parent_dist)
                else:
                    assert child in nodes, child
                    child_node, child_dist = nodes[child]
                    node.add_child(child_node, dist=child_dist - parent_dist)

            nodes[parent_name] = (node, parent_dist)

        self.gtr_tree = node
Ejemplo n.º 28
0
def load_label_tree(noffset_parentidx, noffsets):
    root = Tree()
    root_synset = wn.synset('physical_entity.n.01')
    root.name = root_synset.name()
    root.add_feature('synset', root_synset)
    noffset_node = {}
    for noffset in noffsets:
        parientid = noffset_parentidx[noffset]
        if parientid == -1:
            c = root.add_child(name=noffset)
        else:
            parentnode = noffset_node[noffsets[parientid]]
            c = parentnode.add_child(name=noffset)
        noffset_node[noffset] = c
    return prune_root(root), noffset_node
Ejemplo n.º 29
0
def normalize_ranks(t,ranks):
    normtree = Tree()
    for l in t.get_leaves():
        out = []
        lineage={}
        taxids={}
        parent = l.up
        while parent:
            if parent.rank in ranks:
                lineage[parent.rank]=parent.scientific_name
                taxids[parent.rank]=parent.taxid
            parent = parent.up

        for rank in ranks:
            if not rank in lineage:
                lineage[rank] = 'No '+rank
                taxids[rank]= 'No taxid'

        for i in range(len(ranks)):
            rank=ranks[i]
            if i==0: # superkingdom
                if not normtree.search_nodes(name=lineage[rank], rank=rank):
                    child = normtree.add_child(name=lineage[rank])
#                    child.add_features(rank=rank,lineage=lineage[rank],taxid=taxids[rank])
                    child.add_feature('rank', rank )
                    child.add_feature('lineage', lineage[rank])
                    child.add_feature('taxid', taxids[rank])
            else:
                if rank == 'species' and lineage[rank] == 'No species':
                    lineage['species'] = l.name
                parent_node_lineage =':'.join([ lineage[x] for x in ranks[:i]])
                node_lineage =':'.join([ lineage[x] for x in ranks[:i] + [rank]])
                parent = normtree.search_nodes(lineage=parent_node_lineage)
                if not parent or len(parent)>1:
                    raise
                if not normtree.search_nodes(name=lineage[rank], lineage=node_lineage):
                    child = parent[0].add_child(name=lineage[rank])
#                    child.add_features(rank=rank,lineage=lineage[rank],taxid=taxids[rank])
                    child.add_feature('rank', rank )
                    child.add_feature('lineage', node_lineage)
                    child.add_feature('taxid', taxids[rank])
                    if rank == 'species':
                        child.scientific_name = l.scientific_name
                        child.taxid = l.taxid
    return normtree
Ejemplo n.º 30
0
def main():

	t = Tree()

	auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
	auth.secure = True
	auth.set_access_token(token_key, secret_key)

	api = tweepy.API(auth)

	root = t.add_child(name=api.me().name)

	for friend in api.me().friends():
		child = root.add_child(name=getTwitterAccountName(friend))
		getFriends(friend,child,0)
	
	#getFriends(api.me(),root,0)

	t.render("mytree.png", w=183, units="mm")
Ejemplo n.º 31
0
def main(args={}):
#########################################################
############ loading options
  global opt
  if not args: opt=command_line(def_opt, help_msg, 'iaf', synonyms=command_line_synonyms, strict=1)
  else:  opt=args
  set_MMlib_var('opt', opt)
  global temp_folder; temp_folder=Folder(random_folder(opt['temp'])); test_writeable_folder(temp_folder, 'temp_folder'); set_MMlib_var('temp_folder', temp_folder)

  write("#=============------ "); write(" show_syntheny.py ", how='reverse,bright'); write(" -------=============#", 1)
  ## basic graphics options
  face_width= opt['w'];   font_size= opt['fs']
  ## defining a function that, given a gene, decide what is printed in the face"""
  if not opt['m']: 
    def get_text(g): 
      if g.id in geneid2family:      return geneid2family[g.id]+':'+g.id
      else:                          return '-'+':'+g.id
  else:
    face_width=30
    def get_text(g):                 
      if g.id in geneid2family:      return geneid2family[g.id]
      else:                          return ''
  tree_style=TreeStyle(); tree_style.show_leaf_name=False; tree_style.scale=1; tree_style.show_scale=False
  node_style=NodeStyle(); node_style["size"] = 0 #; node_style["fgcolor"] = "darkred"
  node_style_grey=NodeStyle(); node_style_grey["size"] = 0; node_style_grey["bgcolor"] = "lightgrey"
  tree=Tree(); tree.dist=0.0;  tree.set_style(node_style)

  ############################## legend mode only: start
  if opt['legend']: ### totally different program in this case
    for line_index, line in enumerate(open(opt['legend'])):
      try:
        bkg_color='white' if line_index%2 else 'lightgrey' 
        splt=line.strip().split('\t')
        if not splt: continue
        leaf=tree.add_child(name='', dist=0.0);   
        leaf.set_style(node_style) if line_index%2 else leaf.set_style(node_style_grey)
        g=gene(strand='+'); g.color, g.color_outline, g.color_box_bkg, g.color_box_line=[x if x!='None' else None for x in splt[1].split()]; 
        g.text = replace(splt[2], '\\n', '\n')
        title_face=faces.TextFace(splt[0], fsize=font_size); title_face.margin_left=5;  leaf.add_face(title_face, 0, 'branch-right' ) #title left
        arrow_face=syntheny_view([g],   printed={'boundaries': 0, 'text':1, 'id':0},  pen_size=4, font_size=font_size, width=face_width)[0]; leaf.add_face(arrow_face, 1, 'branch-right' )
        for desc_line_index, desc_line in enumerate(splt[3].split('\\n')):
          desc_face=faces.TextFace(desc_line, fsize=font_size);  desc_face.background.color = bkg_color; leaf.add_face(desc_face,  2, 'branch-right' ) #desc_face.margin_down=3; desc_face.margin_up=3;
      except: printerr('-legend ERROR parsing this line: |n{0}'.format(line), 1); raise
    write("Legend mode: {0} entries found. ".format(len(tree)), 1)
    if opt['out']:    write('--> writing output file: {0}'.format(opt['out']), 1);       tree.render(opt['out'], tree_style=tree_style)
    else:             write('-- opening interactive ETE2 environment (PyQt4) -- ', 1);  tree.show(tree_style=tree_style)
    sys.exit()
  ############################## legend mode only: over

  #### checking input
  input_gff_file=opt['i'];      check_file_presence(input_gff_file, 'input_gff_file', notracebackException )
  annotation_gff_file=opt['a']; check_file_presence(annotation_gff_file, 'annotation_gff_file', notracebackException )
  homology_file=opt['f'];       check_file_presence(homology_file, 'homology_file', notracebackException )
  
  # printing for pretty out
  write('# Input gff file=      {0:<30} (genes of interest)'.format(input_gff_file), 1)
  write('# Annotation gff file= {0:<30} (all genes)'.format(annotation_gff_file), 1)
  write('# Homology tsv file=   {0:<30} (gene families)'.format(homology_file), 1)
  non_def_options_str=join([ '# -{0}  {1}\n'.format(k, opt[k]) for k in opt  if k in def_opt and def_opt[k] != opt[k] and not k in 'iaf' ], '')
  if non_def_options_str:  write('### Non-default options:\n'+non_def_options_str)
  write('', 1)

  # checking output options
  for x in ['of', 'oc', 'ocf', 'ocg']:
    if opt[x] and opt[x]==1: raise notracebackException, "ERROR option -{0} must be provided with an argument (which will be used as output file)!"

  #######
  ### processing options controlling colors
  colors_already_taken={}  # useful for later, when we compute available_colors
  color_genes_of_interest=[None, None, None, None]
  if opt['ci']: 
    for index, color in enumerate( opt['ci'].split(',') ):  
      if color=='None': color=None
      color_genes_of_interest[index]=color
    colors_already_taken[ join( map(str, color_genes_of_interest), ',') ]=1

  color_singlets=[None, None, None, None]
  if opt['cs']: 
    for index, color in enumerate( opt['cs'].split(',') ):  
      if color=='None': color=None
      color_singlets[index]=color
    colors_already_taken[ join( map(str, color_singlets), ',') ]=1

  fam2color={}    ## each color is a list [fill, outline, box_bkg, box_outline]  if not defined, it's None
  if opt['cf']:
    ## load color-family file
    for line in open( opt['cf'] ):
      splt=line.strip().split()
      if splt:  #skipping empty lines
        fam=splt[0];  the_colors=[None, None, None, None]
        for index, item in enumerate(splt[1:]): 
          if item=='None': item=None
          the_colors[index]=item
        fam2color[fam] = the_colors
        colors_already_taken[ join( map(str, the_colors), ',') ]=1

  color_file=opt['c'];          check_file_presence(color_file, 'color_file', notracebackException )
  color_scheme=opt['cr'];       
  if not color_scheme in [0, 1, 2, 3]: raise notracebackException, "ERROR invalid color scheme provided with option -cr ! see -help"
  individual_colors=[line.strip() for line in open(color_file) if line.strip()]; 
  if     color_scheme==0:  available_colors = [[a,None,None,None] for a in individual_colors if not a+',None,None,None' in colors_already_taken]
  elif   color_scheme==1:  available_colors = [[b,   a,None,None] for a in individual_colors for b in individual_colors if not b+','+a+',None,None' in colors_already_taken]
  elif   color_scheme==2:  available_colors = [[c,   b,   a,None] for a in individual_colors for b in individual_colors for c in individual_colors if not (a==b==c) and not c+','+b+','+a+',None' in colors_already_taken]
  elif   color_scheme==3:  available_colors = [[d,   c,   b,   a] for a in individual_colors for b in individual_colors for c in individual_colors for d in individual_colors if not (b==c==d) and not d+','+c+','+b+','+a in colors_already_taken]
  #write('available colors: {0}'.format(len(available_colors)), 1)

  if opt['rc']:   random.shuffle(available_colors)

  ######
  ## loading gff input files    # genes of interest
  input_get_id_function=None; 
  if opt['if']: input_get_id_function=eval('lambda x:'+opt['if'])
  write('Loading genes of interest from {0:<30} ... '.format(input_gff_file)) 
  genes_of_interest=load_all_genes(input_gff_file, tag='*', get_id=input_get_id_function, is_sorted=True)
  for g_index, g in enumerate(genes_of_interest): g.is_of_interest=g_index+1   ### keeping this as a number so later we can sort output in the same order as input
  write('done. Genes: {0}'.format(len(genes_of_interest)), 1)
                                # gene in global annotation
  annotation_get_id_function=None; 
  if opt['af']: annotation_get_id_function=eval('lambda x:'+opt['af'])
  annotation_tag=opt['at']
  write('Loading annotated genes from   {0:<30} ... '.format(annotation_gff_file)) 
  annotated_genes=load_all_genes(annotation_gff_file, tag=annotation_tag, get_id=annotation_get_id_function)
  for a in annotated_genes: a.is_of_interest=False
  write('done. Genes: {0}'.format(len(annotated_genes)), 1)
  ######

  ## load homology file
  geneid2family={}; families_dict={}
  write('Loading homology families from {0:<30} ... '.format(homology_file)) 
  for line in open(homology_file):
    splt=line.strip().split('\t')
    if splt:  geneid, family = splt; geneid2family[geneid]=family; families_dict[family]=0
  write('done.', 1)

  ## print some stats
  for g in annotated_genes: 
    if g.id in geneid2family:  families_dict[geneid2family[g.id]]+=1
  n_fam_represented=0; n_genes_with_family=0
  for fam in families_dict: 
    if families_dict[fam]>0: n_fam_represented+=1; n_genes_with_family+=families_dict[fam]
  write('N of families: {0} ; {1} families have 1 or more gene(s) found in annotation.\nA total of {2} genes have a family assigned.\n'.format(len(families_dict),n_fam_represented, n_genes_with_family ), 1)
  del families_dict;  #saving memory (almost a joke)
  family2genes_displayed={}      ### later we'll modify geneid2family to avoid displaying useless families

  ## families or genes in the annotation to be ignored
  fams_to_ignore={}
  if opt['rf']: 
    check_file_presence(opt['rf'], '-rf file')
    for line in open(opt['rf']):
      strp=line.strip()
      if strp:  fams_to_ignore[strp]=True

  ##############################  start doing things!
  ## finding overlaps
  def scoring_function_for_overlaps(g):    return int (g.is_of_interest)* 10000000 + g.length()
  removed_overlapping_genes=[]
  non_red_genes = remove_overlapping_gene_clusters( genes_of_interest + annotated_genes,  scoring=scoring_function_for_overlaps, phase=True, strand=True, out_removed_genes=removed_overlapping_genes, remember_overlaps=True )
  ### getting all discarded -> kept  relationship, and back
  for g in removed_overlapping_genes: 
    if not hasattr( g.overlapping, 'discarded'): g.overlapping.discarded=[]
    g.overlapping.discarded.append( g )
  for g in genes_of_interest: 
    if hasattr( g, 'discarded'): #len(g.discarded)>1: 
      for d in g.discarded:   write(' Gene: {0:^25} removed overlapping gene: {1}'.format(g.id, d.id), 1)

  non_red_genes.sort(  cmp=order_genes_for_chr_pos  )   #sorting again... not optimized but easy
  ######

  ##############################
  ## building gene clusters to be displayed
  gene_clusters=[]   # list of lists of genes; populating this while parsing the sorted list of genes and looking for the genes of interest.
  max_distance = opt['l']
  index=0
  while index < len(non_red_genes):
    if non_red_genes[index].is_of_interest:
      g=non_red_genes[index]
      verbose('*** Cluster of {0}'.format(g.id), 1)
      gc=gene_cluster();  gc.append(g); gc.link_to_gene(g)

      ## parsing back CAREFUL ASSUMING THERE ARE NO NESTED STRUCTURES WITH EXONS
      other_index=index-1
      while other_index >= 0 and non_red_genes[other_index].chromosome == g.chromosome and  \
        (  (not opt['n'] and abs( g.boundaries()[0] - non_red_genes[other_index].boundaries()[1] ) <= max_distance )  or \
           (    opt['n'] and len(gc)-1 <= opt['n']    )    ):
        if not  non_red_genes[other_index].id in fams_to_ignore and not \
           (non_red_genes[other_index].id in geneid2family and geneid2family[non_red_genes[other_index].id] in fams_to_ignore):
          gc.insert(0, non_red_genes[other_index])
        other_index-=1
      n_genes_added_back=len(gc)-1

      #parsing forward                                                                                               
      other_index=index+1
      while other_index < len(non_red_genes) and non_red_genes[other_index].chromosome == g.chromosome and \
        (   (not opt['n'] and abs( non_red_genes[other_index].boundaries()[0] - g.boundaries()[1] ) <= max_distance )  or \
            (    opt['n'] and len(gc)-1-n_genes_added_back <= opt['n']    )    ):
        if not  non_red_genes[other_index].id in fams_to_ignore and not \
           (non_red_genes[other_index].id in geneid2family and geneid2family[non_red_genes[other_index].id] in fams_to_ignore):            
          gc.append(non_red_genes[other_index])
        other_index+=1

      for i in gc:       verbose( i.gff(), 1)
      gene_clusters.append(gc) 

    index+=1

  ## populating family2genes_displayed to compress family output
  for gc in gene_clusters:
    for g in gc: 
      if g.id in geneid2family: 
        fam=geneid2family[g.id]
        if not fam in family2genes_displayed: family2genes_displayed[fam]={}
        family2genes_displayed[fam][g.id]=True

  if opt['rs']:
    ## removing singlets
    n_singlets_removed=0
    for gc in gene_clusters:
      len_gc=len(gc)
      for i in range(len_gc): 
        g_index= len_gc-i-1  #parsing in reverse order to make .pop() work
        g= gc[g_index]
        if  not g.is_of_interest and ( not g.id in geneid2family or len(family2genes_displayed[ geneid2family[g.id] ])==1 ):  
          gc.pop(g_index); n_singlets_removed+=1;  del family2genes_displayed[ geneid2family[g.id] ]
    if n_singlets_removed: write('Option -rs: {0} singlets were removed! '.format(n_singlets_removed), 1)

  #### merging clusters that share at least one gene
  if not opt['dm']:
    ## since they are sorted, a cluster can share genes only with its previous or next cluster. Also, if we scan forward, we just check if the last gene in a cluster is contained in the next one
    gc_index=0
    while gc_index+1<len(gene_clusters):   #+1 since, if it's the last one, it's not interesting
      current_gc=gene_clusters[gc_index];    next_gc=gene_clusters[gc_index+1]
      merged=False
      if current_gc[0].chromosome == next_gc[0].chromosome: # and current_gc[-1] in next_gc:     --> in practice this is what we check. but let's do it more efficiently
        try:     
          index_pos= next_gc.index(current_gc[-1])  # this cause an exception if not there
          merged=True             #### Yes we're officially merging
          write('Merging the surrounds of {0:>25} and {1:<25}'.format(current_gc.ref_gene.id,  next_gc.ref_gene.id), 1)
          for gc in next_gc[index_pos+1:]:   current_gc.append( gc )
          possible_ref_genes=[]
          for g_index, g in enumerate(current_gc):
            if g.is_of_interest: g.g_index=g_index; possible_ref_genes.append(g)
          middle_point = (len(current_gc)-1)/2.0
          best_ref_gene = min (possible_ref_genes, key= lambda x:abs(x.g_index-middle_point))
          current_gc.link_to_gene(best_ref_gene)
          gene_clusters.pop(gc_index+1)   #removing next_gc
        except ValueError: pass
      if not merged:    gc_index+=1
       
  geneid2color={}
  ### parsing each single gene in each cluster. deciding COLORS
  for gc in gene_clusters:  
    for g in gc: 
      if g.id in fam2color:          geneid2color[g.id]=fam2color[g.id]    # color was specified in -cf using geneId      
      elif g.id in geneid2family:   
        ## this belongs to a family
        fam = geneid2family[g.id]  
        if len(family2genes_displayed[fam]) == 1:   
          geneid2color[g.id]= color_singlets   ## singlet being only representative for its family
        else:   
          if not fam in fam2color:     # not yet assigned to this family
            try:                fam2color[fam]=available_colors.pop(0)
            except IndexError:  raise notracebackException, "ERROR not enough colors are available to display this! Increase the number of colors in the -c file or change the color scheme with -cr"
          geneid2color[g.id]= fam2color[fam]
      else:    geneid2color[g.id]=color_singlets     ## singlet that does not belong to any family
      if g.is_of_interest:
        geneid2color[g.id]= list(geneid2color[g.id])  ## copying list or otherwise we modify in place the color
        for index, color in enumerate(color_genes_of_interest):
          if not color is None:  geneid2color[g.id][index]=color
    #write( g.id+' '+str(geneid2color[g.id]) +'    '+ str(geneid2color), 1, how='green')
    #write('---', 1, how='reverse')

  ### now sorting gene_clusters so they are in the same order as the input file
  gene_clusters.sort(key=lambda x:x.ref_gene.is_of_interest)

  if opt['of']:
    ### producing an output file with a line for each family
    fh=open(opt['of'], 'w')
    for fam in family2genes_displayed: print >> fh, fam+'\t'+join( [gid for gid in family2genes_displayed[fam]], '\t')
    fh.close()
  if opt['oc']:
    ### producing an output file with a line for each gene cluster
    fh=open(opt['oc'], 'w')
    for gc in gene_clusters: print >> fh, join( [g.id for g in gc], '\t')
    fh.close()
  if opt['ocf']:
    fh=open(opt['ocf'], 'w')
    for fam in fam2color:
      if not fam in family2genes_displayed: continue #this is to skip cases in which -cf was provided with geneId instead of fam
      print >>fh,  fam+'\t'+join(map(str, fam2color[fam]), '\t')
    fh.close()
  if opt['ocg']:
    fh=open(opt['ocg'], 'w')
    for geneid in geneid2color: print >>fh,  geneid+'\t'+join(map(str, geneid2color[geneid]), '\t')
    fh.close()


  ### preparing ete2 objects
  max_n_genes_in_a_cluster= max ([len(gc) for gc in gene_clusters])
  for gc in gene_clusters:
    name_displayed= gc[0].chromosome + ' : ' +gc.ref_gene.id
    name_displayed+=join([ '\n'+ ' & '+g.id    for g in gc if g.is_of_interest and g != gc.ref_gene], '') #adding a line for other genes of interested merged in this cluster
    leaf=tree.add_child(name=name_displayed, dist=10);   leaf.set_style(node_style)
    leaf_name_face= faces.TextFace(leaf.name, fsize=font_size);     leaf_name_face.margin_left=5;  leaf_name_face.margin_right=1; 
    leaf.add_face( leaf_name_face, 0, 'aligned' )
    for g in gc:  
      g.color, g.color_outline, g.color_box_bkg, g.color_box_line  = geneid2color[g.id]
      g.text = get_text(g)

    reverse_syntheny_view =  not opt['ks'] and gc.ref_gene.strand=='-'
    # modifying gc inplace to add whitespacers to keep it sortof centered
    while len(gc)<max_n_genes_in_a_cluster:     
      if (len(gc) + int(reverse_syntheny_view))%2 :   gc.append('')
      else:           gc.insert(0, '')
    face_list=     syntheny_view(  gc,   printed={'boundaries': int(not opt['m']), 'text':1, 'id':0},  pen_size=4, font_size=font_size, width=face_width, reverse=reverse_syntheny_view)
    for col_index, the_face in enumerate(face_list):   leaf.add_face( the_face, col_index+1, 'aligned' )
  
  if opt['out']: 
    write('--> writing output file: {0}'.format(opt['out']), 1)
    tree.render(opt['out'], tree_style=tree_style)
  else:          
    write('-- opening interactive ETE2 environment (PyQt4) --- ', 1)
    tree.show(tree_style=tree_style)

  write('#====----      execution finished, exiting         -----====#', 1)
Ejemplo n.º 32
0
def init_star_tree(n):
    tree = Tree()
    for i in xrange(n):
        tree.add_child(name=str(i))
    return tree
import random
import sys
sys.path.insert(0, "../")
from ete2 import Tree, TreeStyle, NodeStyle, PhyloTree
from ete2.treeview.faces import *
from ete2.treeview.main import random_color, _NODE_TYPE_CHECKER, FACE_POSITIONS

sys.path.insert(0, "../examples/treeview")
import face_grid, bubble_map, item_faces, node_style, node_background, face_positions

main_tree = Tree()
main_tree.dist = 0

t, ts = face_grid.get_example_tree()
t_grid = TreeFace(t, ts)
n = main_tree.add_child()
n.add_face(t_grid, 0, "aligned")

t, ts = bubble_map.get_example_tree()
t_bubble = TreeFace(t, ts)
n = main_tree.add_child()
n.add_face(t_bubble, 0, "aligned")

t, ts = item_faces.get_example_tree()
t_items = TreeFace(t, ts)
n = main_tree.add_child()
n.add_face(t_items, 0, "aligned")

t, ts = node_style.get_example_tree()
t_nodest = TreeFace(t, ts)
n = main_tree.add_child()
Ejemplo n.º 34
0
def consensus(trees,weights=[],lim=0):
    '''
    returns weighted consensus tree
    50% majority rule
    TODO: fix glup
    '''

    if weights == []: weights = [1] * len (trees)
    dic = {}
    outgroup_name = Tree(trees[0]).get_leaf_names()[1]
    tlen = 0
    for (tree, weight) in zip (trees, weights):
        tree = Tree(tree)
        if tlen == 0: tlen = len(tree)
        elif len (tree) != tlen: exit('ERROR: trees with different length')
        outgroup = tree.search_nodes(name=outgroup_name)[0]
        tree.set_outgroup(outgroup)
        dad = outgroup.get_sisters()[0]
        for node in dad.traverse():
            if node.is_root(): continue
            cluster  = ','.join (sorted (node.get_leaf_names()))
            if dic.has_key(cluster):
                dic[cluster] += weight
            else:
                dic[cluster] =  weight

    sorted_nodes = map(lambda x: [x[2], x[1]], sorted (\
        map (lambda x: (len (x.split(',')), x, dic[x]), \
             dic.keys()), reverse = True))
    if lim < sorted (sorted_nodes, reverse=True)[:tlen*2 - 3][-1][0]:
        lim = sorted (sorted_nodes, reverse=True)[:tlen*2 - 3][-1][0]
    sorted_nodes = filter (lambda x: x[0] >= lim, sorted_nodes)
    sorted_nodes = map (lambda x: x[1], sorted_nodes)
    if len (sorted_nodes) > tlen*2 - 3:
        print >> stderr, \
              'WARNING: two nodes with same support, will remove: ' + \
              sorted_nodes[-1]
        sorted_nodes = sorted_nodes[:-1]
    cons_tree = Tree()
    cons_tree.add_child(name=outgroup_name)
    node = cons_tree.add_child(name='NoName')
    node.add_feature('childrens', \
                     set (sorted_nodes.pop(0).split(',')) \
                     - set([outgroup_name]))
    while len (sorted_nodes) > 0:
        for name in sorted_nodes:
            if not name in sorted_nodes: continue
            for node in cons_tree.traverse(strategy='postorder'):
                if node.is_root(): continue
                if node.name is not 'NoName': continue
                if len (node.childrens & set(name.split(','))) == 0:
                    continue
                # check if ther is better solution in one of the child
                for rest in sorted_nodes:
                    if len (set(rest.split(','))) < \
                       len (set(name.split(','))):
                        continue
                    if len (set(rest.split(',')) & set(name.split(','))) > 0:
                        name = rest
                weight = dic[name]
                children = set(name.split(','))
                if len (children) == 1:
                    node.add_child(name=name)
                else:
                    n = node.add_child(name='NoName')
                    n.add_feature('childrens', children)
                    n.support = weight
                break
            sorted_nodes.pop(sorted_nodes.index(name))
            sister = node.childrens - children
            name = ','.join (sorted ( list (sister)))
            if not name in sorted_nodes:
                continue
            weight = dic[name]
            if len (sister) == 1:
                node.add_child(name=name)
            else:
                n = node.add_child(name='NoName')
                n.add_feature('childrens', sister)
                n.support = weight
            sorted_nodes.pop(sorted_nodes.index(name))
            break
    print cons_tree

    return cons_tree.write(format=9)
Ejemplo n.º 35
0
def findConsensusTree(trees, weights=[], lim=0):
    if weights == []:
        weights = [1] * len(trees)
    dic = {}
    outgroup_name = trees[0].get_leaf_names()[1]
    tlen = 0
    for (tree, weight) in zip(trees, weights):
        if tlen == 0: tlen = len(tree)
        elif len(tree) != tlen: exit('ERROR: trees with different length')
        outgroup = tree.search_nodes(name=outgroup_name)[0]
        tree.set_outgroup(outgroup)
        dad = outgroup.get_sisters()[0]
        for node in dad.traverse():
            if node.is_root(): continue
            cluster = ','.join(sorted(node.get_leaf_names()))
            if dic.has_key(cluster):
                dic[cluster] += weight
            else:
                dic[cluster] = weight

    sorted_nodes = map(lambda x: [x[2], x[1]], sorted (\
        map (lambda x: (len (x.split(',')), x, dic[x]), \
             dic.keys()), reverse = True))
    if lim < sorted(sorted_nodes, reverse=True)[:tlen * 2 - 3][-1][0]:
        lim = sorted(sorted_nodes, reverse=True)[:tlen * 2 - 3][-1][0]
    sorted_nodes = filter(lambda x: x[0] >= lim, sorted_nodes)
    sorted_nodes = map(lambda x: x[1], sorted_nodes)
    if len(sorted_nodes) > tlen * 2 - 3:
        print >> stderr, \
              'WARNING: two nodes with same support, will remove: ' + \
              sorted_nodes[-1]
        sorted_nodes = sorted_nodes[:-1]
    cons_tree = Tree()
    cons_tree.add_child(name=outgroup_name)
    node = cons_tree.add_child(name='NoName')
    node.add_feature('childrens', \
                     set (sorted_nodes.pop(0).split(','))
                     - set([outgroup_name]))
    while len(sorted_nodes) > 0:
        for name in sorted_nodes:
            if not name in sorted_nodes: continue
            for node in cons_tree.traverse(strategy='postorder'):
                if node.is_root(): continue
                if node.name is not 'NoName': continue
                if len(node.childrens & set(name.split(','))) == 0:
                    continue
                # check if ther is better solution in one of the child
                for rest in sorted_nodes:
                    if len (set(rest.split(','))) < \
                       len (set(name.split(','))):
                        continue
                    if len(set(rest.split(',')) & set(name.split(','))) > 0:
                        name = rest
                weight = dic[name]
                children = set(name.split(','))
                if len(children) == 1:
                    node.add_child(name=name)
                else:
                    n = node.add_child(name='NoName')
                    n.add_feature('childrens', children)
                    n.support = weight
                break
            sorted_nodes.pop(sorted_nodes.index(name))
            sister = node.childrens - children
            name = ','.join(sorted(list(sister)))
            if not name in sorted_nodes:
                continue
            weight = dic[name]
            if len(sister) == 1:
                node.add_child(name=name)
            else:
                n = node.add_child(name='NoName')
                n.add_feature('childrens', sister)
                n.support = weight
            sorted_nodes.pop(sorted_nodes.index(name))
            break

    return cons_tree
def main ():

    global options, args

    # TODO: read and parse taxonomy file
    if options.verbose: print >> sys.stderr, "[",time.asctime(),"]",
    if options.verbose: print >> sys.stderr, "parse taxonomy file"
    tax = get_gg_taxonomy(args[0])
    
    # TODO: construct the taxonomy tree
    if options.verbose: print >> sys.stderr, "[",time.asctime(),"]",
    if options.verbose: print >> sys.stderr, "build taxonomy tree"
    tree = Tree()
    tree_node = {}
    for x in tax:
        n = tree.search_nodes(name=tax[x]['kingdom'])
        if len(n)==0:
            n = tree.add_child(name=tax[x]['kingdom'])
            tree_node[tax[x]['kingdom']] = n

    for x in tax:
        y_name = tax[x]['kingdom']
        for level in ['phylum','class','order','family','genus','species']:
            if len(tax[x][level])>3:
                x_name = y_name+"_"+tax[x][level]
                if x_name not in tree_node:
                    n = tree_node[y_name].add_child(name=tax[x][level])
                    tree_node[x_name] = n
                y_name = x_name
            else:
                n = tree_node[y_name].add_child(name=x)
                tree_node[x] = n
                break
            if level=='species':
                n = tree_node[y_name].add_child(name=x)
                tree_node[x] = n

    # TODO: read and parse depth file
    if options.verbose: print >> sys.stderr, "[",time.asctime(),"]",
    if options.verbose: print >> sys.stderr, "parse depth file"
    depth = {}
    for node in tree.get_leaves():
        depth[node.name] = 0.0
    with open(args[1]) as f:
        for line in f:
            item = line.split()
            depth[item[0]] = float(item[1])

    # TODO: read and parse fasta file
    from skbio.parse.sequences import parse_fasta
    if options.verbose: print >> sys.stderr, "[",time.asctime(),"]",
    if options.verbose: print >> sys.stderr, "parse fasta file"
    fasta = {}
    for id,seq in parse_fasta(args[2]):
        fasta[id] = seq

    # TODO: prune the tree to remove empty leaves 
    retain_nodes = Set()
    for node in tree.iter_leaves():
        if depth[node.name]>0 and len(tax[node.name][options.clade])>3:
            retain_nodes.add(node)
            retain_nodes.update(Set(node.get_ancestors()))

    tree.prune(retain_nodes)

    # TODO: collect clade statistics
    clade_depth = {}
    for node in tree.traverse():
        if node.name.startswith(options.clade[0]):
            d = 0
            for n in node.get_leaves():
                if d < depth[n.name]:
                    d = depth[n.name]
            clade_depth[node.name] = d
    
    # TODO: get the threshold
    X = np.array(sorted(depth.values(),reverse=True))
    rX = X.cumsum()*100/X.sum()
    T = np.interp(options.Npercent,rX,X)

    # TODO: filter out clades below the threshold
    clade_retain = Set()
    for c,d in clade_depth.items():
        if d>=T:
            clade_retain.add(c) 

    # TODO: prune tree again
    retain_nodes = Set()
    for node in tree.get_leaves():
        if tax[node.name][options.clade] in clade_retain:
            retain_nodes.add(node)
            retain_nodes.update(Set(node.get_ancestors()))

    tree.prune(retain_nodes)

    # TODO: pick out representative OTUs for a clade
    retain_nodes = Set()
    for taxon in clade_retain:
        t = tree & taxon
        L = {n:depth[n.name] for n in t.get_leaves()}
        l = sorted(L.items(),key=operator.itemgetter(1),reverse=True)
        taxon_otus = Set()
        for n, val in l:
            if val<T: continue
            if not taxon_otus:
                taxon_otus.add(n)
            else:
                max_sim = 0
                for y in taxon_otus:
                    sim = ssw_similarity(fasta[n.name],fasta[y.name])
                    if max_sim < sim: max_sim = sim
                if max_sim < options.Tsim:
                    taxon_otus.add(n)
        retain_nodes.update(taxon_otus)
        for n in taxon_otus:
            retain_nodes.update(Set(n.get_ancestors()))
        
    # TODO: pick out a representative OTUs for a clade
    #retain_nodes = Set()
    #for taxon in clade_retain:
    #    t = tree & taxon
    #    L = {n:depth[n.name] for n in t.get_leaves()}
    #    l = sorted(L.items(),key=operator.itemgetter(1),reverse=True)
    #    retain_nodes.add(l[0][0])
    #    retain_nodes.update(Set(l[0][0].get_ancestors()))
    #
    tree.prune(retain_nodes)
 
    # TODO: output representative OTUs
    for node in tree.get_leaves():
        print node.name

    #
    for node in tree.get_leaves():
        node.add_feature("depth",depth[node.name])

    if options.png:
        ts = TreeStyle()
        ts.layout_fn = tree_layout
        tree.render(options.png,dpi=1024,tree_style=ts)
Ejemplo n.º 37
0
print D

'''
import matplotlib.pyplot as plt
import numpy as np
from ete2 import Tree

data = open('output.txt').read().replace(',', ' ').replace('\n', ' ')
x = data.split()
ParentChild = np.array(x).astype('str')
y = len(ParentChild) / 2
ParentChild1 = np.reshape(ParentChild, (y, 2))
#print ParentChild1

t = Tree()  # Creates an empty tree
A = t.add_child(name="A")

B = A.add_child(name="B")
C = A.add_child(name="C")
D = C.add_child(name="D")
E = A.add_child(name="E")
F = A.add_child(name="F")
G = A.add_child(name="G")
H = F.add_child(name="H")  #6,8
I = A.add_child(name="I")
J = A.add_child(name="J")  #10
K = D.add_child(name="K")
L = D.add_child(name="L")
M = A.add_child(name="L")  #13
N = D.add_child(name="N")  #4,11
O = D.add_child(name="O")  #4,12
Ejemplo n.º 38
0
    def tree(self):
        L = self._distMatrix.columnNames
        tree = Tree()
        tree.name = "root"
        tree.dist = 0
        for seq in L:
            tree.add_child(name=seq, dist=0)

        iter_count = 1
        while len(L) > 2:
            nearest_nbs = self._distMatrix.getNearestNeigbors()
            node_i = tree.search_nodes(name=nearest_nbs[0])[0]
            node_j = tree.search_nodes(name=nearest_nbs[1])[0]
            L.remove(nearest_nbs[0])
            L.remove(nearest_nbs[1])

            node_k = Tree()
            node_k.dist = 0
            node_k.name = "X" + str(iter_count)
            d_ij = self._distMatrix.getDistance(node_i.name, node_j.name)
            assert d_ij > 0
            d_ik = 0.5 * d_ij + 0.5 * (self._distMatrix.getSeparation(node_i.name) - self._distMatrix.getSeparation(node_j.name))
            d_jk = 0.5 * d_ij + 0.5 * (self._distMatrix.getSeparation(node_j.name) - self._distMatrix.getSeparation(node_i.name))

            tree.remove_child(node_i)
            tree.remove_child(node_j)
            node_k.add_child(node_i, dist=d_ik)
            node_k.add_child(node_j, dist=d_jk)
            tree.add_child(node_k)

            d_km = []
            for node_m in L:
                d_km.append(0.5 * (self._distMatrix.getDistance(node_i.name, node_m) + self._distMatrix.getDistance(node_j.name, node_m) - d_ij) )
                assert d_km > 0

            self._distMatrix.removeData((node_i.name, node_j.name))
            self._distMatrix.appendData(d_km, node_k.name)

            iter_count+=1
            L = self._distMatrix.columnNames

        last_nodes = tree.get_children()
        d_ij = self._distMatrix.getDistance(last_nodes[0].name, last_nodes[1].name)
        leaf = None
        new_root = None
        for node in last_nodes:
            if node.is_leaf():
                node.dist = d_ij
                leaf = node.detach()
            else:
                new_root = node.detach()
        if not leaf:
            leaf = last_nodes[0]
            leaf.dist = d_ij
        new_root.add_child(leaf)

        return new_root

    ##
    # @var _distMatrix
    # the distance matrix in more or less arbitrary form
    # @var _names
    # taxa identification strings
    # @var _alignment
    # multiple sequence alignment
Ejemplo n.º 39
0
"""
import matplotlib.pyplot as plt
import numpy as np
from ete2 import Tree

data = open("output.txt").read().replace(",", " ").replace("\n", " ")
x = data.split()
ParentChild = np.array(x).astype("str")
y = len(ParentChild) / 2
ParentChild1 = np.reshape(ParentChild, (y, 2))
# print ParentChild1


t = Tree()  # Creates an empty tree
A = t.add_child(name="A")

B = A.add_child(name="B")
C = A.add_child(name="C")
D = C.add_child(name="D")
E = A.add_child(name="E")
F = A.add_child(name="F")
G = A.add_child(name="G")
H = F.add_child(name="H")  # 6,8
I = A.add_child(name="I")
J = A.add_child(name="J")  # 10
K = D.add_child(name="K")
L = D.add_child(name="L")
M = A.add_child(name="L")  # 13
N = D.add_child(name="N")  # 4,11
O = D.add_child(name="O")  # 4,12
Ejemplo n.º 40
0
	if( randint(0,5) ):
		a = node.add_child(name = randint(1, 10000))
		addChild(a,n,level+1)
	if( randint(0,5) ):
		a = node.add_child(name = randint(1, 10000))
		level = level + 1
		addChild(a,n,level)





root = randint(1, 10000)
t.name = root
n = 10
a = t.add_child(name = randint(1, 10000))
addChild(a,n,2)
a = t.add_child(name = randint(1, 10000))
addChild(a,n,2)

print t.get_ascii(show_internal=True)
#print t.name,
#zigzag(deque(t.get_children()),0)
#print t.get_descendants()
p= t.get_children()
if( len(p) == 2):
	print "two"
	#print p[0]
	print len( p[0].get_descendants() )
	#print p[1]
	print len( p[1].get_descendants() )
Ejemplo n.º 41
0
class SiteSpider:
    def __init__(self, driver, target_url, depth=-1, delay=5, mitm=False):
        self.driver = driver
        self.target_url = target_url
        self.t = Tree()
        self.root = self.t.add_child(name=target_url)
        self.root.add_features(path=target_url, advance=True)
        self.depth = depth
        self.delay = delay
        self.subscribers = []
        self.url_cache = UrlCache(self.depth)
        self.mitm = mitm

    def auth(self, handler):
        handler(self.driver, self.target_url)

    def _is_same_domain(self, href):
        curr = urlparse(href)
        base = urlparse(self.target_url)
        #print "%s =? %s" % (curr.netloc, base.netloc)
        return curr.netloc == base.netloc

    def _url_same(self, url1, url2):
        if self.depth < 0:
            return url1 == url2
        else:
            path1 = urlparse(url1).path.split("/")
            path2 = urlparse(url2).path.split("/")
            same = True
            for i in range(min(min(self.depth, len(path1)), len(path2))):
                if path1[i] != path2[i]:
                    same = False
                    break

            #print "Path1 %s Path2 %s Same? %s" % (path1, path2, str(same))
            return same

    def _has_visited(self, url):
        return self.url_cache.has_visited(url)

    def _has_sister(self, node, url):
        for sister in node.children:
            if self._url_same(sister.name, url):
                return True
        return False

    def _get_url_path(self, url):
        '''
        Label in tree to use
        '''
        if not self._is_same_domain(url):
            return url
        else:
            parse = urlparse(url)
            return parse.path

    def _get_link_url(self, a):
        child_url = a.get_attribute("href")
        if not child_url:
            return None

        # If pound then JS must handle this link so follow it to see
        # where it goes
        if child_url.endswith("#"):
            logger.debug("Ignoring dynamic link.")
            return None

        return child_url

    def crawl(self):
        self._crawl(self.root)

    def _should_advance(self, child, child_url):
        return self._is_same_domain(
            child_url) and not self._has_visited(child_url)

    def _close_windows(self):
        wins = self.driver.window_handles

    def _call_subscribers(self):
        for s in self.subscribers:
            try:
                s.on_page_visited()
            except:
                pass

    def _crawl(self, node):
        # Make request for the page
        self.url_cache.cache(node.name)

        #Hack to tell the proxy we are requesting a new page
        if self.mitm:
            b64 = base64.b64encode(node.name)
            proxy_signal_url = 'http://127.0.0.1:8080/?page=' + b64
            self.driver.get(proxy_signal_url)
        self.driver.get(node.name)

        # There is an issue if the link is in the same domain but then
        # it does a redirect to a url outside the domain. We wont know until
        # we visit it. If this happens abort and remove from tree
        if not self._is_same_domain(self.driver.current_url):
            node.detach()
            logger.warn("Aborting, not same domain")
            return

        self._call_subscribers()

        time.sleep(self.delay)
        logger.info(self.driver.current_url)
        #logger.debug( self.t.get_ascii(show_internal=True, attributes=["path"]))

        # Access by index because if we move to the
        # next page the context of the page is lost when we come back
        anchors = self.driver.find_elements_by_tag_name("a")
        # anchor_set = Set(anchors)
        l = len(anchors)
        for i in range(l):
            # new_anchor_set = Set(new_anchors)
            # anchor_diff = new_anchor_set - anchor_set
            # vanished_anchors = anchor_set - new_anchor_set
            # if anchor_diff:
            #     logger.debug('New Anchors: ' + '\n'.join([anchor.text for anchor in anchor_diff]))
            # if vanished_anchors:
            #     logger.debug('Vanished Anchors: ' + '\n'.join([anchor.text for anchor in anchor_diff]))

            new_anchors = self.driver.find_elements_by_tag_name("a")
            assert len(new_anchors) == len(anchors)
            a = new_anchors[i]
            child_url = self._get_link_url(a)

            #Only add if its not already there
            if not child_url or self._has_visited(child_url):
                continue

            child = node.add_child(name=child_url)
            child.add_feature("path", self._get_url_path(child_url))

            # Determine if the link should be advanced forward
            # We never want to start crawling other pages
            if self._should_advance(child, child_url):
                child.add_feature("advance", True)
            else:
                child.add_feature("advance", False)

        logger.debug(self.url_cache)

        #Process all the found links
        for child in node.children:
            if child.advance:
                self._crawl(child)

    def get_link_graph(self):
        return self.t

    def add_subscriber(self, subscriber):
        self.subscribers.append(subscriber)
Ejemplo n.º 42
0
def main(argv):
	input_file=''
	title='Title'
	label_internal_nodes = False
	label_leaves = False
	out_file=''
	width=750
	out_file_xml=''
	try:
		opts, args = getopt.getopt(argv,"h:i:t:lno:w:x:",["Help=","InputFile=","Title=","LabelLeaves=", "LabelInternalNodes=","OutFile=","Width=","OutFileXML="])
	except getopt.GetoptError:
		print 'Unknown option, call using: ./PlotTree.py -i <InputCAMIFile> -t <Title> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print './PlotTree.py -i <InputCAMIFile> -t <Title> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -o <OutFile> -x <OutFile.xml> -w <Width>'
			sys.exit(2)
		elif opt in ("-i", "--InputFile"):
			input_file = arg
		elif opt in ("-t", "--Title"):
			title = arg
		elif opt in ("-l", "--LabelLeaves"):
			label_leaves = True
		elif opt in ("-n","--LabelInternalNodes"):
			label_internal_nodes = True
		elif opt in ("-o", "--OutFile"):
			out_file = arg
		elif opt in ("-w", "--Width"):
			width = int(arg)
		elif opt in ("-x", "--OutFileXML"):
			out_file_xml = arg
			
	schema_names = COLOR_SCHEMES.keys()
	
	#Read the common kmer profile
	ckm_tax_paths = []
	ckm_name_to_perc = dict()
	fid = open(input_file,'r')
	file = fid.readlines()
	fid.close()
	
	#Put placeholders in for missing names like: "||" -> "|NA1|"
	file_noblank = list()
	i=0
	for line in file:
		while "||" in line:
			line = line.replace("||","|NONAME|",1)
			i = i+1
		file_noblank.append(line)
	
	#Get the names and weights
	for line in file_noblank:
		if line[0]!='#' and line[0]!='@' and line[0]!='\n': #Don't parse comments or blank lines
			temp = line.split()[3] #Get the names
			ckm_tax_paths.append(temp)
			ckm_name_to_perc[temp.split("|")[-1]] = line.split()[-1] #Get the weights
	
	#Create the tree
	t=Tree()
	names_to_nodes = dict()
	for i in range(0,len(ckm_tax_paths)):
		split_tax_path = ckm_tax_paths[i].split("|")
		if len(split_tax_path)==1: #If len==1, then it's a superkingdom
			names_to_nodes[split_tax_path[0]] = t.add_child(name=split_tax_path[0]) #connect directly to tree
		else:
			if split_tax_path[-2] in names_to_nodes: #If the parent is already in the tree, add to tree
				names_to_nodes[split_tax_path[-1]] = names_to_nodes[split_tax_path[-2]].add_child(name=split_tax_path[-1])
			else: #Otherwise iterate up until we have something that is in the tree
				j=2
				while split_tax_path[-j]=="NONAME":
					j = j + 1
				#This skips over the NONAMES
				names_to_nodes[split_tax_path[-1]] = names_to_nodes[split_tax_path[-j]].add_child(name=split_tax_path[-1])
	
	#Show the tree
	#print t.get_ascii(show_internal=True)
	
	#scheme = random.sample(schema_names, 1)[0] #'set2' is nice, 
	scheme = 'set2'

	def layout(node):
		if node.name in ckm_name_to_perc:
			ckm_perc = float(ckm_name_to_perc[node.name])
		else:
			ckm_perc = 0
		F = CircleFace(radius=3.14*math.sqrt(ckm_perc), color="RoyalBlue", style="sphere")
		F.border.width = None
		F.opacity = 0.6
		faces.add_face_to_node(F,node, 0, position="branch-right")
		if label_internal_nodes:
			faces.add_face_to_node(TextFace(node.name, fsize=7),node, 0, position="branch-top")
	
	ts = TreeStyle()
	ts.layout_fn = layout
	ts.mode = "r"
	ts.show_leaf_name = label_leaves
	ts.min_leaf_separation = 50
	ts.title.add_face(TextFace(title, fsize=20), column=0)
	
	#Export the tree to a png image
	t.render(out_file, w=width, units="mm", tree_style=ts)

    #Export the xml file
	project = Phyloxml()
	phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[]))
	phylo.phyloxml_phylogeny.set_name(title)
	project.add_phylogeny(phylo)
	project.export(open(out_file_xml,'w'))
Ejemplo n.º 43
0
Archivo: id3.py Proyecto: palmagro/mrrf
    def execute(self,nodes,path1,node,c,maximo,maxinf,exrel,umbral,padre):
        a,b = path1.rsplit(':', 1)
        if (a[-1:] != "n"):
            path = a+"d:"+b
            cyprop = "/(count(distinct(d))+1)"
        else:
            path = path1
            cyprop = ""
        TC = self.TC
        graph_db = self.graph_db
        if len(nodes) == 0: 
            self.arbol = Tree("("+str(padre)+"*"+str(len(nodes))+");")
            return Tree("("+str(padre)+"*"+str(len(nodes))+");")
 
        if not any(n[self.target] == self.vtarget for n in nodes):
            self.arbol = Tree("(not "+str(self.vtarget)+"*"+str(len(nodes))+");")
            return Tree("(not "+str(self.vtarget)+"*"+str(len(nodes))+");")
        if not any(n[self.target] != self.vtarget for n in nodes):
            self.arbol = Tree("("+str(self.vtarget)+"*"+str(len(nodes))+");")
            return Tree("("+str(self.vtarget)+"*"+str(len(nodes))+");")
        if (c <= 0 or maxinf == 0 or maxinf <=  umbral or TC == [] or len(nodes) < 2):
            temp = []
            for n in nodes:
                if n[self.target] == self.vtarget:
                    temp.append(self.vtarget)
                else:
                    temp.append("not "+self.vtarget)
            self.arbol = Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");")
            return Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");")
        else:
            posibles = ""
            cont = 0
            while (len(posibles) == 0 and cont < 10):
                cont += 1
                posibles = "MATCH (a)-[r]->(b) WHERE labels(a) <> [] AND labels(b) <> [] AND ( "
                for t in TC:
                    posibles = posibles + "type(r) = '"+t.To+"' OR "
                posibles = str(posibles[:-3]) + ") AND ("
                for z in random.sample(nodes, random.randint(1,(len(nodes)/2))):
                    posibles += "id(a) = " + str(z.id) + " OR "
                posibles = str(posibles[:-3]) + " ) RETURN DISTINCT head(labels(a)) AS This, type(r) as To, head(labels(b)) AS That limit "+str(len(TC))+" UNION ALL MATCH (a)<-[r]-(b) WHERE labels(a) <> [] AND labels(b) <> [] AND ("
                for t in TC:
                    posibles = posibles + "type(r) = '"+t.To+"' OR "
                posibles = str(posibles[:-3]) + ") AND ("
                for z in random.sample(nodes, (random.randint(1,len(nodes)/2))):
                    posibles += "id(a) = "+str(z.id)+" OR "
                posibles = str(posibles[:-3]) + " ) RETURN DISTINCT head(labels(b)) AS This, type(r) as To, head(labels(a)) AS That limit "+str(len(TC))
                posibles = neo4j.CypherQuery(self.graph_db, posibles).execute()
            if cont >= 10 or len(posibles)==0 or len(nodes)<15:
                temp = []
                for n in nodes:
                    if n[self.target] == self.vtarget:
                        temp.append(self.vtarget)
                    else:
                        temp.append("not "+self.vtarget)
                self.arbol = Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");")
                return Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");")
            maxinf = -1000
            tc_c = posibles[0]

            for tc in posibles:#random.sample(posibles, random.randint(1,(len(posibles)))):
                cluster_centers = []
                if((tc.This == node or tc.That == node) and tc.To not in exrel):
                    if(tc.That == node):
                        consulta = path + "<-[:"+tc.To+"]-(e:"+tc.This+")"
                    else:
                        consulta = path + "-[:"+tc.To+"]->(e:"+tc.That+")"
                    if self.relValida(graph_db,consulta,nodes,cyprop) :
                        cluster_centers, group = self.centers_y_clusters(graph_db,nodes,consulta,cyprop)

                        newentropy = 0
                        if (len(cluster_centers))> 0:
                            for idx,v in enumerate(cluster_centers):
                                newentropy += (len(group[idx])/(len(nodes)))*self.entropy(group[idx])
                            information =  self.entropy(nodes) - newentropy
                            temp = []
                            for n in nodes:
                                if n[self.target] == self.vtarget:
                                    temp.append(self.vtarget)
                                else:
                                    temp.append("not "+self.vtarget)
                            self.arbol = Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");")
                            return Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");")                        

                            if (information >= maxinf):
                                maxinf = information 
                                tc_c = tc
            if maxinf > maximo:
                maximo = maxinf
            if (tc_c.That == node):
                consultacon = path + "<-[:"+tc_c.To+"]-(e:"+tc_c.This+")"
                consultasin = path1 + "<-[:"+tc_c.To+"]-(:"+tc_c.This+")"                
                label = "<-[:"+tc_c.To+"]-(:"+tc_c.This+") "
                nextnode = tc_c.This
            else:   
                consultacon = path + "-[:"+tc_c.To+"]->(e:"+tc_c.That+")"
                consultasin = path1 + "-[:"+tc_c.To+"]->(:"+tc_c.That+")"
                nextnode = tc_c.That
                label = "-[:"+tc_c.To+"]->(:"+tc_c.That+")"
            group = []
            neg = []
            suma = 0
            for n in nodes:
                tiene = neo4j.CypherQuery(graph_db, consultacon+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute()

                for r in tiene:
                    todo.append([r.cuenta])
                    rr.append(r.cuenta)
            ms = MeanShift(bin_seeding=True)
            ms.fit(np.asarray(todo))
            labels = ms.labels_
            cluster_centers = sorted(ms.cluster_centers_,key=lambda x: x[0])
            for idx,cl in enumerate(cluster_centers):
                cluster_centers[idx] = round(float(cl[0]),3)
            for u in cluster_centers:
                group.append([])
            for n in nodes:
                tiene = neo4j.CypherQuery(graph_db, consultacon+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute().data
                for r in tiene:
                    valor = r.cuenta
                    for idx,v in enumerate(cluster_centers):
                        if idx == 0:
                            temp1 = -9999
                        else:
                            temp1 = (cluster_centers[idx-1] + cluster_centers[idx])/2
                        if idx == len(cluster_centers) - 1:
                            temp2 = 99999
                        else:
                            temp2 = (cluster_centers[idx+1] + cluster_centers[idx])/2
                        if temp1 <= valor < temp2:
                            group[idx].append(n)
            temp = []
            for n in nodes:
                if n[self.target] == self.vtarget:
                    temp.append(self.vtarget)
                else:
                    temp.append("not "+self.vtarget)
            padre1 = str(max(set(temp), key=temp.count))
            t = Tree()
            t.name=label+" "+str(cluster_centers).replace(". ",".0").replace(" ", "").replace("[","").replace("]","").replace("\n",",")
            t = t.search_nodes(name=label+" "+str(cluster_centers).replace(". ",".0").replace(" ", "").replace("[","").replace("]","").replace("\n",","))[0]
            if umbral < 0:
                umbral = umbral - maxinf
            else:
                umbral = 0
            for idx,v in enumerate(cluster_centers):
                t.add_child(self.execute(group[idx],consultasin,str(nextnode),c-1,maximo,maxinf,[],umbral,padre1))
            self.arbol = t
            if maxinf > umbral and maxinf != 0:
                return t
            else:
                temp = []
                for n in nodes:
                    if n[self.target] == self.vtarget:
                        temp.append(self.vtarget)
                    else:
                        temp.append("not "+self.vtarget)
                 
                self.arbol = t
                return Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");")
Ejemplo n.º 44
0
def tree_from_taxonomy(top_level, tree_taxonomy):

    start_level = taxonomy_levels.index(top_level)
    new_taxa = tree_taxonomy.keys()

    tl_types = []
    for tt in tree_taxonomy:
        tl_types.append(tree_taxonomy[tt][top_level])

    tl_types = _uniquify(tl_types)
    levels_to_worry_about = tlevels[0:tlevels.index(top_level) + 1]

    t = Tree()
    nodes = {}
    nodes[top_level] = []
    for tl in tl_types:
        n = t.add_child(name=tl)
        nodes[top_level].append({tl: n})

    for l in levels_to_worry_about[-2::-1]:
        names = []
        nodes[l] = []
        ci = levels_to_worry_about.index(l)
        for tt in tree_taxonomy:
            try:
                names.append(tree_taxonomy[tt][l])
            except KeyError:
                pass
        names = _uniquify(names)
        for n in names:
            # find my parent
            parent = None
            for tt in tree_taxonomy:
                try:
                    if tree_taxonomy[tt][l] == n:
                        try:
                            parent = tree_taxonomy[tt][levels_to_worry_about[
                                ci + 1]]
                            level = ci + 1
                        except KeyError:
                            try:
                                parent = tree_taxonomy[tt][
                                    levels_to_worry_about[ci + 2]]
                                level = ci + 2
                            except KeyError:
                                try:
                                    parent = tree_taxonomy[tt][
                                        levels_to_worry_about[ci + 3]]
                                    level = ci + 3
                                except KeyError:
                                    print "ERROR: tried to find some taxonomic info for " + tt + " from tree_taxonomy file/downloaded data and I went two levels up, but failed find any. Looked at:\n"
                                    print "\t" + levels_to_worry_about[ci + 1]
                                    print "\t" + levels_to_worry_about[ci + 2]
                                    print "\t" + levels_to_worry_about[ci + 3]
                                    print "This is the taxonomy info I have for " + tt
                                    print tree_taxonomy[tt]
                                    sys.exit(1)

                        k = []
                        for nd in nodes[levels_to_worry_about[level]]:
                            k.extend(nd.keys())
                        i = 0
                        for kk in k:
                            if kk == parent:
                                break
                            i += 1
                        parent_id = i
                        break
                except KeyError:
                    pass  # no data at this level for this beastie
            # find out where to attach it
            node_id = nodes[levels_to_worry_about[level]][parent_id][parent]
            nd = node_id.add_child(name=n.replace(" ", "_"))
            nodes[l].append({n: nd})

    tree = t.write(format=9)

    return tree
Ejemplo n.º 45
0
from re import split

from ete2 import Tree
from nltk.tree import *


def tree_generation(entities):
    for entity in entities:
        words = split(r'[\s-]+', entity)
        reversed_words_list = [words[i - 1:] for i in range(len(words), 0, -1)]
        t = Tree()
        for word in reversed_words_list:
            string = ' '.join(word)
            z = t.add_child(name=string)
            t = z
        print t.show()


# tree_generation(['Enterprise Service Bus'])
t1 = Tree()
x = t1.add_child(name="sdfsdf")
z = t1.add_sister(name="456")
y = x.add_child(name="wef")

# t=Tree()
# t.populate(10)
print t1
Ejemplo n.º 46
0
def main():

    # do stuff
    parser = argparse.ArgumentParser(
        prog="create a tree from a taxonomy file",
        description="Create a taxonomic tree",
    )
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help="Verbose output: mainly progress reports.",
                        default=False)
    parser.add_argument('top_level',
                        nargs=1,
                        help="The top level group to start with, e.g. family")
    parser.add_argument('input_file',
                        metavar='input_file',
                        nargs=1,
                        help="Your taxonomy file")
    parser.add_argument('output_file',
                        metavar='output_file',
                        nargs=1,
                        help="Your new tree file")

    args = parser.parse_args()
    verbose = args.verbose
    input_file = args.input_file[0]
    output_file = args.output_file[0]
    top_level = args.top_level[0]

    tree_taxonomy = stk.load_taxonomy(input_file)
    new_taxa = tree_taxonomy.keys()

    tl_types = []
    for tt in tree_taxonomy:
        tl_types.append(tree_taxonomy[tt][top_level])

    tl_types = _uniquify(tl_types)
    print tl_types
    levels_to_worry_about = tlevels[0:tlevels.index(top_level) + 1]

    t = Tree()
    nodes = {}
    nodes[top_level] = []
    for tl in tl_types:
        n = t.add_child(name=tl)
        nodes[top_level].append({tl: n})

    for l in levels_to_worry_about[-2::-1]:
        names = []
        nodes[l] = []
        ci = levels_to_worry_about.index(l)
        for tt in tree_taxonomy:
            try:
                names.append(tree_taxonomy[tt][l])
            except KeyError:
                pass
        names = _uniquify(names)
        for n in names:
            # find my parent
            parent = None
            for tt in tree_taxonomy:
                try:
                    if tree_taxonomy[tt][l] == n:
                        try:
                            parent = tree_taxonomy[tt][levels_to_worry_about[
                                ci + 1]]
                            level = ci + 1
                        except KeyError:
                            try:
                                parent = tree_taxonomy[tt][
                                    levels_to_worry_about[ci + 2]]
                                level = ci + 2
                            except KeyError:
                                try:
                                    parent = tree_taxonomy[tt][
                                        levels_to_worry_about[ci + 3]]
                                    level = ci + 3
                                except KeyError:
                                    print "ERROR: tried to find some taxonomic info for " + tt + " from tree_taxonomy file/downloaded data and I went two levels up, but failed find any. Looked at:\n"
                                    print "\t" + levels_to_worry_about[ci + 1]
                                    print "\t" + levels_to_worry_about[ci + 2]
                                    print "\t" + levels_to_worry_about[ci + 3]
                                    print "This is the taxonomy info I have for " + tt
                                    print tree_taxonomy[tt]
                                    sys.exit(1)

                        k = []
                        for nd in nodes[levels_to_worry_about[level]]:
                            k.extend(nd.keys())
                        i = 0
                        for kk in k:
                            if kk == parent:
                                break
                            i += 1
                        parent_id = i
                        break
                except KeyError:
                    pass  # no data at this level for this beastie
            # find out where to attach it
            node_id = nodes[levels_to_worry_about[level]][parent_id][parent]
            nd = node_id.add_child(name=n.replace(" ", "_"))
            nodes[l].append({n: nd})

    tree = t.write(format=9)
    tree = stk._collapse_nodes(tree)
    tree = stk._collapse_nodes(tree)
    f = open(output_file, "w")
    f.write(tree)
    f.close()
Ejemplo n.º 47
0
    def createLineageTrees(self, fn=None, width=None, height=None, circular=False, withAppearing=True, from_t=0, to_t=0):
        from ete2 import Tree, NodeStyle, AttrFace
        
        tree = Tree()        
        style = self.getNodeStyle()
        divisionStyle = self.getNodeStyle()
        
        invisibleNodeStyle = NodeStyle()
        invisibleNodeStyle["hz_line_color"] = "white"
        invisibleNodeStyle["vt_line_color"] = "white"
        invisibleNodeStyle["fgcolor"] = "white"
        
        distanceFromRoot = 0
        
        nodeMap = {}
        branchSize = {}
        
        # add all nodes which appear in the first frame
        for event in self.mainOperator.innerOperators[0].events[from_t]:
            if event.type != pgmlink.EventType.Appearance:
                label = event.traxel_ids[0]
                appNode = tree.add_child(name=self.getNodeName(0, label), dist=distanceFromRoot )
                nodeMap[str(self.getNodeName(0, label))] = appNode
                branchSize[str(self.getNodeName(0, label))] = 0                    
                # making the branches to the root node invisible
                n = appNode
                while n:
                    n.set_style(invisibleNodeStyle)
                    n = n.up
                appNode.set_style(invisibleNodeStyle)
                name = AttrFace("name")
                name.fsize = 6
        
        # add all lineages
        for t, events_at in enumerate(self.mainOperator.innerOperators[0].events[from_t:to_t+1]):
            t = t+1            
            for event in events_at:
                if event.type == pgmlink.EventType.Appearance and withAppearing:
                    label = event.traxel_ids[0]
                    appNode = tree.add_child(name=self.getNodeName(t, label), dist=distanceFromRoot + t)
                    nodeMap[str(self.getNodeName(t, label))] = appNode
                    branchSize[str(self.getNodeName(t, label))] = 0                    
                    # making the branches to the root node invisible
                    n = appNode
                    while n:
                        n.set_style(invisibleNodeStyle)
                        n = n.up
                    appNode.set_style(invisibleNodeStyle)
                    name = AttrFace("name")
                    name.fsize = 6

                elif event.type == pgmlink.EventType.Disappearance:
                    label = event.traxel_ids[0]
                    if str(self.getNodeName(t-1,str(label))) not in nodeMap.keys():
                        continue
                    if branchSize[str(self.getNodeName(t-1,str(label)))] == 0:
                        del nodeMap[str(self.getNodeName(t-1,str(label)))]
                        del branchSize[str(self.getNodeName(t-1,str(label)))]
                        continue
                    newNode = nodeMap[str(self.getNodeName(t-1,str(label)))].add_child(
                        name = self.getNodeName(t-1,str(label)),dist = branchSize[str(self.getNodeName(t-1,str(label)))])                     
                    newNode.set_style(style)
                    del nodeMap[str(self.getNodeName(t-1,str(label)))]
                    del branchSize[str(self.getNodeName(t-1,str(label)))]
                    
                elif event.type == pgmlink.EventType.Division:
                    labelOld = event.traxel_ids[0]
                    labelNew1 = event.traxel_ids[1]
                    labelNew2 = event.traxel_ids[2]                    
                    if str(self.getNodeName(t-1,str(labelOld))) not in nodeMap.keys():
                        continue
                    newNode = nodeMap[str(self.getNodeName(t-1,str(labelOld)))].add_child(
                            name = self.getNodeName(t-1,str(self.getNodeName(t-1,str(labelOld)))),
                            dist = branchSize[str(self.getNodeName(t-1,str(labelOld)))] )
                    del nodeMap[str(self.getNodeName(t-1,str(labelOld)))]
                    del branchSize[str(self.getNodeName(t-1,str(labelOld)))]
                    newNode.set_style(divisionStyle)
                    nodeMap[str(self.getNodeName(t,str(labelNew1)))] = newNode
                    nodeMap[str(self.getNodeName(t,str(labelNew2)))] = newNode
                    branchSize[str(self.getNodeName(t,str(labelNew1)))] = 1
                    branchSize[str(self.getNodeName(t,str(labelNew2)))] = 1                    
                    
                elif event.type == pgmlink.EventType.Move:
                    labelOld = event.traxel_ids[0]
                    labelNew = event.traxel_ids[1]
                    if str(self.getNodeName(t-1,str(labelOld))) not in nodeMap.keys():
                        continue
                    nodeMap[str(self.getNodeName(t,str(labelNew)))] = nodeMap[str(self.getNodeName(t-1,str(labelOld)))]
                    del nodeMap[str(self.getNodeName(t-1,str(labelOld)))]
                    branchSize[str(self.getNodeName(t,str(labelNew)))] = branchSize[str(self.getNodeName(t-1,str(labelOld)))] + 1 
                    del branchSize[str(self.getNodeName(t-1,str(labelOld)))]
                
                else:
                    raise Exception, "lineage tree generation not implemented for event type " + str(event.type)

        for label in nodeMap.keys():            
            newNode = nodeMap[label].add_child(name = label,dist = branchSize[label])
        
        self.plotTree(tree, out_fn=fn, rotation=270, show_leaf_name=False, 
                  show_branch_length=False, circularTree=circular, show_division_nodes=False, 
                  distance_between_branches=4, width=width, height=height)
Ejemplo n.º 48
0
ts = TreeStyle()
ts.mode = "c"
ts.arc_span = 360
ts.layout_fn = layout
ts.show_leaf_name = False
ts.show_border = True
ts.draw_guiding_lines = False
ts.show_scale = True
#ts.scale = 60
t = Tree()
t.dist = 0

t.size = 0, 0
for x in xrange(100):
    n = t.add_child()
    n = n.add_child()
    n = n.add_child()
    n2 = n.add_child()
    n3 = n.add_child()
    n4 = n2.add_child()
    n5 = n3.add_child()
#  n.size = (10, 10)
#  n2.size = (10, 70)
#  n3.size = (40, 40)
#  n4.size = (10, 10)
#n2.size = 10
#n3.size = 10
#n5.size = 10
#n2.dist = 0.1
#n2.size = 1
Ejemplo n.º 49
0
from ete2 import Tree, TreeStyle, NodeStyle, PhyloTree, faces
from ete2.treeview.faces import *
from ete2.treeview.main import random_color, _NODE_TYPE_CHECKER, FACE_POSITIONS

sys.path.insert(0, os.path.join(ETEPATH, "examples/treeview"))
import face_grid, bubble_map, item_faces, node_style, node_background, face_positions, face_rotation, seq_motif_faces, barchart_and_piechart_faces

sys.path.insert(0, os.path.join(ETEPATH, "examples/phylogenies"))
import phylotree_visualization

main_tree = Tree()
main_tree.dist = 0

t, ts = face_grid.get_example_tree()
t_grid = TreeFace(t, ts)
n = main_tree.add_child()
n.add_face(t_grid, 0, "aligned")

t, ts = bubble_map.get_example_tree()
t_bubble = TreeFace(t, ts)
n = main_tree.add_child()
n.add_face(t_bubble, 0, "aligned")

t, ts = item_faces.get_example_tree()
t_items = TreeFace(t, ts)
n = main_tree.add_child()
n.add_face(t_items, 0, "aligned")

t, ts = node_style.get_example_tree()
t_nodest = TreeFace(t, ts)
n = main_tree.add_child()
Ejemplo n.º 50
0
    def createLineageTrees(self,
                           fn=None,
                           width=None,
                           height=None,
                           circular=False,
                           withAppearing=True,
                           from_t=0,
                           to_t=0):
        from ete2 import Tree, NodeStyle, AttrFace

        tree = Tree()
        style = self.getNodeStyle()
        divisionStyle = self.getNodeStyle()

        invisibleNodeStyle = NodeStyle()
        invisibleNodeStyle["hz_line_color"] = "white"
        invisibleNodeStyle["vt_line_color"] = "white"
        invisibleNodeStyle["fgcolor"] = "white"

        distanceFromRoot = 0

        nodeMap = {}
        branchSize = {}

        # add all nodes which appear in the first frame
        for event in self.mainOperator.innerOperators[0].events[from_t]:
            if event.type != pgmlink.EventType.Appearance:
                label = event.traxel_ids[0]
                appNode = tree.add_child(name=self.getNodeName(0, label),
                                         dist=distanceFromRoot)
                nodeMap[str(self.getNodeName(0, label))] = appNode
                branchSize[str(self.getNodeName(0, label))] = 0
                # making the branches to the root node invisible
                n = appNode
                while n:
                    n.set_style(invisibleNodeStyle)
                    n = n.up
                appNode.set_style(invisibleNodeStyle)
                name = AttrFace("name")
                name.fsize = 6

        # add all lineages
        for t, events_at in enumerate(
                self.mainOperator.innerOperators[0].events[from_t:to_t + 1]):
            t = t + 1
            for event in events_at:
                if event.type == pgmlink.EventType.Appearance and withAppearing:
                    label = event.traxel_ids[0]
                    appNode = tree.add_child(name=self.getNodeName(t, label),
                                             dist=distanceFromRoot + t)
                    nodeMap[str(self.getNodeName(t, label))] = appNode
                    branchSize[str(self.getNodeName(t, label))] = 0
                    # making the branches to the root node invisible
                    n = appNode
                    while n:
                        n.set_style(invisibleNodeStyle)
                        n = n.up
                    appNode.set_style(invisibleNodeStyle)
                    name = AttrFace("name")
                    name.fsize = 6

                elif event.type == pgmlink.EventType.Disappearance:
                    label = event.traxel_ids[0]
                    if str(self.getNodeName(t - 1,
                                            str(label))) not in nodeMap.keys():
                        continue
                    if branchSize[str(self.getNodeName(t - 1,
                                                       str(label)))] == 0:
                        del nodeMap[str(self.getNodeName(t - 1, str(label)))]
                        del branchSize[str(self.getNodeName(t - 1,
                                                            str(label)))]
                        continue
                    newNode = nodeMap[str(self.getNodeName(
                        t - 1, str(label)))].add_child(
                            name=self.getNodeName(t - 1, str(label)),
                            dist=branchSize[str(
                                self.getNodeName(t - 1, str(label)))])
                    newNode.set_style(style)
                    del nodeMap[str(self.getNodeName(t - 1, str(label)))]
                    del branchSize[str(self.getNodeName(t - 1, str(label)))]

                elif event.type == pgmlink.EventType.Division:
                    labelOld = event.traxel_ids[0]
                    labelNew1 = event.traxel_ids[1]
                    labelNew2 = event.traxel_ids[2]
                    if str(self.getNodeName(
                            t - 1, str(labelOld))) not in nodeMap.keys():
                        continue
                    newNode = nodeMap[str(
                        self.getNodeName(t - 1, str(labelOld)))].add_child(
                            name=self.getNodeName(
                                t - 1,
                                str(self.getNodeName(t - 1, str(labelOld)))),
                            dist=branchSize[str(
                                self.getNodeName(t - 1, str(labelOld)))])
                    del nodeMap[str(self.getNodeName(t - 1, str(labelOld)))]
                    del branchSize[str(self.getNodeName(t - 1, str(labelOld)))]
                    newNode.set_style(divisionStyle)
                    nodeMap[str(self.getNodeName(t, str(labelNew1)))] = newNode
                    nodeMap[str(self.getNodeName(t, str(labelNew2)))] = newNode
                    branchSize[str(self.getNodeName(t, str(labelNew1)))] = 1
                    branchSize[str(self.getNodeName(t, str(labelNew2)))] = 1

                elif event.type == pgmlink.EventType.Move:
                    labelOld = event.traxel_ids[0]
                    labelNew = event.traxel_ids[1]
                    if str(self.getNodeName(
                            t - 1, str(labelOld))) not in nodeMap.keys():
                        continue
                    nodeMap[str(self.getNodeName(
                        t, str(labelNew)))] = nodeMap[str(
                            self.getNodeName(t - 1, str(labelOld)))]
                    del nodeMap[str(self.getNodeName(t - 1, str(labelOld)))]
                    branchSize[str(self.getNodeName(
                        t, str(labelNew)))] = branchSize[str(
                            self.getNodeName(t - 1, str(labelOld)))] + 1
                    del branchSize[str(self.getNodeName(t - 1, str(labelOld)))]

                else:
                    raise Exception, "lineage tree generation not implemented for event type " + str(
                        event.type)

        for label in nodeMap.keys():
            newNode = nodeMap[label].add_child(name=label,
                                               dist=branchSize[label])

        self.plotTree(tree,
                      out_fn=fn,
                      rotation=270,
                      show_leaf_name=False,
                      show_branch_length=False,
                      circularTree=circular,
                      show_division_nodes=False,
                      distance_between_branches=4,
                      width=width,
                      height=height)
Ejemplo n.º 51
0
from ete2 import Tree
t = Tree() # Creates an empty tree
A = t.add_child(name="A") # Adds a new child to the current tree root
                           # and returns it
B = t.add_child(name="B") # Adds a second child to the current tree
                           # root and returns it
C = A.add_child(name="C") # Adds a new child to one of the branches
D = C.add_sister(name="D") # Adds a second child to same branch as
                             # before, but using a sister as the starting
                             # point
R = A.add_child(name="R") # Adds a third child to the
                           # branch. Multifurcations are supported
# Next, I add 6 random leaves to the R branch names_library is an
# optional argument. If no names are provided, they will be generated
# randomly.
R.populate(6, names_library=["r1","r2","r3","r4","r5","r6"])
# Prints the tree topology
print t
#                     /-C
#                    |
#                    |--D
#                    |
#           /--------|                              /-r4
#          |         |                    /--------|
#          |         |          /--------|          \-r3
#          |         |         |         |
#          |         |         |          \-r5
#          |          \--------|
# ---------|                   |                    /-r6
#          |                   |          /--------|
#          |                    \--------|          \-r2
Ejemplo n.º 52
0
class SiteSpider:

    def __init__(self, driver, target_url, depth=-1, delay=5, mitm=False): 
        self.driver = driver
        self.target_url = target_url
        self.t = Tree()
        self.root = self.t.add_child(name=target_url)
        self.root.add_features(path=target_url, advance=True)
        self.depth = depth
        self.delay = delay
        self.subscribers = []
        self.url_cache = UrlCache(self.depth)
        self.mitm = mitm

    def auth(self, handler):
        handler(self.driver, self.target_url)

    def _is_same_domain(self, href):
        curr = urlparse(href)
        base = urlparse(self.target_url)
        #print "%s =? %s" % (curr.netloc, base.netloc)
        return curr.netloc == base.netloc

    def _url_same(self,url1, url2):
        if self.depth < 0:
            return url1 == url2
        else:
            path1 = urlparse(url1).path.split("/")
            path2 = urlparse(url2).path.split("/")
            same = True
            for i in range(min(min(self.depth, len(path1)),len(path2))):
                if path1[i] != path2[i]:
                    same = False
                    break

            #print "Path1 %s Path2 %s Same? %s" % (path1, path2, str(same))
            return same

    def _has_visited(self, url):
        return self.url_cache.has_visited(url)

    def _has_sister(self, node, url):
        for sister in node.children:
            if self._url_same(sister.name,url):
                return True
        return False

    def _get_url_path(self, url):
        '''
        Label in tree to use
        '''
        if not self._is_same_domain(url): 
            return url 
        else: 
            parse = urlparse(url)
            return parse.path

    def _get_link_url(self, a):
        child_url = a.get_attribute("href")
        if not child_url:
            return None

        # If pound then JS must handle this link so follow it to see
        # where it goes
        if child_url.endswith("#"):
            logger.debug("Ignoring dynamic link.")
            return None
            
        return child_url
    
    def crawl(self):
        self._crawl(self.root)

    def _should_advance(self, child, child_url):
        return self._is_same_domain(child_url) and not self._has_visited(child_url)

    def _close_windows(self):
        wins = self.driver.window_handles

    def _call_subscribers(self):
        for s in self.subscribers:
            try:
                s.on_page_visited()
            except:
                pass
    def _crawl(self, node):
        # Make request for the page
        self.url_cache.cache(node.name)

        #Hack to tell the proxy we are requesting a new page
        if self.mitm:
            b64 = base64.b64encode(node.name)
            proxy_signal_url = 'http://127.0.0.1:8080/?page=' + b64
            self.driver.get(proxy_signal_url)
        self.driver.get(node.name)
       
        # There is an issue if the link is in the same domain but then
        # it does a redirect to a url outside the domain. We wont know until
        # we visit it. If this happens abort and remove from tree
        if not self._is_same_domain(self.driver.current_url):
            node.detach()
            logger.warn("Aborting, not same domain")
            return

        self._call_subscribers()

        time.sleep(self.delay)
        logger.info(self.driver.current_url)    
        #logger.debug( self.t.get_ascii(show_internal=True, attributes=["path"]))

        # Access by index because if we move to the 
        # next page the context of the page is lost when we come back 
        anchors = self.driver.find_elements_by_tag_name("a")
        # anchor_set = Set(anchors)
        l = len(anchors)
        for i in range(l):
            # new_anchor_set = Set(new_anchors)
            # anchor_diff = new_anchor_set - anchor_set
            # vanished_anchors = anchor_set - new_anchor_set
            # if anchor_diff:
            #     logger.debug('New Anchors: ' + '\n'.join([anchor.text for anchor in anchor_diff]))
            # if vanished_anchors:
            #     logger.debug('Vanished Anchors: ' + '\n'.join([anchor.text for anchor in anchor_diff]))
            
            new_anchors = self.driver.find_elements_by_tag_name("a")
            assert len(new_anchors) == len(anchors)
            a = new_anchors[i]
            child_url = self._get_link_url(a)
            
            #Only add if its not already there
            if not child_url or self._has_visited(child_url):
                continue
            
            child = node.add_child(name=child_url)
            child.add_feature("path", self._get_url_path(child_url))

            # Determine if the link should be advanced forward
            # We never want to start crawling other pages
            if self._should_advance(child, child_url):
                child.add_feature("advance", True)
            else:
                child.add_feature("advance", False)
        
        logger.debug(self.url_cache)

        #Process all the found links
        for child in node.children:
            if child.advance:
                self._crawl(child)

    def get_link_graph(self):
        return self.t

    def add_subscriber(self, subscriber):
        self.subscribers.append(subscriber)
Ejemplo n.º 53
0
				curr_cols = curr_cols + 1
				continue
			if "note" ==  _string:
				break

			num = s.cell_value(curr_row, curr_cols)
			if num == '' or type(num) != float:
				curr_cols = curr_cols + 1
				continue

			parent_name = parent_name + '@' +  _string

			curr_cols = curr_cols + 1

		if parent_name == '':
			t.add_child(name = clock_name)
			clock_tree_array.append(copy.deepcopy(t))

#		print clock_name + ':'+ parent_name
		curr_row = curr_row + 1

#find the tree that belongs to this root
	for _table_index in range(len(clock_tree_array)):
		t = clock_tree_array[_table_index]
#		print "doing root name = " + t.children[0].name
		add_child_to_node(s, t.children[0], t.children[0].name)

#find the leaf's all parent, using '@' to split.
	for _table_index in range(len(clock_tree_array)):
		t = clock_tree_array[_table_index]