def main(args): gtr_file, cdt_file, nwk_file = args reader = csv.reader(file(cdt_file), delimiter="\t") reader.next() # header reader.next() # EWEIGHT gid_to_name = {} for row in reader: gid, name = row[:2] #gid_to_name[gid] = name gid_to_name[gid] = name.upper() reader = csv.reader(file(gtr_file), delimiter="\t") nodes = {} for gtr in map(GTRLine._make, reader): node = Tree() parent_name, parent_dist = gtr.parent, float(gtr.dist) for child in (gtr.left_child, gtr.right_child): if child in gid_to_name: node.add_child(name=gid_to_name[child], dist=1 - parent_dist) else: assert child in nodes, child child_node, child_dist = nodes[child] node.add_child(child_node, dist=child_dist - parent_dist) nodes[parent_name] = (node, parent_dist) t = node print >> sys.stderr, "writing newick tree to %s" % nwk_file t.write(format=5, outfile=nwk_file)
def main(args): gtr_file, cdt_file, nwk_file = args reader = csv.reader(file(cdt_file), delimiter="\t") reader.next() # header reader.next() # EWEIGHT gid_to_name = {} for row in reader: gid, name = row[:2] #gid_to_name[gid] = name gid_to_name[gid] = name.upper() reader = csv.reader(file(gtr_file), delimiter="\t") nodes = {} for gtr in map(GTRLine._make, reader): node = Tree() parent_name, parent_dist = gtr.parent, float(gtr.dist) for child in (gtr.left_child, gtr.right_child): if child in gid_to_name: node.add_child(name=gid_to_name[child], dist=1-parent_dist) else: assert child in nodes, child child_node, child_dist = nodes[child] node.add_child(child_node, dist=child_dist-parent_dist) nodes[parent_name] = (node, parent_dist) t = node print >>sys.stderr, "writing newick tree to %s" % nwk_file t.write(format=5, outfile=nwk_file)
class K_Graph(object): """docstring for K_Graph""" def __init__(self): self.theme = Tree() self.topic = '' def add_point(self,topic,point): for t in self.theme.traverse(): if t.name in topic: t.add_child(name=point) def add_topic(self,topic): self.theme.add_child(name=topic) self.topic = topic def getCurrentGraph(self): for t in self.theme.traverse(): if t.name in self.topic: return t def get_topic(self): return self.topic def save(self): with open('data.pickle', 'wb') as f: # Pickle the 'data' dictionary using the highest protocol available. pickle.dump(self, f, pickle.HIGHEST_PROTOCOL) def load(self): with open('data.pickle', 'rb') as f: # The protocol version used is detected automatically, so we do not # have to specify it. return pickle.load(f)
def neighbor_joining(D, tree, internals): #fsum will have better precision when adding distances across sites #based on PLs not mutation """ Args: D (np.array): pairwise differences between samples based on PLs (passing copy) tree (Tree): tree of class Tree with num tips = num samples internals (np.array): array of sample numbers Returns: Tree D (np.array): update pairwise differences now there are internal nodes to compare """ print('neighbor_joining() begin', end=' ', file=sys.stderr) m = len(internals) while m > 2: #if m is 2 then only two connected to root d = D[ internals[:, None], internals] #initially D matrix w/o 0 distance btwn internal nodes; then add in nodes as they have distances u = d.sum(axis=1) / (m - 2) Q = np.zeros(shape=(m, m), dtype=np.longdouble) for i, j in itertools.combinations(xrange(m), 2): #std Q matrix calc Q[i, j] = d[i, j] - u[i] - u[j] Q[j, i] = Q[i, j] #print(Q.astype(int)) np.fill_diagonal(Q, np.inf) #print(np.unique(Q, return_counts=True)) i, j = np.unravel_index( Q.argmin(), (m, m) ) #location in matrix of smallest Q value (ie closest nodes/tips) l = len(D) + 2 - m for k in xrange(m): D[l, internals[k]] = D[internals[k], l] = d[i, k] + d[j, k] - d[i, j] D[l, internals[i]] = D[internals[i], l] = vi = (d[i, j] + u[i] - u[j]) / 2 D[l, internals[j]] = D[internals[j], l] = vj = (d[i, j] + u[j] - u[i]) / 2 ci = tree & str(internals[i]) cj = tree & str(internals[j]) ci.detach() cj.detach() node = Tree(name=str(l)) node.add_child(ci, dist=int(vi)) node.add_child(cj, dist=int(vj)) tree.add_child(node) #print(tree) internals = np.delete(internals, [i, j]) internals = np.append(internals, l) m = len(internals) print('.', end='', file=sys.stderr) print(' done', file=sys.stderr) return D, tree
def getGenera(taxonomy_queryset, only_id=False): """ .. This function generates a Tree object derived from the collapse of all *species* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet only_id : Boolean (flag) True (default False) means that is going to append the full name of the genera. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :genera_tree: derived from ete2.TreeNode() """ tax = taxonomy_queryset sps = tax.species genera = tax.genera family_tree = Tree(name='genus_root') for genus in genera: family_id = genus['parent_id'] genus_id = genus['genus_id'] if not only_id: name = genus['name'] else: name = genus_id ab = genus['ab'] points = genus['points'] sp_by_gns = sps.filter(genus_id__exact=genus_id) gn_t = Tree(name=name, support=ab) gn_t.add_feature('genus_id', genus_id) gn_t.add_feature('level', 'genus') gn_t.add_feature('points', points) #logger.info('Building branch for genus %s' %name) for specie in sp_by_gns: if not only_id: name = specie['name'].split(' ') name = name[0] + ' ' + name[1] else: name = specie['species_id'] # logger.info('The name assigned is %s' %name) points = specie['points'] s = Tree(name=name, support=specie['ab']) s.add_feature('species_id', specie['species_id']) s.add_feature('level', 'species') s.add_feature('points', points) gn_t.add_child(child=s) family_tree.add_child(child=gn_t) return family_tree
def getEte2Tree(hypoTree): t = Tree() for entry in hypoTree: if type(entry) is list: t.add_child(getEte2Tree(entry)) else: t.name = entry.name return t
def getGenera(taxonomy_queryset,only_id=False): """ .. This function generates a Tree object derived from the collapse of all *species* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet only_id : Boolean (flag) True (default False) means that is going to append the full name of the genera. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :genera_tree: derived from ete2.TreeNode() """ tax = taxonomy_queryset sps = tax.species genera = tax.genera family_tree = Tree(name='genus_root') for genus in genera: family_id = genus['parent_id'] genus_id = genus['genus_id'] if not only_id: name = genus['name'] else: name = genus_id ab = genus['ab'] points = genus['points'] sp_by_gns = sps.filter(genus_id__exact=genus_id) gn_t = Tree(name=name,support=ab) gn_t.add_feature('genus_id', genus_id) gn_t.add_feature('level','genus') gn_t.add_feature('points',points) #logger.info('Building branch for genus %s' %name) for specie in sp_by_gns: if not only_id: name = specie['name'].split(' ') name = name[0]+' '+name[1] else: name = specie['species_id'] # logger.info('The name assigned is %s' %name) points = specie['points'] s = Tree(name = name,support=specie['ab']) s.add_feature('species_id', specie['species_id']) s.add_feature('level','species') s.add_feature('points',points) gn_t.add_child(child=s) family_tree.add_child(child=gn_t) return family_tree
def getClasses(taxonomic_queryset, orders_tree, only_id=False): """ .. This function generates a Tree object derived from the collapse of all *classes* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :orders_tree: Tree derived from getOrders only_id : Boolean (flag) True (default False) means that is going to append the full name of the classes. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :classes_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset classes = tax.classes orders = tax.orders phylumTree = Tree(name='phylum_root') logger.info("[gbif.buildtree] Collapsing Classes") for class_ in classes: phylum_id = class_['parent_id'] if not only_id: name = class_['name'] else: name = class_['class_id'] ab = class_['ab'] #Add here the geometric feature (if necessary) points = class_['points'] class_id = class_['class_id'] #logger.info("Colapsing Class id: %s" %class_id) classTree = Tree(name=name, support=ab) classTree.add_feature('class_id', class_id) classTree.add_feature('level', 'class') classTree.add_feature('points', points) orders_by_class = orders.filter(parent_id__exact=class_id) for order in orders_by_class: id_o = order['order_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce( lambda node: node.next(), filter(lambda branch: branch.order_id == id_o, orders_tree.get_children())) #print branch # Attach the branch to the family tree classTree.add_child(child=branch) phylumTree.add_child(child=classTree) return phylumTree
def getClasses(taxonomic_queryset,orders_tree,only_id=False): """ .. This function generates a Tree object derived from the collapse of all *classes* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :orders_tree: Tree derived from getOrders only_id : Boolean (flag) True (default False) means that is going to append the full name of the classes. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :classes_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset classes = tax.classes orders = tax.orders phylumTree = Tree(name='phylum_root') logger.info("[gbif.buildtree] Collapsing Classes") for class_ in classes: phylum_id = class_['parent_id'] if not only_id: name = class_['name'] else: name = class_['class_id'] ab = class_['ab'] #Add here the geometric feature (if necessary) points = class_['points'] class_id = class_['class_id'] #logger.info("Colapsing Class id: %s" %class_id) classTree = Tree(name=name,support=ab) classTree.add_feature('id',class_id) classTree.add_feature('abundance',ab) classTree.add_feature('parent_id',phylum_id) classTree.add_feature('class_id',class_id) classTree.add_feature('level','class') classTree.add_feature('points',points) orders_by_class = orders.filter(parent_id__exact=class_id) for order in orders_by_class: id_o = order['order_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce(lambda node : node.next(),filter(lambda branch : branch.order_id==id_o,orders_tree.get_children())) #print branch # Attach the branch to the family tree classTree.add_child(child=branch) phylumTree.add_child(child=classTree) return phylumTree
def getKingdoms(taxonomic_queryset, phyla_tree, only_id=False): """ ... This function generates a Tree object derived from the collapse of all *kingdoms* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :phyla_tree: Tree derived from getKingdoms only_id : Boolean (flag) True (default False) means that is going to append the full name of the kingdoms. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :kingdoms_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset kingdoms = tax.kingdoms phyla = tax.phyla TreeOfLife = Tree(name='Life') logger.info("[gbif.buildtree] Collapsing Kingdoms") for kingdom in kingdoms: kingdom_id = 0 if not only_id: name = kingdom['name'] else: name = kingdom['kingdom_id'] ab = kingdom['ab'] #Add here the geometric feature (if necessary) points = kingdom['points'] kingdom_id = kingdom['kingdom_id'] #logger.info("Colapsing kingdom: %s" %name) kingdomTree = Tree(name=name, support=ab) kingdomTree.add_feature('kingdom_id', kingdom_id) kingdomTree.add_feature('level', 'kingdom') kingdomTree.add_feature('points', points) phyla_by_kingdom = phyla.filter(parent_id__exact=kingdom_id) for phylum in phyla_by_kingdom: id_p = phylum['phylum_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce( lambda node: node.next(), filter(lambda branch: branch.phylum_id == id_p, phyla_tree.get_children())) #print branch # Attach the branch to the family tree kingdomTree.add_child(child=branch) TreeOfLife.add_child(child=kingdomTree) return TreeOfLife
def buildTree(taxid_list, nodes_dict, taxids_remove, cursor): """Recursive function, returns a ete tree object from a list of taxids. Requires a cursor connected to a sqlite db build using the script /users/rg/didac/NCBI/Taxonomy/update_sqlite_DB.py nodes_dict is an empty dict taxids_remove is an empty list """ results = query_a_list(taxid_list, cursor) # check if all taxids returned a result if len(set(taxid_list)) != len(results): taxids_with_result = set([ x[0] for x in results]) taxids_remove += list(set(map(int, taxid_list)) - taxids_with_result ) parent_taxid_list = [] for result in results: taxid, parent_taxid, rank, name = result parent_taxid_list.append(parent_taxid) if not taxid in nodes_dict: c = Tree() c.add_feature('name', name) nodes_dict[ taxid ] = c # I don't have scientific name and rank for parent_taxid yet, but next iteration it will be the taxid nodes_dict[ taxid ].add_features(name=name, taxid=taxid, rank=rank) # add child to node parent_taxid if not parent_taxid in nodes_dict: p = Tree() p.add_feature('taxid', parent_taxid) p.add_child( nodes_dict[ taxid ] ) nodes_dict[ parent_taxid ] = p else: # check if taxid is a child of parent_taxid (already in nodes_dict), otherwise adding it for descendant in nodes_dict[ parent_taxid ].iter_descendants(): if taxid == descendant.taxid: break else: nodes_dict[ parent_taxid ].add_child( nodes_dict[ taxid ] ) parent_taxid_list = list(set(parent_taxid_list)) try: # "1" is the root of the NCBI tree, if "1" is in parent_taxid_list, and it will become an empty list inside this try parent_taxid_list.remove(1) except: pass if parent_taxid_list: t,nodes_dict,taxids_remove = buildTree(parent_taxid_list, nodes_dict, taxids_remove, cursor) else: nodes_dict[ 1 ].add_features(name='Root', rank='Root') return nodes_dict[ 1 ], nodes_dict, taxids_remove return t, nodes_dict, taxids_remove
def getKingdoms(taxonomic_queryset,phyla_tree,only_id=False): """ ... This function generates a Tree object derived from the collapse of all *kingdoms* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :phyla_tree: Tree derived from getKingdoms only_id : Boolean (flag) True (default False) means that is going to append the full name of the kingdoms. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :kingdoms_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset kingdoms = tax.kingdoms phyla = tax.phyla TreeOfLife = Tree(name='Life') logger.info("[gbif.buildtree] Collapsing Kingdoms") for kingdom in kingdoms: kingdom_id = 0 if not only_id: name = kingdom['name'] else: name = kingdom['kingdom_id'] ab = kingdom['ab'] #Add here the geometric feature (if necessary) points = kingdom['points'] kingdom_id = kingdom['kingdom_id'] #logger.info("Colapsing kingdom: %s" %name) kingdomTree = Tree(name=name,support=ab) kingdomTree.add_feature('kingdom_id',kingdom_id) kingdomTree.add_feature('level','kingdom') kingdomTree.add_feature('points',points) phyla_by_kingdom = phyla.filter(parent_id__exact=kingdom_id) for phylum in phyla_by_kingdom: id_p = phylum['phylum_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce(lambda node : node.next(),filter(lambda branch : branch.phylum_id==id_p,phyla_tree.get_children())) #print branch # Attach the branch to the family tree kingdomTree.add_child(child=branch) TreeOfLife.add_child(child=kingdomTree) return TreeOfLife
def neighbor_joining(D, tree, internals): #fsum will have better precision when adding distances across sites #based on PLs not mutation """ Args: D (np.array): pairwise differences between samples based on PLs (passing copy) tree (Tree): tree of class Tree with num tips = num samples internals (np.array): array of sample numbers Returns: Tree D (np.array): update pairwise differences now there are internal nodes to compare """ print('neighbor_joining() begin', end=' ', file=sys.stderr) m = len(internals) while m > 2: #if m is 2 then only two connected to root d = D[internals[:,None],internals] #initially D matrix w/o 0 distance btwn internal nodes; then add in nodes as they have distances u = d.sum(axis=1)/(m-2) Q = np.zeros(shape=(m,m), dtype=np.longdouble) for i,j in itertools.combinations(xrange(m),2): #std Q matrix calc Q[i,j] = d[i,j]-u[i]-u[j] Q[j,i] = Q[i,j] #print(Q.astype(int)) np.fill_diagonal(Q, np.inf) #print(np.unique(Q, return_counts=True)) i,j = np.unravel_index(Q.argmin(), (m,m)) #location in matrix of smallest Q value (ie closest nodes/tips) l = len(D)+2-m for k in xrange(m): D[l,internals[k]] = D[internals[k],l] = d[i,k]+d[j,k]-d[i,j] D[l,internals[i]] = D[internals[i],l] = vi = (d[i,j]+u[i]-u[j])/2 D[l,internals[j]] = D[internals[j],l] = vj = (d[i,j]+u[j]-u[i])/2 ci = tree&str(internals[i]) cj = tree&str(internals[j]) ci.detach() cj.detach() node = Tree(name=str(l)) node.add_child(ci,dist=int(vi)) node.add_child(cj,dist=int(vj)) tree.add_child(node) #print(tree) internals = np.delete(internals, [i,j]) internals = np.append(internals, l) m = len(internals) print('.', end='', file=sys.stderr) print(' done', file=sys.stderr) return D,tree
def treeFromQuartet(quartet): root = Tree() root.name = "root" left = root.add_child(name="Left") left.add_child(name=quartet[0]) left.add_child(name=quartet[1]) right = root.add_child(name="Right") right.add_child(name=quartet[2]) right.add_child(name=quartet[3]) for desc in root.iter_descendants(): desc.dist = 0 return root
def getFamilies(taxonomic_queryset,genera_tree,only_id=False): """ .. This function generates a Tree object derived from the collapse of all *families* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :genera_tree: Tree derived from getGenera only_id : Boolean (flag) True (default False) means that is going to append the full name of the families. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :families_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset families = tax.families genera = tax.genera orders_tree = Tree(name='order_root') for family in families: order_id = family['parent_id'] if not only_id: name = family['name'] else: name = family['family_id'] ab = family['ab'] #Add here the geometric feature (if necessary) points = family['points'] family_id = family['family_id'] famTree = Tree(name=name,support=ab) famTree.add_feature('abundance',ab) famTree.add_feature('id',family_id) famTree.add_feature('parent_id',order_id) famTree.add_feature('family_id',family_id) famTree.add_feature('level','family') famTree.add_feature('points',points) gens_by_fam = genera.filter(parent_id__exact=family_id) for genus in gens_by_fam: id_g = genus['genus_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce(lambda node : node.next(),filter(lambda branch : branch.genus_id==id_g,genera_tree.get_children() )) # Attach the branch to the family tree famTree.add_child(child=branch) orders_tree.add_child(child=famTree) return orders_tree
def getFamilies(taxonomic_queryset, genera_tree, only_id=False): """ .. This function generates a Tree object derived from the collapse of all *families* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :genera_tree: Tree derived from getGenera only_id : Boolean (flag) True (default False) means that is going to append the full name of the families. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :families_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset families = tax.families genera = tax.genera orders_tree = Tree(name='order_root') for family in families: order_id = family['parent_id'] if not only_id: name = family['name'] else: name = family['family_id'] ab = family['ab'] #Add here the geometric feature (if necessary) points = family['points'] family_id = family['family_id'] famTree = Tree(name=name, support=ab) famTree.add_feature('family_id', family_id) famTree.add_feature('level', 'family') famTree.add_feature('points', points) gens_by_fam = genera.filter(parent_id__exact=family_id) for genus in gens_by_fam: id_g = genus['genus_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce( lambda node: node.next(), filter(lambda branch: branch.genus_id == id_g, genera_tree.get_children())) # Attach the branch to the family tree famTree.add_child(child=branch) orders_tree.add_child(child=famTree) return orders_tree
def getPhyla(taxonomic_queryset,classes_tree,only_id=False): """ ... This function generates a Tree object derived from the collapse of all *phyla* under the scope of a spatial queryset. Parameters ---------- taxonomy_queryset gbif.models / GeoquerySet :classes_tree: Tree derived from getclasses only_id : Boolean (flag) True (default False) means that is going to append the full name of the Phyla. This is a string and can be vary in length. If it is used in big data sets it will impact the amount of memory used because of the heavy load of information. Returns ------- :phyla_tree: derived from ete2.TreeNode() """ tax = taxonomic_queryset phyla = tax.phyla classes = tax.classes kingdomTree = Tree(name='kingdom_root') logger.info("[gbif.buildtree] Collapsing Phyla") for phylum in phyla: kingdom_id = phylum['parent_id'] if not only_id: name = phylum['name'] else: name = phylum['phylum_id'] ab = phylum['ab'] #Add here the geometric feature (if necessary) points = phylum['points'] phylum_id = phylum['phylum_id'] #logger.info("Colapsing Phylum: %s" %name) phylumTree = Tree(name=name,support=ab) phylumTree.add_feature('phylum_id',phylum_id) phylumTree.add_feature('level','phylum') phylumTree.add_feature('points',points) classes_by_phylum = classes.filter(parent_id__exact=phylum_id) for class_ in classes_by_phylum: id_c = class_['class_id'] #Filter the branch of the tree with the selected genus (for loop) branch = reduce(lambda node : node.next(),filter(lambda branch : branch.class_id==id_c,classes_tree.get_children())) #print branch # Attach the branch to the family tree phylumTree.add_child(child=branch) kingdomTree.add_child(child=phylumTree) return kingdomTree
def init_star_tree(n): """Creates a tree, adds n children in star with numbers as names Args: n (int): Number of children in tree Returns: Tree: """ tree = Tree() for i in xrange(n): tree.add_child(name=str(i)) return tree
def generateTree(tree, wordlist): nodeList = [] for i in range(len(tree)): parent = Tree() parent.name = str(-(i+1)) for node in eval(str(tree[i]).split(':')[0]): if node >= 0: child = Tree() child.name = wordlist[node] parent.add_child(child) else: parent.add_child(nodeList[int(-node)-1]) nodeList.append(parent) print nodeList[-1].get_ascii(show_internal=True) return nodeList[-1]
def add_taxa(tree, new_taxa, taxa_in_clade, level): # create new tree of the new taxa additionalTaxa = tree_from_taxonomy(level, new_taxa) # find mrca parent treeobj = stk._parse_tree(tree) mrca = stk.get_mrca(tree, taxa_in_clade) if (mrca == 0): # we need to make a new tree! The additional taxa are being placed at the root of the tree t = Tree() A = t.add_child() B = t.add_child() t1 = Tree(additionalTaxa) t2 = Tree(tree) A.add_child(t1) B.add_child(t2) return t.write(format=9) else: mrca = treeobj.nodes[mrca] additionalTaxa = stk._parse_tree(additionalTaxa) if len(taxa_in_clade) == 1: taxon = treeobj.node(taxa_in_clade[0]) mrca = treeobj.addNodeBetweenNodes(taxon, mrca) # insert a node into the tree between the MRCA and it's parent (p4.addNodeBetweenNodes) # newNode = treeobj.addNodeBetweenNodes(mrca, mrca_parent) # add the new tree at the new node using p4.addSubTree(self, selfNode, theSubTree, subTreeTaxNames=None) treeobj.addSubTree(mrca, additionalTaxa, ignoreRootAssert=True) return treeobj.writeNewick(fName=None, toString=True).strip()
def trees(): outdir = os.path.join('..', 'trees') urls = {'global': 'http://glottolog.org/static/trees/tree-glottolog-newick.txt'} for entry in bs(requests.get(GLOTTOLOG_FAMILIES).text, 'html.parser').find_all('entry'): urls[entry.find('title').text] = entry.find('id').text for fname in os.listdir(outdir): if fname.endswith(SUFFIX): os.remove(os.path.join(outdir, fname)) for family in sorted(urls): url = urls[family] if not url.endswith('newick.txt'): url += '.newick.txt' filename = os.path.join(outdir, (family + SUFFIX) if family != 'global' else family + '.trees') print("%30s <- %s" % (family, url)) newick = requests.get(url).text.encode('utf-8') if family == 'global': tree = Tree() for n in newick.split(';\n'): subtree = Tree(clean_newick(n + ';'), format=3) nodenames = [_n.name for _n in subtree.traverse()] if len(nodenames) == len(set(nodenames)) + 1: # FIXME: we must include isolates! # just add single child? tree.add_child(child=Tree(name=subtree.name), dist=1.0) print 'adding isolate', subtree.name else: tree.add_child(child=subtree, dist=1.0) else: tree = Tree(clean_newick(newick), format=3) if clean_tree(tree): newick_string = str(tree.write(format=3)) with codecs.open(filename, 'w', encoding="utf-8") as handle: handle.write("#NEXUS\nBegin taxa;\n") # write taxa to file for leaf in tree.traverse(): if str(leaf.name) in newick_string: handle.write(leaf.name) handle.write("\n") handle.write(";\nend;") # write newick string to file handle.write("\nBegin trees;\ntree UNTITLED = ") handle.write(newick_string) handle.write("\nend;")
def tree_generation(entities): for entity in entities: words = split(r'[\s-]+', entity) reversed_words_list = [words[i - 1:] for i in range(len(words), 0, -1)] t = Tree() for word in reversed_words_list: string = ' '.join(word) z = t.add_child(name=string) t = z print t.show()
def quartetPuzzling(self, steps): seq_ids = self._sequencesDict.keys() if len(seq_ids) < 4: tree = Tree() for seq_id in seq_ids: tree.add_child(name=seq_id) return tree trees = [] for step in range(steps): shuffle(seq_ids) first_quartet = self._optimalQuartets[self.getQuartetID( seq_ids[0:4])]["topology"] rooted_tree = self.treeFromQuartet(first_quartet) tree = rooted_tree.children[0] tree.add_child(rooted_tree.children[1]) # tree.show() for i in range(4, len(seq_ids)): tree_utils.initEdgeLengths(tree, 0) quartets = [] for triplet in combination_utils.combinationsGenerator( seq_ids[0:i], 3): triplet.append(seq_ids[i]) quartets.append(tuple(triplet)) qt_topos_found = set() for quartet in quartets: optimal_qt_topo_id = self._optimalQuartets[ self.getQuartetID(quartet)]["topology_id"] qt_topo_id = self.getTopologyID(quartet) if qt_topo_id == optimal_qt_topo_id and qt_topo_id not in qt_topos_found: qt_topos_found.add(qt_topo_id) self.increaseCostOnPath(tree, quartet[0], quartet[1]) # choose edge with minimum cost, delete it and add new leaf seq_ids[i] shortest_edge = tree_utils.findShortestEdge(tree) # new_node = Tree(name=shortest_edge[0].name + "_" + shortest_edge[1].name) new_node = Tree() new_node.add_child(name=seq_ids[i]) detached = shortest_edge[1].detach() shortest_edge[0].add_child(new_node) new_node.add_child(detached) # tree.show() tree_utils.initEdgeLengths(tree, 1) trees.append(tree) # find consensus tree return tree_utils.findConsensusTree(trees)
def load_label_parentidx_as_tree(label_parentidx, labels): root = Tree() root.name = 'root' label_node = {} for label in labels: parientid = label_parentidx[label] if parientid == -1: c = root.add_child(name=label) else: parentnode = label_node[labels[parientid]] c = parentnode.add_child(name=label) label_node[label] = c return root
def get_gtr_tree(self): from ete2 import Tree fp = open(self.gtrfile) reader = csv.reader(fp, delimiter="\t") nodes = {} gnames = dict(self.gnames) for g in map(GTRLine._make, reader): node = Tree() parent_name, parent_dist = g.parent, float(g.dist) for child in (g.left_child, g.right_child): if child in gnames: node.add_child(name=gnames[child], dist=1 - parent_dist) else: assert child in nodes, child child_node, child_dist = nodes[child] node.add_child(child_node, dist=child_dist - parent_dist) nodes[parent_name] = (node, parent_dist) self.gtr_tree = node
def neighbor_joining(D, tree, internals): print('neighbor_joining() begin', end=' ', file=sys.stderr) m = len(internals) while m > 2: d = D[internals[:,None],internals] u = d.sum(axis=1)/(m-2) Q = np.zeros(shape=(m,m), dtype=np.longdouble) for i,j in itertools.combinations(xrange(m),2): Q[i,j] = d[i,j]-u[i]-u[j] Q[j,i] = Q[i,j] #print(Q.astype(int)) np.fill_diagonal(Q, np.inf) #print(np.unique(Q, return_counts=True)) i,j = np.unravel_index(Q.argmin(), (m,m)) l = len(D)+2-m for k in xrange(m): D[l,internals[k]] = D[internals[k],l] = d[i,k]+d[j,k]-d[i,j] D[l,internals[i]] = D[internals[i],l] = vi = (d[i,j]+u[i]-u[j])/2 D[l,internals[j]] = D[internals[j],l] = vj = (d[i,j]+u[j]-u[i])/2 ci = tree&str(internals[i]) cj = tree&str(internals[j]) ci.detach() cj.detach() node = Tree(name=str(l)) node.add_child(ci,dist=int(vi)) node.add_child(cj,dist=int(vj)) tree.add_child(node) #print(tree) internals = np.delete(internals, [i,j]) internals = np.append(internals, l) m = len(internals) print('.', end='', file=sys.stderr) print(' done', file=sys.stderr) return tree
def load_label_tree(noffset_parentidx, noffsets): root = Tree() root_synset = wn.synset('physical_entity.n.01') root.name = root_synset.name() root.add_feature('synset', root_synset) noffset_node = {} for noffset in noffsets: parientid = noffset_parentidx[noffset] if parientid == -1: c = root.add_child(name=noffset) else: parentnode = noffset_node[noffsets[parientid]] c = parentnode.add_child(name=noffset) noffset_node[noffset] = c return prune_root(root), noffset_node
def normalize_ranks(t,ranks): normtree = Tree() for l in t.get_leaves(): out = [] lineage={} taxids={} parent = l.up while parent: if parent.rank in ranks: lineage[parent.rank]=parent.scientific_name taxids[parent.rank]=parent.taxid parent = parent.up for rank in ranks: if not rank in lineage: lineage[rank] = 'No '+rank taxids[rank]= 'No taxid' for i in range(len(ranks)): rank=ranks[i] if i==0: # superkingdom if not normtree.search_nodes(name=lineage[rank], rank=rank): child = normtree.add_child(name=lineage[rank]) # child.add_features(rank=rank,lineage=lineage[rank],taxid=taxids[rank]) child.add_feature('rank', rank ) child.add_feature('lineage', lineage[rank]) child.add_feature('taxid', taxids[rank]) else: if rank == 'species' and lineage[rank] == 'No species': lineage['species'] = l.name parent_node_lineage =':'.join([ lineage[x] for x in ranks[:i]]) node_lineage =':'.join([ lineage[x] for x in ranks[:i] + [rank]]) parent = normtree.search_nodes(lineage=parent_node_lineage) if not parent or len(parent)>1: raise if not normtree.search_nodes(name=lineage[rank], lineage=node_lineage): child = parent[0].add_child(name=lineage[rank]) # child.add_features(rank=rank,lineage=lineage[rank],taxid=taxids[rank]) child.add_feature('rank', rank ) child.add_feature('lineage', node_lineage) child.add_feature('taxid', taxids[rank]) if rank == 'species': child.scientific_name = l.scientific_name child.taxid = l.taxid return normtree
def main(): t = Tree() auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.secure = True auth.set_access_token(token_key, secret_key) api = tweepy.API(auth) root = t.add_child(name=api.me().name) for friend in api.me().friends(): child = root.add_child(name=getTwitterAccountName(friend)) getFriends(friend,child,0) #getFriends(api.me(),root,0) t.render("mytree.png", w=183, units="mm")
def main(args={}): ######################################################### ############ loading options global opt if not args: opt=command_line(def_opt, help_msg, 'iaf', synonyms=command_line_synonyms, strict=1) else: opt=args set_MMlib_var('opt', opt) global temp_folder; temp_folder=Folder(random_folder(opt['temp'])); test_writeable_folder(temp_folder, 'temp_folder'); set_MMlib_var('temp_folder', temp_folder) write("#=============------ "); write(" show_syntheny.py ", how='reverse,bright'); write(" -------=============#", 1) ## basic graphics options face_width= opt['w']; font_size= opt['fs'] ## defining a function that, given a gene, decide what is printed in the face""" if not opt['m']: def get_text(g): if g.id in geneid2family: return geneid2family[g.id]+':'+g.id else: return '-'+':'+g.id else: face_width=30 def get_text(g): if g.id in geneid2family: return geneid2family[g.id] else: return '' tree_style=TreeStyle(); tree_style.show_leaf_name=False; tree_style.scale=1; tree_style.show_scale=False node_style=NodeStyle(); node_style["size"] = 0 #; node_style["fgcolor"] = "darkred" node_style_grey=NodeStyle(); node_style_grey["size"] = 0; node_style_grey["bgcolor"] = "lightgrey" tree=Tree(); tree.dist=0.0; tree.set_style(node_style) ############################## legend mode only: start if opt['legend']: ### totally different program in this case for line_index, line in enumerate(open(opt['legend'])): try: bkg_color='white' if line_index%2 else 'lightgrey' splt=line.strip().split('\t') if not splt: continue leaf=tree.add_child(name='', dist=0.0); leaf.set_style(node_style) if line_index%2 else leaf.set_style(node_style_grey) g=gene(strand='+'); g.color, g.color_outline, g.color_box_bkg, g.color_box_line=[x if x!='None' else None for x in splt[1].split()]; g.text = replace(splt[2], '\\n', '\n') title_face=faces.TextFace(splt[0], fsize=font_size); title_face.margin_left=5; leaf.add_face(title_face, 0, 'branch-right' ) #title left arrow_face=syntheny_view([g], printed={'boundaries': 0, 'text':1, 'id':0}, pen_size=4, font_size=font_size, width=face_width)[0]; leaf.add_face(arrow_face, 1, 'branch-right' ) for desc_line_index, desc_line in enumerate(splt[3].split('\\n')): desc_face=faces.TextFace(desc_line, fsize=font_size); desc_face.background.color = bkg_color; leaf.add_face(desc_face, 2, 'branch-right' ) #desc_face.margin_down=3; desc_face.margin_up=3; except: printerr('-legend ERROR parsing this line: |n{0}'.format(line), 1); raise write("Legend mode: {0} entries found. ".format(len(tree)), 1) if opt['out']: write('--> writing output file: {0}'.format(opt['out']), 1); tree.render(opt['out'], tree_style=tree_style) else: write('-- opening interactive ETE2 environment (PyQt4) -- ', 1); tree.show(tree_style=tree_style) sys.exit() ############################## legend mode only: over #### checking input input_gff_file=opt['i']; check_file_presence(input_gff_file, 'input_gff_file', notracebackException ) annotation_gff_file=opt['a']; check_file_presence(annotation_gff_file, 'annotation_gff_file', notracebackException ) homology_file=opt['f']; check_file_presence(homology_file, 'homology_file', notracebackException ) # printing for pretty out write('# Input gff file= {0:<30} (genes of interest)'.format(input_gff_file), 1) write('# Annotation gff file= {0:<30} (all genes)'.format(annotation_gff_file), 1) write('# Homology tsv file= {0:<30} (gene families)'.format(homology_file), 1) non_def_options_str=join([ '# -{0} {1}\n'.format(k, opt[k]) for k in opt if k in def_opt and def_opt[k] != opt[k] and not k in 'iaf' ], '') if non_def_options_str: write('### Non-default options:\n'+non_def_options_str) write('', 1) # checking output options for x in ['of', 'oc', 'ocf', 'ocg']: if opt[x] and opt[x]==1: raise notracebackException, "ERROR option -{0} must be provided with an argument (which will be used as output file)!" ####### ### processing options controlling colors colors_already_taken={} # useful for later, when we compute available_colors color_genes_of_interest=[None, None, None, None] if opt['ci']: for index, color in enumerate( opt['ci'].split(',') ): if color=='None': color=None color_genes_of_interest[index]=color colors_already_taken[ join( map(str, color_genes_of_interest), ',') ]=1 color_singlets=[None, None, None, None] if opt['cs']: for index, color in enumerate( opt['cs'].split(',') ): if color=='None': color=None color_singlets[index]=color colors_already_taken[ join( map(str, color_singlets), ',') ]=1 fam2color={} ## each color is a list [fill, outline, box_bkg, box_outline] if not defined, it's None if opt['cf']: ## load color-family file for line in open( opt['cf'] ): splt=line.strip().split() if splt: #skipping empty lines fam=splt[0]; the_colors=[None, None, None, None] for index, item in enumerate(splt[1:]): if item=='None': item=None the_colors[index]=item fam2color[fam] = the_colors colors_already_taken[ join( map(str, the_colors), ',') ]=1 color_file=opt['c']; check_file_presence(color_file, 'color_file', notracebackException ) color_scheme=opt['cr']; if not color_scheme in [0, 1, 2, 3]: raise notracebackException, "ERROR invalid color scheme provided with option -cr ! see -help" individual_colors=[line.strip() for line in open(color_file) if line.strip()]; if color_scheme==0: available_colors = [[a,None,None,None] for a in individual_colors if not a+',None,None,None' in colors_already_taken] elif color_scheme==1: available_colors = [[b, a,None,None] for a in individual_colors for b in individual_colors if not b+','+a+',None,None' in colors_already_taken] elif color_scheme==2: available_colors = [[c, b, a,None] for a in individual_colors for b in individual_colors for c in individual_colors if not (a==b==c) and not c+','+b+','+a+',None' in colors_already_taken] elif color_scheme==3: available_colors = [[d, c, b, a] for a in individual_colors for b in individual_colors for c in individual_colors for d in individual_colors if not (b==c==d) and not d+','+c+','+b+','+a in colors_already_taken] #write('available colors: {0}'.format(len(available_colors)), 1) if opt['rc']: random.shuffle(available_colors) ###### ## loading gff input files # genes of interest input_get_id_function=None; if opt['if']: input_get_id_function=eval('lambda x:'+opt['if']) write('Loading genes of interest from {0:<30} ... '.format(input_gff_file)) genes_of_interest=load_all_genes(input_gff_file, tag='*', get_id=input_get_id_function, is_sorted=True) for g_index, g in enumerate(genes_of_interest): g.is_of_interest=g_index+1 ### keeping this as a number so later we can sort output in the same order as input write('done. Genes: {0}'.format(len(genes_of_interest)), 1) # gene in global annotation annotation_get_id_function=None; if opt['af']: annotation_get_id_function=eval('lambda x:'+opt['af']) annotation_tag=opt['at'] write('Loading annotated genes from {0:<30} ... '.format(annotation_gff_file)) annotated_genes=load_all_genes(annotation_gff_file, tag=annotation_tag, get_id=annotation_get_id_function) for a in annotated_genes: a.is_of_interest=False write('done. Genes: {0}'.format(len(annotated_genes)), 1) ###### ## load homology file geneid2family={}; families_dict={} write('Loading homology families from {0:<30} ... '.format(homology_file)) for line in open(homology_file): splt=line.strip().split('\t') if splt: geneid, family = splt; geneid2family[geneid]=family; families_dict[family]=0 write('done.', 1) ## print some stats for g in annotated_genes: if g.id in geneid2family: families_dict[geneid2family[g.id]]+=1 n_fam_represented=0; n_genes_with_family=0 for fam in families_dict: if families_dict[fam]>0: n_fam_represented+=1; n_genes_with_family+=families_dict[fam] write('N of families: {0} ; {1} families have 1 or more gene(s) found in annotation.\nA total of {2} genes have a family assigned.\n'.format(len(families_dict),n_fam_represented, n_genes_with_family ), 1) del families_dict; #saving memory (almost a joke) family2genes_displayed={} ### later we'll modify geneid2family to avoid displaying useless families ## families or genes in the annotation to be ignored fams_to_ignore={} if opt['rf']: check_file_presence(opt['rf'], '-rf file') for line in open(opt['rf']): strp=line.strip() if strp: fams_to_ignore[strp]=True ############################## start doing things! ## finding overlaps def scoring_function_for_overlaps(g): return int (g.is_of_interest)* 10000000 + g.length() removed_overlapping_genes=[] non_red_genes = remove_overlapping_gene_clusters( genes_of_interest + annotated_genes, scoring=scoring_function_for_overlaps, phase=True, strand=True, out_removed_genes=removed_overlapping_genes, remember_overlaps=True ) ### getting all discarded -> kept relationship, and back for g in removed_overlapping_genes: if not hasattr( g.overlapping, 'discarded'): g.overlapping.discarded=[] g.overlapping.discarded.append( g ) for g in genes_of_interest: if hasattr( g, 'discarded'): #len(g.discarded)>1: for d in g.discarded: write(' Gene: {0:^25} removed overlapping gene: {1}'.format(g.id, d.id), 1) non_red_genes.sort( cmp=order_genes_for_chr_pos ) #sorting again... not optimized but easy ###### ############################## ## building gene clusters to be displayed gene_clusters=[] # list of lists of genes; populating this while parsing the sorted list of genes and looking for the genes of interest. max_distance = opt['l'] index=0 while index < len(non_red_genes): if non_red_genes[index].is_of_interest: g=non_red_genes[index] verbose('*** Cluster of {0}'.format(g.id), 1) gc=gene_cluster(); gc.append(g); gc.link_to_gene(g) ## parsing back CAREFUL ASSUMING THERE ARE NO NESTED STRUCTURES WITH EXONS other_index=index-1 while other_index >= 0 and non_red_genes[other_index].chromosome == g.chromosome and \ ( (not opt['n'] and abs( g.boundaries()[0] - non_red_genes[other_index].boundaries()[1] ) <= max_distance ) or \ ( opt['n'] and len(gc)-1 <= opt['n'] ) ): if not non_red_genes[other_index].id in fams_to_ignore and not \ (non_red_genes[other_index].id in geneid2family and geneid2family[non_red_genes[other_index].id] in fams_to_ignore): gc.insert(0, non_red_genes[other_index]) other_index-=1 n_genes_added_back=len(gc)-1 #parsing forward other_index=index+1 while other_index < len(non_red_genes) and non_red_genes[other_index].chromosome == g.chromosome and \ ( (not opt['n'] and abs( non_red_genes[other_index].boundaries()[0] - g.boundaries()[1] ) <= max_distance ) or \ ( opt['n'] and len(gc)-1-n_genes_added_back <= opt['n'] ) ): if not non_red_genes[other_index].id in fams_to_ignore and not \ (non_red_genes[other_index].id in geneid2family and geneid2family[non_red_genes[other_index].id] in fams_to_ignore): gc.append(non_red_genes[other_index]) other_index+=1 for i in gc: verbose( i.gff(), 1) gene_clusters.append(gc) index+=1 ## populating family2genes_displayed to compress family output for gc in gene_clusters: for g in gc: if g.id in geneid2family: fam=geneid2family[g.id] if not fam in family2genes_displayed: family2genes_displayed[fam]={} family2genes_displayed[fam][g.id]=True if opt['rs']: ## removing singlets n_singlets_removed=0 for gc in gene_clusters: len_gc=len(gc) for i in range(len_gc): g_index= len_gc-i-1 #parsing in reverse order to make .pop() work g= gc[g_index] if not g.is_of_interest and ( not g.id in geneid2family or len(family2genes_displayed[ geneid2family[g.id] ])==1 ): gc.pop(g_index); n_singlets_removed+=1; del family2genes_displayed[ geneid2family[g.id] ] if n_singlets_removed: write('Option -rs: {0} singlets were removed! '.format(n_singlets_removed), 1) #### merging clusters that share at least one gene if not opt['dm']: ## since they are sorted, a cluster can share genes only with its previous or next cluster. Also, if we scan forward, we just check if the last gene in a cluster is contained in the next one gc_index=0 while gc_index+1<len(gene_clusters): #+1 since, if it's the last one, it's not interesting current_gc=gene_clusters[gc_index]; next_gc=gene_clusters[gc_index+1] merged=False if current_gc[0].chromosome == next_gc[0].chromosome: # and current_gc[-1] in next_gc: --> in practice this is what we check. but let's do it more efficiently try: index_pos= next_gc.index(current_gc[-1]) # this cause an exception if not there merged=True #### Yes we're officially merging write('Merging the surrounds of {0:>25} and {1:<25}'.format(current_gc.ref_gene.id, next_gc.ref_gene.id), 1) for gc in next_gc[index_pos+1:]: current_gc.append( gc ) possible_ref_genes=[] for g_index, g in enumerate(current_gc): if g.is_of_interest: g.g_index=g_index; possible_ref_genes.append(g) middle_point = (len(current_gc)-1)/2.0 best_ref_gene = min (possible_ref_genes, key= lambda x:abs(x.g_index-middle_point)) current_gc.link_to_gene(best_ref_gene) gene_clusters.pop(gc_index+1) #removing next_gc except ValueError: pass if not merged: gc_index+=1 geneid2color={} ### parsing each single gene in each cluster. deciding COLORS for gc in gene_clusters: for g in gc: if g.id in fam2color: geneid2color[g.id]=fam2color[g.id] # color was specified in -cf using geneId elif g.id in geneid2family: ## this belongs to a family fam = geneid2family[g.id] if len(family2genes_displayed[fam]) == 1: geneid2color[g.id]= color_singlets ## singlet being only representative for its family else: if not fam in fam2color: # not yet assigned to this family try: fam2color[fam]=available_colors.pop(0) except IndexError: raise notracebackException, "ERROR not enough colors are available to display this! Increase the number of colors in the -c file or change the color scheme with -cr" geneid2color[g.id]= fam2color[fam] else: geneid2color[g.id]=color_singlets ## singlet that does not belong to any family if g.is_of_interest: geneid2color[g.id]= list(geneid2color[g.id]) ## copying list or otherwise we modify in place the color for index, color in enumerate(color_genes_of_interest): if not color is None: geneid2color[g.id][index]=color #write( g.id+' '+str(geneid2color[g.id]) +' '+ str(geneid2color), 1, how='green') #write('---', 1, how='reverse') ### now sorting gene_clusters so they are in the same order as the input file gene_clusters.sort(key=lambda x:x.ref_gene.is_of_interest) if opt['of']: ### producing an output file with a line for each family fh=open(opt['of'], 'w') for fam in family2genes_displayed: print >> fh, fam+'\t'+join( [gid for gid in family2genes_displayed[fam]], '\t') fh.close() if opt['oc']: ### producing an output file with a line for each gene cluster fh=open(opt['oc'], 'w') for gc in gene_clusters: print >> fh, join( [g.id for g in gc], '\t') fh.close() if opt['ocf']: fh=open(opt['ocf'], 'w') for fam in fam2color: if not fam in family2genes_displayed: continue #this is to skip cases in which -cf was provided with geneId instead of fam print >>fh, fam+'\t'+join(map(str, fam2color[fam]), '\t') fh.close() if opt['ocg']: fh=open(opt['ocg'], 'w') for geneid in geneid2color: print >>fh, geneid+'\t'+join(map(str, geneid2color[geneid]), '\t') fh.close() ### preparing ete2 objects max_n_genes_in_a_cluster= max ([len(gc) for gc in gene_clusters]) for gc in gene_clusters: name_displayed= gc[0].chromosome + ' : ' +gc.ref_gene.id name_displayed+=join([ '\n'+ ' & '+g.id for g in gc if g.is_of_interest and g != gc.ref_gene], '') #adding a line for other genes of interested merged in this cluster leaf=tree.add_child(name=name_displayed, dist=10); leaf.set_style(node_style) leaf_name_face= faces.TextFace(leaf.name, fsize=font_size); leaf_name_face.margin_left=5; leaf_name_face.margin_right=1; leaf.add_face( leaf_name_face, 0, 'aligned' ) for g in gc: g.color, g.color_outline, g.color_box_bkg, g.color_box_line = geneid2color[g.id] g.text = get_text(g) reverse_syntheny_view = not opt['ks'] and gc.ref_gene.strand=='-' # modifying gc inplace to add whitespacers to keep it sortof centered while len(gc)<max_n_genes_in_a_cluster: if (len(gc) + int(reverse_syntheny_view))%2 : gc.append('') else: gc.insert(0, '') face_list= syntheny_view( gc, printed={'boundaries': int(not opt['m']), 'text':1, 'id':0}, pen_size=4, font_size=font_size, width=face_width, reverse=reverse_syntheny_view) for col_index, the_face in enumerate(face_list): leaf.add_face( the_face, col_index+1, 'aligned' ) if opt['out']: write('--> writing output file: {0}'.format(opt['out']), 1) tree.render(opt['out'], tree_style=tree_style) else: write('-- opening interactive ETE2 environment (PyQt4) --- ', 1) tree.show(tree_style=tree_style) write('#====---- execution finished, exiting -----====#', 1)
def init_star_tree(n): tree = Tree() for i in xrange(n): tree.add_child(name=str(i)) return tree
import random import sys sys.path.insert(0, "../") from ete2 import Tree, TreeStyle, NodeStyle, PhyloTree from ete2.treeview.faces import * from ete2.treeview.main import random_color, _NODE_TYPE_CHECKER, FACE_POSITIONS sys.path.insert(0, "../examples/treeview") import face_grid, bubble_map, item_faces, node_style, node_background, face_positions main_tree = Tree() main_tree.dist = 0 t, ts = face_grid.get_example_tree() t_grid = TreeFace(t, ts) n = main_tree.add_child() n.add_face(t_grid, 0, "aligned") t, ts = bubble_map.get_example_tree() t_bubble = TreeFace(t, ts) n = main_tree.add_child() n.add_face(t_bubble, 0, "aligned") t, ts = item_faces.get_example_tree() t_items = TreeFace(t, ts) n = main_tree.add_child() n.add_face(t_items, 0, "aligned") t, ts = node_style.get_example_tree() t_nodest = TreeFace(t, ts) n = main_tree.add_child()
def consensus(trees,weights=[],lim=0): ''' returns weighted consensus tree 50% majority rule TODO: fix glup ''' if weights == []: weights = [1] * len (trees) dic = {} outgroup_name = Tree(trees[0]).get_leaf_names()[1] tlen = 0 for (tree, weight) in zip (trees, weights): tree = Tree(tree) if tlen == 0: tlen = len(tree) elif len (tree) != tlen: exit('ERROR: trees with different length') outgroup = tree.search_nodes(name=outgroup_name)[0] tree.set_outgroup(outgroup) dad = outgroup.get_sisters()[0] for node in dad.traverse(): if node.is_root(): continue cluster = ','.join (sorted (node.get_leaf_names())) if dic.has_key(cluster): dic[cluster] += weight else: dic[cluster] = weight sorted_nodes = map(lambda x: [x[2], x[1]], sorted (\ map (lambda x: (len (x.split(',')), x, dic[x]), \ dic.keys()), reverse = True)) if lim < sorted (sorted_nodes, reverse=True)[:tlen*2 - 3][-1][0]: lim = sorted (sorted_nodes, reverse=True)[:tlen*2 - 3][-1][0] sorted_nodes = filter (lambda x: x[0] >= lim, sorted_nodes) sorted_nodes = map (lambda x: x[1], sorted_nodes) if len (sorted_nodes) > tlen*2 - 3: print >> stderr, \ 'WARNING: two nodes with same support, will remove: ' + \ sorted_nodes[-1] sorted_nodes = sorted_nodes[:-1] cons_tree = Tree() cons_tree.add_child(name=outgroup_name) node = cons_tree.add_child(name='NoName') node.add_feature('childrens', \ set (sorted_nodes.pop(0).split(',')) \ - set([outgroup_name])) while len (sorted_nodes) > 0: for name in sorted_nodes: if not name in sorted_nodes: continue for node in cons_tree.traverse(strategy='postorder'): if node.is_root(): continue if node.name is not 'NoName': continue if len (node.childrens & set(name.split(','))) == 0: continue # check if ther is better solution in one of the child for rest in sorted_nodes: if len (set(rest.split(','))) < \ len (set(name.split(','))): continue if len (set(rest.split(',')) & set(name.split(','))) > 0: name = rest weight = dic[name] children = set(name.split(',')) if len (children) == 1: node.add_child(name=name) else: n = node.add_child(name='NoName') n.add_feature('childrens', children) n.support = weight break sorted_nodes.pop(sorted_nodes.index(name)) sister = node.childrens - children name = ','.join (sorted ( list (sister))) if not name in sorted_nodes: continue weight = dic[name] if len (sister) == 1: node.add_child(name=name) else: n = node.add_child(name='NoName') n.add_feature('childrens', sister) n.support = weight sorted_nodes.pop(sorted_nodes.index(name)) break print cons_tree return cons_tree.write(format=9)
def findConsensusTree(trees, weights=[], lim=0): if weights == []: weights = [1] * len(trees) dic = {} outgroup_name = trees[0].get_leaf_names()[1] tlen = 0 for (tree, weight) in zip(trees, weights): if tlen == 0: tlen = len(tree) elif len(tree) != tlen: exit('ERROR: trees with different length') outgroup = tree.search_nodes(name=outgroup_name)[0] tree.set_outgroup(outgroup) dad = outgroup.get_sisters()[0] for node in dad.traverse(): if node.is_root(): continue cluster = ','.join(sorted(node.get_leaf_names())) if dic.has_key(cluster): dic[cluster] += weight else: dic[cluster] = weight sorted_nodes = map(lambda x: [x[2], x[1]], sorted (\ map (lambda x: (len (x.split(',')), x, dic[x]), \ dic.keys()), reverse = True)) if lim < sorted(sorted_nodes, reverse=True)[:tlen * 2 - 3][-1][0]: lim = sorted(sorted_nodes, reverse=True)[:tlen * 2 - 3][-1][0] sorted_nodes = filter(lambda x: x[0] >= lim, sorted_nodes) sorted_nodes = map(lambda x: x[1], sorted_nodes) if len(sorted_nodes) > tlen * 2 - 3: print >> stderr, \ 'WARNING: two nodes with same support, will remove: ' + \ sorted_nodes[-1] sorted_nodes = sorted_nodes[:-1] cons_tree = Tree() cons_tree.add_child(name=outgroup_name) node = cons_tree.add_child(name='NoName') node.add_feature('childrens', \ set (sorted_nodes.pop(0).split(',')) - set([outgroup_name])) while len(sorted_nodes) > 0: for name in sorted_nodes: if not name in sorted_nodes: continue for node in cons_tree.traverse(strategy='postorder'): if node.is_root(): continue if node.name is not 'NoName': continue if len(node.childrens & set(name.split(','))) == 0: continue # check if ther is better solution in one of the child for rest in sorted_nodes: if len (set(rest.split(','))) < \ len (set(name.split(','))): continue if len(set(rest.split(',')) & set(name.split(','))) > 0: name = rest weight = dic[name] children = set(name.split(',')) if len(children) == 1: node.add_child(name=name) else: n = node.add_child(name='NoName') n.add_feature('childrens', children) n.support = weight break sorted_nodes.pop(sorted_nodes.index(name)) sister = node.childrens - children name = ','.join(sorted(list(sister))) if not name in sorted_nodes: continue weight = dic[name] if len(sister) == 1: node.add_child(name=name) else: n = node.add_child(name='NoName') n.add_feature('childrens', sister) n.support = weight sorted_nodes.pop(sorted_nodes.index(name)) break return cons_tree
def main (): global options, args # TODO: read and parse taxonomy file if options.verbose: print >> sys.stderr, "[",time.asctime(),"]", if options.verbose: print >> sys.stderr, "parse taxonomy file" tax = get_gg_taxonomy(args[0]) # TODO: construct the taxonomy tree if options.verbose: print >> sys.stderr, "[",time.asctime(),"]", if options.verbose: print >> sys.stderr, "build taxonomy tree" tree = Tree() tree_node = {} for x in tax: n = tree.search_nodes(name=tax[x]['kingdom']) if len(n)==0: n = tree.add_child(name=tax[x]['kingdom']) tree_node[tax[x]['kingdom']] = n for x in tax: y_name = tax[x]['kingdom'] for level in ['phylum','class','order','family','genus','species']: if len(tax[x][level])>3: x_name = y_name+"_"+tax[x][level] if x_name not in tree_node: n = tree_node[y_name].add_child(name=tax[x][level]) tree_node[x_name] = n y_name = x_name else: n = tree_node[y_name].add_child(name=x) tree_node[x] = n break if level=='species': n = tree_node[y_name].add_child(name=x) tree_node[x] = n # TODO: read and parse depth file if options.verbose: print >> sys.stderr, "[",time.asctime(),"]", if options.verbose: print >> sys.stderr, "parse depth file" depth = {} for node in tree.get_leaves(): depth[node.name] = 0.0 with open(args[1]) as f: for line in f: item = line.split() depth[item[0]] = float(item[1]) # TODO: read and parse fasta file from skbio.parse.sequences import parse_fasta if options.verbose: print >> sys.stderr, "[",time.asctime(),"]", if options.verbose: print >> sys.stderr, "parse fasta file" fasta = {} for id,seq in parse_fasta(args[2]): fasta[id] = seq # TODO: prune the tree to remove empty leaves retain_nodes = Set() for node in tree.iter_leaves(): if depth[node.name]>0 and len(tax[node.name][options.clade])>3: retain_nodes.add(node) retain_nodes.update(Set(node.get_ancestors())) tree.prune(retain_nodes) # TODO: collect clade statistics clade_depth = {} for node in tree.traverse(): if node.name.startswith(options.clade[0]): d = 0 for n in node.get_leaves(): if d < depth[n.name]: d = depth[n.name] clade_depth[node.name] = d # TODO: get the threshold X = np.array(sorted(depth.values(),reverse=True)) rX = X.cumsum()*100/X.sum() T = np.interp(options.Npercent,rX,X) # TODO: filter out clades below the threshold clade_retain = Set() for c,d in clade_depth.items(): if d>=T: clade_retain.add(c) # TODO: prune tree again retain_nodes = Set() for node in tree.get_leaves(): if tax[node.name][options.clade] in clade_retain: retain_nodes.add(node) retain_nodes.update(Set(node.get_ancestors())) tree.prune(retain_nodes) # TODO: pick out representative OTUs for a clade retain_nodes = Set() for taxon in clade_retain: t = tree & taxon L = {n:depth[n.name] for n in t.get_leaves()} l = sorted(L.items(),key=operator.itemgetter(1),reverse=True) taxon_otus = Set() for n, val in l: if val<T: continue if not taxon_otus: taxon_otus.add(n) else: max_sim = 0 for y in taxon_otus: sim = ssw_similarity(fasta[n.name],fasta[y.name]) if max_sim < sim: max_sim = sim if max_sim < options.Tsim: taxon_otus.add(n) retain_nodes.update(taxon_otus) for n in taxon_otus: retain_nodes.update(Set(n.get_ancestors())) # TODO: pick out a representative OTUs for a clade #retain_nodes = Set() #for taxon in clade_retain: # t = tree & taxon # L = {n:depth[n.name] for n in t.get_leaves()} # l = sorted(L.items(),key=operator.itemgetter(1),reverse=True) # retain_nodes.add(l[0][0]) # retain_nodes.update(Set(l[0][0].get_ancestors())) # tree.prune(retain_nodes) # TODO: output representative OTUs for node in tree.get_leaves(): print node.name # for node in tree.get_leaves(): node.add_feature("depth",depth[node.name]) if options.png: ts = TreeStyle() ts.layout_fn = tree_layout tree.render(options.png,dpi=1024,tree_style=ts)
print D ''' import matplotlib.pyplot as plt import numpy as np from ete2 import Tree data = open('output.txt').read().replace(',', ' ').replace('\n', ' ') x = data.split() ParentChild = np.array(x).astype('str') y = len(ParentChild) / 2 ParentChild1 = np.reshape(ParentChild, (y, 2)) #print ParentChild1 t = Tree() # Creates an empty tree A = t.add_child(name="A") B = A.add_child(name="B") C = A.add_child(name="C") D = C.add_child(name="D") E = A.add_child(name="E") F = A.add_child(name="F") G = A.add_child(name="G") H = F.add_child(name="H") #6,8 I = A.add_child(name="I") J = A.add_child(name="J") #10 K = D.add_child(name="K") L = D.add_child(name="L") M = A.add_child(name="L") #13 N = D.add_child(name="N") #4,11 O = D.add_child(name="O") #4,12
def tree(self): L = self._distMatrix.columnNames tree = Tree() tree.name = "root" tree.dist = 0 for seq in L: tree.add_child(name=seq, dist=0) iter_count = 1 while len(L) > 2: nearest_nbs = self._distMatrix.getNearestNeigbors() node_i = tree.search_nodes(name=nearest_nbs[0])[0] node_j = tree.search_nodes(name=nearest_nbs[1])[0] L.remove(nearest_nbs[0]) L.remove(nearest_nbs[1]) node_k = Tree() node_k.dist = 0 node_k.name = "X" + str(iter_count) d_ij = self._distMatrix.getDistance(node_i.name, node_j.name) assert d_ij > 0 d_ik = 0.5 * d_ij + 0.5 * (self._distMatrix.getSeparation(node_i.name) - self._distMatrix.getSeparation(node_j.name)) d_jk = 0.5 * d_ij + 0.5 * (self._distMatrix.getSeparation(node_j.name) - self._distMatrix.getSeparation(node_i.name)) tree.remove_child(node_i) tree.remove_child(node_j) node_k.add_child(node_i, dist=d_ik) node_k.add_child(node_j, dist=d_jk) tree.add_child(node_k) d_km = [] for node_m in L: d_km.append(0.5 * (self._distMatrix.getDistance(node_i.name, node_m) + self._distMatrix.getDistance(node_j.name, node_m) - d_ij) ) assert d_km > 0 self._distMatrix.removeData((node_i.name, node_j.name)) self._distMatrix.appendData(d_km, node_k.name) iter_count+=1 L = self._distMatrix.columnNames last_nodes = tree.get_children() d_ij = self._distMatrix.getDistance(last_nodes[0].name, last_nodes[1].name) leaf = None new_root = None for node in last_nodes: if node.is_leaf(): node.dist = d_ij leaf = node.detach() else: new_root = node.detach() if not leaf: leaf = last_nodes[0] leaf.dist = d_ij new_root.add_child(leaf) return new_root ## # @var _distMatrix # the distance matrix in more or less arbitrary form # @var _names # taxa identification strings # @var _alignment # multiple sequence alignment
""" import matplotlib.pyplot as plt import numpy as np from ete2 import Tree data = open("output.txt").read().replace(",", " ").replace("\n", " ") x = data.split() ParentChild = np.array(x).astype("str") y = len(ParentChild) / 2 ParentChild1 = np.reshape(ParentChild, (y, 2)) # print ParentChild1 t = Tree() # Creates an empty tree A = t.add_child(name="A") B = A.add_child(name="B") C = A.add_child(name="C") D = C.add_child(name="D") E = A.add_child(name="E") F = A.add_child(name="F") G = A.add_child(name="G") H = F.add_child(name="H") # 6,8 I = A.add_child(name="I") J = A.add_child(name="J") # 10 K = D.add_child(name="K") L = D.add_child(name="L") M = A.add_child(name="L") # 13 N = D.add_child(name="N") # 4,11 O = D.add_child(name="O") # 4,12
if( randint(0,5) ): a = node.add_child(name = randint(1, 10000)) addChild(a,n,level+1) if( randint(0,5) ): a = node.add_child(name = randint(1, 10000)) level = level + 1 addChild(a,n,level) root = randint(1, 10000) t.name = root n = 10 a = t.add_child(name = randint(1, 10000)) addChild(a,n,2) a = t.add_child(name = randint(1, 10000)) addChild(a,n,2) print t.get_ascii(show_internal=True) #print t.name, #zigzag(deque(t.get_children()),0) #print t.get_descendants() p= t.get_children() if( len(p) == 2): print "two" #print p[0] print len( p[0].get_descendants() ) #print p[1] print len( p[1].get_descendants() )
class SiteSpider: def __init__(self, driver, target_url, depth=-1, delay=5, mitm=False): self.driver = driver self.target_url = target_url self.t = Tree() self.root = self.t.add_child(name=target_url) self.root.add_features(path=target_url, advance=True) self.depth = depth self.delay = delay self.subscribers = [] self.url_cache = UrlCache(self.depth) self.mitm = mitm def auth(self, handler): handler(self.driver, self.target_url) def _is_same_domain(self, href): curr = urlparse(href) base = urlparse(self.target_url) #print "%s =? %s" % (curr.netloc, base.netloc) return curr.netloc == base.netloc def _url_same(self, url1, url2): if self.depth < 0: return url1 == url2 else: path1 = urlparse(url1).path.split("/") path2 = urlparse(url2).path.split("/") same = True for i in range(min(min(self.depth, len(path1)), len(path2))): if path1[i] != path2[i]: same = False break #print "Path1 %s Path2 %s Same? %s" % (path1, path2, str(same)) return same def _has_visited(self, url): return self.url_cache.has_visited(url) def _has_sister(self, node, url): for sister in node.children: if self._url_same(sister.name, url): return True return False def _get_url_path(self, url): ''' Label in tree to use ''' if not self._is_same_domain(url): return url else: parse = urlparse(url) return parse.path def _get_link_url(self, a): child_url = a.get_attribute("href") if not child_url: return None # If pound then JS must handle this link so follow it to see # where it goes if child_url.endswith("#"): logger.debug("Ignoring dynamic link.") return None return child_url def crawl(self): self._crawl(self.root) def _should_advance(self, child, child_url): return self._is_same_domain( child_url) and not self._has_visited(child_url) def _close_windows(self): wins = self.driver.window_handles def _call_subscribers(self): for s in self.subscribers: try: s.on_page_visited() except: pass def _crawl(self, node): # Make request for the page self.url_cache.cache(node.name) #Hack to tell the proxy we are requesting a new page if self.mitm: b64 = base64.b64encode(node.name) proxy_signal_url = 'http://127.0.0.1:8080/?page=' + b64 self.driver.get(proxy_signal_url) self.driver.get(node.name) # There is an issue if the link is in the same domain but then # it does a redirect to a url outside the domain. We wont know until # we visit it. If this happens abort and remove from tree if not self._is_same_domain(self.driver.current_url): node.detach() logger.warn("Aborting, not same domain") return self._call_subscribers() time.sleep(self.delay) logger.info(self.driver.current_url) #logger.debug( self.t.get_ascii(show_internal=True, attributes=["path"])) # Access by index because if we move to the # next page the context of the page is lost when we come back anchors = self.driver.find_elements_by_tag_name("a") # anchor_set = Set(anchors) l = len(anchors) for i in range(l): # new_anchor_set = Set(new_anchors) # anchor_diff = new_anchor_set - anchor_set # vanished_anchors = anchor_set - new_anchor_set # if anchor_diff: # logger.debug('New Anchors: ' + '\n'.join([anchor.text for anchor in anchor_diff])) # if vanished_anchors: # logger.debug('Vanished Anchors: ' + '\n'.join([anchor.text for anchor in anchor_diff])) new_anchors = self.driver.find_elements_by_tag_name("a") assert len(new_anchors) == len(anchors) a = new_anchors[i] child_url = self._get_link_url(a) #Only add if its not already there if not child_url or self._has_visited(child_url): continue child = node.add_child(name=child_url) child.add_feature("path", self._get_url_path(child_url)) # Determine if the link should be advanced forward # We never want to start crawling other pages if self._should_advance(child, child_url): child.add_feature("advance", True) else: child.add_feature("advance", False) logger.debug(self.url_cache) #Process all the found links for child in node.children: if child.advance: self._crawl(child) def get_link_graph(self): return self.t def add_subscriber(self, subscriber): self.subscribers.append(subscriber)
def main(argv): input_file='' title='Title' label_internal_nodes = False label_leaves = False out_file='' width=750 out_file_xml='' try: opts, args = getopt.getopt(argv,"h:i:t:lno:w:x:",["Help=","InputFile=","Title=","LabelLeaves=", "LabelInternalNodes=","OutFile=","Width=","OutFileXML="]) except getopt.GetoptError: print 'Unknown option, call using: ./PlotTree.py -i <InputCAMIFile> -t <Title> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>' sys.exit(2) for opt, arg in opts: if opt == '-h': print './PlotTree.py -i <InputCAMIFile> -t <Title> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -o <OutFile> -x <OutFile.xml> -w <Width>' sys.exit(2) elif opt in ("-i", "--InputFile"): input_file = arg elif opt in ("-t", "--Title"): title = arg elif opt in ("-l", "--LabelLeaves"): label_leaves = True elif opt in ("-n","--LabelInternalNodes"): label_internal_nodes = True elif opt in ("-o", "--OutFile"): out_file = arg elif opt in ("-w", "--Width"): width = int(arg) elif opt in ("-x", "--OutFileXML"): out_file_xml = arg schema_names = COLOR_SCHEMES.keys() #Read the common kmer profile ckm_tax_paths = [] ckm_name_to_perc = dict() fid = open(input_file,'r') file = fid.readlines() fid.close() #Put placeholders in for missing names like: "||" -> "|NA1|" file_noblank = list() i=0 for line in file: while "||" in line: line = line.replace("||","|NONAME|",1) i = i+1 file_noblank.append(line) #Get the names and weights for line in file_noblank: if line[0]!='#' and line[0]!='@' and line[0]!='\n': #Don't parse comments or blank lines temp = line.split()[3] #Get the names ckm_tax_paths.append(temp) ckm_name_to_perc[temp.split("|")[-1]] = line.split()[-1] #Get the weights #Create the tree t=Tree() names_to_nodes = dict() for i in range(0,len(ckm_tax_paths)): split_tax_path = ckm_tax_paths[i].split("|") if len(split_tax_path)==1: #If len==1, then it's a superkingdom names_to_nodes[split_tax_path[0]] = t.add_child(name=split_tax_path[0]) #connect directly to tree else: if split_tax_path[-2] in names_to_nodes: #If the parent is already in the tree, add to tree names_to_nodes[split_tax_path[-1]] = names_to_nodes[split_tax_path[-2]].add_child(name=split_tax_path[-1]) else: #Otherwise iterate up until we have something that is in the tree j=2 while split_tax_path[-j]=="NONAME": j = j + 1 #This skips over the NONAMES names_to_nodes[split_tax_path[-1]] = names_to_nodes[split_tax_path[-j]].add_child(name=split_tax_path[-1]) #Show the tree #print t.get_ascii(show_internal=True) #scheme = random.sample(schema_names, 1)[0] #'set2' is nice, scheme = 'set2' def layout(node): if node.name in ckm_name_to_perc: ckm_perc = float(ckm_name_to_perc[node.name]) else: ckm_perc = 0 F = CircleFace(radius=3.14*math.sqrt(ckm_perc), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") if label_internal_nodes: faces.add_face_to_node(TextFace(node.name, fsize=7),node, 0, position="branch-top") ts = TreeStyle() ts.layout_fn = layout ts.mode = "r" ts.show_leaf_name = label_leaves ts.min_leaf_separation = 50 ts.title.add_face(TextFace(title, fsize=20), column=0) #Export the tree to a png image t.render(out_file, w=width, units="mm", tree_style=ts) #Export the xml file project = Phyloxml() phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[])) phylo.phyloxml_phylogeny.set_name(title) project.add_phylogeny(phylo) project.export(open(out_file_xml,'w'))
def execute(self,nodes,path1,node,c,maximo,maxinf,exrel,umbral,padre): a,b = path1.rsplit(':', 1) if (a[-1:] != "n"): path = a+"d:"+b cyprop = "/(count(distinct(d))+1)" else: path = path1 cyprop = "" TC = self.TC graph_db = self.graph_db if len(nodes) == 0: self.arbol = Tree("("+str(padre)+"*"+str(len(nodes))+");") return Tree("("+str(padre)+"*"+str(len(nodes))+");") if not any(n[self.target] == self.vtarget for n in nodes): self.arbol = Tree("(not "+str(self.vtarget)+"*"+str(len(nodes))+");") return Tree("(not "+str(self.vtarget)+"*"+str(len(nodes))+");") if not any(n[self.target] != self.vtarget for n in nodes): self.arbol = Tree("("+str(self.vtarget)+"*"+str(len(nodes))+");") return Tree("("+str(self.vtarget)+"*"+str(len(nodes))+");") if (c <= 0 or maxinf == 0 or maxinf <= umbral or TC == [] or len(nodes) < 2): temp = [] for n in nodes: if n[self.target] == self.vtarget: temp.append(self.vtarget) else: temp.append("not "+self.vtarget) self.arbol = Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");") return Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");") else: posibles = "" cont = 0 while (len(posibles) == 0 and cont < 10): cont += 1 posibles = "MATCH (a)-[r]->(b) WHERE labels(a) <> [] AND labels(b) <> [] AND ( " for t in TC: posibles = posibles + "type(r) = '"+t.To+"' OR " posibles = str(posibles[:-3]) + ") AND (" for z in random.sample(nodes, random.randint(1,(len(nodes)/2))): posibles += "id(a) = " + str(z.id) + " OR " posibles = str(posibles[:-3]) + " ) RETURN DISTINCT head(labels(a)) AS This, type(r) as To, head(labels(b)) AS That limit "+str(len(TC))+" UNION ALL MATCH (a)<-[r]-(b) WHERE labels(a) <> [] AND labels(b) <> [] AND (" for t in TC: posibles = posibles + "type(r) = '"+t.To+"' OR " posibles = str(posibles[:-3]) + ") AND (" for z in random.sample(nodes, (random.randint(1,len(nodes)/2))): posibles += "id(a) = "+str(z.id)+" OR " posibles = str(posibles[:-3]) + " ) RETURN DISTINCT head(labels(b)) AS This, type(r) as To, head(labels(a)) AS That limit "+str(len(TC)) posibles = neo4j.CypherQuery(self.graph_db, posibles).execute() if cont >= 10 or len(posibles)==0 or len(nodes)<15: temp = [] for n in nodes: if n[self.target] == self.vtarget: temp.append(self.vtarget) else: temp.append("not "+self.vtarget) self.arbol = Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");") return Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");") maxinf = -1000 tc_c = posibles[0] for tc in posibles:#random.sample(posibles, random.randint(1,(len(posibles)))): cluster_centers = [] if((tc.This == node or tc.That == node) and tc.To not in exrel): if(tc.That == node): consulta = path + "<-[:"+tc.To+"]-(e:"+tc.This+")" else: consulta = path + "-[:"+tc.To+"]->(e:"+tc.That+")" if self.relValida(graph_db,consulta,nodes,cyprop) : cluster_centers, group = self.centers_y_clusters(graph_db,nodes,consulta,cyprop) newentropy = 0 if (len(cluster_centers))> 0: for idx,v in enumerate(cluster_centers): newentropy += (len(group[idx])/(len(nodes)))*self.entropy(group[idx]) information = self.entropy(nodes) - newentropy temp = [] for n in nodes: if n[self.target] == self.vtarget: temp.append(self.vtarget) else: temp.append("not "+self.vtarget) self.arbol = Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");") return Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");") if (information >= maxinf): maxinf = information tc_c = tc if maxinf > maximo: maximo = maxinf if (tc_c.That == node): consultacon = path + "<-[:"+tc_c.To+"]-(e:"+tc_c.This+")" consultasin = path1 + "<-[:"+tc_c.To+"]-(:"+tc_c.This+")" label = "<-[:"+tc_c.To+"]-(:"+tc_c.This+") " nextnode = tc_c.This else: consultacon = path + "-[:"+tc_c.To+"]->(e:"+tc_c.That+")" consultasin = path1 + "-[:"+tc_c.To+"]->(:"+tc_c.That+")" nextnode = tc_c.That label = "-[:"+tc_c.To+"]->(:"+tc_c.That+")" group = [] neg = [] suma = 0 for n in nodes: tiene = neo4j.CypherQuery(graph_db, consultacon+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute() for r in tiene: todo.append([r.cuenta]) rr.append(r.cuenta) ms = MeanShift(bin_seeding=True) ms.fit(np.asarray(todo)) labels = ms.labels_ cluster_centers = sorted(ms.cluster_centers_,key=lambda x: x[0]) for idx,cl in enumerate(cluster_centers): cluster_centers[idx] = round(float(cl[0]),3) for u in cluster_centers: group.append([]) for n in nodes: tiene = neo4j.CypherQuery(graph_db, consultacon+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute().data for r in tiene: valor = r.cuenta for idx,v in enumerate(cluster_centers): if idx == 0: temp1 = -9999 else: temp1 = (cluster_centers[idx-1] + cluster_centers[idx])/2 if idx == len(cluster_centers) - 1: temp2 = 99999 else: temp2 = (cluster_centers[idx+1] + cluster_centers[idx])/2 if temp1 <= valor < temp2: group[idx].append(n) temp = [] for n in nodes: if n[self.target] == self.vtarget: temp.append(self.vtarget) else: temp.append("not "+self.vtarget) padre1 = str(max(set(temp), key=temp.count)) t = Tree() t.name=label+" "+str(cluster_centers).replace(". ",".0").replace(" ", "").replace("[","").replace("]","").replace("\n",",") t = t.search_nodes(name=label+" "+str(cluster_centers).replace(". ",".0").replace(" ", "").replace("[","").replace("]","").replace("\n",","))[0] if umbral < 0: umbral = umbral - maxinf else: umbral = 0 for idx,v in enumerate(cluster_centers): t.add_child(self.execute(group[idx],consultasin,str(nextnode),c-1,maximo,maxinf,[],umbral,padre1)) self.arbol = t if maxinf > umbral and maxinf != 0: return t else: temp = [] for n in nodes: if n[self.target] == self.vtarget: temp.append(self.vtarget) else: temp.append("not "+self.vtarget) self.arbol = t return Tree("("+str(max(set(temp), key=temp.count))+"*"+str(len(nodes))+");")
def tree_from_taxonomy(top_level, tree_taxonomy): start_level = taxonomy_levels.index(top_level) new_taxa = tree_taxonomy.keys() tl_types = [] for tt in tree_taxonomy: tl_types.append(tree_taxonomy[tt][top_level]) tl_types = _uniquify(tl_types) levels_to_worry_about = tlevels[0:tlevels.index(top_level) + 1] t = Tree() nodes = {} nodes[top_level] = [] for tl in tl_types: n = t.add_child(name=tl) nodes[top_level].append({tl: n}) for l in levels_to_worry_about[-2::-1]: names = [] nodes[l] = [] ci = levels_to_worry_about.index(l) for tt in tree_taxonomy: try: names.append(tree_taxonomy[tt][l]) except KeyError: pass names = _uniquify(names) for n in names: # find my parent parent = None for tt in tree_taxonomy: try: if tree_taxonomy[tt][l] == n: try: parent = tree_taxonomy[tt][levels_to_worry_about[ ci + 1]] level = ci + 1 except KeyError: try: parent = tree_taxonomy[tt][ levels_to_worry_about[ci + 2]] level = ci + 2 except KeyError: try: parent = tree_taxonomy[tt][ levels_to_worry_about[ci + 3]] level = ci + 3 except KeyError: print "ERROR: tried to find some taxonomic info for " + tt + " from tree_taxonomy file/downloaded data and I went two levels up, but failed find any. Looked at:\n" print "\t" + levels_to_worry_about[ci + 1] print "\t" + levels_to_worry_about[ci + 2] print "\t" + levels_to_worry_about[ci + 3] print "This is the taxonomy info I have for " + tt print tree_taxonomy[tt] sys.exit(1) k = [] for nd in nodes[levels_to_worry_about[level]]: k.extend(nd.keys()) i = 0 for kk in k: if kk == parent: break i += 1 parent_id = i break except KeyError: pass # no data at this level for this beastie # find out where to attach it node_id = nodes[levels_to_worry_about[level]][parent_id][parent] nd = node_id.add_child(name=n.replace(" ", "_")) nodes[l].append({n: nd}) tree = t.write(format=9) return tree
from re import split from ete2 import Tree from nltk.tree import * def tree_generation(entities): for entity in entities: words = split(r'[\s-]+', entity) reversed_words_list = [words[i - 1:] for i in range(len(words), 0, -1)] t = Tree() for word in reversed_words_list: string = ' '.join(word) z = t.add_child(name=string) t = z print t.show() # tree_generation(['Enterprise Service Bus']) t1 = Tree() x = t1.add_child(name="sdfsdf") z = t1.add_sister(name="456") y = x.add_child(name="wef") # t=Tree() # t.populate(10) print t1
def main(): # do stuff parser = argparse.ArgumentParser( prog="create a tree from a taxonomy file", description="Create a taxonomic tree", ) parser.add_argument('-v', '--verbose', action='store_true', help="Verbose output: mainly progress reports.", default=False) parser.add_argument('top_level', nargs=1, help="The top level group to start with, e.g. family") parser.add_argument('input_file', metavar='input_file', nargs=1, help="Your taxonomy file") parser.add_argument('output_file', metavar='output_file', nargs=1, help="Your new tree file") args = parser.parse_args() verbose = args.verbose input_file = args.input_file[0] output_file = args.output_file[0] top_level = args.top_level[0] tree_taxonomy = stk.load_taxonomy(input_file) new_taxa = tree_taxonomy.keys() tl_types = [] for tt in tree_taxonomy: tl_types.append(tree_taxonomy[tt][top_level]) tl_types = _uniquify(tl_types) print tl_types levels_to_worry_about = tlevels[0:tlevels.index(top_level) + 1] t = Tree() nodes = {} nodes[top_level] = [] for tl in tl_types: n = t.add_child(name=tl) nodes[top_level].append({tl: n}) for l in levels_to_worry_about[-2::-1]: names = [] nodes[l] = [] ci = levels_to_worry_about.index(l) for tt in tree_taxonomy: try: names.append(tree_taxonomy[tt][l]) except KeyError: pass names = _uniquify(names) for n in names: # find my parent parent = None for tt in tree_taxonomy: try: if tree_taxonomy[tt][l] == n: try: parent = tree_taxonomy[tt][levels_to_worry_about[ ci + 1]] level = ci + 1 except KeyError: try: parent = tree_taxonomy[tt][ levels_to_worry_about[ci + 2]] level = ci + 2 except KeyError: try: parent = tree_taxonomy[tt][ levels_to_worry_about[ci + 3]] level = ci + 3 except KeyError: print "ERROR: tried to find some taxonomic info for " + tt + " from tree_taxonomy file/downloaded data and I went two levels up, but failed find any. Looked at:\n" print "\t" + levels_to_worry_about[ci + 1] print "\t" + levels_to_worry_about[ci + 2] print "\t" + levels_to_worry_about[ci + 3] print "This is the taxonomy info I have for " + tt print tree_taxonomy[tt] sys.exit(1) k = [] for nd in nodes[levels_to_worry_about[level]]: k.extend(nd.keys()) i = 0 for kk in k: if kk == parent: break i += 1 parent_id = i break except KeyError: pass # no data at this level for this beastie # find out where to attach it node_id = nodes[levels_to_worry_about[level]][parent_id][parent] nd = node_id.add_child(name=n.replace(" ", "_")) nodes[l].append({n: nd}) tree = t.write(format=9) tree = stk._collapse_nodes(tree) tree = stk._collapse_nodes(tree) f = open(output_file, "w") f.write(tree) f.close()
def createLineageTrees(self, fn=None, width=None, height=None, circular=False, withAppearing=True, from_t=0, to_t=0): from ete2 import Tree, NodeStyle, AttrFace tree = Tree() style = self.getNodeStyle() divisionStyle = self.getNodeStyle() invisibleNodeStyle = NodeStyle() invisibleNodeStyle["hz_line_color"] = "white" invisibleNodeStyle["vt_line_color"] = "white" invisibleNodeStyle["fgcolor"] = "white" distanceFromRoot = 0 nodeMap = {} branchSize = {} # add all nodes which appear in the first frame for event in self.mainOperator.innerOperators[0].events[from_t]: if event.type != pgmlink.EventType.Appearance: label = event.traxel_ids[0] appNode = tree.add_child(name=self.getNodeName(0, label), dist=distanceFromRoot ) nodeMap[str(self.getNodeName(0, label))] = appNode branchSize[str(self.getNodeName(0, label))] = 0 # making the branches to the root node invisible n = appNode while n: n.set_style(invisibleNodeStyle) n = n.up appNode.set_style(invisibleNodeStyle) name = AttrFace("name") name.fsize = 6 # add all lineages for t, events_at in enumerate(self.mainOperator.innerOperators[0].events[from_t:to_t+1]): t = t+1 for event in events_at: if event.type == pgmlink.EventType.Appearance and withAppearing: label = event.traxel_ids[0] appNode = tree.add_child(name=self.getNodeName(t, label), dist=distanceFromRoot + t) nodeMap[str(self.getNodeName(t, label))] = appNode branchSize[str(self.getNodeName(t, label))] = 0 # making the branches to the root node invisible n = appNode while n: n.set_style(invisibleNodeStyle) n = n.up appNode.set_style(invisibleNodeStyle) name = AttrFace("name") name.fsize = 6 elif event.type == pgmlink.EventType.Disappearance: label = event.traxel_ids[0] if str(self.getNodeName(t-1,str(label))) not in nodeMap.keys(): continue if branchSize[str(self.getNodeName(t-1,str(label)))] == 0: del nodeMap[str(self.getNodeName(t-1,str(label)))] del branchSize[str(self.getNodeName(t-1,str(label)))] continue newNode = nodeMap[str(self.getNodeName(t-1,str(label)))].add_child( name = self.getNodeName(t-1,str(label)),dist = branchSize[str(self.getNodeName(t-1,str(label)))]) newNode.set_style(style) del nodeMap[str(self.getNodeName(t-1,str(label)))] del branchSize[str(self.getNodeName(t-1,str(label)))] elif event.type == pgmlink.EventType.Division: labelOld = event.traxel_ids[0] labelNew1 = event.traxel_ids[1] labelNew2 = event.traxel_ids[2] if str(self.getNodeName(t-1,str(labelOld))) not in nodeMap.keys(): continue newNode = nodeMap[str(self.getNodeName(t-1,str(labelOld)))].add_child( name = self.getNodeName(t-1,str(self.getNodeName(t-1,str(labelOld)))), dist = branchSize[str(self.getNodeName(t-1,str(labelOld)))] ) del nodeMap[str(self.getNodeName(t-1,str(labelOld)))] del branchSize[str(self.getNodeName(t-1,str(labelOld)))] newNode.set_style(divisionStyle) nodeMap[str(self.getNodeName(t,str(labelNew1)))] = newNode nodeMap[str(self.getNodeName(t,str(labelNew2)))] = newNode branchSize[str(self.getNodeName(t,str(labelNew1)))] = 1 branchSize[str(self.getNodeName(t,str(labelNew2)))] = 1 elif event.type == pgmlink.EventType.Move: labelOld = event.traxel_ids[0] labelNew = event.traxel_ids[1] if str(self.getNodeName(t-1,str(labelOld))) not in nodeMap.keys(): continue nodeMap[str(self.getNodeName(t,str(labelNew)))] = nodeMap[str(self.getNodeName(t-1,str(labelOld)))] del nodeMap[str(self.getNodeName(t-1,str(labelOld)))] branchSize[str(self.getNodeName(t,str(labelNew)))] = branchSize[str(self.getNodeName(t-1,str(labelOld)))] + 1 del branchSize[str(self.getNodeName(t-1,str(labelOld)))] else: raise Exception, "lineage tree generation not implemented for event type " + str(event.type) for label in nodeMap.keys(): newNode = nodeMap[label].add_child(name = label,dist = branchSize[label]) self.plotTree(tree, out_fn=fn, rotation=270, show_leaf_name=False, show_branch_length=False, circularTree=circular, show_division_nodes=False, distance_between_branches=4, width=width, height=height)
ts = TreeStyle() ts.mode = "c" ts.arc_span = 360 ts.layout_fn = layout ts.show_leaf_name = False ts.show_border = True ts.draw_guiding_lines = False ts.show_scale = True #ts.scale = 60 t = Tree() t.dist = 0 t.size = 0, 0 for x in xrange(100): n = t.add_child() n = n.add_child() n = n.add_child() n2 = n.add_child() n3 = n.add_child() n4 = n2.add_child() n5 = n3.add_child() # n.size = (10, 10) # n2.size = (10, 70) # n3.size = (40, 40) # n4.size = (10, 10) #n2.size = 10 #n3.size = 10 #n5.size = 10 #n2.dist = 0.1 #n2.size = 1
from ete2 import Tree, TreeStyle, NodeStyle, PhyloTree, faces from ete2.treeview.faces import * from ete2.treeview.main import random_color, _NODE_TYPE_CHECKER, FACE_POSITIONS sys.path.insert(0, os.path.join(ETEPATH, "examples/treeview")) import face_grid, bubble_map, item_faces, node_style, node_background, face_positions, face_rotation, seq_motif_faces, barchart_and_piechart_faces sys.path.insert(0, os.path.join(ETEPATH, "examples/phylogenies")) import phylotree_visualization main_tree = Tree() main_tree.dist = 0 t, ts = face_grid.get_example_tree() t_grid = TreeFace(t, ts) n = main_tree.add_child() n.add_face(t_grid, 0, "aligned") t, ts = bubble_map.get_example_tree() t_bubble = TreeFace(t, ts) n = main_tree.add_child() n.add_face(t_bubble, 0, "aligned") t, ts = item_faces.get_example_tree() t_items = TreeFace(t, ts) n = main_tree.add_child() n.add_face(t_items, 0, "aligned") t, ts = node_style.get_example_tree() t_nodest = TreeFace(t, ts) n = main_tree.add_child()
def createLineageTrees(self, fn=None, width=None, height=None, circular=False, withAppearing=True, from_t=0, to_t=0): from ete2 import Tree, NodeStyle, AttrFace tree = Tree() style = self.getNodeStyle() divisionStyle = self.getNodeStyle() invisibleNodeStyle = NodeStyle() invisibleNodeStyle["hz_line_color"] = "white" invisibleNodeStyle["vt_line_color"] = "white" invisibleNodeStyle["fgcolor"] = "white" distanceFromRoot = 0 nodeMap = {} branchSize = {} # add all nodes which appear in the first frame for event in self.mainOperator.innerOperators[0].events[from_t]: if event.type != pgmlink.EventType.Appearance: label = event.traxel_ids[0] appNode = tree.add_child(name=self.getNodeName(0, label), dist=distanceFromRoot) nodeMap[str(self.getNodeName(0, label))] = appNode branchSize[str(self.getNodeName(0, label))] = 0 # making the branches to the root node invisible n = appNode while n: n.set_style(invisibleNodeStyle) n = n.up appNode.set_style(invisibleNodeStyle) name = AttrFace("name") name.fsize = 6 # add all lineages for t, events_at in enumerate( self.mainOperator.innerOperators[0].events[from_t:to_t + 1]): t = t + 1 for event in events_at: if event.type == pgmlink.EventType.Appearance and withAppearing: label = event.traxel_ids[0] appNode = tree.add_child(name=self.getNodeName(t, label), dist=distanceFromRoot + t) nodeMap[str(self.getNodeName(t, label))] = appNode branchSize[str(self.getNodeName(t, label))] = 0 # making the branches to the root node invisible n = appNode while n: n.set_style(invisibleNodeStyle) n = n.up appNode.set_style(invisibleNodeStyle) name = AttrFace("name") name.fsize = 6 elif event.type == pgmlink.EventType.Disappearance: label = event.traxel_ids[0] if str(self.getNodeName(t - 1, str(label))) not in nodeMap.keys(): continue if branchSize[str(self.getNodeName(t - 1, str(label)))] == 0: del nodeMap[str(self.getNodeName(t - 1, str(label)))] del branchSize[str(self.getNodeName(t - 1, str(label)))] continue newNode = nodeMap[str(self.getNodeName( t - 1, str(label)))].add_child( name=self.getNodeName(t - 1, str(label)), dist=branchSize[str( self.getNodeName(t - 1, str(label)))]) newNode.set_style(style) del nodeMap[str(self.getNodeName(t - 1, str(label)))] del branchSize[str(self.getNodeName(t - 1, str(label)))] elif event.type == pgmlink.EventType.Division: labelOld = event.traxel_ids[0] labelNew1 = event.traxel_ids[1] labelNew2 = event.traxel_ids[2] if str(self.getNodeName( t - 1, str(labelOld))) not in nodeMap.keys(): continue newNode = nodeMap[str( self.getNodeName(t - 1, str(labelOld)))].add_child( name=self.getNodeName( t - 1, str(self.getNodeName(t - 1, str(labelOld)))), dist=branchSize[str( self.getNodeName(t - 1, str(labelOld)))]) del nodeMap[str(self.getNodeName(t - 1, str(labelOld)))] del branchSize[str(self.getNodeName(t - 1, str(labelOld)))] newNode.set_style(divisionStyle) nodeMap[str(self.getNodeName(t, str(labelNew1)))] = newNode nodeMap[str(self.getNodeName(t, str(labelNew2)))] = newNode branchSize[str(self.getNodeName(t, str(labelNew1)))] = 1 branchSize[str(self.getNodeName(t, str(labelNew2)))] = 1 elif event.type == pgmlink.EventType.Move: labelOld = event.traxel_ids[0] labelNew = event.traxel_ids[1] if str(self.getNodeName( t - 1, str(labelOld))) not in nodeMap.keys(): continue nodeMap[str(self.getNodeName( t, str(labelNew)))] = nodeMap[str( self.getNodeName(t - 1, str(labelOld)))] del nodeMap[str(self.getNodeName(t - 1, str(labelOld)))] branchSize[str(self.getNodeName( t, str(labelNew)))] = branchSize[str( self.getNodeName(t - 1, str(labelOld)))] + 1 del branchSize[str(self.getNodeName(t - 1, str(labelOld)))] else: raise Exception, "lineage tree generation not implemented for event type " + str( event.type) for label in nodeMap.keys(): newNode = nodeMap[label].add_child(name=label, dist=branchSize[label]) self.plotTree(tree, out_fn=fn, rotation=270, show_leaf_name=False, show_branch_length=False, circularTree=circular, show_division_nodes=False, distance_between_branches=4, width=width, height=height)
from ete2 import Tree t = Tree() # Creates an empty tree A = t.add_child(name="A") # Adds a new child to the current tree root # and returns it B = t.add_child(name="B") # Adds a second child to the current tree # root and returns it C = A.add_child(name="C") # Adds a new child to one of the branches D = C.add_sister(name="D") # Adds a second child to same branch as # before, but using a sister as the starting # point R = A.add_child(name="R") # Adds a third child to the # branch. Multifurcations are supported # Next, I add 6 random leaves to the R branch names_library is an # optional argument. If no names are provided, they will be generated # randomly. R.populate(6, names_library=["r1","r2","r3","r4","r5","r6"]) # Prints the tree topology print t # /-C # | # |--D # | # /--------| /-r4 # | | /--------| # | | /--------| \-r3 # | | | | # | | | \-r5 # | \--------| # ---------| | /-r6 # | | /--------| # | \--------| \-r2
class SiteSpider: def __init__(self, driver, target_url, depth=-1, delay=5, mitm=False): self.driver = driver self.target_url = target_url self.t = Tree() self.root = self.t.add_child(name=target_url) self.root.add_features(path=target_url, advance=True) self.depth = depth self.delay = delay self.subscribers = [] self.url_cache = UrlCache(self.depth) self.mitm = mitm def auth(self, handler): handler(self.driver, self.target_url) def _is_same_domain(self, href): curr = urlparse(href) base = urlparse(self.target_url) #print "%s =? %s" % (curr.netloc, base.netloc) return curr.netloc == base.netloc def _url_same(self,url1, url2): if self.depth < 0: return url1 == url2 else: path1 = urlparse(url1).path.split("/") path2 = urlparse(url2).path.split("/") same = True for i in range(min(min(self.depth, len(path1)),len(path2))): if path1[i] != path2[i]: same = False break #print "Path1 %s Path2 %s Same? %s" % (path1, path2, str(same)) return same def _has_visited(self, url): return self.url_cache.has_visited(url) def _has_sister(self, node, url): for sister in node.children: if self._url_same(sister.name,url): return True return False def _get_url_path(self, url): ''' Label in tree to use ''' if not self._is_same_domain(url): return url else: parse = urlparse(url) return parse.path def _get_link_url(self, a): child_url = a.get_attribute("href") if not child_url: return None # If pound then JS must handle this link so follow it to see # where it goes if child_url.endswith("#"): logger.debug("Ignoring dynamic link.") return None return child_url def crawl(self): self._crawl(self.root) def _should_advance(self, child, child_url): return self._is_same_domain(child_url) and not self._has_visited(child_url) def _close_windows(self): wins = self.driver.window_handles def _call_subscribers(self): for s in self.subscribers: try: s.on_page_visited() except: pass def _crawl(self, node): # Make request for the page self.url_cache.cache(node.name) #Hack to tell the proxy we are requesting a new page if self.mitm: b64 = base64.b64encode(node.name) proxy_signal_url = 'http://127.0.0.1:8080/?page=' + b64 self.driver.get(proxy_signal_url) self.driver.get(node.name) # There is an issue if the link is in the same domain but then # it does a redirect to a url outside the domain. We wont know until # we visit it. If this happens abort and remove from tree if not self._is_same_domain(self.driver.current_url): node.detach() logger.warn("Aborting, not same domain") return self._call_subscribers() time.sleep(self.delay) logger.info(self.driver.current_url) #logger.debug( self.t.get_ascii(show_internal=True, attributes=["path"])) # Access by index because if we move to the # next page the context of the page is lost when we come back anchors = self.driver.find_elements_by_tag_name("a") # anchor_set = Set(anchors) l = len(anchors) for i in range(l): # new_anchor_set = Set(new_anchors) # anchor_diff = new_anchor_set - anchor_set # vanished_anchors = anchor_set - new_anchor_set # if anchor_diff: # logger.debug('New Anchors: ' + '\n'.join([anchor.text for anchor in anchor_diff])) # if vanished_anchors: # logger.debug('Vanished Anchors: ' + '\n'.join([anchor.text for anchor in anchor_diff])) new_anchors = self.driver.find_elements_by_tag_name("a") assert len(new_anchors) == len(anchors) a = new_anchors[i] child_url = self._get_link_url(a) #Only add if its not already there if not child_url or self._has_visited(child_url): continue child = node.add_child(name=child_url) child.add_feature("path", self._get_url_path(child_url)) # Determine if the link should be advanced forward # We never want to start crawling other pages if self._should_advance(child, child_url): child.add_feature("advance", True) else: child.add_feature("advance", False) logger.debug(self.url_cache) #Process all the found links for child in node.children: if child.advance: self._crawl(child) def get_link_graph(self): return self.t def add_subscriber(self, subscriber): self.subscribers.append(subscriber)
curr_cols = curr_cols + 1 continue if "note" == _string: break num = s.cell_value(curr_row, curr_cols) if num == '' or type(num) != float: curr_cols = curr_cols + 1 continue parent_name = parent_name + '@' + _string curr_cols = curr_cols + 1 if parent_name == '': t.add_child(name = clock_name) clock_tree_array.append(copy.deepcopy(t)) # print clock_name + ':'+ parent_name curr_row = curr_row + 1 #find the tree that belongs to this root for _table_index in range(len(clock_tree_array)): t = clock_tree_array[_table_index] # print "doing root name = " + t.children[0].name add_child_to_node(s, t.children[0], t.children[0].name) #find the leaf's all parent, using '@' to split. for _table_index in range(len(clock_tree_array)): t = clock_tree_array[_table_index]