def __init__(self, newick = None, text_array = None, \ fdist=clustvalidation.default_dist): # Default dist is spearman_dist when scipy module is loaded # otherwise, it is set to euclidean_dist. # Initialize basic tree features and loads the newick (if any) TreeNode.__init__(self, newick) self._fdist = None self._silhouette = None self._intercluster_dist = None self._intracluster_dist = None self._profile = None self._std_profile = None # Cluster especific features self.features.add("intercluster_dist") self.features.add("intracluster_dist") self.features.add("silhouette") self.features.add("profile") self.features.add("deviation") # Initialize tree with array data if text_array: self.link_to_arraytable(text_array) if newick: self.set_distance_function(fdist)
def __init__(self, newick=None, alignment=None, alg_format="fasta", \ sp_naming_function=_parse_species, format=0): # _update names? self._name = "NoName" self._species = "Unknown" self._speciesFunction = None # Caution! native __init__ has to be called after setting # _speciesFunction to None!! TreeNode.__init__(self, newick=newick, format=format) # This will be only executed after reading the whole tree, # because the argument 'alignment' is not passed to the # PhyloNode constructor during parsing if alignment: self.link_to_alignment(alignment, alg_format) if newick: self.set_species_naming_function(sp_naming_function)
def __init__(self, newick=None, alignment=None, alg_format="fasta", \ sp_naming_function=_parse_species, format=0, **kargs): # _update names? self._name = "NoName" self._species = "Unknown" self._speciesFunction = None # Caution! native __init__ has to be called after setting # _speciesFunction to None!! TreeNode.__init__(self, newick=newick, format=format, **kargs) # This will be only executed after reading the whole tree, # because the argument 'alignment' is not passed to the # PhyloNode constructor during parsing if alignment: self.link_to_alignment(alignment, alg_format) if newick: self.set_species_naming_function(sp_naming_function)
def buildDSAndSpeciesTree(self): """Build one valid DS-tree for the relationships given, and a species tree with which it is consistent, and returns both trees. If no such pair of trees can be built, returns False instead. The DS-tree built prioritizes dups first. """ startcc = self.genes.copy() dstree = TreeNode() dstree.set = set(startcc) dsNodeList =[dstree] constructedSpeciesTree = TreeNode() constructedSpeciesTree.set = set(self.treelessGeneSpeciesMapping[x] for x in startcc) hasPassed = self._buildDSAndSpeciesTree(startcc, dsNodeList, constructedSpeciesTree) #Return dstree and species tree as an ordered pair treepair = [dstree, constructedSpeciesTree] #in case you are wondering, what's below is (hasPassed ? treepair : False) return (treepair if hasPassed else False)
def validate_tree(tree_path, msa_path): ''' Tries to validate that the tree contains unique ids, that those ids exist in the MSA, and finally, it links each leaf to its msa row.''' # holds the link between the leaf accession and the row in that msa leaf_information = {} # first load the tree tree = TreeNode(newick=tree_path, format=0) # also load in the MSA msa = AlignIO.read(msa_path,'fasta') # next go through all of the leaves for leaf in tree.get_leaves(): found = False for alignment_row in msa: if leaf.name in alignment_row.description: if leaf.name not in leaf_information: leaf_information[leaf.name] = alignment_row found = True break else: raise Exception("%s is found in the tree/MSA twice, accessions must be unique" % (leaf.name)) if not found: raise Exception("%s is in the tree but not found in the MSA" % (leaf.name)) return (tree, msa, leaf_information)
def generateRandomProblem(nbGenes, nbSpecies, orthologProb = 0.5, paralogProb = 0.4): """ Generates a random set of genes, orthologs, paralogs and species tree, ready to be input in ConstraintGraph class. Returned value has the form { "genes" : geneset, "orthologs" : orthologs, "paralogs" : paralogs, "speciesTree" : speciesTree } geneset items have the form [GENENAME]:[SPECIESNAME] Gene species are attributed randomly, though each species has at least one gene. If two genes have the same species, they end up in paralogs, always. :argument nbGenes: Number of genes to generate :argument orthologProb: chances for 2 genes to be a pair in orthologs :argument paralogProb: chances for 2 genes to be a pair in paralogs """ speciesnames = range(0, nbSpecies) speciesTree = TreeNode() speciesTree.populate(nbSpecies, speciesnames) for node in speciesTree: if not node.name is None: node.name = str(node.name) genes = [] #first add one gene per species for i in range(nbSpecies): genes.append("g" + str(len(genes)) + ":" + str(i)) #then fill in the rest with random species genes paralogs = set() orthologs = set() for i in range(nbGenes - nbSpecies): s = random.randint(0, nbSpecies - 1) genes.append("g" + str(len(genes)) + ":" + str(s)) #and here we decide of random relationships paralogProb += orthologProb for i in range(nbGenes): g1 = genes[i] pz = g1.split(":") g1name = pz[0] g1species = pz[1] for j in range(i + 1, nbGenes): g2 = genes[j] px = g2.split(":") g2name = px[0] g2species = px[1] if g1species == g2species: paralogs.add( (g1, g2) ) else: p = random.random() if p < orthologProb: orthologs.add( (g1, g2) ) elif p >= orthologProb and p < paralogProb: paralogs.add( (g1, g2) ) geneset = set() geneset.update(genes) return { "genes" : geneset, "orthologs" : orthologs, "paralogs" : paralogs, "speciesTree" : speciesTree }
orthologsStr = pz[1] elif arg.startswith("--paralogs="): pz = arg.split("=") paralogsStr = pz[1] elif not arg.startswith('-'): if graphfile1 == '': graphfile1 = arg elif graphfile2 == '': graphfile2 = arg # Map each species name to a leaf. This avoids using search_nodes, which is slow # speciesTree is used in every mode, so we build it right here right now speciesTree = None if speciesTreeStr != '': speciesTree = TreeNode(speciesTreeStr) speciesLeavesList = speciesTree.get_leaves() speciesLeaves = {} for leaf in speciesLeavesList: speciesLeaves[leaf.name] = leaf class ConstraintGraph: """ Given a set of genes, and and two set of tuples that represent orthology and paralogy relationships between genes, builds a graph of orthologies, and a graph of paralogies. It then becomes possible to build a DS-tree that satisifies all these constraints (or detect that it can't be done). """ def __init__(self, genes, orthology_relations, paralogy_relations, speciesTree = None, geneSpeciesMapping = None, treelessGeneSpeciesMapping = None): """Constructor. The 2 graphs are built here, as adjacency lists (self.orthologs and self.paralogs, two dicts with key = gene-string, value = neighbors as a set of gene-strings).
def __init__(self, newick=None, format=0, dist=None, support=None,name=None): """ Default init for the TreeClass. This works better than wrapping the entire class""" TreeNode.__init__(self, newick=newick, format=format, dist=dist, support=support, name=name)
#treeid = "ENSGT00390000013823" server = "http://beta.rest.ensembl.org" ext = "/genetree/id/" + treeid + "?" resp, content = http.request(server+ext, method="GET", headers={"Content-Type":"application/json"}) if not resp.status == 200: print "Invalid response: ", resp.status continue #evil continue decoded = json.loads(content) tree = decoded['tree'] geneNodeMap = {} root = TreeNode() visit_json_elem(tree, root, geneNodeMap) orthologs = set() paralogs = set() paralogs_dubious = set() for g1 in geneNodeMap: for g2 in geneNodeMap: if g1 != g2: n1 = geneNodeMap[g1] n2 = geneNodeMap[g2] lca = n1.get_common_ancestor(n2) if lca.name == 'duplication' or lca.name == 'gene_split':
################################## ass_node2parent = {"1": "1"} for nodeid in taxa: parentid = node2parent[nodeid] while nodeid != parentid: #costruiamo un nuovo dizionario per i soli taxa che abbiamo identificato nel campione ass_node2parent[nodeid] = parentid nodeid = parentid parentid = node2parent[nodeid] node2parentid = {} for nodeid in ass_node2parent.keys(): parentid = ass_node2parent[nodeid] # Stores node connections all_ids.update([nodeid, parentid]) # Creates a new TreeNode instance for each new node in file n = TreeNode() # Sets some TreeNode attributes n.add_feature("name", node2name[nodeid]) n.add_feature("taxid", nodeid) n.add_feature("Order", node2order[nodeid]) # updates node list and connections node2parentid[n] = parentid id2node[nodeid] = n print len(id2node) # Reconstruct tree topology from previously stored tree connections print 'Reconstructing tree topology...' for node in id2node.itervalues(): parentid = node2parentid[node] parent = id2node[parentid] # node with taxid=1 is the root of the tree
def load_NCBI(species_file, names_file, nodes_file ): if not os.path.isfile(species_file): print "ERROR "+species_file+' can\'t be read. Exiting... ' sys.exit(8) all_wanted_species={} # species_name: taxid (string) print "Reading wanted species from file: "+species_file ifile=open(species_file, 'r') for iline in ifile: species_name=iline.strip() all_wanted_species[species_name]=-1 ifile.close() # This sets Unbuffered stdout/auto-flush sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) id2node= {} node2parentid = {} all_ids = set([]) all_nodes = [] id2name= {} # Loads info from NCBI taxonomy files if os.path.exists(nodes_file): NODESFILE = open(nodes_file) elif os.path.exists(nodes_file+"bz2"): import bz2 NODESFILE = bz2.BZ2File(nodes_file+'.bz2') else: print nodes_file+' file is missing. ' sys.exit(8) if os.path.exists(names_file): NAMESFILE = open(names_file) elif os.path.exists(names_file+"bz2"): import bz2 NAMESFILE = bz2.BZ2File(names_file+'.bz2') else: print names_file +' file is missing. ' sys.exit(8) # Reads taxid/names transaltion print 'Loading species names from "names.dmp" file...', for line in NAMESFILE: # lines are redundant. synonyms are on different lines defined by the same id. So, we store only lines with "scientific name". line = line.strip() fields = map(strip, line.split("|")) nodeid, name = fields[0], fields[1] if all_wanted_species.has_key(name): all_wanted_species[name]=nodeid if fields[3]=='scientific name': #storing name that will appear afterwards in the ete2 node id2name[nodeid] = name print len(id2name) any_species_is_missing=0 for species_name in all_wanted_species: if all_wanted_species[species_name]==-1: print "ERROR the species name \""+species_name+"\" was not found!" any_species_is_missing=1 if any_species_is_missing: sys.exit(9) # Reads node connections in nodes.dmp print 'Loading node connections from "nodes.dmp" file...', for line in NODESFILE: line = line.strip() fields = map(strip, line.split("|")) nodeid, parentid = fields[0], fields[1] if nodeid =="" or parentid == "": raw_input("Wrong nodeid!") # Stores node connections all_ids.update([nodeid, parentid]) # Creates a new TreeNode instance for each new node in file n = TreeNode() # Sets some TreeNode attributes n.add_feature("name", id2name[nodeid]) n.add_feature("taxid", nodeid) # updates node list and connections node2parentid[n]=parentid id2node[nodeid] = n print len(id2node) # Reconstruct tree topology from previously stored tree connections print 'Reconstructing tree topology...' for node in id2node.itervalues(): parentid = node2parentid[node] parent = id2node[parentid] # node with taxid=1 is the root of the tree if node.taxid == "1": t = node else: parent.add_child(node) return t, id2node, all_wanted_species
def __init__(self, newick=None, formating=0): TreeNode.__init__(self, newick, formating) self.abundance = None self.nabundance = None self.fulltree=False
def build_tax_tree(): import os import sys from string import strip from ete2 import TreeNode, Tree #print sys.argv[1] #if len(sys.argv) == 1: # print "Usage: taxid2lineage file_with_taxids.txt" #else: # f = open(sys.argv[1], 'r') # This sets Unbuffered stdout/auto-flush sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) id2node= {} id2rank={} node2parentid = {} all_ids = set([]) all_nodes = [] id2name= {} # Loads info from NCBI taxonomy files if os.path.exists("nodes.dmp"): NODESFILE = open('nodes.dmp') elif os.path.exists("nodes.dmp.bz2"): import bz2 NODESFILE = bz2.BZ2File('nodes.dmp.bz2') else: print '"nodes.dmp" file is missing. Try to downloaded from: ' if os.path.exists("names_scientific.dmp"): NAMESFILE = open('names_scientific.dmp') elif os.path.exists("names_scientific.dmp.bz2"): import bz2 NAMESFILE = bz2.BZ2File('names_scientific.dmp.bz2') else: print '"names_scientific.dmp" file is missing. Try to downloaded from: ' # Reads taxid/names transaltion #print 'Loading species names from "names_scientific.dmp" file...', for line in NAMESFILE: line = line.strip() fields = map(strip, line.split("|")) nodeid, name = fields[0], fields[1] id2name[nodeid] = name # Reads node connections in nodes.dmp #print 'Loading node connections form "nodes.dmp" file...', for line in NODESFILE: line = line.strip() fields = map(strip, line.split("|")) nodeid, parentid,rankid = fields[0], fields[1], fields[2] id2rank[nodeid]=rankid if nodeid =="" or parentid == "": raw_input("Wrong nodeid!") # Stores node connections all_ids.update([nodeid, parentid]) # Creates a new TreeNode instance for each new node in file n = TreeNode() # Sets some TreeNode attributes n.add_feature("name", id2name[nodeid]) n.add_feature("taxid", nodeid) n.add_feature("rank",id2rank[nodeid]) # updates node list and connections node2parentid[n]=parentid id2node[nodeid] = n #print len(id2node) # Reconstruct tree topology from previously stored tree connections #print 'Reconstructing tree topology...' for node in id2node.itervalues(): parentid = node2parentid[node] parent = id2node[parentid] if node.taxid == "1": t = node else: parent.add_child(node) return id2node, id2name