Exemple #1
1
def validateInputs(msa, tree=None):
	# Check for existence and proper FASTA formatting of input MSA
	try:
		msaHandle = open(msa, "rU")
	except:
		print '** HYPNO input error: Given MSA file location does not exist or is not accessible: '+msa
		sys.exit(1)
	try:
		AlignIO.parse(msaHandle, "fasta").next()
	except:
		print '** HYPNO input error: improper MSA file format, must be aligned FASTA or a2m format: '+msa
		sys.exit(1)	

	if tree:
		try:
			treeHandle = open(tree, "rU")
		except:
			print '** HYPNO input error: Given tree file location does not exist or is not accessible: '+tree
			sys.exit(1)
		try:
			Phylo.read(treeHandle, "newick")
		except:
			print '** HYPNO input error: improper tree file format, must be Newick format: '+msa
			sys.exit(1)

	if not internet_connected():
		print '** HYPNO connection error: Please connect to the internet to enable HYPNO remote database queries.'
		sys.exit(1)

	return 0
Exemple #2
0
def to_Biopython(tree):
	from Bio import Phylo
	from StringIO import StringIO
	from itertools import izip

	try:
		bT	= Phylo.read(StringIO(tree.as_newick_string()), 'newick')
	except:
		nwk_str = tree.as_string(schema='newick')[5:]
		print("raw string:", nwk_str)
		print("stringIO output:", StringIO(nwk_str).readlines())
		try:
				bT = Phylo.read(StringIO(nwk_str), 'newick')
		except:
				bT = Phylo.read(StringIO(nwk_str+')'), 'newick')

	for new_leaf, old_leaf in izip(bT.get_terminals(), tree.leaf_nodes()):
		for attr,val in old_leaf.__dict__.iteritems():
			try:
				new_leaf.__setattr__(attr, float(val))
			except:
				new_leaf.__setattr__(attr, val)
	for new_leaf, old_leaf in izip(bT.get_nonterminals(order='postorder'), tree.postorder_internal_node_iter()):
		for attr,val in old_leaf.__dict__.iteritems():
			try:
				new_leaf.__setattr__(attr, float(val))
			except:
				new_leaf.__setattr__(attr, val)
	return bT
def annotate_cOTU_tree(cOTU_tree_string,results_list):
    from Bio import Phylo
    from StringIO import StringIO
    
    tree = Phylo.read(StringIO(cOTU_tree_string),'newick',rooted=True)
    
    for node_dict in results_list:
        node_tree = Phylo.read(StringIO(load_de_numericized_newick_tree(node_dict['s_nodes'],before="cOTU_",after="")),'newick',rooted=True)
        
        ###debug###
        #print node_tree
        
        node_ref = []
        for terminal in node_tree.get_terminals():
            node_ref.append({"name": terminal.name})
        
        node = tree.common_ancestor(node_ref)
        
        node.confidence = float(node_dict['fdr_p'])
        
        #print node_dict['fdr_p']
    
    out = StringIO()
    
    Phylo.write(tree,out,'newick')
    
    return out.getvalue()
Exemple #4
0
    def test_phylo_read_extra(self):
        """Additional tests to check correct parsing."""
        tree = Phylo.read(StringIO("(A:1, B:-2, (C:3, D:4):-2)"), 'newick')
        self.assertEqual(tree.distance('A'), 1)
        self.assertEqual(tree.distance('B'), -2)
        self.assertEqual(tree.distance('C'), 1)
        self.assertEqual(tree.distance('D'), 2)

        tree = Phylo.read(StringIO("((A:1, B:-2):-5, (C:3, D:4):-2)"), 'newick')
        self.assertEqual(tree.distance('A'), -4)
        self.assertEqual(tree.distance('B'), -7)
        self.assertEqual(tree.distance('C'), 1)
        self.assertEqual(tree.distance('D'), 2)

        tree = Phylo.read(StringIO("((:1, B:-2):-5, (C:3, D:4):-2)"), 'newick')
        distances = {-4.0: 1, -7.0: 1, 1: 1, 2: 1}
        for x in tree.get_terminals():
            entry = int(tree.distance(x))
            distances[entry] -= distances[entry]
            self.assertEqual(distances[entry], 0)

        tree = Phylo.read(StringIO("((:\n1\n,\n B:-2):-5, (C:3, D:4):-2);"), 'newick')
        distances = {-4.0: 1, -7.0: 1, 1: 1, 2: 1}
        for x in tree.get_terminals():
            entry = int(tree.distance(x))
            distances[entry] -= distances[entry]
            self.assertEqual(distances[entry], 0)
Exemple #5
0
def Main():
    global alphabet, rev_alphabet
    alphabet = {"A": 0, "C": 1, "G": 2, "T": 3}
    rev_alphabet = {0: "A", 1: "C", 2: "G", 3: "T"}
    
    # get trees from each file
    tree1 = Phylo.read("tree1.txt", "newick")
    tree1.rooted = True

    tree2 = Phylo.read("tree2.txt", "newick")
    tree2.rooted = True
    
    tree3 = Phylo.read("tree3.txt", "newick")
    tree3.rooted = True
    
    root1 = tree1.clade
    root2 = tree2.clade
    root3 = tree3.clade
    
    
    print("------------------ Tree 1 ------------------")
    plk = felsenstein(root1)
    finalProb(plk)
    
    print("\n\n------------------ Tree 2 ------------------")
    plk = felsenstein(root2)
    finalProb(plk)
    
    print("\n\n------------------ Tree 3 ------------------")
    plk = felsenstein(root3)
    finalProb(plk)
Exemple #6
0
def GetExec():
    Recs = os.listdir(os.getcwd())
    
    newList=[]
    j = 0

    listdata=dict()
    k = 0
    while k < len(Recs):
        (name, ext) = os.path.splitext(Recs[k])
        if len(ext)>3 and ext[0:4]=='.dnd':
            tree = Phylo.read(Recs[k], "newick")
            tree.rooted = True
            newList.append([tree,'ok'])
            listdata[j] = j,str(Recs[k])
            j+=1
        elif len(ext)>3 and ext[0:4]=='.xml':
            tree = Phylo.read(Recs[k], "phyloxml")
            tree.rooted = True
            newList.append([tree,'ok'])
            listdata[j] = j,str(Recs[k])
            j+=1
    
        k += 1
    return [newList,listdata]
Exemple #7
0
def get_tree(tree_file, name_tree):
    tree = Phylo.read( open(tree_file, 'r'), "newick")
    tree_name = Phylo.read( open(name_tree, 'r'), "newick")
    #set node number for nonterminal nodes and specify root node
    numInternalNode = 0
    for clade in tree.get_nonterminals():
        clade.name = 'N' + str(numInternalNode)
        clade.branch_length = clade.confidence
        numInternalNode += 1

    
    for clade_iter in range(len(tree.get_terminals())):
        clade = tree.get_terminals()[clade_iter]
        clade.branch_length = clade.confidence
        clade.name = tree_name.get_terminals()[clade_iter].name
    tree_phy = tree.as_phyloxml(rooted = 'True')
    tree_nx = Phylo.to_networkx(tree_phy)


    triples = ((u.name, v.name, d['weight']) for (u, v, d) in tree_nx.edges(data = True)) # data = True to have the blen as 'weight'
    T = nx.DiGraph()
    edge_to_blen = {}
    for va, vb, blen in triples:
        edge = (va, vb)
        T.add_edge(*edge)
        edge_to_blen[edge] = blen

    edge_list = edge_to_blen.keys()
    edge_list.sort(key = lambda node: int(node[0][1:]))

    return edge_to_blen, edge_list
Exemple #8
0
 def test_root_with_outgroup(self):
     """Tree.root_with_outgroup: reroot at a given clade."""
     # On a large realistic tree, at a deep internal node
     tree = Phylo.read(EX_APAF, 'phyloxml')
     orig_num_tips = len(tree.get_terminals())
     orig_tree_len = tree.total_branch_length()
     tree.root_with_outgroup('19_NEMVE', '20_NEMVE')
     self.assertEqual(orig_num_tips, len(tree.get_terminals()))
     self.assertAlmostEqual(orig_tree_len, tree.total_branch_length())
     # Now, at an external node
     tree.root_with_outgroup('1_BRAFL')
     self.assertEqual(orig_num_tips, len(tree.get_terminals()))
     self.assertAlmostEqual(orig_tree_len, tree.total_branch_length())
     # Specifying outgroup branch length mustn't change the total tree size
     tree.root_with_outgroup('2_BRAFL', outgroup_branch_length=0.5)
     self.assertEqual(orig_num_tips, len(tree.get_terminals()))
     self.assertAlmostEqual(orig_tree_len, tree.total_branch_length())
     tree.root_with_outgroup('36_BRAFL', '37_BRAFL',
             outgroup_branch_length=0.5)
     self.assertEqual(orig_num_tips, len(tree.get_terminals()))
     self.assertAlmostEqual(orig_tree_len, tree.total_branch_length())
     # On small contrived trees, testing edge cases
     for small_nwk in (
             '(A,B,(C,D));',
             '((E,F),((G,H)),(I,J));',
             '((Q,R),(S,T),(U,V));',
             '(X,Y);',
             ):
         tree = Phylo.read(StringIO(small_nwk), 'newick')
         orig_tree_len = tree.total_branch_length()
         for node in list(tree.find_clades()):
             tree.root_with_outgroup(node)
             self.assertAlmostEqual(orig_tree_len,
                                    tree.total_branch_length())
Exemple #9
0
    def test_newick_read_scinot(self):
        """Parse Newick branch lengths in scientific notation."""
        tree = Phylo.read(StringIO("(foo:1e-1,bar:0.1)"), 'newick')
        clade_a = tree.clade[0]
        self.assertEqual(clade_a.name, 'foo')
        self.assertAlmostEqual(clade_a.branch_length, 0.1)


        """Additional tests to check correct parsing"""
        tree = Phylo.read(StringIO("(A:1, B:-2, (C:3, D:4):-2)"),'newick')
        self.assertEqual(tree.distance('A'),1)
        self.assertEqual(tree.distance('B'),-2)
        self.assertEqual(tree.distance('C'),1)
        self.assertEqual(tree.distance('D'),2)

        tree = Phylo.read(StringIO("((A:1, B:-2):-5, (C:3, D:4):-2)"),'newick')
        self.assertEqual(tree.distance('A'),-4)
        self.assertEqual(tree.distance('B'),-7)
        self.assertEqual(tree.distance('C'),1)
        self.assertEqual(tree.distance('D'),2)

        tree = Phylo.read(StringIO("((:1, B:-2):-5, (C:3, D:4):-2)"),'newick')
        distances = {-4.0:1,-7.0:1,1:1,2:1}
        for x in tree.get_terminals():
            entry = int(tree.distance(x))
            distances[entry] -= distances[entry]
            self.assertEqual(distances[entry],0)

        tree = Phylo.read(StringIO("((:\n1\n,\n B:-2):-5, (C:3, D:4):-2);"),'newick')
        distances = {-4.0:1,-7.0:1,1:1,2:1}
        for x in tree.get_terminals():
            entry = int(tree.distance(x))
            distances[entry] -= distances[entry]
            self.assertEqual(distances[entry],0)
Exemple #10
0
 def is_starting_tree_valid(starting_tree):
     try:
         Phylo.read(starting_tree, "newick")
         tree = dendropy.Tree.get_from_path(starting_tree, "newick", preserve_underscores=True)
     except:
         print("Error with the input starting tree: Is it a valid Newick file?")
         return 0
     return 1
Exemple #11
0
 def test_newick_read_single2(self):
     """Read second Newick file with one tree."""
     tree = Phylo.read(EX_NEWICK2, 'newick')
     self.assertEqual(len(tree.get_terminals()), 33)
     self.assertEqual(tree.find_any('H**o sapiens').comment, 'modern human')
     self.assertEqual(tree.find_any('Equus caballus').comment, "wild horse; also 'Equus ferus caballus'")
     self.assertEqual(tree.root.confidence, 80)
     tree = Phylo.read(EX_NEWICK2, 'newick', comments_are_confidence=True)
     self.assertEqual(tree.root.confidence, 100)
 def test_draw(self):
     """Run the tree layout algorithm, but don't display it."""
     pyplot.ioff()   # Turn off interactive display
     dollo = Phylo.read(EX_DOLLO, 'phyloxml')
     apaf = Phylo.read(EX_APAF, 'phyloxml')
     Phylo.draw(dollo, do_show=False)
     Phylo.draw(apaf, do_show=False)
     # Fancier options
     Phylo.draw(apaf, do_show=False, branch_labels={apaf.root: 'Root'})
     Phylo.draw(apaf, do_show=False, branch_labels=lambda c: c.branch_length)
    def test_draw_with_label_colors_callable(self):
        """Run the tree layout algorithm with a label_colors argument passed in
        as a callable. Don't display tree."""
        pyplot.ioff()   # Turn off interactive display
        dollo = Phylo.read(EX_DOLLO, 'phyloxml')
        apaf = Phylo.read(EX_APAF, 'phyloxml')

        label_colors_dollo = lambda label: 'r' if label == 'f_50' else 'k'
        label_colors_apaf = lambda label: 'r'

        Phylo.draw(dollo, label_colors=label_colors_dollo, do_show=False)
        Phylo.draw(apaf, label_colors=label_colors_apaf, do_show=False)
Exemple #14
0
 def test_newick_write(self):
     """Parse a Nexus file with multiple trees."""
     # Tree with internal node labels
     mem_file = StringIO()
     tree = Phylo.read(StringIO("(A,B,(C,D)E)F;"), "newick")
     Phylo.write(tree, mem_file, "newick")
     mem_file.seek(0)
     tree2 = Phylo.read(mem_file, "newick")
     # Sanity check
     self.assertEqual(tree2.count_terminals(), 4)
     # Check internal node labels were retained
     internal_names = set(c.name for c in tree2.get_nonterminals() if c is not None)
     self.assertEqual(internal_names, set(("E", "F")))
Exemple #15
0
def handleData(sample, current, n):
    global total_branch_length, total_mutations
    total_branch_length = 0;
    total_mutations = 0;

    newickForm1 = newick(sample, 1)#tree in terms of time
    heterozygosity = analysis(sample)#analysis computes 1) heterozygosity, 2) total branch length, 3) number of mutations
    newickForm2 = newick(sample, 2)#tree in terms of mutations
    newickForm1 = str(newickForm1)
    newickForm2 = str(newickForm2)
    handle1 = StringIO(newickForm1)
    handle2 = StringIO(newickForm2)
    tree1 = Phylo.read(handle1, 'newick')
    tree2 = Phylo.read(handle2, 'newick')
    data = Node(n, total_branch_length, total_mutations, heterozygosity, tree1, tree2)   
    current.next = data;
def get_pairwise_distances(seq_series, tree_file = None, seq_file = None):
    
    if seq_file is None:
        fasta_handle = NTF()
    if tree_file is None:
        tree_handle = NTF()
    else:
        tree_handle = open(tree_file, 'w')
    for (pat, visit), seq in zip(seq_series.index, seq_series.values):
        nheader = '%s-%s' % (pat, visit)
        fasta_handle.write('>%s\n%s\n' % (nheader, ''.join(seq)))
    fasta_handle.flush()
    os.fsync(fasta_handle.fileno())
    cmd = 'muscle -in %(ifile)s -tree2 %(treefile)s -gapopen -2.9'
    cmdlist = shlex.split(cmd % {
                                 'ifile':fasta_handle.name, 
                                 'treefile':tree_handle.name
                                 })
    t = check_call(cmdlist)
    tree = Phylo.read(open(tree_handle.name), 'newick')
    seq_names = tree.get_terminals()
    dmat = {}
    for p1, p2 in combinations(seq_names, 2):
        d = tree.distance(p1, p2)
        dmat[(p1.name, p2.name)] = d
        dmat[(p2.name, p1.name)] = d
        
    return dmat
Exemple #17
0
    def call_root2tip(self, tree):
        """
        Call jar file that implements a modified version of Andrew Rambaut's
        root-to-tip method (Path-O-Gen).
        :param tree: a Newick tree string
        :return: a dictionary that includes the time-scaled tree
        """
        # write tree to temporary file
        with open(self.tmpfile, "w") as handle:
            handle.write(tree)

        out1 = os.path.join(self.tmp, "anchre.r2t.timetree")
        out2 = os.path.join(self.tmp, "anchre.r2t.csv")

        p = subprocess.check_call(
            [self.java, "-jar", "java/RLRootToTip.jar", "-timetree", out1, "-newick", self.tmpfile, out2],
            stdout=subprocess.PIPE,
        )

        # read outputs
        with open(out1, "rU") as handle:
            timetree = Phylo.read(handle, "nexus")
        with open(out2, "rU") as handle:
            coef = handle.readlines()

        # convert NEXUS to Newick string
        newick = self.phylo2newick(timetree)
        res = {"timetree": newick}
        values = coef[1].strip("\n").split(",")
        for i, key in enumerate(coef[0].strip("\n").split(",")):
            res.update({key: values[i]})

        return res
Exemple #18
0
def tree(alignment,
         run_id = 'T%05i' % (0,),
         bionj = False):

  old_cwd = os.getcwd()
  new_wd = config.dataPath('phyml')
  if not os.path.isdir(new_wd): os.mkdir(new_wd)
  os.chdir(new_wd)

  infilepath = 'infile{0}'.format(run_id)
  infile = open(infilepath,'w')
  aio.write(alignment, infile, 'phylip')
  infile.close()


  command = 'phyml --quiet -i {0} -o {1} '.format(infilepath, 'n' if bionj else 'tlr' )
  print command
  subprocess.call(command,
                  shell = True,
                  stdout = subprocess.PIPE)
  treefilepath = infilepath + '_phyml_tree.txt'
  treefile = open(treefilepath)
  tree =phylo.read(treefile, 'newick')
  treefile.close()
  os.chdir(old_cwd)
  return tree
Exemple #19
0
	def removeParalogs(self):
		self.getseqsfromCodeFile()		
		self.uilist = []
		self.tree_in = Phylo.read(self.PathtoOutput + '/BestRaxTrees/' + self.OG + '_outrax.tree','newick')
		try:
			self.alignment = open(self.PathtoOutput + '/RenamedAlignments/' + self.OG + '_renamed.contrem','r')
		except:
			self.alignment = open(self.PathtoOutput + '/RenamedAlignments/' + self.OG + '_renamed','r')
		for seq in self.tree_in.get_terminals():
			print self.OG			
			try:
				ui = self.sequenceDict[str(seq).split('_')[0]][1] #ui is MC_mc_code
				self.paralogDict[ui].append(str(seq)) # so len is # of paralogs per taxon
				if ui not in self.uilist:
					self.uilist.append(ui)
			except:
				print 'problem with ' + self.OG
				
	
		for ui in self.uilist:	
			print 'self.paralogDict[ui] ' + str(self.paralogDict[ui])
			if len(self.paralogDict[ui]) > 1:
				print ui
				self.pickParalog(ui)
		print 'seq to delete ' + str(self.seqtoDelete)
		self.deleteSeqsFromAlignment()

		self.alignment.close()
def make_tree_figure(wanted_seqs, trop_dict, tree_file):
    mat_data = get_pairwise_distances(wanted_seqs, tree_file = tree_file)
    tree = Phylo.read(open(tree_file), 'newick')
    net = Phylo.to_networkx(tree)
    
    node_mapping = {}
    clade = 1
    for node in net.nodes():
        if node.name is None:
            node_mapping[node] = 'Clade-%i' % clade
            clade += 1
        else:
            node_mapping[node] = node.name
    new_net = networkx.relabel_nodes(net, node_mapping)
    
    colors = []
    for node in new_net.nodes():
        if node.startswith('Clade'):
            colors.append('w')
        elif trop_dict[node]:
            colors.append('g')
        elif not trop_dict[node]:
            colors.append('r')
        else:
            print node
    #print colors, len(colors), len(new_net.nodes())
    pos = networkx.graphviz_layout(new_net, 'twopi')
    
    networkx.draw_networkx(new_net, pos, with_labels = False, node_color = colors)
def run_paml_per_group(groups, alignment, tree, output_dir, working_dir):
    """
    This function take the group, alignment, tree and folder information and runs a paml analysis
    on each defined group.
    The steps needed are to modify the tree to add the #1 that defines the branches in the tree for paml
    and then runs PAML on that tree, using the provided alignment.
    The working dir is important (different from the output dir), because different PAML runs at the same time may
    override each other.
    This is particularly important if running this script in more than one processor
    """
    from Bio import Phylo
    from SelectionAnalysis import paml_run

    cluster_tree = Phylo.read(tree, "newick")  # Read the input tree

    #Names have a pipe sign (|) with the organism|protein_id.
    #Here I create a dictionary where the key is the protein_id and the value is the organism
    clades_in_tree_by_gene_id = {str(clade).split("|")[1]: str(clade).split("|")[0]
                                 for clade in cluster_tree.get_terminals()}

    species_in_tree = set(str(clade).split("|")[0] for clade in cluster_tree.get_terminals())

    clade_results = dict()

    #Iterate on each group
    for group in groups:

        #Check that all the branches are present on the tree (and is not the only branch)
        if set(groups[group]).issubset(species_in_tree) and len(species_in_tree) > len(groups[group]):

            dict_new_clade_names = dict()

            for gene_id in clades_in_tree_by_gene_id:
                genome = clades_in_tree_by_gene_id[gene_id]

                if genome in groups[group]:
                    dict_new_clade_names[genome + "|" + gene_id] = genome + "|" + gene_id + " #1"
                else:
                    continue

            #Replace the names in the tree and save the tree
            old_tree_information = open(tree).read()

            new_tree_information = multiple_replace(dict_new_clade_names, old_tree_information)

            group_tree = working_dir + "/" + group + ".tre"

            new_tree_file = open(group_tree, 'w')
            new_tree_file.write(new_tree_information)
            new_tree_file.close()

            #Run model for the new tree
            paml_results = paml_run.ma_m1a(alignment, group_tree, output_dir, working_dir)

            clade_results[group] = paml_results

        else:
            clade_results[group] = None

    return clade_results
Exemple #22
0
def genTaxTree(resolver, namesdict, logger, taxonomy=None, draw=False):
    """Return Phylo from TaxonNamesResolver class."""
    ranks = resolver.retrieve('classification_path_ranks')
    qnames = resolver.retrieve('query_name')
    lineages = resolver.retrieve('classification_path')
    # replace ' ' with '_' for taxon tree
    qnames = [re.sub("\s", "_", e) for e in qnames]
    resolved_names_bool = [e in namesdict.keys() for e in qnames]
    ranks = [ranks[ei] for ei, e in enumerate(resolved_names_bool) if e]
    lineages = [lineages[ei] for ei, e in enumerate(resolved_names_bool) if e]
    # identify unresolved names
    unresolved_names = [qnames[ei] for ei, e in enumerate(resolved_names_bool)
                        if not e]
    idents = [qnames[ei] for ei, e in enumerate(resolved_names_bool) if e]
    statement = "Unresolved names: "
    for each in unresolved_names:
        statement += " " + each
    logger.debug(statement)
    # make taxdict
    taxdict = TaxDict(idents=idents, ranks=ranks, lineages=lineages,
                      taxonomy=taxonomy)
    # make treestring
    treestring = taxTree(taxdict)
    if not taxonomy:
        d = 22  # default_taxonomy + 1 in tnr
    else:
        d = len(taxonomy) + 1
    # add outgroup
    treestring = '({0},outgroup:{1});'.format(treestring[:-1], float(d))
    tree = Phylo.read(StringIO(treestring), "newick")
    if draw:
        Phylo.draw_ascii(tree)
    return tree
def rootTree(f, root,output):
	tree = Phylo.read(f,'newick')
	if ',' in root:
		taxa = root.split(',')
		root = tree.common_ancestor(taxa)
	tree.root_with_outgroup(root)
	Phylo.write(tree,output,'newick')
def getPhylotasticTree():
    absoluteFileName = getFileName()
    filePrefix = absoluteFileName[:-4]

    # Load the kept nodes and create the comma-delimited species
    # string for sending to PTastic
    speciesList = [l.strip() for l in open(filePrefix+'_species_present.txt').readlines()]
    # Need underscores instead of spaces
    speciesList = [x.replace(' ', '_') for x in speciesList]
    speciesString = ','.join(speciesList)

    phylotasticUrlBase = 'http://phylotastic-wg.nescent.org/script/phylotastic.cgi?species='
    speciesTreeUrl = phylotasticUrlBase+speciesString+'&tree=mammals&format=newick'
    conn = urllib2.urlopen(speciesTreeUrl)
    speciesTreeString = conn.read()
    speciesTreeString = speciesTreeString.strip()

    speciesTreeFilename = filePrefix+'_species_tree.txt'
    open(speciesTreeFilename,'w').write(speciesTreeString)
   
    #setting a counter for counting nodes i.e.number of species in the species newick tree.
    #I have referenced the link http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc182 to understand the count_terminals() function 

    got_nodes=0									#counter to keep the count of the nodes
    tree = Phylo.read(speciesTreeFilename, 'newick')				#tree reads the species labels in newick format
    got_nodes=BaseTree.TreeMixin.count_terminals(tree);

    #no. of non-terminal nodes received in got_nodes.Now can be printed or checked with user input value to test if all species received.
    speciesTreeWebFile = getRelativeWebPath('_species_tree.txt')

    return response.json( dict(vizFile = speciesTreeWebFile,
                               vizLabel = "Phylotastic Species Tree",
			       got_nodes=got_nodes
                               ) )
Exemple #25
0
    def reroot_tree_with_outgroup(tree_name, outgroups):
        clade_outgroups = GubbinsCommon.get_monophyletic_outgroup(tree_name, outgroups)
        outgroups = [{"name": taxon_name} for taxon_name in clade_outgroups]

        tree = Phylo.read(tree_name, "newick")
        tree.root_with_outgroup(*outgroups)
        Phylo.write(tree, tree_name, "newick")

        tree = dendropy.Tree.get_from_path(tree_name, "newick", preserve_underscores=True)
        tree.deroot()
        tree.update_bipartitions()
        output_tree_string = tree.as_string(
            schema="newick",
            suppress_leaf_taxon_labels=False,
            suppress_leaf_node_labels=True,
            suppress_internal_taxon_labels=False,
            suppress_internal_node_labels=False,
            suppress_rooting=True,
            suppress_edge_lengths=False,
            unquoted_underscores=True,
            preserve_spaces=False,
            store_tree_weights=False,
            suppress_annotations=True,
            annotations_as_nhx=False,
            suppress_item_comments=True,
            node_label_element_separator=" ",
        )
        with open(tree_name, "w+") as output_file:
            output_file.write(output_tree_string.replace("'", ""))
            output_file.closed
Exemple #26
0
def _newick_to_nx(newick, default_lineages=None):
    newick = StringIO(newick)
    phy = Phylo.read(newick, "newick")
    phy.rooted = True
    edges = []
    nodes = []
    node_data = {}
    clades = [phy.root]
    phy.root.name = phy.root.name or "root"
    i = 0
    while clades:
        clade = clades.pop()
        nd = _extract_momi_fields(clade.comment or "")
        if 'lineages' not in nd and default_lineages is not None:
            nd['lineages'] = default_lineages
        nodes.append((clade.name, nd))
        for c_clade in clade.clades:
            clades += clade.clades
            if c_clade.name is None:
                c_clade.name = "node%d" % i
                i += 1
            ed = {'branch_length': c_clade.branch_length}
            edges.append((clade.name, (c_clade.name), ed))
    t = nx.DiGraph(data=edges)
    t.add_nodes_from(nodes)
    tn = dict(t.nodes(data=True))
    for node in node_data:
        tn[node].update(node_data[node])
    return t
 def test_draw_ascii(self):
     """Tree to Graph conversion, if networkx is available."""
     handle = StringIO()
     tree = Phylo.read(EX_APAF, 'phyloxml')
     Phylo.draw_ascii(tree, file=handle)
     Phylo.draw_ascii(tree, file=handle, column_width=120)
     handle.close()
Exemple #28
0
 def test_find_elements(self):
     """TreeMixin: find_elements() method."""
     # From the docstring example
     tree = self.phylogenies[5]
     matches = list(tree.find_elements(PhyloXML.Taxonomy, code='OCTVU'))
     self.assertEqual(len(matches), 1)
     self.assertTrue(isinstance(matches[0], PhyloXML.Taxonomy))
     self.assertEqual(matches[0].code, 'OCTVU')
     self.assertEqual(matches[0].scientific_name, 'Octopus vulgaris')
     # Iteration and regexps
     tree = self.phylogenies[10]
     for point, alt in zip(tree.find_elements(geodetic_datum=r'WGS\d{2}'),
                            (472, 10, 452)):
         self.assertTrue(isinstance(point, PhyloXML.Point))
         self.assertEqual(point.geodetic_datum, 'WGS84')
         self.assertAlmostEqual(point.alt, alt)
     # class filter
     tree = self.phylogenies[4]
     events = list(tree.find_elements(PhyloXML.Events))
     self.assertEqual(len(events), 2)
     self.assertEqual(events[0].speciations, 1)
     self.assertEqual(events[1].duplications, 1)
     # string filter & find_any
     tree = self.phylogenies[3]
     taxonomy = tree.find_any("B. subtilis")
     self.assertEqual(taxonomy.scientific_name, "B. subtilis")
     # integer filter
     tree = Phylo.read(EX_APAF, 'phyloxml')
     domains = list(tree.find_elements(start=5))
     self.assertEqual(len(domains), 8)
     for dom in domains:
         self.assertEqual(dom.start, 5)
         self.assertEqual(dom.value, 'CARD')
Exemple #29
0
def test_ancestral():
    import os
    from Bio import AlignIO
    import numpy as np
    from treetime import TreeAnc, GTR
    root_dir = os.path.dirname(os.path.realpath(__file__))
    fasta = str(os.path.join(root_dir, 'treetime_examples/data/h3n2_na/h3n2_na_20.fasta'))
    nwk = str(os.path.join(root_dir, 'treetime_examples/data/h3n2_na/h3n2_na_20.nwk'))

    for marginal in [True, False]:
        print('loading flu example')
        t = TreeAnc(gtr='Jukes-Cantor', tree=nwk, aln=fasta)
        print('ancestral reconstruction' + ("marginal" if marginal else "joint"))
        t.reconstruct_anc(method='ml', marginal=marginal)
        assert "".join(t.tree.root.sequence) == 'ATGAATCCAAATCAAAAGATAATAACGATTGGCTCTGTTTCTCTCACCATTTCCACAATATGCTTCTTCATGCAAATTGCCATCTTGATAACTACTGTAACATTGCATTTCAAGCAATATGAATTCAACTCCCCCCCAAACAACCAAGTGATGCTGTGTGAACCAACAATAATAGAAAGAAACATAACAGAGATAGTGTATCTGACCAACACCACCATAGAGAAGGAAATATGCCCCAAACCAGCAGAATACAGAAATTGGTCAAAACCGCAATGTGGCATTACAGGATTTGCACCTTTCTCTAAGGACAATTCGATTAGGCTTTCCGCTGGTGGGGACATCTGGGTGACAAGAGAACCTTATGTGTCATGCGATCCTGACAAGTGTTATCAATTTGCCCTTGGACAGGGAACAACACTAAACAACGTGCATTCAAATAACACAGTACGTGATAGGACCCCTTATCGGACTCTATTGATGAATGAGTTGGGTGTTCCTTTTCATCTGGGGACCAAGCAAGTGTGCATAGCATGGTCCAGCTCAAGTTGTCACGATGGAAAAGCATGGCTGCATGTTTGTATAACGGGGGATGATAAAAATGCAACTGCTAGCTTCATTTACAATGGGAGGCTTGTAGATAGTGTTGTTTCATGGTCCAAAGAAATTCTCAGGACCCAGGAGTCAGAATGCGTTTGTATCAATGGAACTTGTACAGTAGTAATGACTGATGGAAGTGCTTCAGGAAAAGCTGATACTAAAATACTATTCATTGAGGAGGGGAAAATCGTTCATACTAGCACATTGTCAGGAAGTGCTCAGCATGTCGAAGAGTGCTCTTGCTATCCTCGATATCCTGGTGTCAGATGTGTCTGCAGAGACAACTGGAAAGGCTCCAATCGGCCCATCGTAGATATAAACATAAAGGATCATAGCATTGTTTCCAGTTATGTGTGTTCAGGACTTGTTGGAGACACACCCAGAAAAAACGACAGCTCCAGCAGTAGCCATTGTTTGGATCCTAACAATGAAGAAGGTGGTCATGGAGTGAAAGGCTGGGCCTTTGATGATGGAAATGACGTGTGGATGGGAAGAACAATCAACGAGACGTCACGCTTAGGGTATGAAACCTTCAAAGTCATTGAAGGCTGGTCCAACCCTAAGTCCAAATTGCAGATAAATAGGCAAGTCATAGTTGACAGAGGTGATAGGTCCGGTTATTCTGGTATTTTCTCTGTTGAAGGCAAAAGCTGCATCAATCGGTGCTTTTATGTGGAGTTGATTAGGGGAAGAAAAGAGGAAACTGAAGTCTTGTGGACCTCAAACAGTATTGTTGTGTTTTGTGGCACCTCAGGTACATATGGAACAGGCTCATGGCCTGATGGGGCGGACCTCAATCTCATGCCTATA'

    print('testing LH normalization')
    from Bio import Phylo,AlignIO
    tiny_tree = Phylo.read(StringIO("((A:0.60100000009,B:0.3010000009):0.1,C:0.2):0.001;"), 'newick')
    tiny_aln = AlignIO.read(StringIO(">A\nAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTT\n"
                                     ">B\nAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTT\n"
                                     ">C\nACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT\n"), 'fasta')

    mygtr = GTR.custom(alphabet = np.array(['A', 'C', 'G', 'T']), pi = np.array([0.9, 0.06, 0.02, 0.02]), W=np.ones((4,4)))
    t = TreeAnc(gtr=mygtr, tree=tiny_tree, aln=tiny_aln)
    t.reconstruct_anc('ml', marginal=True, debug=True)
    lhsum =  np.exp(t.sequence_LH(pos=np.arange(4**3))).sum()
    print (lhsum)
    assert(np.abs(lhsum-1.0)<1e-6)

    t.optimize_branch_len()
Exemple #30
0
 def test_raxml(self):
     """Run RAxML using the wrapper."""
     cmd = RaxmlCommandline(raxml_exe,
                            sequences=EX_PHYLIP, model="PROTCATWAG",
                            name="test")
     # The parsimony seed should be set automatically
     self.assert_('-p' in str(cmd))
     # Smoke test
     try:
         out, err = cmd()
         self.assert_(len(out) > 0)
         self.assert_(len(err) == 0)
         # Check the output tree
         tree = Phylo.read('RAxML_result.test', 'newick')
         self.assertEqual(tree.count_terminals(), 4)
     finally:
         # Remove RAxML-generated files, or RAxML will complain bitterly
         # during the next run
         for fname in ['RAxML_info.test',
                       'RAxML_log.test',
                       'RAxML_parsimonyTree.test',
                       'RAxML_result.test',
                       # Present in 7.2.X+  but not 7.0.4:
                       'RAxML_bestTree.test',
                      ]:
             if os.path.isfile(fname):
                 os.remove(fname)
Exemple #31
0
	while len(labels) > 1:
		x,y = lowest_cell(table)
		join_table(table,x,y)
		join_labels(labels,x,y)
	return labels[0]
	
def alpha_labels(start,end):
	labels = []
	for i in range(ord(start), ord(end)+1):
		labels.append(chr(i))
	return labels
	
M_labels = alpha_labels("A","E")
M = [
	[],
	[0.189],
	[0.110,0.179],
	[0.113,0.192,0.094],
	[0.215,0.211,0.205,0.214]]
	
u = (UPGMA(M,M_labels))
u = u.replace("A","Gorila")
u = u.replace("B","Oragontango")
u = u.replace("C","Humano")
u = u.replace("D","Chimpanze")
u = u.replace("E","Gibao")
handle = StringIO(u)
tree = Phylo.read(handle,"newick")

Phylo.draw(tree)
    dist = distance.euclidean(vec1, vec2)
    return dist


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Down sample sequences from FASTA",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--world_tree",
                        type=str,
                        required=True,
                        help="path to a tree file")
    parser.add_argument("--sampled_tree",
                        type=str,
                        required=True,
                        help="path to a tree file")
    parser.add_argument("--output", required=True, help="FASTA output file")
    args = parser.parse_args()

    tree1 = Phylo.read(args.world_tree, 'newick')
    tree2 = Phylo.read(args.sampled_tree, 'newick')

    tree1.root_with_outgroup({'name': 'Wuhan-Hu-1/2019'})
    tree2.root_with_outgroup({'name': 'Wuhan-Hu-1/2019'})

    tree1 = prune_world_tree(tree1, tree2)

    mapper = map_tree_to_vector_idx(tree2)
    dist = get_trees_distance(tree1, tree2, mapper)
    np.savetxt(args.output, dist)
Exemple #33
0
def main(argv):
    print "AnnotateTreeCmd v1.0"
    if len(argv) == 2 and argv[1] == '-t':
        conduct_tests()
        exit(0)
    elif len(argv) != 7:
        print 'usage python AnnotateTreeCmd.py seqnumfile seqfile treefile cdrfile tag wd.'
        sys.exit(0)

    for file in argv[1:4]:
        check_file(file)

    (seqnumfile, seqfile, treefile, cdrfile, tag, wdir) = argv[1:7]

    if len(cdrfile) > 0:
        check_file(cdrfile)
    else:
        cdrfile = None

    try:
        if not os.path.exists(wdir):
            os.makedirs(wdir)
    except:
        print "Error creating directory %s." % wdir
        sys.exit(0)

    try:
        msa = Alignment()
        msa.read_nt(
            seqfile)  # Check that the sequence comprises a valid set of codons
        for seq in msa:
            if '*' in seq:
                print "Stop codon found in sequence %s." % seq.id
                sys.exit(0)
    except:
        print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1])
        sys.exit(0)

    try:
        seq_pos = msa.read_position_numbers(seqnumfile)
    except:
        print "Error parsing %s: %s." % (seqnumfile, sys.exc_info()[1])
        sys.exit(0)

    if cdrfile is not None:
        try:
            acdr = AnalyseCDR(msa, file_name=cdrfile)
        except:
            print "Error parsing %s: %s." % (cdrfile, sys.exc_info()[1])
            sys.exit(0)

    try:
        seq_align = AlignIO.read(seqfile, "fasta")
    except:
        try:
            seq_align = AlignIO.read(seqfile, "phylip")
        except:
            print "Error parsing %s: %s." % (seqfile, sys.exc_info()[1])
            sys.exit(0)

    try:
        tree = Phylo.read(treefile, "newick")
    except:
        print "Error parsing %s: %s." % (treefile, sys.exc_info()[1])
        sys.exit(0)

    dnaml = Dnaml()

    int_aas = dnaml.run_dnaml(seq_align, tree, seq_pos, cdrfile, wdir, report,
                              tag)

    if int_aas is not None:
        try:
            if cdrfile is not None:
                acdr = AnalyseCDR(int_aas, file_name=cdrfile)
                cdr_output = acdr.analyse()
                fo = open(wdir + "/" + tag + "cdr_analysis.html", "w")
                fo.write(cdr_output)
                fo.close()
        except:
            print "Warning: CDRs were not analysed: " + str(sys.exc_info()[1])

        try:
            gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "annotated_treefile.new",
                wdir + "/" + tag + "annotated_treefile.png")
            gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "annotated_treefile.new",
                wdir + "/" + tag + "annotated_treefile.svg")
            gc.collect()
            if cdrfile is not None:
                RenderTree.render_annotate(
                    wdir + "/" + tag + "annotated_treefile_sum.new",
                    wdir + "/" + tag + "annotated_treefile_sum.png")
                gc.collect()
                RenderTree.render_annotate(
                    wdir + "/" + tag + "annotated_treefile_sum.new",
                    wdir + "/" + tag + "annotated_treefile_sum.svg")
                gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "annotated_treefile_tot.new",
                wdir + "/" + tag + "annotated_treefile_tot.png")
            gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "annotated_treefile_tot.new",
                wdir + "/" + tag + "annotated_treefile_tot.svg")
            gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "intermediates_treefile.new",
                wdir + "/" + tag + "intermediates_treefile.png")
            gc.collect()
            RenderTree.render_annotate(
                wdir + "/" + tag + "intermediates_treefile.new",
                wdir + "/" + tag + "intermediates_treefile.svg")
            gc.collect()
        except:
            print "Error rendering trees: " + str(sys.exc_info()[1])

        first = True
        orig_recs = []
        for rec in SeqIO.parse(wdir + "/" + tag + "aa_alignment.fa", "fasta"):
            if not first and "node_" not in rec.id:
                orig_recs.append(rec)
            first = False

        logo_alignment_file = wdir + "/" + tag + "alignment_for_logo.fa"
        SeqIO.write(orig_recs, wdir + "/" + tag + "alignment_for_logo.fa",
                    "fasta")

        with open(wdir + "/" + tag + "weblogo_status.txt", "w") as fo:
            retcode = subprocess.call(
                "seqlogo -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS"
                % tag,
                cwd=wdir,
                shell=True,
                stdout=fo,
                stderr=subprocess.STDOUT)
            if retcode == 1:
                fo.write("Trying seqlogo.pl instead.\n")
                retcode = subprocess.call(
                    "seqlogo.pl -f %salignment_for_logo.fa -F PNG -o aa_logo -h 2 -w 20 -acS"
                    % tag,
                    cwd=wdir,
                    shell=True,
                    stdout=fo,
                    stderr=subprocess.STDOUT)
            if retcode == 1:
                print "Weblogo not installed: logo plot will not be generated."
    # Variables
    version = 'checkTreeFormat v1.0'  # Script version
    arguments = ""  # Arguments from ArgParse
    tree = ""  # Tree variable

    # Grab arguments
    arguments = check_arg(sys.argv[1:])

    # Checking if file exists
    if (not os.path.exists(arguments.tree_file)):
        print("Tree file not found.")
        sys.exit(1)

    # Read file, check it is in the correct format.
    try:
        tree = Phylo.read(arguments.tree_file, arguments.tree_format)
    except:
        if (arguments.tree_format == "newick"):
            print("Tree file not in newick format.")
        elif (arguments.tree_format == "nexus"):
            print("Tree file not in nexus format.")
        raise
        sys.exit(1)

    # If format =! newick convert to canonical format.
    if (arguments.tree_format != "newick"):
        print("Tree file not in canonical format. Converting to newick...")
    else:
        print("Tree is already in newick format, printing...")

    # Writing tree in newick format
Exemple #35
0
""" BioE231
    Vivian Fu, Jessica Wu, Zihui Xu

    Use Biopython's Phylo to visualize tree.nwk. 
    """

from Bio import Phylo
from io import StringIO
import sys

tree = Phylo.read(sys.stdin, 'newick')
Phylo.draw(tree)
Exemple #36
0
                    d = 1. - np.mean(temp)
                    Wg = np.exp(-((d**2) / (d0**2)))
                    s = g.Shannon(l)
                    if s != None:
                        entropy.append(Wg * s)
                score.append(sum(entropy))
        ranking[l] = 1. + float(sum(score))
    return ranking


###====================================================================================================
### MAIN
###====================================================================================================
if __name__ == "__main__":
    msa = MSA(args.msa_file)
    tree = Phylo.read(args.tree_file, 'newick')
    tree.ladderize()  # Flip branches so deeper clades are displayed at top
    clades = list(tree.find_clades(order='level'))
    subfamily = {}
    leaves = []
    for i, clade in enumerate(clades):
        leaf = False
        if clade.is_terminal():
            leaf = True
        if not leaf:
            clade.name = 'N%d' % i
        subfamily[clade] = Clade(msa, [
            msa.sequence_indices[x.name] for x in list(clade.get_terminals())
        ], clade.branch_length, clade.name)
        if leaf:
            leaves.append(subfamily[clade])
Exemple #37
0
def run(args):
    """run mugration inference

    Parameters
    ----------
    args : namespace
        command line arguments are parsed by argparse
    """
    tree_fname = args.tree
    traits, columns = read_metadata(args.metadata)

    from Bio import Phylo
    T = Phylo.read(tree_fname, 'newick')
    missing_internal_node_names = [
        n.name is None for n in T.get_nonterminals()
    ]
    if np.all(missing_internal_node_names):
        print("\n*** WARNING: Tree has no internal node names!")
        print(
            "*** Without internal node names, ancestral traits can't be linked up to the correct node later."
        )
        print(
            "*** If you want to use 'augur export' later, re-run this command with the output of 'augur refine'."
        )
        print(
            "*** If you haven't run 'augur refine', you can add node names to your tree by running:"
        )
        print("*** augur refine --tree %s --output-tree <filename>.nwk" %
              (tree_fname))
        print(
            "*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'"
        )

    mugration_states = defaultdict(dict)
    models = defaultdict(dict)
    out_prefix = '.'.join(args.tree.split('.')[:-1])
    for column in args.columns:
        T, gtr, alphabet = mugration_inference(
            tree=tree_fname,
            seq_meta=traits,
            field=column,
            confidence=args.confidence,
            sampling_bias_correction=args.sampling_bias_correction)
        if T is None:  # something went wrong
            continue

        for node in T.find_clades():
            mugration_states[node.name][column] = node.__getattribute__(column)
            if args.confidence:
                mugration_states[node.name][
                    column +
                    '_confidence'] = node.__getattribute__(column +
                                                           '_confidence')
                mugration_states[node.name][
                    column + '_entropy'] = node.__getattribute__(column +
                                                                 '_entropy')

        if gtr:
            # add gtr models to json structure for export
            models[column]['rate'] = gtr.mu
            models[column]['alphabet'] = [
                alphabet[k] for k in sorted(alphabet.keys())
            ]
            models[column]['equilibrium_probabilities'] = list(gtr.Pi)
            models[column]['transition_matrix'] = [list(x) for x in gtr.W]

        if gtr:
            with open(out_prefix + '%s.mugration_model.txt' % column,
                      'w') as ofile:
                ofile.write('Map from character to field name\n')
                for k, v in alphabet.items():
                    ofile.write(k + ':\t' + str(v) + '\n')
                ofile.write('\n\n')

                ofile.write(str(gtr))

    out_name = get_json_name(args, out_prefix + '_traits.json')
    write_json({"models": models, "nodes": mugration_states}, out_name)

    print(
        "\nInferred ancestral states of discrete character using TreeTime:"
        "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis"
        "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n",
        file=sys.stdout)

    print("results written to", out_name, file=sys.stdout)
Exemple #38
0
def mugration_inference(tree=None,
                        seq_meta=None,
                        field='country',
                        confidence=True,
                        infer_gtr=True,
                        root_state=None,
                        missing='?',
                        sampling_bias_correction=None):
    """
    Infer likely ancestral states of a discrete character assuming a time reversible model.

    Parameters
    ----------
    tree : str
        name of tree file
    seq_meta : dict
        meta data associated with sequences
    field : str, optional
        meta data field to use
    confidence : bool, optional
        calculate confidence values for inferences
    infer_gtr : bool, optional
        infer a GTR model for trait transitions (otherwises uses a flat model with rate 1)
    root_state : None, optional
        force the state of the root node (currently not implemented)
    missing : str, optional
        character that is to be interpreted as missing data, default='?'

    Returns
    -------
    T : Phylo.Tree
        Biophyton tree
    gtr : treetime.GTR
        GTR model
    alphabet : dict
        mapping of character states to
    """
    from treetime import GTR
    from Bio.Align import MultipleSeqAlignment
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio import Phylo

    T = Phylo.read(tree, 'newick')
    nodes = {n.name: n for n in T.get_terminals()}

    # Determine alphabet only counting tips in the tree
    places = set()
    for name, meta in seq_meta.items():
        if field in meta and name in nodes:
            places.add(meta[field])
    if root_state is not None:
        places.add(root_state)

    # construct GTR (flat for now). The missing DATA symbol is a '-' (ord('-')==45)
    places = sorted(places)
    nc = len(places)
    if nc > 180:
        print("ERROR: geo_inference: can't have more than 180 places!",
              file=sys.stderr)
        return None, None, None
    elif nc == 0:
        print("ERROR: geo_inference: list of places is empty!",
              file=sys.stderr)
        return None, None, None
    elif nc == 1:
        print(
            "WARNING: geo_inference: only one place found -- set every internal node to %s!"
            % places[0],
            file=sys.stderr)
        alphabet = {'A': places[0]}
        alphabet_values = ['A']
        gtr = None
        for node in T.find_clades():
            node.sequence = ['A']
            node.marginal_profile = np.array([[1.0]])
    else:
        # set up model
        alphabet = {chr(65 + i): place for i, place in enumerate(places)}
        model = GTR.custom(pi=np.ones(nc, dtype=float) / nc,
                           W=np.ones((nc, nc)),
                           alphabet=np.array(sorted(alphabet.keys())))

        missing_char = chr(65 + nc)
        alphabet[missing_char] = missing
        model.profile_map[missing_char] = np.ones(nc)
        model.ambiguous = missing_char
        alphabet_rev = {v: k for k, v in alphabet.items()}

        # construct pseudo alignment
        pseudo_seqs = []
        for name, meta in seq_meta.items():
            if name in nodes:
                s = alphabet_rev[
                    meta[field]] if field in meta else missing_char
                pseudo_seqs.append(SeqRecord(Seq(s), name=name, id=name))
        aln = MultipleSeqAlignment(pseudo_seqs)

        # set up treetime and infer
        from treetime import TreeAnc
        tt = TreeAnc(tree=tree,
                     aln=aln,
                     gtr=model,
                     convert_upper=False,
                     verbose=0)
        tt.use_mutation_length = False
        tt.infer_ancestral_sequences(infer_gtr=infer_gtr,
                                     store_compressed=False,
                                     pc=1.0,
                                     marginal=True,
                                     normalized_rate=False)

        if sampling_bias_correction:
            tt.gtr.mu *= sampling_bias_correction
            tt.infer_ancestral_sequences(infer_gtr=False,
                                         store_compressed=False,
                                         marginal=True,
                                         normalized_rate=False)

        T = tt.tree
        gtr = tt.gtr
        alphabet_values = tt.gtr.alphabet

    # attach inferred states as e.g. node.region = 'africa'
    for node in T.find_clades():
        node.__setattr__(field, alphabet[node.sequence[0]])

    # if desired, attach entropy and confidence as e.g. node.region_entropy = 0.03
    if confidence:
        for node in T.find_clades():
            pdis = node.marginal_profile[0]
            S = -np.sum(pdis * np.log(pdis + TINY))

            marginal = [(alphabet[alphabet_values[i]], pdis[i])
                        for i in range(len(alphabet_values))]
            marginal.sort(key=lambda x: x[1],
                          reverse=True)  # sort on likelihoods
            marginal = [(a, b) for a, b in marginal if b > 0.001
                        ][:4]  #only take stuff over .1% and the top 4 elements
            conf = {a: b for a, b in marginal}
            node.__setattr__(field + "_entropy", S)
            node.__setattr__(field + "_confidence", conf)

    return T, gtr, alphabet
Exemple #39
0
def get_morpheme_tree(clauses, scenario, tree_name, reconstructed=False):
    set_1 = {}
    for i in clauses:
        set_1[i] = {}
    for entry in function_paradigms():
        if entry["Construction"] in set_1:
            if entry["Function"] not in set_1[entry["Construction"]].keys():
                set_1[entry["Construction"]][entry["Function"]] = [
                    entry["Morpheme"]
                ]
            else:
                set_1[entry["Construction"]][entry["Function"]].append(
                    entry["Morpheme"])
    lang_clauses = {}
    for clause in set_1:
        cons = DBSession.query(Construction).filter(
            Construction.id == clause)[0]
        lang_clauses[cons.language.id] = set_1[clause]
    lang_clauses["kax"] = {
        "3>1+2": [["k-"]],
        "1>2": [["k-"]],
        "2>1": [["k-"]],
        "1>3": [["w-"]],
        "1+2>3": [["k(ɨt)-"]],
        "3>1": [["j-"], ["Ø-"]],
        "3>2": [["o(w)-"]]
    }
    lang_clauses["bak"] = {
        "3>1+2": [["k-"]],
        "1>2": [["ə-"]],
        "2>1": [["j-"]],
        "1>3": [["s-"]],
        "1+2>3": [["kɨd-"]],
        "3>1": [["ɨ-"], ["j-"]],
        "3>2": [["ə-"]]
    }
    lang_clauses["yuk"] = {
        "3>1+2": [["ɨp", "n-"]],
        "1>2": [["aw", "oj-"]],
        "2>1": [["am", "j-"]],
        "1>3": [["aw", "Ø-"]],
        "1+2>3": [["ɨp", "Ø-"]],
        "3>1": [["aw", "j-"]],
        "3>2": [["am", "oj-"]]
    }
    lang_clauses["aku"] = {
        "3>1+2": [["k-"]],
        "1>2": [["k-"]],
        "2>1": [["k-"]],
        "1>3": [["i-"], ["Ø-"]],
        "1+2>3": [["kɨt-"]],
        "3>1": [["jː-"], ["Øː-"]],
        "3>2": [["ə-"]]
    }
    lang_clauses["cum"] = {
        "1>2": [["kaj-"], ["kən-"], ["k-"]],
        "2>1": [["kaj-"], ["k-"]],
        "1>3": [["w-"], ["i-"]],
    }
    lang_clauses["tam"] = {
        "3>1+2": ["?"],
        "1>2": ["?"],
        "2>1": ["?"],
        "1>3": [["t-"]],
        "1+2>3": [["kɨt͡ʃ-"]]
    }
    lang_clauses["car"] = {
        "3>1+2": [["k-"]],
        "1>2": [["k-"]],
        "2>1": [["k-"]],
        "1>3": [["i-"]],
        "1+2>3": [["kɨt-"]],
        "3>1": [["j-"], ["ji-"], ["voice"]],
        "3>2": [["əj-"]]
    }
    lang_clauses["pem"] = {
        "1>3": "s-",
        "1>2": ["?"],
        "2>1": ["?"],
    }
    my_tree = Phylo.read(io.StringIO(Phylogeny.get("matter").newick), "newick")
    for node in my_tree.find_clades():
        if node.name == None:
            continue
        if node.is_terminal():
            node.name = node.name.replace("?", "")
            new_name = "lg:" + node.name
        else:
            new_name = node.name
        if node.name in lang_clauses.keys():
            if scenario in lang_clauses[node.name].keys():
                all_morphs = []
                for morpheme_combo in lang_clauses[node.name][scenario]:
                    this_morph = []
                    for morpheme in morpheme_combo:
                        if DBSession.query(Morpheme).filter(
                                Morpheme.id == morpheme).count() >= 1:
                            if not reconstructed or DBSession.query(
                                    Morpheme).filter(
                                        Morpheme.id == morpheme
                                    )[0].counterparts[0].cognateset.id == "NA":
                                this_morph.append(
                                    "morph:" + morpheme
                                )  #data["Morpheme"][morpheme].name + " "
                            else:
                                # print()
                                for counterpart in DBSession.query(
                                        Morpheme).filter(
                                            Morpheme.id ==
                                            morpheme)[0].counterparts:
                                    this_morph.append(
                                        "cogset:" + counterpart.cognateset.id)
                        else:
                            this_morph.append("obj:" + morpheme)
                    all_morphs.append("£".join(this_morph))
            else:
                all_morphs = ["-"]
        else:
            all_morphs = ["-"]
        node.name = new_name + " " + " OR ".join(all_morphs)
        node.name = generate_markup(node.name)
    return get_clade_as_json(my_tree.clade)
Exemple #40
0
def add_default_branch_lengths(s, branch_length):
    insert = ':' + str(branch_length)
    s = s.replace(')', insert + ')')
    s = s.replace(',', insert + ',')
    s = s.replace(';', insert + ';')
    return s


with open(infile, "r") as f:

    data = [l.strip() for l in f.readlines()]
    trees = [(tree, node) for tree, node in zip(data[::3], data[1::3])]

    for tree, nodes in trees:
        ## Add default length to tree
        #tree = add_default_branch_lengths(tree, 1)

        ## Read tree in newick format
        ntree = Phylo.read(StringIO(tree), "newick")

        ## nodes for distance
        node1, node2 = nodes.split()

        ## compute distance between nodes
        print(int(ntree.distance(node1, node2)), end=" ")

        ## See the actual tree
        # Phylo.draw(tree) # matplotlib
        # nx.draw(G)
Exemple #41
0
		print(node)
		row = ref.loc[ref['saccver'].str.contains(node)] # get the row, for which the node is a substring in the saccver column 
		org_name = row['organism_name'].to_string() # get the organism name 
		org_name = org_name.lstrip('0123456789.- ')
		print(org_name)
		gcf = row['subject_gcf'].to_string()
		gcf = gcf.lstrip('0123456789.- ')
		label = org_name + ' ' + gcf # make the label by concatenating the organism name and gcf
		label = label.replace(' ', '_')
		line = pd.Series({'node': node, 'label': label}) 
		results = results.append(line, ignore_index=True)
		i += 1
		print(i)

	return results
tree = Phylo.read(sys.argv[1], 'newick') # open the tree file 
names = lookup_by_names(tree) # use the function to get the dictionary
nm = [i for i in names]
#print(nm)
#print(len(nm))

query = sys.argv[1].split('-')[0]
#print(query)

ref = get_reference_table(query)
print(ref.head())

results = node_label_table(nm)
print(results.head(10))
print(len(results))
Exemple #42
0
def midpoint(input_fn, output_fn):
    tree = Phylo.read(input_fn, 'newick')
    tree.root_at_midpoint()
    Phylo.write(tree, output_fn, 'newick')
def align_hits(fasta_file, record_df):
    """ Use clustalw to align the fasta files to find the best sequence to use
    for the C. elegans vs Human comparison

    REQUIREMENTS - download clustaw from http://www.clustal.org/download/current/
    and put folder in Applications

    Input : fasta_file

    Output """

    from Bio.Align.Applications import ClustalwCommandline
    from Bio import AlignIO
    from Bio import Phylo

    gene = fasta_file.parent.stem
    print('analysis {}'.format(gene))

    # check if alignment has already been done
    if len(list(fasta_file.parent.rglob('*.aln'))) > 0:
        print('{} alignment already done, nothing to do here'.format(gene))

        return

    # import information about the gene from dataframe
    records = record_df[record_df.HGNC == gene].copy()

    # now do the alignment
    clustalw_exe = r"/Applications/clustalw-2.1-macosx/clustalw2"
    clustalw_cline = ClustalwCommandline(clustalw_exe,
                                         infile=fasta_file,
                                         stats=fasta_file.parent / 'stats.txt')
    stdout, stderr = clustalw_cline()

    #find alignment files
    align_file = list(fasta_file.parent.rglob('*.aln'))[0]
    tree_file = list(fasta_file.parent.rglob('*.dnd'))[0]

    alignment = AlignIO.read(align_file, "clustal")

    # find consensus sequence
    consensus = re.finditer(r"\*",
                            alignment.column_annotations['clustal_consensus'])
    clist = []
    for c in consensus:
        clist.append(c.span(0))

    if len(clist) > 0:
        consensus = alignment[:, clist[0][0]:clist[-1][1]]
    else:
        consensus = alignment[:, ::]

    gap_count = {}
    for sequence in consensus:
        gap_count[sequence.id] = sequence.seq.count('-')

    records.loc[:, 'alignment_gaps'] = records.entrez_id.map(gap_count)
    records.sort_values(by=['alignment_gaps', 'sequence_length'],
                        ascending=[True, False],
                        inplace=True)
    records.reset_index(drop=True, inplace=True)

    #save top ranked to output file
    top_sequence = SeqRecord(Seq(records.Sequence.loc[0],
                                 IUPAC.IUPACAmbiguousDNA()),
                             id=records.entrez_id.loc[0],
                             name=gene)
    SeqIO.write(top_sequence,
                fasta_file.parent / '{}_sequence.fa'.format(top_sequence.id),
                'fasta')

    tree = Phylo.read(tree_file, "newick")
    tree.ladderize()
    Phylo.draw(tree)
    plt.savefig(tree_file.parent / 'tree.png')
    plt.close('all')

    return
Exemple #44
0
## initialize hash tables
LOG_TAVARE_CONDITIONAL_LIKELIHOOD_DICT = {}
TIME_BETA_DICT = {}
TIME_ALPHA_DICT = {}

idxsSamplesCorrectlyPolarized = []
individualMargEsts = []
numSamplesWronglyPolarized = 0
ind_i_hats = []
ind_i_sel_hats = []
branch_lengths = []
individualMargEsts = np.zeros(
    (numImportanceSamples, len(S_GRID), len(I_SEL), len(ds), len(FREQS)))
for (k, line) in enumerate(lines):
    nwk = line.rstrip().split()[-1]
    derTree = Phylo.read(StringIO(nwk), 'newick')
    ancTree = Phylo.read(StringIO(nwk), 'newick')
    mixTree = Phylo.read(StringIO(nwk), 'newick')
    Phylo.read(StringIO(nwk), 'newick')

    n = len(derInds)
    m = len(ancInds)
    if k == 0:
        if args.popFreq != None:
            if args.popFreq != 1:
                discretizedPopFreqIdx = np.digitize(args.popFreq, FREQS)
                hPlus = FREQS[discretizedPopFreqIdx] - args.popFreq
                hMinus = args.popFreq - FREQS[discretizedPopFreqIdx - 1]
                sign = -1 * np.random.binomial(1, hPlus / (hPlus + hMinus))
                discretizedPopFreqIdx += sign
            else:
Exemple #45
0
    # And counter-search to check if it's really a good match, the top hit should be the query sequence
    call('tblastx -db ../db/a_n_genes.fasta -query ' + output + 'b_g_GOI.fa -out ' + output + 'counterBlast.blast -num_threads 4 -max_target_seqs 1 -outfmt "7 sseqid evalue"', shell=True)

################################
# Identify motifs in sequences #
################################

# Do we need to do this?

#################################
# Make a phylogeny of sequences #
#################################
if foundBlu:
    unaln = open(output + 'unaligned.fa', 'a')
    bluFile = open(output + 'b_g_GOI.fa', 'r')
    addition = bluFile.read()
    unaln.write(addition)
    unaln.close()
    bluFile.close()
    print ('Creating phylogenetic tree...')
    call('clustalo -i ' + output + 'unaligned.fa -o ' + output + 'alignedAll.aln --force --outfmt=clu', shell=True)
    AlignIO.convert(output + 'alignedAll.aln', 'clustal', output + 'phyAlign.phy', 'phylip-relaxed')
    cmdline = PhymlCommandline(input=output + 'phyAlign.phy', alpha='e', bootstrap=1, sequential=False)
    call(str(cmdline), shell=True)
    my_tree = Phylo.read(output + "phyAlign.phy_phyml_tree.txt", "newick")
    Phylo.draw(my_tree, show_confidence=False)

# Got to print 'Done' at the end
print ('Done')
    list = args.list
    action = args.action[0]
    index = args.index
    output = args.output


    # input = path + "metadata.tsv"
    # format = 'tsv'
    # list = path + 'seqList.txt'
    # action = 'keep'
    # output = path + 'output_ren.tsv'


    targets = [target.strip() for target in open(list, "r").readlines() if target[0] not in ['\n', '#']]
    if format == 'tree':
        tree = Phylo.read(input, 'newick')
        print('Starting tree file processing...')
        # rename clade names
        if action == 'rename':
            for clade in tree.find_clades():
                for line in targets:
                    oldName = line.split("\t")[0]
                    newName = line.split("\t")[1].strip()
                    if str(clade.name) == oldName:
                        print('Renaming ' + oldName + ' as ' + newName)
                        clade.name = newName

            Phylo.write([tree], output, 'newick')
            print('\nTree file successfully renamed: \'' + output)

Exemple #47
0
        return leftSibling
    #Case 3: Only the right sibling exists so return it
    elif rightSibling != None and rightSibling.genomeFragments != None and len(rightSibling.genomeFragments) > 0:
        return rightSibling
    #Case 4: None of the siblings exist so return NULL
    else:
        return None

######################################################
#                       main
######################################################
print('Starting application...')
startTime = time.time()

print('Reading newick tree from file: %s...' % (newickFileName))
newickTree = Phylo.read(newickFileName, 'newick')
Phylo.draw(newickTree)

globals.initialize() #Initialize the globals file
globals.strains = strains #Assign pointer to the global strains array so we can access it anywhere
createFile(outputFileName, newickTree) #Creates file where data will be output

#Traverses the newick tree recursively reconstructing ancestral genomes
print('Traversing newick tree...')
result = traverseNewickTree(newickTree.clade, None)

#Output newick tree after the ancestors have been added to it
Phylo.draw(newickTree)

#Need to traverse tree to ouput appropriate content to file
newickTree.clade.name = '' #Make sure that the output for the root is not output
Exemple #48
0
def main():
    """Perform the main routine."""
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""
            Given a newick tree, use this program to resolve polytomies (convert to
            bifurcating), and/or change the precision of branch lengths,
            and/or collapse.""")
    subparser_args1 = argparse.ArgumentParser(add_help=False)
    subparser_args1.add_argument("tree", help="Input newick tree")
    subparser_args1.add_argument("-p",
                                 "--precision",
                                 help="""Branch length precision
                                                       (i.e., number of decimal places to
                                                       print).""",
                                 default=None,
                                 type=int)
    subparser_args1.add_argument("-m",
                                 "--support_multiplier",
                                 help="""Multiply branch supports
                        by this value.  Use, for example, to convert scale of 0 to 1 to percentages.  """,
                                 default=None,
                                 choices=[0.1, 100],
                                 type=float)
    subparser_args1.add_argument(
        "-b",
        "--dont_bifurcate_polytomies",
        help="Switch off conversion of node polytomies to bifurcating",
        default=False,
        action="store_true")
    subparser_args1.add_argument(
        "-c",
        "--collapse",
        help="Collapse nodes with support values less than this.",
        default=None,
        type=float)
    subparser_modules = parser.add_subparsers(title="Sub-commands help",
                                              help="",
                                              metavar="",
                                              dest="subparser_name")
    subparser_modules.add_parser(
        "smuggle",
        help="Smuggle the budgie.",
        description="Process the tree.",
        parents=[subparser_args1],
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    subparser_modules.add_parser("version",
                                 help="Print version.",
                                 description="Print version.")
    subparser_modules.add_parser(
        "test",
        help="Run test suite.",
        description="Run test suite.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    args = parser.parse_args()

    if not args.subparser_name:
        parser.print_help()
    elif args.subparser_name == "version":
        from budgitree import __version__ as version
        print(version)
    elif args.subparser_name == "test":
        import unittest
        from .tests.test_suite import suite
        runner = unittest.TextTestRunner(verbosity=2)
        runner.run(suite())

    elif args.subparser_name == "smuggle":
        import sys
        from pathlib import Path
        # Check if file exists
        if not Path(args.tree).exists():
            sys.exit(
                f"File '{Path(args.tree).absolute()}' not found.  Exiting.")

        from Bio import Phylo
        from Bio.Phylo.NewickIO import Writer
        from io import StringIO
        # Read the tree
        tree = Phylo.read(args.tree, "newick")
        # Collapse the low-support nodes if requested
        if args.collapse is not None:
            print(f"Collapsing nodes with support < {args.collapse}.",
                  file=sys.stderr)
            tree.collapse_all(lambda c: c.confidence is not None and c.
                              confidence < args.collapse)
        if not args.dont_bifurcate_polytomies:
            print("Removing polytomies.", file=sys.stderr)
            from ete3 import Tree
            t = Tree(tree.format("newick"))
            t.standardize()
            tree = Phylo.read(StringIO(t.write(format=0)), "newick")
        if args.support_multiplier is not None:
            print(f"Multiplying branch supports by {args.support_multiplier}",
                  file=sys.stderr)
            for non_terminal in tree.get_nonterminals():
                if non_terminal.confidence is not None:
                    if args.support_multiplier == 100:
                        non_terminal.confidence = float("{0:.0f}".format(
                            float(non_terminal.confidence *
                                  args.support_multiplier)))
                    else:
                        if args.support_multiplier == 0.1:
                            non_terminal.confidence = float("{0:.2f}".format(
                                float(non_terminal.confidence *
                                      args.support_multiplier)))

        # Polytomies created by collapsing nodes still need to be parseable.
        # Achieve this by increasing the recursion limit
        sys.setrecursionlimit(3000)
        # but don"t let it get too high (to prevent stack overflow)
        sys.setrecursionlimit(tree.count_terminals() * 2)
        trees = None
        if args.support_multiplier == 100:
            format_confidence = '%1.0f'
        else:
            format_confidence = '%1.2f'
        if args.precision is not None:
            print(
                f"Reformatting branch lengths to {args.precision} decimal places.",
                file=sys.stderr)
            trees = Writer([tree]). \
            to_strings(format_branch_length = f"%1.{args.precision}f", format_confidence=format_confidence)
        else:
            trees = Writer([tree]). \
            to_strings(format_branch_length = f"%g", format_confidence=format_confidence)
        # there is only one tree in trees, so:
        print(next(trees))
import numpy as np
import matplotlib.pyplot as plt
from augur.utils import read_node_data
import argparse
from Bio import Phylo

parser = argparse.ArgumentParser(description="Analyze TMRCA.")
parser.add_argument("--tree", help="tree file")
parser.add_argument("--node-data", help="node_data file")
parser.add_argument("--titers", help="titer_model file")
parser.add_argument("--output", help="output file")
args = parser.parse_args()

T = Phylo.read(args.tree, 'newick')
of = [args.node_data, args.titers] if args.titers else [args.node_data]
node_data = read_node_data(of)

T.root.up = None
for n in T.find_clades(order='postorder'):
    n.numdate = node_data["nodes"][n.name]["numdate"]
    if args.titers:
        n.cTiter = node_data["nodes"][n.name]["cTiter"]
        n.dTiter = node_data["nodes"][n.name]["dTiter"]
    if n.is_terminal():
        n.ntips = 1
        n.tree_length = n.branch_length
        if args.titers:
            n.antigenic_length = n.dTiter
    else:
        n.ntips = np.sum([c.ntips for c in n])
        n.tree_length = n.branch_length + np.sum([c.tree_length for c in n])
Exemple #50
0
def subset_tree(args):

    ################################################# input #################################################

    tree_file_in = args['tree']
    group_to_taxon_file = args['taxon']
    tree_file_out = args['out']

    # define tmp file name
    tree_file_tmp_1 = '%s.tmp_1.tree' % tree_file_out
    tree_file_tmp_2 = '%s.tmp_2.tree' % tree_file_out

    time_format = '[%Y-%m-%d %H:%M:%S] '

    ################################################ store input information ###############################################

    # read in tree
    tree_in = Phylo.read(tree_file_in, 'newick')

    # read in all identified taxons
    identified_taxon_list = set()
    for each_group in open(group_to_taxon_file):
        identified_taxon_list.add(each_group.strip())
    print(datetime.now().strftime(time_format) +
          'The number of provided taxon: %s' % len(identified_taxon_list))

    ########################################## remove unwanted nodes recursively ###########################################

    # remove unwanted nodes recursively
    print(datetime.now().strftime(time_format) +
          'Recursively removing unwanted nodes')
    deleted_leaf_num = 1
    n = 0
    tree_in_copy = copy.deepcopy(tree_in)
    while deleted_leaf_num > 0:
        tree_in_copy, deleted_leaf_num = remove_unwanted_leaf_nodes(
            tree_in_copy, identified_taxon_list)
        n += 1
        print(datetime.now().strftime(time_format) +
              'Removed %s nodes in %sth round' % (deleted_leaf_num, n))

    # write out tree
    Phylo.write(tree_in_copy, tree_file_tmp_1, 'newick')

    ############################################# remove "100:" in clade name ##############################################

    # read in tree
    tree_tmp_1 = Phylo.read(tree_file_tmp_1, 'newick')
    tree_tmp_1_copy = copy.deepcopy(tree_tmp_1)

    for clade in tree_tmp_1_copy.find_clades():
        clade_name = str(clade.name)
        if ':' in clade_name:
            clade.name = clade_name.split(':')[1]

    Phylo.write(tree_tmp_1_copy, tree_file_tmp_2, 'newick')

    ################################################ rename leaf nodes name ################################################

    # read in tree
    tree_tmp_2 = Phylo.read(tree_file_tmp_2, 'newick')
    tree_tmp_2_copy = copy.deepcopy(tree_tmp_2)

    # get all leaf nodes
    all_leaf_nodes = tree_tmp_2_copy.get_terminals()
    for leaf_node in all_leaf_nodes:
        leaf_node_name_str = str(leaf_node.name)

        if ';' in leaf_node_name_str:
            leaf_node_name_split = leaf_node_name_str.split(';')

            # remove space at the begining or end
            leaf_node_name_split_no_space = []
            for each_name in leaf_node_name_split:
                if each_name[0] == ' ':
                    each_name = each_name[1:]
                if each_name[-1] == ' ':
                    each_name = each_name[:-1]
                leaf_node_name_split_no_space.append(each_name)

            leaf_node_name_new = ''
            for identified_taxon in identified_taxon_list:
                if identified_taxon in leaf_node_name_split_no_space:
                    leaf_node_name_new = identified_taxon

            leaf_node.name = leaf_node_name_new

    # write out tree
    Phylo.write(tree_tmp_2_copy, tree_file_out, 'newick')

    # report
    print(datetime.now().strftime(time_format) +
          'Tree subset exported to: %s' % tree_file_out)

    # print warning message if some provided node(s) were not found
    extracted_leaf_nodes = tree_tmp_2_copy.get_terminals()
    if len(extracted_leaf_nodes) < len(identified_taxon_list):

        extracted_leaf_node_list = []
        for extracted_leaf_node in extracted_leaf_nodes:
            extracted_leaf_node_list.append(str(extracted_leaf_node.name))

        un_extracted_nodes = []
        for provided_node in identified_taxon_list:
            if provided_node not in extracted_leaf_node_list:
                un_extracted_nodes.append(provided_node)

        print(datetime.now().strftime(time_format) +
              'Warning!!! Found %s of %s provided nodes, missed: %s' %
              (len(extracted_leaf_nodes), len(identified_taxon_list),
               ', '.join(un_extracted_nodes)))

    ################################################### remove tmp files ###################################################

    # remove tmp files
    os.remove(tree_file_tmp_1)
    os.remove(tree_file_tmp_2)
Exemple #51
0
# Fitch and Margoliash Method to build tree structure in newick format
#
point_dict_copy = point_dictionary.copy()
fm = FitchMargoliash(hamming_table,point_dictionary)
handle = fm.run()
print(handle)
for i in range(len(files)):
    print(files[i],'  ',point_dict_copy[i])

# Phylogenetic Tree construction using phyloXML file (unrooted phylogram)

from io import StringIO

handle = StringIO(handle)
tree = Phylo.read(handle,'newick')
tree.name = TREE_NAME
tree.id = process_id
tree.ladderize()

def tabulate_names(tree):
    names = {}
    for idx, clade in enumerate(tree.find_clades()):
        if clade.name:
            new_name = sequences[point_dict_copy.index(clade.name)].name
            clade.name = '%d_%s' % (idx, new_name)
        else:
            clade.name = "{}_inner".format(idx)
        names[clade.name] = clade
    return names
Exemple #52
0
             label=name)
plt.legend(loc='best', fontsize=12)
plt.savefig('offline-scores.png', bbox_inches='tight')

plt.figure()
plt.xlim([0, n_iters])
# plt.ylim(ymin=-400)
plt.xlabel("Iterations", fontsize=fontsize)
plt.ylabel("Data Log Likelihood", fontsize=fontsize)
for name, likelihood in likelihoods.items():
    plt.plot(likelihood, label=name)
plt.legend(loc='best', fontsize=12)
plt.savefig('offline-likelihoods.png', bbox_inches='tight')

for type, model in models.items():
    final_tree = model.copy()

    plt.figure()
    plot_tree_2d(final_tree, X, pca)

    for node in final_tree.dfs():
        if node.is_leaf():
            node.point = y[node.point]

    newick = final_tree.to_newick()
    tree = Phylo.read(StringIO(newick), 'newick')

    Phylo.draw_graphviz(tree, prog='neato')
    plt.savefig('tree-%s.png' % type, bbox_inches='tight')
plt.show()
Exemple #53
0
    options = get_options()

    import matplotlib
    matplotlib.use('Agg')

    import matplotlib.pyplot as plt
    import seaborn as sns

    sns.set_style('white')

    import os
    import pandas as pd
    import numpy as np
    from Bio import Phylo

    t = Phylo.read(options.tree, 'newick')

    # Max distance to create better plots
    mdist = max([t.distance(t.root, x) for x in t.get_terminals()])

    # Load roary
    roary = pd.read_table(options.spreadsheet, sep=',', low_memory=False)
    # Set index (group name)
    roary.set_index('Gene', inplace=True)
    # Drop the other info columns
    roary.drop(list(roary.columns[:options.skipped_columns - 1]),
               axis=1,
               inplace=True)

    # Transform it in a presence/absence matrix (1/0)
    roary.replace('.{2,100}', 1, regex=True, inplace=True)
Exemple #54
0
#validate gene tree names against the species tree and flag up any discrepencies.
#usage: python check_valid_species_names.py species_tree genetreedir/

from Bio import Phylo
import os, sys, re

spnames = []
odd_names = {}

sptree = Phylo.read(sys.argv[1], "newick")
for taxa in sptree.get_terminals():
    spnames.append(taxa.name)

#now scan gene trees for any unexpected species names
to_check = [
    file for file in os.listdir(sys.argv[2]) if file.endswith("ufboot")
]
for file in to_check:
    trees = Phylo.parse(sys.argv[2] + file, "newick")
    for t in trees:
        for tip in t.get_terminals():
            fields = re.split("_", tip.name)
            if fields[0] in spnames:
                continue
            else:
                odd_names[fields[0]] = 1

for element in odd_names.keys():
    print element
                tmpParam = sampSets[key]
                sampSets[realKey] = {
                    'startTime': tmpParam['startTime'],
                    'endTime': tmpParam['endTime'],
                    'time': [],
                    'lineages': []
                }

            params = sampSets[realKey]

            startTime = params['startTime']
            endTime = params['endTime']
            time = params['time']
            num_lineages = params['lineages']

            T = Phylo.read(treefile, 'newick')
            node_data = read_node_data([branchfile, cladefile])
            #raw_strain_info = collect_strain_info(node_data, metadatafile)
            node_data, node_attrs, node_data_names, metadata_names = parse_node_data_and_metadata(
                T, [branchfile, cladefile], metadatafile)
            rate = node_data['clock']['rate']

            for node in T.find_clades(order='postorder'):
                data = node_data['nodes'][node.name]
                node.clade_membership = data['clade_membership']
                node.date = data['date']
                node.num_date = data['numdate']
                #raw_data = raw_strain_info[node.name]
                raw_data = node_attrs[node.name]
                node.region = raw_data['region'] if 'region' in raw_data else ''
                node.branch_length = data['branch_length'] / rate
Exemple #56
0
                        nargs='?',
                        default='newick',
                        help='phylogeny format (%s)' %
                        (','.join(bp._io.supported_formats.keys())))
    parser.add_argument('-t',
                        '--taxonomy_format',
                        nargs='?',
                        default='newick',
                        help='taxonomy format')
    parser.add_argument('-o',
                        '--output_format',
                        nargs='?',
                        default='newick',
                        help='output format')
    parser.add_argument('-r',
                        '--root',
                        nargs='?',
                        default=None,
                        help='name of OTU to use as root of taxonomy')

    args = parser.parse_args()

    # read in the tree and taxonomy
    phylogeny = bp.read(args.phylogeny_file, args.phylogeny_format)
    taxonomy = bp.read(args.taxonomy_file, args.taxonomy_format)

    label_tree(phylogeny, taxonomy, tax_root=args.root)

    # write output to stdout
    print phylogeny.format(args.output_format)
Exemple #57
0
# makes a data frame where first column are node names, other columns are bbh, psi, and negatives count
def lookup_by_names(tree, df):
    names = {}
    for clade in tree.find_clades():
        if clade.name:
            if clade.name in names:
                raise ValueError("Duplicate key: %s" % clade.name)
            names[clade.name] = clade
    for node in names.keys():
        for ind, val in df.iterrows():
            if val[0] in node:
                df.iloc[ind, 0] = node
    return df


tree = Phylo.read('16s-epsilon-outgroup-labelled-r.tree',
                  'newick')  # open the tree file
pfla = lookup_by_names(tree, pfla)  # use the function to get the dictionary
pflb = lookup_by_names(tree, pflb)

# make a column of which results column has most species in it
pfla = max_col(pfla)
pflb = max_col(pflb)


def binary_prot(df):
    df2 = df.iloc[:, :2]
    df2 = df2.reindex(columns=['accession', 'binary', 'binary2'])
    df2.iloc[:, 1:] = np.nan
    print(df2.head())
    for ind, val in df['max'].iteritems():
        if val == 'BBH':
#%% Preparatory file generation and organization.

## Read in the genome_data file.

genome_data = pd.read_csv(ref_dir_domain + 'genome_data.csv',
                          header=0,
                          index_col=0)
genome_data['clade'] = np.nan
genome_data['tip_name'] = np.nan
genome_data['npaths_actual'] = np.nan
genome_data['branch_length'] = np.nan

## Get the clade number of each assembly and add this information to
## genome_data.

tree = Phylo.read(tree, 'phyloxml')

assemblies = []

for clade in tree.get_terminals():
    clade_number = int(clade.confidence)
    print clade_number

    assembly = clade.name
    assembly = assembly.strip('@')

    if domain == 'eukarya':
        assembly = re.split('_', assembly)[0]

    else:
        assembly = re.split('_', assembly)
Exemple #59
0
def phylo_from_str(tree_str):
    treeio = StringIO.StringIO(tree_str)
    phylo_tree = Phylo.read(treeio, format='newick')
    return phylo_tree
Exemple #60
0
 def test_to_networkx(self):
     """Tree to Graph conversion, if networkx is available."""
     tree = Phylo.read(EX_DOLLO, 'phyloxml')
     G = Phylo.to_networkx(tree)
     self.assertEqual(len(G.nodes()), 659)