Beispiel #1
0
  def reroot_tree_with_outgroup(tree_name, outgroups):
    clade_outgroups = GubbinsCommon.get_monophyletic_outgroup(tree_name, outgroups)
    outgroups = [{'name': taxon_name} for taxon_name in clade_outgroups]

    tree = Phylo.read(tree_name, 'newick')
    tree.root_with_outgroup(*outgroups)
    Phylo.write(tree, tree_name, 'newick')

    tree  = dendropy.Tree.get_from_path(tree_name, 'newick',
              preserve_underscores=True)
    tree.deroot()
    tree.update_splits()
    output_tree_string = tree.as_string(
      'newick',
      taxon_set=None,
      suppress_leaf_taxon_labels=False,
      suppress_leaf_node_labels=True,
      suppress_internal_taxon_labels=False,
      suppress_internal_node_labels=False,
      suppress_rooting=True,
      suppress_edge_lengths=False,
      unquoted_underscores=True,
      preserve_spaces=False,
      store_tree_weights=False,
      suppress_annotations=True,
      annotations_as_nhx=False,
      suppress_item_comments=True,
      node_label_element_separator=' ',
      node_label_compose_func=None
      )
    output_file = open(tree_name, 'w+')
    output_file.write(output_tree_string.replace('\'', ''))
    output_file.closed
Beispiel #2
0
def convert_boottrees(fname_trees):
    out_fnames = []
    for i, tree in enumerate(Phylo.parse(fname_trees, "newick")):
        fname_tree = "%s.codeml-%d" % (fname_trees, i)
        Phylo.write(tree, fname_tree, "newick")
        out_fnames.append(fname_tree)
    return out_fnames
Beispiel #3
0
 def phylo2newick(self, t):
     """
     Convert Phylo into Newick tree string.
     """
     output = StringIO()
     Phylo.write(t, output, 'newick')
     return output.getvalue()
def buildTree(FASTAFile):
    myAlignment = AlignIO.read(FASTAFile, "fasta")
    
    # Create a tip mapping from the fasta file
    tipMapping = {}
    for record in myAlignment:
        tipMapping[record.id] = str(record.seq)
        
    # Compute a distance matrix and construct tree
    calculator = DistanceCalculator("identity") 
    myMatrix = calculator.get_distance(myAlignment)
    constructor = DistanceTreeConstructor()
    upgmaTree = constructor.nj(myMatrix)
    upgmaTree.root_at_midpoint()
    Phylo.draw(upgmaTree)
    # Convert phyloxml tree to newick
    # biopython does not provide a function to do this so it was necessary
    # to write to a buffer in newick to convert then get rid of unneeded info
    for clade in upgmaTree.get_terminals():
        clade.name = "\"" + clade.name + "\""
    buf = cStringIO.StringIO()
    Phylo.write(upgmaTree, buf, 'newick', plain = True)
    tree = buf.getvalue()
    tree = re.sub(r'Inner\d*', '', tree)
    tree = tree.replace(";", "")
    tree = literal_eval(tree)    #newick format

    # RLR tree required for maxParsimony function
    tree = NewicktoRLR(tree)
    return tree
Beispiel #5
0
    def print_newick_tree(self, root_tee):
        """
        Convert an ElementTree into a ladderized Newick tree.
        """
        newick = self.export_newick_tree(root_tee.getroot())

        # load into Phylo so we can sort the tree (i.e. ladderize)
        tree = Phylo.read(StringIO(newick), 'newick')
        tree.ladderize()

        # export the tree back to a string
        fout = StringIO()
        Phylo.write(tree, fout, 'newick')
        newick = fout.getvalue()

        # remove the branch lenghs
        newick = newick.replace(':0.00000', '').strip()

        # get the order of admix nodes in the tree
        order = list(OrderedDict.fromkeys(re.findall('a\d+', newick)))

        # normalise the node numbering
        for i, old in enumerate(order):
            newick = newick.replace(old, 'n%s' % (i + 1))

        # replace n0 with a0 (to preseve the existing cache)
        newick = re.sub(r'n([0-9]+)', r'a\1', newick)

        return newick
def annotate_cOTU_tree(cOTU_tree_string,results_list):
    from Bio import Phylo
    from StringIO import StringIO
    
    tree = Phylo.read(StringIO(cOTU_tree_string),'newick',rooted=True)
    
    for node_dict in results_list:
        node_tree = Phylo.read(StringIO(load_de_numericized_newick_tree(node_dict['s_nodes'],before="cOTU_",after="")),'newick',rooted=True)
        
        ###debug###
        #print node_tree
        
        node_ref = []
        for terminal in node_tree.get_terminals():
            node_ref.append({"name": terminal.name})
        
        node = tree.common_ancestor(node_ref)
        
        node.confidence = float(node_dict['fdr_p'])
        
        #print node_dict['fdr_p']
    
    out = StringIO()
    
    Phylo.write(tree,out,'newick')
    
    return out.getvalue()
def main():
    global START_TIME
    global CURRENT_TIME
    global nodelist
    
    print(colored("---------------- read tree ----------------", "green"))
    subtree_path = './data/subtree/Eukaryota.tre'
    tree = Phylo.read(subtree_path, 'newick')
    CURRENT_TIME = print_time(CURRENT_TIME)

    print(colored("---------------- read nodelist ----------------", "green"))
    nodelist_path = './data/nodelist/Eukaryota-castor.csv' 
    #                0    1              2       3       4           5
    # nodelist    - [id, originaltag, finaltag, depth, heights, nr_children]
    with open(nodelist_path, 'r') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')
        next(reader, None)      # skip the header
        for row in reader:
            if row != []:
                ott_id = row[0]
                originaltag = row[1]
                finaltag = row[2]
                nodelist.append([ott_id, originaltag, finaltag])
    CURRENT_TIME = print_time(CURRENT_TIME)

    print(colored("---------------- prepare tree ----------------", "green"))
    prepare_tree(tree.clade)
    print(colored("---------------- Save tree ----------------", "green"))
    Phylo.write(tree, './results/Eukaryota_tree-castor.tre', 'newick')
    CURRENT_TIME = print_time(CURRENT_TIME)
    print(colored("--------------------------------", "green"))
    return
def tree(option, opt, value, parser):
    inputfile = parser.values.inputfile
    outputfile = parser.values.outputfile
    inputtype = str(value[0])
    outputtype = str(value[1])
    tree = Phylo.read(inputfile, inputtype)
    records = tree.get_terminals()

    # convert sequences to small names and save dictionary to .dict file
    if parser.values.convert == False:
        namedict = dict()
        x = 0
        for clade in records:
            x+=1
            newname = 'seq%i' % x
            namedict[newname] = clade.name
            clade.name = newname
        Phylo.write(tree, outputfile, outputtype)
#        dictoutname = outputfile + ".dict"
#        dicthandle = open(dictoutname, "w")
#        dicthandle.write(str(namedict))
#        dicthandle.close()
    
    # use dictionary to convert taxon back to large names
    elif parser.values.convert == True:
        namedict = eval(open(parser.values.dictfile, "r").read())
        for clade in records:
            oldname = namedict[clade.name]
            clade.name = oldname
        Phylo.write(tree, outputfile, outputtype)
Beispiel #9
0
def trim_tree(absenteeList, TreeFile):
    """Collapse away species from the phylogenetic tree that
    are not found in this sequence file. Output the tree file."""
    print "\nReading the Tree..."
    #parse the tree using Phylo
    tree = Phylo.read(TreeFile, 'newick')
    print "Here is the starting tree:"
    Phylo.draw_ascii(tree)
    terminals = tree.get_terminals()
    print "\nFound the following {} taxa in the tree:".format(len(terminals))
    print terminals
    #prune away taxa that are not included for this sequence file
    for taxon in absenteeList:
        print("Removing absent")
        tree.prune(taxon)
    print "\nPruned away these species:"
    print absenteeList
    print "\nHere is the tree with the missing taxa pruned away:\n"
    Phylo.draw_ascii(tree)

    #unless you have a clock, PAML requires that your tree is unrooted, ie has a trifurcation at first node. So do that here
    # ROOT = tree.get_nonterminals()[0]
    # if ROOT.is_bifurcating() == True:
    #     firstNode = tree.get_nonterminals()[1]
    #     tree.collapse(firstNode)

    #if RunMode is not 2 just output the pruned tree as is
    print "\nOutputting the following revised tree for the species content of the sequence file"
    print "it should have a trifurcation at the base unless you are using a clock\n"
    Phylo.draw_ascii(tree)
    # if tree.rooted == False:
    #     print "The revised tree is an unrooted tree (regardless of how the sketch above looks)"
    # if tree.rooted == True:
    #     print "Hmm, the tree is rooted. This may not be right for PAML input. You should check."
    Phylo.write(tree, TreeOutFileName, "newick")
Beispiel #10
0
def writeTree(filename, tree, format_str='newick'):
    """ Write a tree to file using Biopython.

    :arg filename: name for output file
    :type filename: str

    :arg tree: a square matrix with length of ensemble. If numbers does not match *names*
                          it will raise an error
    :type tree: :class:`~Bio.Phylo.BaseTree.Tree`

    :arg format_str: a string specifying the format for the tree
    :type format_str: str
    """
    try: 
        from Bio import Phylo
    except ImportError:
        raise ImportError('Phylo module could not be imported. '
            'Reinstall ProDy or install Biopython '
            'to solve the problem.')

    if not isinstance(filename, str):
        raise TypeError('filename should be a string')

    if not isinstance(tree, Phylo.BaseTree.Tree):
        raise TypeError('tree should be a Biopython.Phylo Tree object')

    if not isinstance(format_str, str):
        raise TypeError('format_str should be a string')

    Phylo.write(tree, filename, format_str)
Beispiel #11
0
def root_tree_with_outgroup(input_file, output_file, outgroup):
    input_tree = Phylo.read(input_file, 'newick')
    try:
        input_tree.root_with_outgroup({'name': outgroup})
        Phylo.write(input_tree, output_file, 'newick')
    except:
        print('Could not root', input_file)
Beispiel #12
0
    def save_treetime_results(self):

        from Bio import Align
        #  files to be displayed in the web interface
        self._tree_to_json()
        self._likelihoods_to_json()

        # files to be downloaded as .zip archive
        Phylo.write(self.tree, os.path.join(self._root_dir, out_tree_nwk),
                    'newick')
        self._save_alignment()
        self._save_metadata_to_csv()
        self._save_molecular_clock_to_csv()
        self._save_gtr()
        # zip all results to one file
        with zipfile.ZipFile(os.path.join(self._root_dir, zipname),
                             'w') as out_zip:
            out_zip.write(os.path.join(self._root_dir, out_tree_nwk),
                          arcname=out_tree_nwk)
            out_zip.write(os.path.join(self._root_dir, out_aln_fasta),
                          arcname=out_aln_fasta)
            out_zip.write(os.path.join(self._root_dir, out_metadata_csv),
                          arcname=out_metadata_csv)
            out_zip.write(os.path.join(self._root_dir, out_tree_json),
                          arcname=out_tree_json)
            #out_zip.write(os.path.join(self._root_dir, in_cfg), arcname=in_cfg)
            out_zip.write(os.path.join(self._root_dir, out_mol_clock_csv),
                          arcname=out_mol_clock_csv)
            out_zip.write(os.path.join(self._root_dir, out_likelihoods_json),
                          arcname=out_likelihoods_json)
            out_zip.write(os.path.join(self._root_dir, out_gtr),
                          arcname=out_gtr)
Beispiel #13
0
def process_fasta(args):
    metadata = pd.read_csv(args.metadata, sep='\t')
    # calcualte numeric date
    # removes rows with ambiguous dates
    exclude_dates = set(['2019', '2020', '2020-01', '2020-02', '2020-03',
                         '2020-01-XX', '2020-02-XX', '2020-03-XX',
                         '2020-04-XX'])
    metadata = metadata[~metadata['date'].isin(exclude_dates)]
    metadata['numeric_date'] = pd.to_datetime(
        metadata['date']).apply(numeric_date)
    aligned = list(SeqIO.parse(args.aln, "fasta"))
    aligned, names = rename_aln(aligned, args.ph, metadata)
    tree = Phylo.read(args.initTree, 'newick')
    tree = rescale_tree(tree)       # TODO WHY DO WE NEED THIS
    for tip in tree.get_terminals():
        tip.name = names[tip.name]
    aln_name = args.aln.split('/')[-1].replace('.fasta', '')
    renamed_fasta_path = args.base_path+'/'+aln_name+'renamed.fasta'
    renamed_tree_path = args.base_path+'/'+aln_name+'renamed.newick'
    with open(renamed_fasta_path, 'w') as out_fasta:
        SeqIO.write(aligned, out_fasta, 'fasta')
    with open(renamed_tree_path, 'w') as out_tree:
        Phylo.write(tree, out_tree, 'newick')
    args.finalAln = renamed_fasta_path
    args.finalTree = renamed_tree_path
    return(args)
Beispiel #14
0
def build_phylogeny_trees():
    path = "out/homologous_gene_sequences/"
    output_path = "out/aligned_homologous_gene_sequences/"

    for homologous_gene_sequence in os.listdir(path):
        input = path + homologous_gene_sequence
        output = output_path + homologous_gene_sequence
    
        clustal_omega = ClustalOmegaCommandline(infile=input, outfile=output, verbose=True, auto=True)
        os.system(str(clustal_omega))

        multi_seq_align = AlignIO.read(output, 'fasta')

        # Distance Matrix
        calculator = DistanceCalculator('identity')
        dist_mat = calculator.get_distance(multi_seq_align)

        tree_constructor = DistanceTreeConstructor()
        phylo_tree = tree_constructor.upgma(dist_mat)

        Phylo.draw(phylo_tree)

        print('\nPhylogenetic Tree\n', homologous_gene_sequence)
        Phylo.draw_ascii(phylo_tree)
        Phylo.write([phylo_tree], 'out/phylogenetic_trees/{}_tree.nex'.format(homologous_gene_sequence), 'nexus')
Beispiel #15
0
def main():
	args = parse_arguments()

	msa = ParsimonyTree.read_msa(args.a)
	i_tree = ParsimonyTree.read_tree(args.n)

	nb_f = ParsimonyTree.get_nni_neighbors
	if args.spr:
		nb_f = ParsimonyTree.get_spr_neighbors
	elif args.tbr:
		nb_f = ParsimonyTree.get_tbr_neighbors

	mcmc = MonteCarlo(msa, i_tree, nb_f, args.r, args.p)
	f_tree = mcmc.get_tree()

	with open(args.o, "w") as outfile:
		Phylo.write(f_tree, outfile, "newick")

	print("\n=========================\n")
	print("Original Tree")
	print("Score:", ParsimonyTree.get_parsimony_score(msa, i_tree))
	Phylo.draw(i_tree)
	Phylo.draw_ascii(i_tree)

	print("\n=========================\n")
	print("Final Tree")
	print("Score:", ParsimonyTree.get_parsimony_score(msa, f_tree))
	Phylo.draw(f_tree)
	Phylo.draw_ascii(f_tree)

	print("\n=========================\n")
	print("Histogram of Parsimony Scores")
	plt.title("Histogram of Parsimony Scores")
	plt.hist(mcmc.get_scores())
	plt.show()
def phylo_tree_score_otus(input_file,
                          tree,
                          path=RESULT_FOLDER,
                          out_tree='out_tree.txt'):
    #系統樹作成
    score = []
    otus = []
    f = open(os.path.join(RESULT_FOLDER, input_file), "r")
    try:
        line_count = int(f.readline())
        for n in range(line_count):
            line = f.readline()
            line_read = line.split(" ", 1)
            otus.append(line_read.pop(0))
            pre_score = line_read.pop(0)
            score.append(list(map(float, pre_score.split(" ")[:-1:])))
    except:
        raise Exception("No valid matrix")
    f.close()
    print("Create Phylogenetic Tree...")
    try:
        if tree == "nj":
            print("nj")
            Phylo.write(makeNj(score, otus), os.path.join(path, out_tree),
                        "newick")
        elif tree == "upgma":
            print("upgma")
            Phylo.write(makeUpgma(score, otus), os.path.join(path, out_tree),
                        "newick")
    except:
        raise Exception("Phylogenetic Tree Generation Error")
Beispiel #17
0
 def phylo2newick(self, t):
     """
     Convert Phylo into Newick tree string.
     """
     output = StringIO()
     Phylo.write(t, output, "newick")
     return output.getvalue()
Beispiel #18
0
def action(args):
    def newname(leaf, newname):
        leaf.name = newname
        return leaf

    tree = Phylo.parse(args.tree, args.tree_type).next()
    leafs = (leaf for leaf in tree.get_terminals())

    if args.info:
        info = DictReader(args.info, fieldnames=['seqname', 'newname'])
        info = {i['seqname']: i['newname'] for i in info}

        # for newick trees :s will be replaced by |s
        if args.tree_type == 'newick':
            info = {s.replace(':', '|'): n for s, n in info.items()}

        leafs = (l for l in leafs if l.name in info)
        leafs = (newname(l, info[l.name]) for l in leafs)

    if args.remove_word:
        leafs = (newname(l, re.sub(args.remove_word, '', l.name))
                 for l in leafs)
        leafs = (newname(l, l.name.strip()) for l in leafs)

    leafs = (newname(l, args.add_prefix + l.name) for l in leafs)
    leafs = (newname(l, l.name + args.add_suffix) for l in leafs)

    # do this last
    if args.tree_type == 'newick':
        leafs = (newname(l, l.name.replace(' ', '_')) for l in leafs)

    # execute changes and write tree
    list(leafs)
    Phylo.write(tree, args.out, args.tree_type)
Beispiel #19
0
    def _calculate_gsi(self):
        """
        Method for calculating Gene Support Indices
        :return:
        """
        LOGGER.info("Calculating Gene Support Indices (GSIs)"
                    " from the gene trees..")
        genome_num = 0
        bcg_dir = os.path.join(self._dirpath, self.config.bcg_dir)
        for file in os.listdir(bcg_dir):
            if file.endswith('.bcg'):
                genome_num += 1

        nwk_file = os.path.join(self._align_output_dir, "all_gene.trees")
        trees = Phylo.parse(nwk_file, 'newick')
        tree = Consensus.majority_consensus(trees,
                                            cutoff=(100-self.config.gsi_threshold) * genome_num/100)
        Phylo.draw_ascii(tree)
        ubcg_gsi_file = os.path.join(self._align_output_dir,
                                     f'UBCG_gsi({self._bcg_num}'
                                     f'){self.config.postfixes.align_tree_const}')
        with open(ubcg_gsi_file, 'w') as f:
            Phylo.write(tree, f, 'newick')

        LOGGER.info("The final tree marked with GSI was written"
                    " to %s", ubcg_gsi_file)
Beispiel #20
0
    def reroot_tree_with_outgroup(tree_name, outgroups):
        clade_outgroups = GubbinsCommon.get_monophyletic_outgroup(tree_name, outgroups)
        outgroups = [{"name": taxon_name} for taxon_name in clade_outgroups]

        tree = Phylo.read(tree_name, "newick")
        tree.root_with_outgroup(*outgroups)
        Phylo.write(tree, tree_name, "newick")

        tree = dendropy.Tree.get_from_path(tree_name, "newick", preserve_underscores=True)
        tree.deroot()
        tree.update_bipartitions()
        output_tree_string = tree.as_string(
            schema="newick",
            suppress_leaf_taxon_labels=False,
            suppress_leaf_node_labels=True,
            suppress_internal_taxon_labels=False,
            suppress_internal_node_labels=False,
            suppress_rooting=True,
            suppress_edge_lengths=False,
            unquoted_underscores=True,
            preserve_spaces=False,
            store_tree_weights=False,
            suppress_annotations=True,
            annotations_as_nhx=False,
            suppress_item_comments=True,
            node_label_element_separator=" ",
        )
        with open(tree_name, "w+") as output_file:
            output_file.write(output_tree_string.replace("'", ""))
            output_file.closed
Beispiel #21
0
def generate_new_files(fname) :
    # to get gene names that slr can handle (short enough)
    newfname = fname.replace(".", "_.")

    # generate a fasta file with new ids
    d = {}
    sequences = []

    runningids = 1
    for record in SeqIO.parse(fname, 'fasta') :
        d[record.id] = "flyg%s" % runningids
        record.id = d[record.id]
        record.name = ""
        record.description = ""
        sequences.append(record)
        runningids += 1

    SeqIO.write(sequences, newfname, "fasta")
    

    if not RUN_RAXML :
    # generate a treefile with new ids
        treefile = fname.replace("fasta", "tree")
        newtreefile = newfname.replace("fasta", "tree")

        tree = Phylo.read(treefile, 'newick')

        for node in tree.get_terminals():
            node.name = d[node.name]

        Phylo.write(tree, newtreefile, 'newick')

    return newfname
Beispiel #22
0
    def build(self, root='midpoint', raxml=True, raxml_time_limit=0.5):
        from Bio import Phylo, AlignIO
        import subprocess, glob, shutil
        make_dir(self.run_dir)
        os.chdir(self.run_dir)
        for seq in self.aln: seq.name=seq.id
        AlignIO.write(self.aln, 'temp.fasta', 'fasta')

        tree_cmd = ["fasttree"]
        if self.nuc: tree_cmd.append("-nt")
        tree_cmd.append("temp.fasta")
        tree_cmd.append(">")
        tree_cmd.append("initial_tree.newick")
        os.system(" ".join(tree_cmd))

        out_fname = "tree_infer.newick"
        if raxml:
            if raxml_time_limit>0:
                tmp_tree = Phylo.read('initial_tree.newick','newick')
                resolve_iter = 0
                resolve_polytomies(tmp_tree)
                while (not tmp_tree.is_bifurcating()) and (resolve_iter<10):
                    resolve_iter+=1
                    resolve_polytomies(tmp_tree)
                Phylo.write(tmp_tree,'initial_tree.newick', 'newick')
                AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed")
                print( "RAxML tree optimization with time limit", raxml_time_limit,  "hours")
                # using exec to be able to kill process
                end_time = time.time() + int(raxml_time_limit*3600)
                process = subprocess.Popen("exec raxml -f d -T " + str(self.nthreads) + " -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True)
                while (time.time() < end_time):
                    if os.path.isfile('RAxML_result.topology'):
                        break
                    time.sleep(10)
                process.terminate()

                checkpoint_files = glob.glob("RAxML_checkpoint*")
                if os.path.isfile('RAxML_result.topology'):
                    checkpoint_files.append('RAxML_result.topology')
                if len(checkpoint_files) > 0:
                    last_tree_file = checkpoint_files[-1]
                    shutil.copy(last_tree_file, 'raxml_tree.newick')
                else:
                    shutil.copy("initial_tree.newick", 'raxml_tree.newick')
            else:
                shutil.copy("initial_tree.newick", 'raxml_tree.newick')

            try:
                print("RAxML branch length optimization")
                os.system("raxml -f e -T " + str(self.nthreads) + " -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick")
                shutil.copy('RAxML_result.branches', out_fname)
            except:
                print("RAxML branch length optimization failed")
                shutil.copy('raxml_tree.newick', out_fname)
        else:
            shutil.copy('initial_tree.newick', out_fname)
        self.tt_from_file(out_fname, root)
        os.chdir('..')
        remove_dir(self.run_dir)
        self.is_timetree=False
def root(tree, clade, filename):
    """
    roots tree in newick format
    on a single column list of outgroup
    clade names
    
    Parameters
    ----------
    argv: tree
        newick tree file
    argv: clade
        single column file of outgroup taxa
    argv: filename
        output file name
    """

    # read in tree
    tree = Phylo.read(tree, 'newick')

    # initialize variables for terminal branch length
    clade = [line.rstrip('\n') for line in open(clade)]

    outgroup = [{'name': taxon_name} for taxon_name in clade]

    tree.root(outgroup)

    Phylo.draw_ascii(tree)

    Phylo.write(tree, filename, 'newick')
def tree(from_cluster,to_cluster, grupa):

    consensus_trees = []

    for i in [x for x in range(from_cluster,to_cluster)]:

        msa = AlignIO.read('msa\msa_rodzina_' + str(i)+ '_s.fasta', 'fasta')
        print i
        calculator = DistanceCalculator('identity')

        try:
            dm = calculator.get_distance(msa)
            constructor = DistanceTreeConstructor(calculator, 'nj')
            trees = bootstrap_trees(msa, 50, constructor)

            trees_list = list(trees)
            not_included = set([])

            for j in range(len(trees_list)):
                target_tree = trees_list[j]
                support_tree = get_support(target_tree, trees_list)

            for node in support_tree.get_nonterminals():
                if node.confidence < 50:
                    not_included.add(j)

            trees = [trees_list[k] for k in range(len(trees_list)) if k not in not_included]

            if len(trees) > 0:
                consensus_trees.append(majority_consensus(trees))

        except:
            ValueError

    Phylo.write(consensus_trees,"drzewa_wynikowe_" + str(grupa),"newick")
Beispiel #25
0
def ML_tree(infile, outfile, file_type):
    # Tree creation with maximum-likelihood algorithm (phyML)
    # input : infile = .fasta alignment file that the user can import or paste, outfile = name of output file, file_type = clustal is the clustal too has been used, fasta if muscle tool has been used
    # output : .newick file and .png picture to display
    # phylogeny page should allow to choose maximum likelihood method

    # convert file to phylip
    records = SeqIO.parse("static/data/sauvegardes/" + dirName + infile, file_type)  # clustal <-> fasta
    count = SeqIO.write(records, "static/data/sauvegardes/" + dirName + outfile + ".phylip", "phylip")
    print("Converted %i records" % count)

    if (user_OS == 'darwin'):
        cmd = PhymlCommandline(cmd='static/tools/MacOS/PhyML-3.1/PhyML-3.1_macOS-MountainLion',
                               input='static/data/sauvegardes/' + dirName + outfile + '.phylip')
    if (user_OS == 'linux'):
        cmd = PhymlCommandline(cmd='static/tools/Linux/PhyML-3.1/PhyML-3.1_linux64',
                               input='static/data/sauvegardes/' + dirName + outfile + '.phylip')
    if (user_OS == 'win32'):
        cmd = PhymlCommandline(cmd= current_path + '/static/tools/Windows/PhyML-3.1/PhyML-3.1_win32.exe',
                               input='static/data/sauvegardes/' + dirName + outfile + '.phylip')

    out_log, err_log = cmd()
    tree = Phylo.read('static/data/sauvegardes/' + dirName + outfile + '.phylip_phyml_tree.txt', 'newick')
    Phylo.draw(tree, do_show=False)
    Phylo.write(tree, 'static/data/sauvegardes/' + dirName + 'tree.txt', "newick")
    foo = current_path + '/static/data/sauvegardes/' + dirName + 'tree.png'
    plt.savefig(foo)
Beispiel #26
0
def TreeAssembly(StartDIR, outfname, delete_name):
    init_clade = Phylo.BaseTree.Clade(name=StartDIR)
    tree = Phylo.BaseTree.Tree(init_clade)
    NONTERMINALS = [tree.clade]
    i = 0
    while (NONTERMINALS != []):
        i += 1
        cstate = NONTERMINALS.pop(0)
        WD = cstate.name
        try:
            downtree = Phylo.read(WD + "/UPSTREAM.nwk", 'newick')
            cstate.clades.extend(downtree.clade.clades)
            NONTERMINALS.extend(
                list(terminal for terminal in downtree.get_terminals()))
        except:
            try:
                downtree = Phylo.read(WD + "/TERMINAL.nwk", 'newick')
                if (downtree.clade.clades != []):
                    cstate.clades.extend(downtree.clade.clades)
                else:
                    cstate.name = downtree.clade.name
            except:
                print("missing " + WD)
    if (delete_name == "TRUE"):
        for internal_node in tree.get_nonterminals():
            internal_node.name = ""
    Phylo.write(tree, outfname, 'newick')
Beispiel #27
0
def make_patient_RNA_DNA_tree(pcode, min_DNA_frac = 0.001):
    ''' make a tree for all RNA/DNA sample of a given patient '''
    for seq_type in ['clustered_good', 'good', 'hyper', 'suspicious']:
        seqs=[]
        for outprefix in patient_to_prefix_p17[pcode]:
            with myopen('data/'+outprefix+'_DNA_'+seq_type+save_as) as ifile:
                seqs.extend([x for x in SeqIO.parse(ifile, 'fasta')])
        p = Patient.load(pcode)
        seqs.extend(p.get_haplotype_alignment(region))
        seqs_pruned = prune_rare_DNA(seqs, min_DNA_frac)
        for hi, hap in enumerate(seqs_pruned):
            hap.id+='_'+str(hi)
            hap.name=hap.id

        outfname = 'data/'+pcode+'_RNA_and_DNA_'+seq_type+'.fasta'
        align(ungap(seqs_pruned), outfname)
        tree = infer_tree(outfname, min_DNA_frac=0.0)
        leaves = sorted(filter(lambda x:x.name[:4]=='days', tree.get_terminals()),
                        key = lambda x:(int(x.name.split('_')[1]), -int(x.name.split('_')[3][:-1])))
        tree.root_with_outgroup(leaves[0])
        tree.root.branch_length=0.01
        for branch in tree.get_nonterminals(order='postorder'):
            if branch.branch_length<0.001:
                tree.collapse(branch)
        tree.ladderize()
        Phylo.write(tree, 'data/'+pcode+ '_RNA_and_DNA_'+seq_type+'.nwk', 'newick')
Beispiel #28
0
def tree_reconstruction(phy_file, method, model, phyformat):
    '''Construct tree with given method and model'''

    aln = AlignIO.read(phy_file, 'phylip-' + phyformat)

    constructor = DistanceTreeConstructor()
    calculator = DistanceCalculator(model)
    dm = calculator.get_distance(aln)

    if method == 'upgma':
        tree = constructor.upgma(dm)
    elif method == 'nj':
        tree = constructor.nj(dm)

    tree.ladderize()

    for c in tree.find_clades():
        if 'Inner' in c.name:
            c.name = ''

    Phylo.write(tree, args.output + '/tree.nwk', 'newick')

    plt.rcParams['font.style'] = 'italic'
    plt.rc('font', size=8)
    plt.rc('axes', titlesize=14)
    plt.rc('xtick', labelsize=10)
    plt.rc('ytick', labelsize=10)
    plt.rc('figure', titlesize=18)

    draw(tree, do_show=False)
    plt.savefig(args.output + "/tree.svg", format='svg', dpi=1200)
Beispiel #29
0
    def summarise_dist(self, rf_results: RfResults, dir_out):

        for use_norm in (True, False):
            if use_norm:
                path_out = os.path.join(dir_out, 'rf_normed.tree')
                path_hm = os.path.join(dir_out, 'rf_normed_heatmap.svg')
                plt_title = 'Normalised Robinson-Foulds Distance'
            else:
                path_out = os.path.join(dir_out, 'rf_un_normed.tree')
                path_hm = os.path.join(dir_out, 'rf_un_normed_heatmap.svg')
                plt_title = '(un)Normalised Robinson-Foulds Distance'

            metrics = defaultdict(dict)
            names = set()
            for (tid_a, tid_b), (rf, norm_rf) in rf_results.data.items():
                if use_norm:
                    metrics[tid_a][tid_b] = norm_rf
                    metrics[tid_b][tid_a] = norm_rf
                else:
                    metrics[tid_a][tid_b] = rf
                    metrics[tid_b][tid_a] = rf
                names.add(tid_a)
                names.add(tid_b)

            labels = sorted(list(names))
            mat_vals = list()
            mat = np.zeros((len(labels), len(labels)))
            for i in range(len(labels)):
                cur_row = list()
                tid_a = labels[i]
                for j in range(i + 1):
                    tid_b = labels[j]
                    if tid_a == tid_b:
                        cur_row.append(0.0)
                    else:
                        cur_row.append(metrics[tid_a][tid_b])
                        mat[i, j] = metrics[tid_a][tid_b]
                mat_vals.append(cur_row)
            mat = mat + mat.T

            # Newick
            dm = DistanceMatrix(names=labels, matrix=mat_vals)
            constructor = DistanceTreeConstructor()
            tree = constructor.nj(dm)

            Phylo.write(tree, path_out, 'newick')

            # Heatmap
            cmap = sns.cubehelix_palette(100, reverse=True)

            sns.set(font_scale=1)
            fig_size = (15, 15)

            rf_df = pd.DataFrame(mat, columns=labels, index=labels)
            sns.clustermap(rf_df,
                           annot=True,
                           fmt='.3f',
                           cmap=cmap,
                           figsize=fig_size).fig.suptitle(plt_title)
            plt.savefig(path_hm)
Beispiel #30
0
    def distance_matrix(cls, cluster_list):
        print cluster_list
        dists = Distance.objects.filter(rep_accnum1__in=cluster_list, rep_accnum2__in=cluster_list)
        
        distance_pairs = {g.rep_accnum1 + '_' + g.rep_accnum2: g.distance for g in dists.all()}
    
        matrix = []
        for i in range(0,len(cluster_list)):
            matrix_iteration = []
            for j in range(0,i+1):
                if i == j:
                    matrix_iteration.append(0)
                elif cluster_list[i] + '_' + cluster_list[j] in distance_pairs:
                    matrix_iteration.append(distance_pairs[cluster_list[i] + '_' + cluster_list[j]])
                elif cluster_list[j] + '_' + cluster_list[i] in distance_pairs:
                    matrix_iteration.append(distance_pairs[cluster_list[j] + '_' + cluster_list[i]])
                else:
                    raise("Error, can't find pair!")
            matrix.append(matrix_iteration)
            #print matrix_iteration

        cluster_list = [s.encode('ascii', 'ignore') for s in cluster_list]
        matrix_obj = _DistanceMatrix(names=cluster_list, matrix=matrix)
        constructor = DistanceTreeConstructor()
        tree = constructor.nj(matrix_obj)
        tree.ladderize()
        #Phylo.draw_ascii(tree)
        output = StringIO.StringIO()
        Phylo.write(tree, output, 'newick')
        tree_str = output.getvalue()
        #print tree_str
        
        return tree_str
Beispiel #31
0
def write_xml(fname, E, C, l):
    n, _ = E.shape

    root = Tree()
    root.name = str(n - 1)
    stack = [root]
    while stack:
        cur = stack.pop()
        i = int(cur.name)
        child_idxs = np.where(E[i, :] == 1)[0]
        for ci in child_idxs:
            child = cur.add_child(name=str(ci))
            child.dist = np.linalg.norm(np.subtract(C[i, l:], C[ci, l:]),
                                        ord=1)
            stack.append(child)

    newick_str = root.write(
        features=['name'], format=1, format_root_node=True
    )  # format_root_node=True puts root node name in str
    newick_tree = Phylo.read(
        StringIO(newick_str), 'newick'
    )  # format=1 gives branch lengths and names for all nodes (leaves and internal)

    for clade in newick_tree.find_clades():
        if clade.confidence is not None:  # Phylo.read() stupidly interprets names of internal nodes as confidences for newick strings
            clade.name = clade.confidence
            clade.confidence = None
    xmltree = newick_tree.as_phyloxml()  # convert to PhyloXML.Phylogeny type
    Phylo.write(xmltree, open(fname, 'w'), 'phyloxml')
Beispiel #32
0
def ete3_evol_prepare(
    tree_in_fn,
    alignment_in_fn,
    tree_out_fn,
    foreground_list,
    min_foreground=2,
    min_background=2,
):
    """
    Read a species tree and alignment (nwk and fasta),
    Read the list of foreground taxa

    If there are enough foreground and background species:
        subset and rename the species tree into a protein tree
        write the alignment separately.
    """
    # Read inputs
    tree_in = Phylo.read(file=tree_in_fn, format="newick")
    alignment_in = AlignIO.read(handle=open(alignment_in_fn, "r"), format="fasta")

    # Slice and rename leafs in tree
    tree_out = rename_tree(tree=tree_in, alignment=alignment_in)

    # Check that there are enough sequences
    if has_enough_by_background_and_foreground(
        alignment_in, foreground_list, min_foreground, min_background
    ):
        Phylo.write(trees=tree_out, file=tree_out_fn, format="newick")
def export_gain_loss(tree, path):
    '''
    '''
    # write final tree with internal node names as assigned by treetime
    sep = '/'
    output_path = sep.join([path.rstrip(sep), 'geneCluster/'])
    tree_fname = sep.join([output_path, 'tree_result.newick'])
    Phylo.write(tree.tree, tree_fname, 'newick')

    from collections import defaultdict
    gene_gain_loss_dict = defaultdict(str)
    for node in tree.tree.find_clades(
            order='preorder'):  # order does not matter much here
        if node.up is None: continue
        #print(node.name ,len(node.geneevents),node.geneevents)
        gain_loss = [
            str(int(ancestral) * 2 + int(derived)) for ancestral, derived in
            zip(node.up.genepresence, node.genepresence)
        ]
        gene_gain_loss_dict[node.name] = "".join(gain_loss)

    gain_loss_array = np.array(
        [[i for i in gain_loss_str]
         for gain_loss_str in gene_gain_loss_dict.values()],
        dtype=int)
    # 1 and 2 are codes for gain/loss events
    events_array = ((gain_loss_array == 1) |
                    (gain_loss_array == 2)).sum(axis=0)
    events_dict = {index: event for index, event in enumerate(events_array)}
    events_dict_path = sep.join([output_path, 'dt_geneEvents.cpk'])
    write_pickle(events_dict_path, events_dict)

    # export gene loss dict to json for visualization
    gene_loss_fname = sep.join([output_path, 'geneGainLossEvent.json'])
    write_json(gene_gain_loss_dict, gene_loss_fname, indent=1)
Beispiel #34
0
def write_clusters(seqfname, tree, clusters, unclustered):
    """Write output files: clusters & unique as FASTA, tree as phyloXML."""
    is_aln = seqfname.endswith('.aln')
    seq_idx = SeqIO.to_dict(SeqIO.parse(seqfname,
                                        'clustal' if is_aln else 'fasta'))
    def write_cluster(cluster, fname):
        """Write the sequences of cluster tips to a FASTA file."""
        records = [seq_idx[seqid] for seqid in sorted(cluster)]
        with open(fname, 'w+') as handle:
            for rec in records:
                write_fasta(rec, handle, do_ungap=is_aln)
        logging.info("Wrote %s (%d sequences)", fname, len(records))

    colors = [BranchColor(*map(lambda x: int(x*255), rgb))
            for rgb in ColorSpiral().get_colors(len(clusters))]
    for i, item in enumerate(sorted(clusters.iteritems(), reverse=True,
                                    key=lambda kv: len(kv[1]))):
        clade, cluster = item
        write_cluster(cluster, os.path.basename(seqfname) + '.' + str(i))
        clade.color = colors[i]
        clade.width = 2
    if unclustered:
        write_cluster(unclustered, os.path.basename(seqfname) + '.Unique')

    treefname = os.path.basename(seqfname) + '.xml'
    Phylo.write(tree, treefname, 'phyloxml') 
    logging.info("Wrote %s", treefname)
Beispiel #35
0
def tiny_tree(INPUTfile, OUTPUTnwk, file_format="fasta"):
    is_gzipped = (INPUTfile.split(".")[-1] == "gz")
    if is_gzipped:
        handle = gzip.open(INPUTfile, 'rt')
    else:
        handle = open(INPUTfile, 'r')

    names = []
    if (file_format == "fasta"):
        records = SeqIO.parse(handle, "fasta")
        for record in records:
            if (record.id != "root"):
                names.append(record.id)
    elif (file_format == "edit"):
        for line in handle:
            name = line.split()[0]
            if (name != "root"): names.append(name)

    if (len(names) == 1):
        init_clade = Phylo.BaseTree.Clade(name=names[0])
        tree = Phylo.BaseTree.Tree(init_clade)
    elif (len(names) == 2):
        init_clade = Phylo.BaseTree.Clade()
        tree = Phylo.BaseTree.Tree(init_clade)
        tree.clade.clades.extend(
            list(Phylo.BaseTree.Clade(name=name) for name in names))
    else:
        print("tiny_tree() Error : len(names)=")
        print(len(names))
    Phylo.write(tree, OUTPUTnwk, 'newick')

    handle.close()
Beispiel #36
0
    def write ( self, phytrees_file ) :
        """
        Save all trees stored at the PhyTrees object in the 'phytrees_file' (in
        newick format). A file with a detailed report of the trees will be
        created replacing the extension of 'phytrees_file' by ".rep". If
        'phytrees_file' contains a relative path, the current working directory
        will be used to get the absolute path. If any file already exists, it
        will be overwritten without warning.
        
        Arguments :
            phytrees_file  ( string )
                New PhyTrees tree file.

        Raises :
            IOError
                If the path provided doesn't exist.
        """
        data_filepath = get_abspath(phytrees_file)
        report_filepath = os.path.splitext(data_filepath)[0] + '.rep'
        # Generate a single string with all the report content
        str_report = '\n'.join(['    '.join(x)  for x in self._report])
        # Write all the information in the PhyTrees files
        try :
            Phylo.write(self.data, data_filepath, 'newick')
            with open(report_filepath, 'w') as report_file :
                report_file.write('Num. trees: {:d}\nHistory:\n' \
                                  '{:s}'.format(len(self), str_report))
        except IOError :
            raise
        except :
            if ( os.path.isfile(data_filepath) ) :
                os.remove(data_filepath)
            if ( os.path.isfile(report_filepath) ) :
                os.remove(report_filepath)
            raise
Beispiel #37
0
def action(args):
    def newname(leaf, newname):
        leaf.name = newname
        return leaf

    tree = Phylo.parse(args.tree, args.tree_type).next()
    leafs = (leaf for leaf in tree.get_terminals())

    if args.info:
        info = DictReader(args.info, fieldnames = ['seqname','newname'])
        info = {i['seqname']:i['newname'] for i in info}

        # for newick trees :s will be replaced by |s
        if args.tree_type == 'newick':
            info = {s.replace(':', '|'):n for s,n in info.items()}

        leafs = (l for l in leafs if l.name in info)
        leafs = (newname(l, info[l.name]) for l in leafs)

    if args.remove_word:
        leafs = (newname(l, re.sub(args.remove_word, '', l.name)) for l in leafs)
        leafs = (newname(l, l.name.strip()) for l in leafs)

    leafs = (newname(l, args.add_prefix + l.name) for l in leafs)
    leafs = (newname(l, l.name + args.add_suffix) for l in leafs)

    # do this last
    if args.tree_type == 'newick':
        leafs = (newname(l, l.name.replace(' ', '_')) for l in leafs)

    # execute changes and write tree
    list(leafs)
    Phylo.write(tree, args.out, args.tree_type)
Beispiel #38
0
    def serialize_trees(self, tree_uri='', format='newick', trees=None, handle=None):
        '''Retrieve trees serialized to any format supported by Biopython.
        
        Current options include 'newick', 'nexus', 'phyloxml', 'nexml', and 'cdao'

        Example:
        >>> treestore.serialize_trees('http://www.example.org/test/')
        '''
        
        if handle: s = handle
        else: s = StringIO()
        
        if tree_uri: tree_uri = self.uri_from_id(tree_uri)
        
        if trees is None: 
            trees = [(x for x in self.get_trees(tree_uri)).next()]
        if not trees:
            raise Exception('Tree to be serialized not found.')

        if format == 'cdao':
            bp.write(trees, s, format, tree_uri=tree_uri)
        elif format == 'ascii':
            bp._utils.draw_ascii((i for i in trees).next(), file=s)
        else:
            bp.write(trees, s, format)

        if handle: return
        return s.getvalue()
Beispiel #39
0
def pruneNewick(gt, protein_id):
    tree = Phylo.read(
        "/SplitTrees/" + gt.partition("_")[0] + '/' + gt + ".newick", 'newick')
    pruned_tree = tree.prune(target=protein_id)
    Phylo.write(tree,
                "/SplitTrees/" + gt.partition("_")[0] + '/' + gt + ".newick",
                'newick')
Beispiel #40
0
def measure_D_net(G,qmod,qcon):
    D_net_dic = {}
    D_net_ret = {}
    D_net = []
    for u in G: D_net_dic[u] = {}

    for u in sorted(G):
        key1 = "Taxon" + str(u)
        tmp_row = []
        for v in sorted(G):
            key2 = "Taxon" + str(v)
            if u < v: continue
            D_net_dic[u][v] = 1.0 - G.dmc_likelihood(u,v,qmod,qcon)
            tmp_row.append(D_net_dic[u][v])

            print D_net_dic[u][v],
        D_net.append(tmp_row)
        print '\n'


    names = []
    for u in G: names.append('Taxon'+str(u))
    print names 
    print D_net
    D_net_final = _DistanceMatrix(names,D_net)
    #print D_net_final.names 

    constructor = DistanceTreeConstructor()
    tree_dmc = constructor.upgma(D_net_final)
    #print tree_dmc
    Phylo.write(tree_dmc,'ph_dmc.nre','newick')
    
    return D_net_final
Beispiel #41
0
def draw_tree():
        alignment = AlignIO.read('outfile_padded.aln', 'clustal') # reading the alignment file
        calculator = DistanceCalculator('identity')
        dm = calculator.get_distance(alignment)
        msas = bootstrap(alignment, 100)
        calculator = DistanceCalculator('blosum62')

        constructor = DistanceTreeConstructor(calculator)

        trees = bootstrap_trees(alignment, 100, constructor)

        consensus_tree = bootstrap_consensus(alignment, 1000, constructor, majority_consensus)
        consensus_tree.ladderize()
        consensus_tree.root.color="green"
        #mrca = tree.common_ancestor({"name": "PC_00004"}, {"name": "BG_I_00594"})
        mrca = consensus_tree.common_ancestor({"name": "PC_00004|DNA"})
        mrca.color = "salmon"


        Phylo.write(consensus_tree,  'TreeToCutOff.xml', 'phyloxml')

        #plt.rc('font', size=10)          # controls default text sizes #HERE IS THE SETTING FOR THAT ALLOWS ME TO HIDE THE BRANCH TIP LABELS
        #plt.rc('axes', titlesize=14)     # fontsize of the axes title
        #plt.rc('xtick', labelsize=10)    # fontsize of the tick labels
        #plt.rc('ytick', labelsize=10)    # fontsize of the tick labels
        #plt.rc('figure', titlesize=18)   # fontsize of the figure title

#plt.savefig("TreeToCutOff_check.svg", format='svg', dpi=1200, bbox_inches='tight')


        Phylo.draw(consensus_tree,  show_confidence=True)
        pylab.gcf().set_dpi(300)
        pylab.savefig("phylo-dot.png")
        pylab.clf()
Beispiel #42
0
def main():
    args = parse_args()
    tree = Phylo.read(args.input_file, args.input_type)
    tree = tree.as_phyloxml()
    if args.zchemat_kolorowania == 'eba':
        get_colors_and_groups = get_eukariota_group
    elif args.zchemat_kolorowania == 'fungi':
        get_colors_and_groups = get_fungus_groups
    elif args.zchemat_kolorowania == 'opisto':
        get_colors_and_groups = get_opisto_groups

    for branch in tree.get_nonterminals():
        try:
            branch.confidence.type = "bootstrap"
            branch.name = branch.confidence.value
        except AttributeError:
            pass

    colors, list_of_groups = get_colors_and_groups()

    for leaf in tree.get_terminals():
        name = leaf.name.strip()
        try:
            index = name.index(".")
            name = name[index + 3:]
        except:
            name = "_".join(name.split("_")[-2:])
        for color, members in zip(colors, list_of_groups):
            if name in members:
                leaf.color = color
    Phylo.write(tree, args.output_file, "phyloxml")
Beispiel #43
0
def rootTree(f, root,output):
	tree = Phylo.read(f,'newick')
	if ',' in root:
		taxa = root.split(',')
		root = tree.common_ancestor(taxa)
	tree.root_with_outgroup(root)
	Phylo.write(tree,output,'newick')
Beispiel #44
0
def main():
    # read argparse
    global args
    args = parse_arguments()

    # initialize input_tree, hosts, score, solution_count
    input_tree = initialize_tree(args.INPUT_TREE_FILE)
    initialize_leaf_nodes(input_tree)
    initialize_internal_nodes(input_tree)

    # label internal nodes
    np.random.seed(args.seed)
    labeled_trees = get_labeled_trees(input_tree)

    # create transmission edges and counts from labeled trees
    # create output files
    if not args.times:
        # old TNet output format
        transmission_edges = get_transmission_edges(labeled_trees[0])
        write_transmission_edges(args.OUTPUT_FILE, labeled_trees[0].root.name,
                                 transmission_edges)
    else:
        # summary output
        edge_count = get_transmission_edge_count(labeled_trees)
        write_transmission_edges_summary(edge_count)

    # create optional output files
    if args.labeledtrees:
        Phylo.write(labeled_trees, args.OUTPUT_FILE + '.tree', 'newick')
Beispiel #45
0
def labeler(files, etalon_tree, tree_path=".", rebuild=False):
    """
    Constructs labels for given files. (Best phylogeny reconstruction method)
    :param files: an iterable with file paths to alignments
    :param etalon_tree: the path to etalon tree
    :param tree_path: a directory, where built trees will be stored
    :param rebuild: set it True, if you need to rebuild trees or build them from scratch
    :return: tensor with labels
    """
    tree_path = osp.abspath(tree_path)  # raxml needs absolute paths
    if rebuild:
        calculator = TreeConstruction.DistanceCalculator('blosum62')
        dist_constructor = TreeConstruction.DistanceTreeConstructor()

        # construct all trees with UPGMA, NJ and raxml
        for i, file in enumerate(files):
            aln = AlignIO.read(file, 'fasta')
            tree = dist_constructor.upgma(calculator.get_distance(aln))
            name = file.split("/")[-1].split(".")[0]
            Phylo.write(tree, osp.join(tree_path, 'upgma_{}.tre'.format(name)),
                        'newick')
            tree = dist_constructor.nj(calculator.get_distance(aln))
            Phylo.write(tree, osp.join(tree_path, 'nj_{}.tre'.format(name)),
                        'newick')
            raxml = RaxmlCommandline(sequences=osp.abspath(file),
                                     model='PROTCATWAG',
                                     name='{}.tre'.format(name),
                                     threads=3,
                                     working_dir=tree_path)
            _, stderr = raxml()
            print(stderr)
            print('{} finished'.format(name))
    # get best tree
    tns = dendropy.TaxonNamespace()
    act_tree = dendropy.Tree.get_from_path(osp.join(tree_path, etalon_tree),
                                           "newick",
                                           taxon_namespace=tns)
    act_tree.encode_bipartitions()
    distances = np.zeros(shape=(len(files), 3))
    for i, file in enumerate(files):
        name = file.split("/")[-1].split(".")[0]
        nj_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "nj_{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        up_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "upgma_{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        ml_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "RAxML_bestTree.{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        distances[i, 0] = dendropy.calculate.treecompare.symmetric_difference(
            nj_tree, act_tree)
        distances[i, 1] = dendropy.calculate.treecompare.symmetric_difference(
            up_tree, act_tree)
        distances[i, 2] = dendropy.calculate.treecompare.symmetric_difference(
            ml_tree, act_tree)
    return distances.argmin(1)
Beispiel #46
0
    def taxid2tree(self, taxid_list, out_fmt="newick"):
        """ This function take a list of gi as input, will generate a path for
            for each gi, then construct a newick or phyloxml tree based on these
            gi pathes.

            out_fmt = newick / phyloxml ...
        """
        treeFile = StringIO()

        # get pathes for a list of taxid
        path_list =[";".join([str(item) for item in self.get_path(taxid)])
                    for taxid in taxid_list ]

        # read in pathFile, and store node info into nodes
        nodes = {} # data format {"node_name": Clade_object}
        root = None

        # to parese path iterately
        for i, path in enumerate(path_list):
            line = path.strip().split(";")
            if root is None:
                root = line[0]
            else:
                assert root == line[0], "The %d-th line is from a different root"%(i+1)

            # check node iterately, first reverse list, to from leaf to root
            # to make sure every node has a parent node
            leaf2root = line[::-1]

            for j, item in enumerate(leaf2root):
                # find child_node and parent_node, root node's parent is itself
                if j == len(line)-1:
                    child_node = item; parent_node=item
                else:
                    child_node = item; parent_node = leaf2root[j+1]

                if nodes.has_key(child_node):
                    continue
                else:
                    # add this node
                    nodes[child_node] = Newick.Clade(name=child_node)
                    # add its parent info
                    nodes[child_node].parent = parent_node

        for node_name, node_clade in nodes.iteritems():
            # find the root node, its parent is itself
            if node_name == node_clade.parent:
                root_node = node_clade
                print "root node is %s, constructing tree ..."%(str(node_name))
            # if node is not root, then find its parent, and add to its parent's clades
            else:
                parent_node = nodes[node_clade.parent]
                parent_node.clades.append(node_clade)
            del node_clade.parent

        tree = Newick.Tree(root = root_node)
        bp.write(tree, treeFile, out_fmt)
        treeStr = treeFile.getvalue()
        return treeStr
def trim_tree(absenteeList, TreeFile, Inclusive):
    """Collapse away species from the phylogenetic tree that
    are not found in this sequence file. Output the tree file."""
    print "\nReading the Tree..."
    #parse the tree using Phylo
    tree = Phylo.read(TreeFile, 'newick')
    print "Here is the starting tree:"
    Phylo.draw_ascii(tree)
    terminals = tree.get_terminals()
    print "\nFound the following {} taxa in the tree:".format(len(terminals))
    print terminals
    #prune away taxa that are not included for this sequence file
    for taxon in absenteeList:
        tree.prune(taxon)
        if CladeList != "none":
            if taxon in CladeList:
                CladeList.remove(taxon)
    print "\nPruned away these species:"
    print absenteeList
    print "\nHere is the tree with the missing taxa pruned away:\n"
    Phylo.draw_ascii(tree)
    #unless you have a clock, PAML requires that your tree is unrooted, ie has a trifurcation at first node. So do that here
    ROOT = tree.get_nonterminals()[0]
    if ROOT.is_bifurcating() == True:
        firstNode = tree.get_nonterminals()[1]
        tree.collapse(firstNode)
    #add notations to the tree to identify the 'foreground' branches
    #these are assigned to a monophyletic group of species assigned with the argument -clade
    #by default add "#1" to the branch leading to the clade. Change -inc from 'no' to make it inclusive,
    #adding #1 to the branch leading to the clade as well as all terminal branches.
    if Model == "2":
        print "\nAssigning the foreground branches in the tree based on the species given in the clade file..."
        print "These species make up the forground clade:"
        for spp in CladeList:
            print spp
        #identifying the foreground clade works differently depending on whether there are multiple species or just one
        #deal with the case when there are multiple first
        if len(CladeList) > 1:
            #add #1 to the node representing the common ancestor to your clade of interest, identifying it as the foreground lineage for the branch sites model
            tree.common_ancestor(CladeList).name = "#1"
            #if you want the foreground lineage to be inclusive for terminal branches, then add the #1s to the terminal taxa in the clade
            if Inclusive != 'no':
                for leaf in tree.get_terminals():
                    if leaf.name in CladeList:
                        leaf.name = leaf.name + "#1"
        #if there is only one member of the clade list left, then it is the sole representative for the lineage, and should be marked #1
        else:
            for leaf in tree.get_terminals():
                if leaf.name in CladeList:
                    leaf.name = leaf.name + "#1"
    #if RunMode is not 2 just output the pruned tree as is
    print "\nOutputting the following revised tree for the species content of the sequence file"
    print "it should have a trifurcation at the base unless you are using a clock\n"
    Phylo.draw_ascii(tree)
    # if tree.rooted == False:
    #     print "The revised tree is an unrooted tree (regardless of how the sketch above looks)"
    # if tree.rooted == True:
    #     print "Hmm, the tree is rooted. This may not be right for PAML input. You should check."
    Phylo.write(tree, TreeOutFileName, "newick")
 def test_consensus(self):
     # create a list of trees and make a consensus
     phylogenies = [self.phylo for i in range(100)]
     with open('distribution.tre', 'w') as file:
         Phylo.write(phylogenies, file, 'newick')
     ptools.consensus(outdir='.', min_freq=0.5,
                      is_rooted=True, trees_splits_encoded=False)
     self.assertTrue(os.path.isfile('consensus.tre'))
 def test_built_tree(self):
     tree = self.constructor.build_tree(self.aln)
     self.assertTrue(isinstance(tree, BaseTree.Tree))
     tree_file = StringIO.StringIO()
     Phylo.write(tree, tree_file, 'newick')
     ref_tree = open('./TreeConstruction/nj.tre')
     self.assertEqual(tree_file.getvalue(), ref_tree.readline())
     ref_tree.close()
Beispiel #50
0
 def test_format_branch_length(self):
     """Custom format string for Newick branch length serialization."""
     tree = Phylo.read(StringIO("A:0.1;"), "newick")
     mem_file = StringIO()
     Phylo.write(tree, mem_file, "newick", format_branch_length="%.0e")
     # Py2.5 compat: Windows with Py2.5- represents this as 1e-001;
     # on all other platforms it's 1e-01
     self.assertTrue(mem_file.getvalue().strip() in ["A:1e-01;", "A:1e-001;"])
 def build_nj_tree(self):
     dm = self.distance_matrix()
     constructor = DistanceTreeConstructor()
     tree = constructor.nj(dm)
     treeio = StringIO.StringIO()
     Phylo.write(tree, treeio, 'newick')
     treestr = treeio.getvalue()
     treeio.close()
     return treestr
Beispiel #52
0
	def export(self):
		from bio_draw import muttree_draw
		def select_fontsize(n):
			if n<10:
				return 12
			elif n<50:
				return 10
			else:
				return 8

		def branch_label_func(n):
			max_muts = 15
			if hasattr(n,'aa_muts'):
				muts = n.aa_muts
			else:
				muts = n.nuc_muts
			tmp = muts.split(',')
			if len(tmp)>max_muts:
				return ', '.join(tmp[:max_muts])+' + '+str(len(tmp)-max_muts)+' others'
			else:
				return ', '.join(tmp)

		from Bio import Phylo
		import matplotlib.pyplot as plt
		plt.rcParams.update({'font.size':select_fontsize(len(self.viruses))})
		plt.ioff()
		from tree_util import to_Biopython
		tmp_tree = to_Biopython(self.tree)
		tmp_tree.ladderize()
		fig = plt.figure('Tree')
		plt.close()
		fig = plt.figure('Tree', figsize = (15,2+len(self.viruses)/5))
		ax = plt.subplot('111')

		muttree_draw(tmp_tree, axes=ax, show_confidence=False, do_show=False,
			label_func = lambda x: x.name,
			branch_labels = branch_label_func
			)
		ax.invert_yaxis()
		tl = np.diff(ax.get_xticks())[0]
		lengthbar = tl/2
		plt.plot( [0,lengthbar],[len(self.viruses),len(self.viruses)], lw=10, c='k')
		plt.text(lengthbar/2, len(self.viruses)+0.1, str(lengthbar),horizontalalignment='center',fontsize=16)
		ax.set_axis_off()
		for fmt in self.formats:
			plt.savefig(self.outdir+'tree.'+fmt)

		for t in tmp_tree.find_clades():
			if t.name is None:
				t.name=''
			muts = t.aa_muts if hasattr(t,'aa_muts') else t.nuc_muts
			if len(t.name) and len(muts): t.name+='-'
			t.name+='_'.join(muts.split(','))

		Phylo.write(tmp_tree, self.outdir+'tree.nwk', 'newick')

		self.export_to_auspice(tree_fields = ['aa_muts','num_date']+self.fasta_fields.values())
Beispiel #53
0
    def dump(self, treefile, nodefile):
        from Bio import Phylo
        Phylo.write(self.tree, treefile, 'newick')
        node_props = {}
        for node in self.tree.find_clades():
            node_props[node.name] = {attr:node.__getattribute__(attr) for attr in self.dump_attr if hasattr(node, attr)}

        with myopen(nodefile, 'w') as nfile:
            pickle.dump(node_props, nfile)
Beispiel #54
0
def main() :

    if len(sys.argv) != 4 :
        print >> sys.stderr, "Usage: %s <orthologues_input.json> <msa_fname.fasta> <fly output folder>" % sys.argv[0]
        sys.exit(1)

    json_input = sys.argv[1]
    msa_fname = sys.argv[2]
    msa_number = filter(str.isdigit, msa_fname)
    fly_directory = sys.argv[3]
    fly_fasta_path = ("%s/fly%s.fasta") % (fly_directory, msa_number)
    fly_tree_path = ("%s/fly%s.tree") % (fly_directory, msa_number)
    minimum_species = 10 # arbitrary

    global the_name
    the_name = msa_fname

    if not os.path.exists(msa_fname) :
        print >> sys.stderr, "Error: %s does not exist!" % msa_fname
        sys.exit(1)

    orthologues = get_homology_information_fromfile(json_input)
    #orthologues = get_homology_information()

    msa_species,tc_genes = get_msa_species(msa_fname)
    tc_gene = tc_genes[0]

    if tc_gene not in orthologues :
        #print "\nSkipped %s, %s: missing from orthologues" % (msa_fname, tc_gene)
        sys.exit(1)

    """if len(msa_species) < minimum_species :
        #print >> sys.stderr, "\033[93m" + "\nskipping: not enough beetle species..." + "\033[0m"
        print "\nSkipped %s, %s: not enough beetle species" % (msa_fname, tc_gene)
        sys.exit(1)"""

    if len(orthologues[tc_gene]) < minimum_species :
        #print >> sys.stderr, "\033[93m" + "\nskipping: not enough fly species..." + "\033[0m"
        #print "\nSkipped %s, %s: not enough fly species" % (msa_fname, tc_gene)
        sys.exit(1)

    tmp = orthologues[tc_gene]
    tmp_species = tmp.keys()[0]
    tmp_gene = tmp[tmp_species][0]
    tmp_flies, tmp_alignment, tree = get_genetree(tmp_species, tmp_gene)
    
    AlignIO.write(tmp_alignment, fly_fasta_path, 'fasta')

    Phylo.write(tree, fly_tree_path, 'newick')

    get_rid_of_bootstrap(fly_tree_path)

    #print >> sys.stderr, "\n\033[92mfly = %d genes, beetle = %d genes\033[0m" % (len(tmp_alignment), count_seq(msa_fname))
    #print "\n Wrote %s, %s (homologues of %s) \n" % (fly_fasta_path, fly_tree_path, tc_gene)

    return 0
Beispiel #55
0
def runBSSC(workingFile, srp_tree_file, debug):

    bssc = runExtProg(bsscDir + "BSSC_original", pdir=bsscDir, length=3)
    bssc.set_param_at("-f", 1)
    bssc.set_param_at(workingFile + "_BSSC.par", 2)
    bssc.set_param_at(1, 3)

    bssc_paup_result = workingFile + "_BSSC.paup"
#     bssc_tree_result = workingFile + "_true_trees.trees"

    try:
        os.remove(bssc_paup_result)
#         os.remove(workingFile + "_0.pau")
    except OSError:
        pass

    while not os.path.exists(bssc_paup_result):
        bssc.run()

#     input_handle = open(bssc_tree_result, "rU")
    input_handle = open(bssc_paup_result, "rU")
    for line in input_handle:
        if line.find("tree true_tree_1") > 0:
            line = line.strip()
            start = line.index("U]") + 3
            treeString = line[start:]
            tree = Phylo.read(StringIO(treeString), 'newick')
    input_handle.close()

    for clade in tree.find_clades():
        if clade.name:
            match = re.match(MATCH_TREE_NODE, clade.name)
            if match:
                index = match.group(2)
                clade.name = "hap_" + str(int(index) - 1)


    input_handle = open(bssc_paup_result, "rU")
    sequences = AlignIO.read(input_handle, "nexus")
    input_handle.close()
    seq = sequences[0]

    ref_handle = open(workingFile + ".cons", "w")
    ref_handle.write(">%s\n%s\n" % ("Ref", seq.seq))
    ref_handle.close()

    output_handle = open(workingFile + "_seqgen.phylip", "w")
    output_handle.write("1 1200\n")
    output_handle.write("%s %s\n" % ("ancestor", seq.seq))
    output_handle.write("1\n")
    Phylo.write(tree, output_handle, "newick")
    output_handle.close()

    output_handle = open(srp_tree_file, "w")
    Phylo.write(tree, output_handle, "newick")
    output_handle.close()
 def save_treeanc_results(self):
     from Bio import Align
     #  files to be displayed in the web interface
     Phylo.write(self.tree, os.path.join(self._root_dir, out_tree_nwk), 'newick')
     self._save_alignment()
     self._save_gtr()
     with zipfile.ZipFile(os.path.join(self._root_dir, zipname), 'w') as out_zip:
         out_zip.write(os.path.join(self._root_dir, out_tree_nwk), arcname=out_tree_nwk)
         out_zip.write(os.path.join(self._root_dir, out_aln_fasta), arcname=out_aln_fasta)
         out_zip.write(os.path.join(self._root_dir, out_gtr), arcname=out_gtr)
Beispiel #57
0
def D_seq_matrix(fasta_file):
    aln = AlignIO.read(fasta_file, 'fasta')
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(aln)
    constructor = DistanceTreeConstructor()
    tree_seq = constructor.upgma(dm)
    #print tree_dmc
    Phylo.write(tree_seq,'ph_seq.nre','newick')
    print dm.names 
    return dm
Beispiel #58
0
    def test_convert_phyloxml_to_newick_branch_length_only(self):
        """Write phyloxml with bootstrap values to newick format using branch_length_only=True"""
        trees = Phylo.parse(EX_APAF, "phyloxml")
        tmp_filename = tempfile.mktemp()

        try :
            Phylo.write(trees, tmp_filename, "newick", branch_length_only=True)
            os.remove(tmp_filename)

        except TypeError:
            self.fail()