Beispiel #1
0
 def test_raxml(self):
     """Run RAxML using the wrapper."""
     cmd = RaxmlCommandline(raxml_exe,
                            sequences=EX_PHYLIP, model="PROTCATWAG",
                            name="test")
     # The parsimony seed should be set automatically
     self.assertIn("-p", str(cmd))
     # Smoke test
     try:
         out, err = cmd()
         self.assertTrue(len(out) > 0)
         self.assertEqual(len(err), 0)
         # Check the output tree
         tree = Phylo.read("RAxML_result.test", "newick")
         self.assertEqual(tree.count_terminals(), 4)
     finally:
         # Remove RAxML-generated files, or RAxML will complain bitterly
         # during the next run
         for fname in ["RAxML_info.test",
                       "RAxML_log.test",
                       "RAxML_parsimonyTree.test",
                       "RAxML_result.test",
                       # Present in 7.2.X+  but not 7.0.4:
                       "RAxML_bestTree.test",
                       ]:
             if os.path.isfile(fname):
                 os.remove(fname)
Beispiel #2
0
def SH_raxml(reftree, querytree, phy_file, shout="SH_out.txt"):
    """
    SH test using RAxML

    querytree can be a single tree or a bunch of trees (eg. from bootstrapping)
    """
    assert op.isfile(reftree)
    shout = must_open(shout, "a")

    raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work"))
    mkdir(raxml_work)
    raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \
    sequences=phy_file, algorithm="h", model="GTRGAMMA", \
    name="SH", starting_tree=reftree, bipartition_filename=querytree, \
    working_dir=raxml_work)

    logging.debug("Running SH test in RAxML: %s" % raxml_cl)
    o, stderr = raxml_cl()
    # hard coded
    try:
        pval = re.search('(Significantly.*:.*)', o).group(0)
    except:
        print >> sys.stderr, "SH test failed."
    else:
        pval = pval.strip().replace("\t", " ").replace("%", "\%")
        print >> shout, "{0}\t{1}".format(op.basename(querytree), pval)
        logging.debug("SH p-value appended to %s" % shout.name)

    shout.close()
    return shout.name
Beispiel #3
0
def labeler(files, etalon_tree, tree_path=".", rebuild=False):
    """
    Constructs labels for given files. (Best phylogeny reconstruction method)
    :param files: an iterable with file paths to alignments
    :param etalon_tree: the path to etalon tree
    :param tree_path: a directory, where built trees will be stored
    :param rebuild: set it True, if you need to rebuild trees or build them from scratch
    :return: tensor with labels
    """
    tree_path = osp.abspath(tree_path)  # raxml needs absolute paths
    if rebuild:
        calculator = TreeConstruction.DistanceCalculator('blosum62')
        dist_constructor = TreeConstruction.DistanceTreeConstructor()

        # construct all trees with UPGMA, NJ and raxml
        for i, file in enumerate(files):
            aln = AlignIO.read(file, 'fasta')
            tree = dist_constructor.upgma(calculator.get_distance(aln))
            name = file.split("/")[-1].split(".")[0]
            Phylo.write(tree, osp.join(tree_path, 'upgma_{}.tre'.format(name)),
                        'newick')
            tree = dist_constructor.nj(calculator.get_distance(aln))
            Phylo.write(tree, osp.join(tree_path, 'nj_{}.tre'.format(name)),
                        'newick')
            raxml = RaxmlCommandline(sequences=osp.abspath(file),
                                     model='PROTCATWAG',
                                     name='{}.tre'.format(name),
                                     threads=3,
                                     working_dir=tree_path)
            _, stderr = raxml()
            print(stderr)
            print('{} finished'.format(name))
    # get best tree
    tns = dendropy.TaxonNamespace()
    act_tree = dendropy.Tree.get_from_path(osp.join(tree_path, etalon_tree),
                                           "newick",
                                           taxon_namespace=tns)
    act_tree.encode_bipartitions()
    distances = np.zeros(shape=(len(files), 3))
    for i, file in enumerate(files):
        name = file.split("/")[-1].split(".")[0]
        nj_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "nj_{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        up_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "upgma_{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        ml_tree = dendropy.Tree.get_from_path(osp.join(
            tree_path, "RAxML_bestTree.{}.tre".format(name)),
                                              "newick",
                                              taxon_namespace=tns)
        distances[i, 0] = dendropy.calculate.treecompare.symmetric_difference(
            nj_tree, act_tree)
        distances[i, 1] = dendropy.calculate.treecompare.symmetric_difference(
            up_tree, act_tree)
        distances[i, 2] = dendropy.calculate.treecompare.symmetric_difference(
            ml_tree, act_tree)
    return distances.argmin(1)
Beispiel #4
0
def build_ml_raxml(alignment, outfile, work_dir=".", **kwargs):
    """
    build maximum likelihood tree of DNA seqs with RAxML
    """
    work_dir = op.join(work_dir, "work")
    mkdir(work_dir)
    phy_file = op.join(work_dir, "aln.phy")
    AlignIO.write(alignment, file(phy_file, "w"), "phylip-relaxed")

    raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work"))
    mkdir(raxml_work)
    raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \
        sequences=phy_file, algorithm="a", model="GTRGAMMA", \
        parsimony_seed=12345, rapid_bootstrap_seed=12345, \
        num_replicates=100, name="aln", \
        working_dir=raxml_work, **kwargs)

    logging.debug("Building ML tree using RAxML: %s" % raxml_cl)
    stdout, stderr = raxml_cl()

    tree_file = "{0}/RAxML_bipartitions.aln".format(raxml_work)
    if not op.exists(tree_file):
        print >> sys.stderr, "***RAxML failed."
        sh("rm -rf %s" % raxml_work, log=False)
        return None
    sh("cp {0} {1}".format(tree_file, outfile), log=False)

    logging.debug("ML tree printed to %s" % outfile)
    sh("rm -rf %s" % raxml_work)

    return outfile, phy_file
def generate_phylo():
    [os.remove(x) for x in glob.glob("RAxML_*")]
    print "Generating phylogenetic tree using the multiple algnment output using RAxML..."
    try:
        #best_likelihood
        print "1/3"
        raxml_cline = RaxmlCommandline(sequences="trimmed_muscleout.aln",
                                       model="GTRGAMMA",
                                       threads=NPROC,
                                       cmd=raxml_exe,
                                       name="T1",
                                       parsimony_seed=12345,
                                       num_replicates=BOOTSTRAP)
        stdout, stderr = raxml_cline()
        #bootstrap search
        print "2/3"
        raxml_cline = RaxmlCommandline(sequences="trimmed_muscleout.aln",
                                       model="GTRGAMMA",
                                       threads=NPROC,
                                       cmd=raxml_exe,
                                       name="T2",
                                       parsimony_seed=12345,
                                       num_replicates=BOOTSTRAP,
                                       bootstrap_seed=12345)
        stdout, stderr = raxml_cline()
        #draw bipartition
        print "3/3"
        raxml_cline = RaxmlCommandline(
            sequences="trimmed_muscleout.aln",
            model="GTRCAT",
            threads=NPROC,
            cmd=raxml_exe,
            name="T3.nwk",
            algorithm="b",
            starting_tree="RAxML_bestTree.T1",
            bipartition_filename="RAxML_bootstrap.T2")
        stdout, stderr = raxml_cline()
    except Exception, e:
        print str(e)
        exit(0)
Beispiel #6
0
def create_tree_from_seqs(filename,
                          out,
                          outgroup='',
                          model='GTRGAMMA',
                          tail_deletion=False,
                          n=40,
                          delete_outgroup=True):
    output_dir, output_filename = parse_path(out)
    if tail_deletion:
        regions = list()
        alns = AlignIO.read(filename, 'fasta')
        a = None
        for i in range(len(alns[0])):
            complete = True
            for aln in alns:
                if aln[i] == '-':
                    complete = False
                    break
            if complete:
                if a is None:
                    a = i
            else:
                if a is not None:
                    regions.append((a, i))
                a = None
        if a is not None:
            regions.append((a, i))
        s, _ = max(regions, key=lambda reg: (reg[1] - reg[0]) / (reg[0] + 1))
        alns = MSA([aln[s:] for aln in alns])
        filename = filename + '.tmp.fasta'
        AlignIO.write(alns, filename, 'fasta')
    RaxmlCommandline(sequences=filename,
                     model=model,
                     parsimony_seed=2018,
                     num_replicates=n,
                     name=output_filename)()
    if tail_deletion:
        os.remove(filename)
    for filename in glob.glob("RAxML_*"):
        if output_filename in filename:
            if not filename.endswith('bestTree.' + output_filename):
                os.remove(filename)
            else:
                os.rename(filename, out)
    if os.path.isfile('tmp.dnd'):
        os.remove('tmp.dnd')
    tree = ete3.Tree(out)
    os.remove(out)
    if outgroup is not None:
        outgroup_tree(tree, outgroup, delete_outgroup)
    tree.write(outfile=out)
    return ete3.Tree(out)
Beispiel #7
0
 def generate_dist(self):
     mafft_cline = MafftCommandline(input=self.fasta_seq,
                                    maxiterate=1000,
                                    localpair=True,
                                    phylipout=True)
     stdout, stderr = mafft_cline()
     #Save alignments into  FASTA and PHYLIP format
     phyFile = 'testing/alignment.phy'
     outPhy = open(phyFile, 'w')
     outPhy.write(stdout)
     outPhy.close()
     fastaFile = 'testing/align.fasta'
     SeqIO.convert(phyFile, 'phylip', fastaFile, 'fasta')
     #Create phylogenetic tree of the original sequences
     raxml_cline = RaxmlCommandline(sequences=phyFile,
                                    model='GTRGAMMA',
                                    name='reversatest',
                                    working_dir=self.cwPath)
     raxml_cline()
     #Calculate the phylo distances between each branch of the tree
     tree = dendropy.Tree.get_from_path("testing/RAxML_result.reversatest",
                                        "newick")
     pdm = tree.phylogenetic_distance_matrix()
     pdm.write_csv('distance.csv')
Beispiel #8
0
            time.sleep(3)
            sys.exit()

        print("Multiple sequence alignment complete")

        #code block executes or skips trimal
        if trimal == 'yes':
            print("Optimization in progress")
            subprocess.call(['trimal', '-in', out_file, '-out', out_file2])
            print("Alignment optimization complete")
        elif trimal == 'no':
            out_file2 = out_file
        else:
            print("error: invalid input, terminating program")
            time.sleep(3)
            sys.exit()

        print("Beginning phylogenetic tree construction")

        #code block executes RAxML tree construction
        raxml_cline = RaxmlCommandline(sequences=out_file2, model="PROTCATWAG", name=f+".nwk")
        child = subprocess.call(str(raxml_cline), shell=(sys.platform!=platform))

        #i variable is used to count the number of files that are iterated over
        i = i + 1
twoLines()
print("Total number of files analyzed: "+str(i))
twoLines()

#sys.exit() is not used, this way commands and selections can be copied and pasted for documentation
Beispiel #9
0
    def roundTwo(self):
        self.roundOne()
        self.jfileMinus = []
        pattern = re.compile(r'_')
        for j in self.query_name:
            clust_id = pattern.split(j)[2]
            for query in SeqIO.parse(j, 'fasta'):
                seq = pattern.split(query.id)[0]
                #Create special identifier for each round of files
                number = 'minus%s' % seq
                id_jfile = '%s_minus%s' % (clust_id, seq)

                rax_name = 'reversatest%s' % number
                fasta_name = 'testing/align%s.fasta' % number

                if not os.path.isfile('testing/alignment%s.phy' % number):
                    edited = MultipleSeqAlignment([])
                    openPhy = open('testing/alignment.phy')
                    record = AlignIO.read(openPhy, 'phylip')
                    for i in record:
                        if i.id != seq:
                            edited.append(i)

                    #write the alignment minus a sequence
                    phy_name = 'testing/alignment%s.phy' % number
                    out = open(phy_name, 'w')
                    AlignIO.write(edited, out, 'phylip')
                    out.close()
                    #convert FASTA to PHYLIP format
                    SeqIO.convert(
                        phy_name,
                        'phylip',
                        fasta_name,
                        'fasta',
                    )

                    #Create reference tree
                    raxml_line = RaxmlCommandline(sequences=phy_name,
                                                  model='GTRGAMMA',
                                                  name=rax_name,
                                                  working_dir=self.cwPath)
                    raxml_line()

                #Add query sequences to the previous alignment
                multiali_name = 'testing/multiple_ali%s.fasta' % id_jfile

                if not os.path.isfile('testing/alignment%s.phy' % id_jfile):
                    os.system('mafft --add %s --quiet --reorder %s >%s' %
                              (j, fasta_name, multiali_name))

                    jason_name = 'multiple_ali%s.jplace' % id_jfile

                    #wrap pplacer
                    if not os.path.isfile('pplacer/%s' % jason_name):
                        self.jfileMinus.append(jason_name)
                        os.system(
                            'pplacer --out-dir pplacer  -p -t testing/RAxML_result.%s -s testing/RAxML_info.%s %s'
                            % (rax_name, rax_name, multiali_name))
        print self.jfile
        print self.jfileMinus

        return self.jfile, self.jfileMinus
Beispiel #10
0
#Muscle Alignment
from Bio.Align.Applications import MuscleCommandline
#Path relative to project
in_file = input('file_name= ')
out_file = "align.fasta"
muscle_exe = "muscle"
#using default fasta output
cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file)
#creating best tree file
from Bio.Phylo.Applications import RaxmlCommandline

raxml_cline = RaxmlCommandline(sequences=out_file,
                               model="PROTCATWAG",
                               name="tree")
raxml_cline(out_file)