Example #1
0
def tajimas_d(char_matrix, ignore_uncertain=True):
    deprecate.dendropy_deprecation_warning(
            preamble="The 'dendropy.popgenstat' module has moved to 'dendropy.calculate.popgenstat'.",
            old_construct="from dendropy import popgenstat\npopgenstat.tajimas_d(...)",
            new_construct="from dendropy.calculate import popgenstat\npopgenstat.tajimas_d(...)",
            )
    return popgenstat.tajimas_d(char_matrix=char_matrix, ignore_uncertain=ignore_uncertain)
Example #2
0
def tajimas_d(char_matrix, ignore_uncertain=True):
    deprecate.dendropy_deprecation_warning(
        preamble=
        "The 'dendropy.popgenstat' module has moved to 'dendropy.calculate.popgenstat'.",
        old_construct=
        "from dendropy import popgenstat\npopgenstat.tajimas_d(...)",
        new_construct=
        "from dendropy.calculate import popgenstat\npopgenstat.tajimas_d(...)",
    )
    return popgenstat.tajimas_d(char_matrix=char_matrix,
                                ignore_uncertain=ignore_uncertain)
Example #3
0
 def test_tajimas_d(self):
     self.assertAlmostEqual(popgenstat.tajimas_d(self.data, ignore_uncertain=True), 1.12467, 4)
Example #4
0
 def testTajimasD(self):
     self.assertAlmostEqual(popgenstat.tajimas_d(self.matrix), -1.44617198561, 4)
Example #5
0
 def test_tajimas_d(self):
     self.assertAlmostEqual(
         popgenstat.tajimas_d(self.data, ignore_uncertain=True), 1.12467, 4)
Example #6
0
 def testTajimasD(self):
     self.assertAlmostEqual(popgenstat.tajimas_d(self.matrix),
                            -1.44617198561, 4)
 def testTajimasD_with_missing(self):
     self.assertAlmostEqual(popgenstat.tajimas_d(self.matrix_with_missing), -1.44617198561, 4)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_trees",
            metavar="SOURCE_TREEFILE [SOURCE_TREEFILE [SOURCE_TREEFILE]]",
            nargs="+",
            help="Path to containing tree files. Specify '-' to read from standard input.")
    parser.add_argument("-f", "--input-format",
            default="nexus",
            dest="schema",
            help="Input trees format (default: $(default)s).")
    parser.add_argument("-z", "--random-seed",
            type=int,
            default=None,
            help="Seed for random number generator engine.")
    parser.add_argument("-t", "--title",
            default="bpprun",
            help="Run title (default: '%(default)s')")
    data_options = parser.add_argument_group("Data Options")
    data_options.add_argument("--population-size",
            type=int,
            default=1.0,
            help="Population size (default: %(default)s).")
    data_options.add_argument("--num-individuals-per-population",
            type=int,
            default=4,
            help="Number of individuals sampled per incipient species lineage (default: %(default)s).")
    data_options.add_argument("--num-loci-per-individual",
            type=int,
            default=10,
            help="Number of loci sampled per individual (default: %(default)s).")
    data_options.add_argument("--num-characters-per-locus",
            type=int,
            default=1000,
            help="Number of characters sampled per locus (default: %(default)s).")
    data_options.add_argument("--mutation-rate-per-site",
            type=float,
            # default=0.00001,
            default=1e-8,
            help="Per-site mutation rate (default: %(default)s).")
    parser.add_argument("--no-scale-tree-by-mutation-rate",
            action="store_true",
            help="Do not scale tree by mutation rate.")
    args = parser.parse_args()

    if args.random_seed is None:
        random_seed = random.randint(0, sys.maxsize-1)
    else:
        random_seed = args.random_seed
    rng = random.Random(random_seed)
    _log("Random seed: {}".format(random_seed))

    sg = seqgen.SeqGen()
    sg.seq_len = args.num_characters_per_locus
    sg.scale_branch_lens = args.mutation_rate_per_site

    if "-" in args.source_trees:
        filepaths = sys.stdin.read().split("\n")
        args.source_trees.remove("-")
    else:
        filepaths = []

    manifest_entries = []
    filepaths.extend(args.source_trees)
    for idx, filepath in enumerate(filepaths):
        job_title = "{}_{:05d}".format(args.title, idx+1)
        manifest_entry = collections.OrderedDict()
        _log("{} of {}: {}: {}".format(idx+1, len(filepaths), job_title, filepath))
        source_tree = dendropy.Tree.get(
                path=filepath,
                schema=args.schema,
                extract_comment_metadata=True,
                preserve_underscores=True,
                )

        manifest_entry["speciation_initiation_from_orthospecies_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_initiation_from_orthospecies_rate"].value)
        manifest_entry["speciation_initiation_from_incipient_species_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_initiation_from_incipient_species_rate"].value)
        manifest_entry["speciation_completion_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_completion_rate"].value)
        manifest_entry["orthospecies_extinction_rate"] = try_to_coerce_to_float(source_tree.annotations["orthospecies_extinction_rate"].value)
        manifest_entry["incipient_species_extinction_rate"] = try_to_coerce_to_float(source_tree.annotations["incipient_species_extinction_rate"].value)
        manifest_entry["max_time"] = try_to_coerce_to_float(source_tree.annotations["max_time"].value)
        manifest_entry["max_extant_orthospecies"] = try_to_coerce_to_float(source_tree.annotations["max_extant_orthospecies"].value)
        manifest_entry["num_extant_lineages"] = try_to_coerce_to_float(source_tree.annotations["num_extant_lineages"].value)
        manifest_entry["num_extant_orthospecies"] = try_to_coerce_to_float(source_tree.annotations["num_extant_orthospecies"].value)
        manifest_entry["source_tree_type"] = source_tree.annotations["tree_type"].value
        manifest_entry["population_size"] = args.population_size
        manifest_entry["num_individuals_per_population"] = args.num_individuals_per_population
        manifest_entry["num_loci_per_individual"] = args.num_loci_per_individual
        manifest_entry["mutation_rate_per_site"] = args.mutation_rate_per_site

        source_tree.calc_node_ages()

        gene_trees = generate_contained_trees(
                containing_tree=source_tree,
                num_individuals_per_population=args.num_individuals_per_population,
                num_gene_trees=args.num_loci_per_individual,
                population_size=args.population_size,
                rng=rng,
                )

        imap_filepath = "{}.input.imap.txt".format(job_title)
        f = open(imap_filepath, "w")
        for taxon in gene_trees.taxon_namespace:
            f.write("{}    {}\n".format(taxon.label.split("^")[1], taxon.population_label))
            # f.write("{}    {}\n".format(taxon.label.split("^")[0], taxon.population_label))
            # f.write("{}    {}\n".format(taxon.label, taxon.population_label))
        f.write("\n")

        d0 = sg.generate(gene_trees)
        chars_filepath = "{}.input.chars.txt".format(job_title)
        f = open(chars_filepath, "w")
        for cm_idx, cm in enumerate(d0.char_matrices):
            sys.stderr.write("Locus {}: pi = {}, Tajima's D = {}\n".format(
                cm_idx+1,
                popgenstat.nucleotide_diversity(cm),
                popgenstat.tajimas_d(cm)))
            cm.write(file=f, schema="phylip")
            f.write("\n")

        out_filepath = "{}.results.out.txt".format(job_title)
        mcmc_filepath = "{}.results.mcmc.txt".format(job_title)
        num_species = len(source_tree.taxon_namespace)
        species_labels = " ".join(t.label for t in source_tree.taxon_namespace)
        num_individuals_per_species = " ".join(str(args.num_individuals_per_population) for i in range(len(source_tree.taxon_namespace)))

        # Inverse Gamma Prior
        # IG(a,b), with mean given by b/(a-1)
        # So,
        #   thetaprior 3 0.002
        # has a mean of
        #   0.002/(3-1) = 0.001
        theta_prior_mean = args.population_size * 4 * args.mutation_rate_per_site
        theta_prior_a = 3.0
        theta_prior_b = theta_prior_mean * (theta_prior_a - 1)
        if args.no_scale_tree_by_mutation_rate:
            tau_prior_mean = source_tree.seed_node.age
        else:
            # tau_prior_mean = source_tree.seed_node.age * args.population_size * 4 * args.mutation_rate_per_site
            tau_prior_mean = source_tree.seed_node.age * args.mutation_rate_per_site * (1.0 / (args.num_loci_per_individual * args.num_characters_per_locus))
            # tau_prior_mean = source_tree.seed_node.age / 100000
        tau_prior_a = 3.0
        tau_prior_b = tau_prior_mean * (tau_prior_a - 1)

        manifest_entry["num_input_lineages"] = len(species_labels)
        manifest_entry["theta"] = theta_prior_mean
        manifest_entry["theta_prior_a"] = theta_prior_a
        manifest_entry["theta_prior_b"] = theta_prior_b
        manifest_entry["root_age"] = source_tree.seed_node.age
        manifest_entry["tau_prior_a"] = tau_prior_a
        manifest_entry["tau_prior_b"] = tau_prior_b

        species_tree = source_tree.as_string(
                schema="newick",
                suppress_leaf_taxon_labels=False,
                suppress_leaf_node_labels=True,
                suppress_internal_taxon_labels=True,
                suppress_internal_node_labels=True,
                suppress_rooting=True,
                suppress_edge_lengths=True,
                unquoted_underscores=True,
                preserve_spaces=True,
                store_tree_weights=False,
                suppress_annotations=True,
                suppress_item_comments=True,
                )
        bpp_config = BPP_TEMPLATE.format(
                chars_filepath=chars_filepath,
                imap_filepath=imap_filepath,
                out_filepath=out_filepath,
                mcmc_filepath=mcmc_filepath,
                num_species=num_species,
                species_labels=species_labels,
                num_individuals_per_species=num_individuals_per_species,
                species_tree=species_tree,
                theta_prior_mean=theta_prior_mean,
                theta_prior_a=theta_prior_a,
                theta_prior_b=theta_prior_b,
                tau_prior_mean=tau_prior_mean,
                tau_prior_a=tau_prior_a,
                tau_prior_b=tau_prior_b,
                num_loci=args.num_loci_per_individual,
                root_age=source_tree.seed_node.age
                )
        bpp_ctl_filepath = "{}.input.bpp.ctl".format(job_title)
        f = open(bpp_ctl_filepath, "w")
        f.write(bpp_config)
        f.write("\n")

        jobf = open("{}.job.sge".format(job_title), "w")
        jobf.write("#! /bin/bash\n")
        jobf.write("#$ -cwd\n")
        jobf.write("#$ -V\n")
        jobf.write("#$ -S /bin/bash\n")
        jobf.write("#$ -l h_vmem=12G\n")
        jobf.write("#$ -l virtual_free=12G\n")
        jobf.write("bpp --cfile {}\n".format(bpp_ctl_filepath))

        manifest_entry["source_tree_path"] = filepath
        manifest_entry["results_filepath"] = out_filepath
        manifest_entry["mcmc_filepath"] = mcmc_filepath
        manifest_entries.append(manifest_entry)

    out = _open_output_file_for_csv_writer(
            filepath="{}_manifest.csv".format(args.title),
            append=False)
    with out:
        writer = csv.DictWriter(
                out,
                fieldnames=manifest_entries[0].keys(),
                restval="NA",
                delimiter=",",
                lineterminator=os.linesep,
                )
        writer.writeheader()
        writer.writerows(manifest_entries)
 def testTajimasD_with_missing(self):
     self.assertAlmostEqual(popgenstat.tajimas_d(self.matrix_with_missing),
                            -1.44617198561, 4)
Example #10
0
# -*- coding: utf-8 -*-

"run as python estimate_SFS_stats.py $inputfile"
"Appends to the given output file"

import dendropy
from dendropy.calculate import popgenstat
import os
import sys

seqs = dendropy.DnaCharacterMatrix.get(path=sys.argv[1], schema="fasta")

out = open("popgen_stats.txt", "a")

pop = sys.argv[1].split("/")[1].split("_")[2].split(".")[0]
gene = sys.argv[1].split("/")[1].split("_")[0]
fbtr = sys.argv[1].split("/")[1].split("_")[1]
td = popgenstat.tajimas_d(seqs)
tw = popgenstat.wattersons_theta(seqs)
tp = popgenstat.average_number_of_pairwise_differences(seqs)
ss = popgenstat.num_segregating_sites(seqs)

out.write("\t".join(
    [str(pop),
     str(gene),
     str(fbtr),
     str(td),
     str(tw),
     str(tp),
     str(ss)]) + "\n")