def tajimas_d(char_matrix, ignore_uncertain=True): deprecate.dendropy_deprecation_warning( preamble="The 'dendropy.popgenstat' module has moved to 'dendropy.calculate.popgenstat'.", old_construct="from dendropy import popgenstat\npopgenstat.tajimas_d(...)", new_construct="from dendropy.calculate import popgenstat\npopgenstat.tajimas_d(...)", ) return popgenstat.tajimas_d(char_matrix=char_matrix, ignore_uncertain=ignore_uncertain)
def tajimas_d(char_matrix, ignore_uncertain=True): deprecate.dendropy_deprecation_warning( preamble= "The 'dendropy.popgenstat' module has moved to 'dendropy.calculate.popgenstat'.", old_construct= "from dendropy import popgenstat\npopgenstat.tajimas_d(...)", new_construct= "from dendropy.calculate import popgenstat\npopgenstat.tajimas_d(...)", ) return popgenstat.tajimas_d(char_matrix=char_matrix, ignore_uncertain=ignore_uncertain)
def test_tajimas_d(self): self.assertAlmostEqual(popgenstat.tajimas_d(self.data, ignore_uncertain=True), 1.12467, 4)
def testTajimasD(self): self.assertAlmostEqual(popgenstat.tajimas_d(self.matrix), -1.44617198561, 4)
def test_tajimas_d(self): self.assertAlmostEqual( popgenstat.tajimas_d(self.data, ignore_uncertain=True), 1.12467, 4)
def testTajimasD_with_missing(self): self.assertAlmostEqual(popgenstat.tajimas_d(self.matrix_with_missing), -1.44617198561, 4)
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_trees", metavar="SOURCE_TREEFILE [SOURCE_TREEFILE [SOURCE_TREEFILE]]", nargs="+", help="Path to containing tree files. Specify '-' to read from standard input.") parser.add_argument("-f", "--input-format", default="nexus", dest="schema", help="Input trees format (default: $(default)s).") parser.add_argument("-z", "--random-seed", type=int, default=None, help="Seed for random number generator engine.") parser.add_argument("-t", "--title", default="bpprun", help="Run title (default: '%(default)s')") data_options = parser.add_argument_group("Data Options") data_options.add_argument("--population-size", type=int, default=1.0, help="Population size (default: %(default)s).") data_options.add_argument("--num-individuals-per-population", type=int, default=4, help="Number of individuals sampled per incipient species lineage (default: %(default)s).") data_options.add_argument("--num-loci-per-individual", type=int, default=10, help="Number of loci sampled per individual (default: %(default)s).") data_options.add_argument("--num-characters-per-locus", type=int, default=1000, help="Number of characters sampled per locus (default: %(default)s).") data_options.add_argument("--mutation-rate-per-site", type=float, # default=0.00001, default=1e-8, help="Per-site mutation rate (default: %(default)s).") parser.add_argument("--no-scale-tree-by-mutation-rate", action="store_true", help="Do not scale tree by mutation rate.") args = parser.parse_args() if args.random_seed is None: random_seed = random.randint(0, sys.maxsize-1) else: random_seed = args.random_seed rng = random.Random(random_seed) _log("Random seed: {}".format(random_seed)) sg = seqgen.SeqGen() sg.seq_len = args.num_characters_per_locus sg.scale_branch_lens = args.mutation_rate_per_site if "-" in args.source_trees: filepaths = sys.stdin.read().split("\n") args.source_trees.remove("-") else: filepaths = [] manifest_entries = [] filepaths.extend(args.source_trees) for idx, filepath in enumerate(filepaths): job_title = "{}_{:05d}".format(args.title, idx+1) manifest_entry = collections.OrderedDict() _log("{} of {}: {}: {}".format(idx+1, len(filepaths), job_title, filepath)) source_tree = dendropy.Tree.get( path=filepath, schema=args.schema, extract_comment_metadata=True, preserve_underscores=True, ) manifest_entry["speciation_initiation_from_orthospecies_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_initiation_from_orthospecies_rate"].value) manifest_entry["speciation_initiation_from_incipient_species_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_initiation_from_incipient_species_rate"].value) manifest_entry["speciation_completion_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_completion_rate"].value) manifest_entry["orthospecies_extinction_rate"] = try_to_coerce_to_float(source_tree.annotations["orthospecies_extinction_rate"].value) manifest_entry["incipient_species_extinction_rate"] = try_to_coerce_to_float(source_tree.annotations["incipient_species_extinction_rate"].value) manifest_entry["max_time"] = try_to_coerce_to_float(source_tree.annotations["max_time"].value) manifest_entry["max_extant_orthospecies"] = try_to_coerce_to_float(source_tree.annotations["max_extant_orthospecies"].value) manifest_entry["num_extant_lineages"] = try_to_coerce_to_float(source_tree.annotations["num_extant_lineages"].value) manifest_entry["num_extant_orthospecies"] = try_to_coerce_to_float(source_tree.annotations["num_extant_orthospecies"].value) manifest_entry["source_tree_type"] = source_tree.annotations["tree_type"].value manifest_entry["population_size"] = args.population_size manifest_entry["num_individuals_per_population"] = args.num_individuals_per_population manifest_entry["num_loci_per_individual"] = args.num_loci_per_individual manifest_entry["mutation_rate_per_site"] = args.mutation_rate_per_site source_tree.calc_node_ages() gene_trees = generate_contained_trees( containing_tree=source_tree, num_individuals_per_population=args.num_individuals_per_population, num_gene_trees=args.num_loci_per_individual, population_size=args.population_size, rng=rng, ) imap_filepath = "{}.input.imap.txt".format(job_title) f = open(imap_filepath, "w") for taxon in gene_trees.taxon_namespace: f.write("{} {}\n".format(taxon.label.split("^")[1], taxon.population_label)) # f.write("{} {}\n".format(taxon.label.split("^")[0], taxon.population_label)) # f.write("{} {}\n".format(taxon.label, taxon.population_label)) f.write("\n") d0 = sg.generate(gene_trees) chars_filepath = "{}.input.chars.txt".format(job_title) f = open(chars_filepath, "w") for cm_idx, cm in enumerate(d0.char_matrices): sys.stderr.write("Locus {}: pi = {}, Tajima's D = {}\n".format( cm_idx+1, popgenstat.nucleotide_diversity(cm), popgenstat.tajimas_d(cm))) cm.write(file=f, schema="phylip") f.write("\n") out_filepath = "{}.results.out.txt".format(job_title) mcmc_filepath = "{}.results.mcmc.txt".format(job_title) num_species = len(source_tree.taxon_namespace) species_labels = " ".join(t.label for t in source_tree.taxon_namespace) num_individuals_per_species = " ".join(str(args.num_individuals_per_population) for i in range(len(source_tree.taxon_namespace))) # Inverse Gamma Prior # IG(a,b), with mean given by b/(a-1) # So, # thetaprior 3 0.002 # has a mean of # 0.002/(3-1) = 0.001 theta_prior_mean = args.population_size * 4 * args.mutation_rate_per_site theta_prior_a = 3.0 theta_prior_b = theta_prior_mean * (theta_prior_a - 1) if args.no_scale_tree_by_mutation_rate: tau_prior_mean = source_tree.seed_node.age else: # tau_prior_mean = source_tree.seed_node.age * args.population_size * 4 * args.mutation_rate_per_site tau_prior_mean = source_tree.seed_node.age * args.mutation_rate_per_site * (1.0 / (args.num_loci_per_individual * args.num_characters_per_locus)) # tau_prior_mean = source_tree.seed_node.age / 100000 tau_prior_a = 3.0 tau_prior_b = tau_prior_mean * (tau_prior_a - 1) manifest_entry["num_input_lineages"] = len(species_labels) manifest_entry["theta"] = theta_prior_mean manifest_entry["theta_prior_a"] = theta_prior_a manifest_entry["theta_prior_b"] = theta_prior_b manifest_entry["root_age"] = source_tree.seed_node.age manifest_entry["tau_prior_a"] = tau_prior_a manifest_entry["tau_prior_b"] = tau_prior_b species_tree = source_tree.as_string( schema="newick", suppress_leaf_taxon_labels=False, suppress_leaf_node_labels=True, suppress_internal_taxon_labels=True, suppress_internal_node_labels=True, suppress_rooting=True, suppress_edge_lengths=True, unquoted_underscores=True, preserve_spaces=True, store_tree_weights=False, suppress_annotations=True, suppress_item_comments=True, ) bpp_config = BPP_TEMPLATE.format( chars_filepath=chars_filepath, imap_filepath=imap_filepath, out_filepath=out_filepath, mcmc_filepath=mcmc_filepath, num_species=num_species, species_labels=species_labels, num_individuals_per_species=num_individuals_per_species, species_tree=species_tree, theta_prior_mean=theta_prior_mean, theta_prior_a=theta_prior_a, theta_prior_b=theta_prior_b, tau_prior_mean=tau_prior_mean, tau_prior_a=tau_prior_a, tau_prior_b=tau_prior_b, num_loci=args.num_loci_per_individual, root_age=source_tree.seed_node.age ) bpp_ctl_filepath = "{}.input.bpp.ctl".format(job_title) f = open(bpp_ctl_filepath, "w") f.write(bpp_config) f.write("\n") jobf = open("{}.job.sge".format(job_title), "w") jobf.write("#! /bin/bash\n") jobf.write("#$ -cwd\n") jobf.write("#$ -V\n") jobf.write("#$ -S /bin/bash\n") jobf.write("#$ -l h_vmem=12G\n") jobf.write("#$ -l virtual_free=12G\n") jobf.write("bpp --cfile {}\n".format(bpp_ctl_filepath)) manifest_entry["source_tree_path"] = filepath manifest_entry["results_filepath"] = out_filepath manifest_entry["mcmc_filepath"] = mcmc_filepath manifest_entries.append(manifest_entry) out = _open_output_file_for_csv_writer( filepath="{}_manifest.csv".format(args.title), append=False) with out: writer = csv.DictWriter( out, fieldnames=manifest_entries[0].keys(), restval="NA", delimiter=",", lineterminator=os.linesep, ) writer.writeheader() writer.writerows(manifest_entries)
# -*- coding: utf-8 -*- "run as python estimate_SFS_stats.py $inputfile" "Appends to the given output file" import dendropy from dendropy.calculate import popgenstat import os import sys seqs = dendropy.DnaCharacterMatrix.get(path=sys.argv[1], schema="fasta") out = open("popgen_stats.txt", "a") pop = sys.argv[1].split("/")[1].split("_")[2].split(".")[0] gene = sys.argv[1].split("/")[1].split("_")[0] fbtr = sys.argv[1].split("/")[1].split("_")[1] td = popgenstat.tajimas_d(seqs) tw = popgenstat.wattersons_theta(seqs) tp = popgenstat.average_number_of_pairwise_differences(seqs) ss = popgenstat.num_segregating_sites(seqs) out.write("\t".join( [str(pop), str(gene), str(fbtr), str(td), str(tw), str(tp), str(ss)]) + "\n")