Example #1
0
def seqgen_to_file(files, seqgen_vals):
    s = seqgen.SeqGen()
    s.scale_branch_lens = 0.1
    for k, v in seqgen_vals.items():
        seqgen_vals[k] = seqgen.SeqGen(v)
    for file in files:
        schema = file.split('.')[1]
        trees = dendropy.Tree.get(path=file, schema=schema)
        filename = "seq_{}".format(file.split('.')[0] + "." + schema)
        d1 = s.generate(trees)
        with open(filename, "w") as f:
            f.write(d1.char_matrices[0].as_string(schema))
Example #2
0
    def generate_sequences(self,
                           species_name,
                           samples_per_pop=10,
                           seq_len=2000,
                           use_seq_gen=True):

        self.generate_pop_tree(species_name=species_name,
                               samples_per_pop=samples_per_pop)
        self.generate_gene_tree(species_name=species_name,
                                samples_per_pop=samples_per_pop)
        d = dendropy.DataSet(self.mutation_tree.taxon_namespace)
        if self.use_seq_gen is True:
            sg = seqgen.SeqGen()
            sg.seqgen_path = self.seqgen_path
            sg.num_replicates = 1
            sg.quiet = True
            sg.rng = self.rng
            sg.seq_len = seq_len
            sg.char_model = 'HKY'
            sg.ti_tv = float(self.kappa) / 2
            sg.state_freqs = self.base_freqs
            sg.trees = [self.mutation_tree]
            d = sg.generate_dataset(dataset=d)
        else:
            char_matrix = discrete.hky85_chars(
                seq_len=seq_len,
                tree_model=self.mutation_tree,
                mutation_rate=1.0,
                kappa=1.0,
                base_freqs=[0.25, 0.25, 0.25, 0.25],
                root_states=None,
                rng=self.rng)
            d.add_char_matrix(char_matrix)
        return d
Example #3
0
def simulate_gtr_matrix(tree, seq_length, frequencies, rates, branch_scale):
    s = seqgen.SeqGen()
    s.char_model = seqgen.SeqGen.GTR
    s.state_freqs = frequencies
    s.general_rates = rates
    s.scale_branch_lens = branch_scale
    s.seq_len = seq_length
    d = s.generate(tree)
    fasta_string = d.char_matrices[0].as_string('fasta')
    return fasta_string
Example #4
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import dendropy
from dendropy.interop import seqgen

trees = dendropy.TreeList.get(path="pythonidae.mcmc.nex", schema="nexus")
s = seqgen.SeqGen()

# generate one alignment per tree
# as substitution model is not specified, defaults to a JC model
# will result in a DataSet object with one DnaCharacterMatrix per input tree
d0 = s.generate(trees)
print(len(d0.char_matrices))
print(d0.char_matrices[0].as_string("nexus"))

# instruct Seq-Gen to scale branch lengths by factor of 0.1
# note that this does not modify the input trees
s.scale_branch_lens = 0.1

# more complex model
s.char_model = seqgen.SeqGen.GTR
s.state_freqs = [0.4, 0.4, 0.1, 0.1]
s.general_rates = [0.8, 0.4, 0.4, 0.2, 0.2, 0.1]
d1 = s.generate(trees)
print(len(d0.char_matrices))
print(d0.char_matrices[0].as_string("nexus"))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_trees",
            metavar="SOURCE_TREEFILE [SOURCE_TREEFILE [SOURCE_TREEFILE]]",
            nargs="+",
            help="Path to containing tree files. Specify '-' to read from standard input.")
    parser.add_argument("-f", "--input-format",
            default="nexus",
            dest="schema",
            help="Input trees format (default: $(default)s).")
    parser.add_argument("-z", "--random-seed",
            type=int,
            default=None,
            help="Seed for random number generator engine.")
    parser.add_argument("-t", "--title",
            default="bpprun",
            help="Run title (default: '%(default)s')")
    data_options = parser.add_argument_group("Data Options")
    data_options.add_argument("--population-size",
            type=int,
            default=1.0,
            help="Population size (default: %(default)s).")
    data_options.add_argument("--num-individuals-per-population",
            type=int,
            default=4,
            help="Number of individuals sampled per incipient species lineage (default: %(default)s).")
    data_options.add_argument("--num-loci-per-individual",
            type=int,
            default=10,
            help="Number of loci sampled per individual (default: %(default)s).")
    data_options.add_argument("--num-characters-per-locus",
            type=int,
            default=1000,
            help="Number of characters sampled per locus (default: %(default)s).")
    data_options.add_argument("--mutation-rate-per-site",
            type=float,
            # default=0.00001,
            default=1e-8,
            help="Per-site mutation rate (default: %(default)s).")
    parser.add_argument("--no-scale-tree-by-mutation-rate",
            action="store_true",
            help="Do not scale tree by mutation rate.")
    args = parser.parse_args()

    if args.random_seed is None:
        random_seed = random.randint(0, sys.maxsize-1)
    else:
        random_seed = args.random_seed
    rng = random.Random(random_seed)
    _log("Random seed: {}".format(random_seed))

    sg = seqgen.SeqGen()
    sg.seq_len = args.num_characters_per_locus
    sg.scale_branch_lens = args.mutation_rate_per_site

    if "-" in args.source_trees:
        filepaths = sys.stdin.read().split("\n")
        args.source_trees.remove("-")
    else:
        filepaths = []

    manifest_entries = []
    filepaths.extend(args.source_trees)
    for idx, filepath in enumerate(filepaths):
        job_title = "{}_{:05d}".format(args.title, idx+1)
        manifest_entry = collections.OrderedDict()
        _log("{} of {}: {}: {}".format(idx+1, len(filepaths), job_title, filepath))
        source_tree = dendropy.Tree.get(
                path=filepath,
                schema=args.schema,
                extract_comment_metadata=True,
                preserve_underscores=True,
                )

        manifest_entry["speciation_initiation_from_orthospecies_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_initiation_from_orthospecies_rate"].value)
        manifest_entry["speciation_initiation_from_incipient_species_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_initiation_from_incipient_species_rate"].value)
        manifest_entry["speciation_completion_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_completion_rate"].value)
        manifest_entry["orthospecies_extinction_rate"] = try_to_coerce_to_float(source_tree.annotations["orthospecies_extinction_rate"].value)
        manifest_entry["incipient_species_extinction_rate"] = try_to_coerce_to_float(source_tree.annotations["incipient_species_extinction_rate"].value)
        manifest_entry["max_time"] = try_to_coerce_to_float(source_tree.annotations["max_time"].value)
        manifest_entry["max_extant_orthospecies"] = try_to_coerce_to_float(source_tree.annotations["max_extant_orthospecies"].value)
        manifest_entry["num_extant_lineages"] = try_to_coerce_to_float(source_tree.annotations["num_extant_lineages"].value)
        manifest_entry["num_extant_orthospecies"] = try_to_coerce_to_float(source_tree.annotations["num_extant_orthospecies"].value)
        manifest_entry["source_tree_type"] = source_tree.annotations["tree_type"].value
        manifest_entry["population_size"] = args.population_size
        manifest_entry["num_individuals_per_population"] = args.num_individuals_per_population
        manifest_entry["num_loci_per_individual"] = args.num_loci_per_individual
        manifest_entry["mutation_rate_per_site"] = args.mutation_rate_per_site

        source_tree.calc_node_ages()

        gene_trees = generate_contained_trees(
                containing_tree=source_tree,
                num_individuals_per_population=args.num_individuals_per_population,
                num_gene_trees=args.num_loci_per_individual,
                population_size=args.population_size,
                rng=rng,
                )

        imap_filepath = "{}.input.imap.txt".format(job_title)
        f = open(imap_filepath, "w")
        for taxon in gene_trees.taxon_namespace:
            f.write("{}    {}\n".format(taxon.label.split("^")[1], taxon.population_label))
            # f.write("{}    {}\n".format(taxon.label.split("^")[0], taxon.population_label))
            # f.write("{}    {}\n".format(taxon.label, taxon.population_label))
        f.write("\n")

        d0 = sg.generate(gene_trees)
        chars_filepath = "{}.input.chars.txt".format(job_title)
        f = open(chars_filepath, "w")
        for cm_idx, cm in enumerate(d0.char_matrices):
            sys.stderr.write("Locus {}: pi = {}, Tajima's D = {}\n".format(
                cm_idx+1,
                popgenstat.nucleotide_diversity(cm),
                popgenstat.tajimas_d(cm)))
            cm.write(file=f, schema="phylip")
            f.write("\n")

        out_filepath = "{}.results.out.txt".format(job_title)
        mcmc_filepath = "{}.results.mcmc.txt".format(job_title)
        num_species = len(source_tree.taxon_namespace)
        species_labels = " ".join(t.label for t in source_tree.taxon_namespace)
        num_individuals_per_species = " ".join(str(args.num_individuals_per_population) for i in range(len(source_tree.taxon_namespace)))

        # Inverse Gamma Prior
        # IG(a,b), with mean given by b/(a-1)
        # So,
        #   thetaprior 3 0.002
        # has a mean of
        #   0.002/(3-1) = 0.001
        theta_prior_mean = args.population_size * 4 * args.mutation_rate_per_site
        theta_prior_a = 3.0
        theta_prior_b = theta_prior_mean * (theta_prior_a - 1)
        if args.no_scale_tree_by_mutation_rate:
            tau_prior_mean = source_tree.seed_node.age
        else:
            # tau_prior_mean = source_tree.seed_node.age * args.population_size * 4 * args.mutation_rate_per_site
            tau_prior_mean = source_tree.seed_node.age * args.mutation_rate_per_site * (1.0 / (args.num_loci_per_individual * args.num_characters_per_locus))
            # tau_prior_mean = source_tree.seed_node.age / 100000
        tau_prior_a = 3.0
        tau_prior_b = tau_prior_mean * (tau_prior_a - 1)

        manifest_entry["num_input_lineages"] = len(species_labels)
        manifest_entry["theta"] = theta_prior_mean
        manifest_entry["theta_prior_a"] = theta_prior_a
        manifest_entry["theta_prior_b"] = theta_prior_b
        manifest_entry["root_age"] = source_tree.seed_node.age
        manifest_entry["tau_prior_a"] = tau_prior_a
        manifest_entry["tau_prior_b"] = tau_prior_b

        species_tree = source_tree.as_string(
                schema="newick",
                suppress_leaf_taxon_labels=False,
                suppress_leaf_node_labels=True,
                suppress_internal_taxon_labels=True,
                suppress_internal_node_labels=True,
                suppress_rooting=True,
                suppress_edge_lengths=True,
                unquoted_underscores=True,
                preserve_spaces=True,
                store_tree_weights=False,
                suppress_annotations=True,
                suppress_item_comments=True,
                )
        bpp_config = BPP_TEMPLATE.format(
                chars_filepath=chars_filepath,
                imap_filepath=imap_filepath,
                out_filepath=out_filepath,
                mcmc_filepath=mcmc_filepath,
                num_species=num_species,
                species_labels=species_labels,
                num_individuals_per_species=num_individuals_per_species,
                species_tree=species_tree,
                theta_prior_mean=theta_prior_mean,
                theta_prior_a=theta_prior_a,
                theta_prior_b=theta_prior_b,
                tau_prior_mean=tau_prior_mean,
                tau_prior_a=tau_prior_a,
                tau_prior_b=tau_prior_b,
                num_loci=args.num_loci_per_individual,
                root_age=source_tree.seed_node.age
                )
        bpp_ctl_filepath = "{}.input.bpp.ctl".format(job_title)
        f = open(bpp_ctl_filepath, "w")
        f.write(bpp_config)
        f.write("\n")

        jobf = open("{}.job.sge".format(job_title), "w")
        jobf.write("#! /bin/bash\n")
        jobf.write("#$ -cwd\n")
        jobf.write("#$ -V\n")
        jobf.write("#$ -S /bin/bash\n")
        jobf.write("#$ -l h_vmem=12G\n")
        jobf.write("#$ -l virtual_free=12G\n")
        jobf.write("bpp --cfile {}\n".format(bpp_ctl_filepath))

        manifest_entry["source_tree_path"] = filepath
        manifest_entry["results_filepath"] = out_filepath
        manifest_entry["mcmc_filepath"] = mcmc_filepath
        manifest_entries.append(manifest_entry)

    out = _open_output_file_for_csv_writer(
            filepath="{}_manifest.csv".format(args.title),
            append=False)
    with out:
        writer = csv.DictWriter(
                out,
                fieldnames=manifest_entries[0].keys(),
                restval="NA",
                delimiter=",",
                lineterminator=os.linesep,
                )
        writer.writeheader()
        writer.writerows(manifest_entries)
Example #6
0
def main():
    parser = argparse.ArgumentParser()
    parameter_options = parser.add_argument_group("Model Parameters")
    parameter_options.add_argument("--b1", "--speciation_initiation_from_orthospecies_rate",
            type=float,
            dest="speciation_initiation_from_orthospecies_rate",
            default=1.0,
            help="Rate at which orthospecies give rise to new incipient species [default: %(default)s].")
    parameter_options.add_argument("--b2", "--speciation_initiation_from_incipient_species_rate",
            type=float,
            dest="speciation_initiation_from_incipient_species_rate",
            default=1.0,
            help="Rate at which incipient species give rise to new incipient species [default: %(default)s].")
    parameter_options.add_argument("--c1", "--speciation-completion-rate",
            type=float,
            dest="speciation_completion_rate",
            default=1.0,
            help="Rate at which incipient species become orthospecies [default: %(default)s].")
    parameter_options.add_argument("--e1", "--orthospecies-extinction-rate",
            type=float,
            dest="orthospecies_extinction_rate",
            default=1.0,
            help="Rate at which orthospecies go extinct [default: %(default)s].")
    parameter_options.add_argument("--e2", "--incipient-species-extinction-rate",
            type=float,
            dest="incipient_species_extinction_rate",
            default=1.0,
            help="Rate at which incipient species go extinct [default: %(default)s].")
    termination_options = parser.add_argument_group("Simulation Termination Conditions")
    termination_options.add_argument("--max-time",
            type=float,
            default=None,
            help="Maximum length of time to to run (default: %(default)s).")
    termination_options.add_argument("--max-extant-orthospecies",
            type=int,
            default=None,
            help="Maximum number of orthospecies to generate (default: %(default)s).")
    termination_options.add_argument("--max-extant-lineages",
            type=int,
            default=None,
            help="Maximum number of lineages to generate (default: %(default)s).")
    data_options = parser.add_argument_group("Data Options")
    data_options.add_argument("--population-size",
            type=int,
            default=10000,
            help="Population size (default: %(default)s).")
    data_options.add_argument("--num-individuals-per-population",
            type=int,
            default=4,
            help="Number of individuals sampled per incipient species lineage (default: %(default)s).")
    data_options.add_argument("--num-loci-per-individual",
            type=int,
            default=10,
            help="Number of loci sampled per individual (default: %(default)s).")
    data_options.add_argument("--num-characters-per-locus",
            type=int,
            default=10,
            help="Number of characters sampled per locus (default: %(default)s).")
    data_options.add_argument("--mutation-rate-per-site",
            type=float,
            default=0.00001,
            help="Per-site mutation rate (default: %(default)s).")
    run_options = parser.add_argument_group("Run Options")
    run_options.add_argument("-t", "--run-title",
            default="run",
            help="Run title (default: '%(default)s')")
    run_options.add_argument("-n", "--nreps",
            type=int,
            default=10,
            help="Number of replicates (default: %(default)s).")
    run_options.add_argument("-z", "--random-seed",
            type=int,
            default=None,
            help="Seed for random number generator engine.")
    args = parser.parse_args()

    if not args.max_time and not args.max_extant_orthospecies and not args.max_extant_lineages:
        sys.exit("Need to specify termination condition, at least one of: '--max-time', '--max-extant-orthospecies', '--max-extant-lineages'")
    if args.random_seed is None:
        random_seed = random.randint(0, sys.maxint-1)
    else:
        random_seed = args.random_seed

    _log("Random seed: {}".format(random_seed))
    rng = random.Random(random_seed)
    psm = protractedspeciation.ProtractedSpeciationProcess(
            speciation_initiation_from_orthospecies_rate=args.speciation_initiation_from_orthospecies_rate,
            orthospecies_extinction_rate=args.orthospecies_extinction_rate,
            speciation_initiation_from_incipient_species_rate=args.speciation_initiation_from_incipient_species_rate,
            speciation_completion_rate=args.speciation_completion_rate,
            incipient_species_extinction_rate=args.incipient_species_extinction_rate,
            rng=rng,)
    sg = seqgen.SeqGen()
    sg.scale_branch_lengths = args.mutation_rate_per_site
    for rep in range(args.nreps):
        job_title = "{}_{:03d}".format(args.run_title, rep+1)
        _log("Replicate {} of {}: {}".format(rep+1, args.nreps, job_title))
        lineage_tree, orthospecies_tree = psm.generate_sample(
                max_time=args.max_time,
                max_extant_orthospecies=args.max_extant_orthospecies,
                max_extant_lineages=args.max_extant_lineages,
                is_initial_lineage_orthospecies=True,
                # is_correlate_lineage_and_species_trees=True,
                )
        # lineage_tree.calc_node_ages()
        # orthospecies_tree.calc_node_ages()
        # _log("    Incipient species tree: {} tips, root age = {} ({} mutation units)".format(len(lineage_tree.leaf_nodes()), lineage_tree.seed_node.age, lineage_tree.seed_node.age * args.mutation_rate_per_site,))
        # _log("    Orthospecies tree:      {} tips, root age = {} ({} mutation units)".format(len(orthospecies_tree.leaf_nodes()), orthospecies_tree.seed_node.age))
        label_lineage_tree(lineage_tree)
        label_orthospecies_tree(orthospecies_tree)
        lineage_tree.write(path="x1.nexus", schema="nexus")
        orthospecies_tree.write(path="x2.nexus", schema="nexus")

        logf = open("{}.setup.log".format(job_title), "w")
        logf.write("-  Replicate {} of {} generated by command: {}\n".format(rep+1, args.nreps, " ".join(sys.argv)))
        logf.write("\n")
        logf.write("-  Random seed used: {}\n".format(random_seed))
        logf.write("\n")
        describe_tree(logf, lineage_tree, "-  Lineage Tree Profile")
        logf.write("\n")
        describe_tree(logf, orthospecies_tree, "-  Orthospecies Tree Profile")
        logf.write("\n")
        logf.write("-  Protracted Speciation Model Parameters\n")
        logf.write("   -       Speciation initiation from orthospecies rate: {}\n".format(args.speciation_initiation_from_orthospecies_rate))
        logf.write("   -  Speciation initiation from incipient species rate: {}\n".format(args.speciation_initiation_from_incipient_species_rate))
        logf.write("   -                         Speciation completion rate: {}\n".format(args.speciation_completion_rate))
        logf.write("   -                       Orthospecies extinction rate: {}\n".format(args.orthospecies_extinction_rate))
        logf.write("   -                  Incipient species extinction rate: {}\n".format(args.incipient_species_extinction_rate))
        logf.write("   -               Termination: Maximum simulation time: {}\n".format(args.max_time))
        logf.write("   -                  Termination: Maximum orthospecies: {}\n".format(args.max_extant_orthospecies))
        logf.write("   -                      Termination: Maximum lineages: {}\n".format(args.max_extant_lineages))
        logf.write("\n")
        logf.write("-  Data Generation Parameters\n")
        logf.write("   -                                    Population size: {}\n".format(args.population_size))
        logf.write("   -                 Individuals per species/population: {}\n".format(args.num_individuals_per_population))
        logf.write("   -                      Number of loci per individual: {}\n".format(args.num_loci_per_individual))
        logf.write("   -                             Per-site mutation rate: {}\n".format(args.mutation_rate_per_site))
        logf.write("\n")
Example #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_trees",
            metavar="SOURCE_TREEFILE [SOURCE_TREEFILE [SOURCE_TREEFILE]]",
            nargs="+",
            help="Path to source of tree files. Specify '-' to read from standard input.")
    parser.add_argument("-f", "--input-format",
            default="nexus",
            dest="schema",
            help="Input trees format (default: $(default)s).")
    parser.add_argument("-z", "--random-seed",
            type=int,
            default=None,
            help="Seed for random number generator engine.")
    parser.add_argument("-t", "--title",
            default="bpprun",
            help="Run title (default: '%(default)s')")
    data_options = parser.add_argument_group("Data Options")
    data_options.add_argument("--population-size",
            type=int,
            default=10000,
            help="Population size (default: %(default)s).")
    data_options.add_argument("--total-number-of-individuals",
            type=int,
            default=200,
            help="Number of individuals sampled across all populations (default: %(default)s).")
    data_options.add_argument("--num-loci-per-individual",
            type=int,
            default=10,
            help="Number of loci sampled per individual (default: %(default)s).")
    data_options.add_argument("--num-characters-per-locus",
            type=int,
            default=1000,
            help="Number of characters sampled per locus (default: %(default)s).")
    data_options.add_argument("--mutation-rate-per-site",
            type=float,
            default=0.00001,
            help="Per-site mutation rate (default: %(default)s).")
    args = parser.parse_args()

    if args.random_seed is None:
        random_seed = random.randint(0, sys.maxint-1)
    else:
        random_seed = args.random_seed
    rng = random.Random(random_seed)
    s00.log("Random seed: {}".format(random_seed))

    sg = seqgen.SeqGen()
    sg.scale_branch_lengths = args.mutation_rate_per_site

    if "-" in args.source_trees:
        filepaths = sys.stdin.read().split("\n")
        args.source_trees.remove("-")
    else:
        filepaths = []

    manifest_entries = []
    filepaths.extend(args.source_trees)
    for idx, filepath in enumerate(filepaths):
        job_title = "{}_{:05d}".format(args.title, idx+1)
        manifest_entry = collections.OrderedDict()
        s00.log("{} of {}: {}: {}".format(idx+1, len(filepaths), job_title, filepath))
        try:
            source_tree = dendropy.Tree.get(
                    path=filepath,
                    schema=args.schema,
                    extract_comment_metadata=True,
                    preserve_underscores=True,
                    )
        except OSError, dendropy.DataError:
            s00.log("Skipping failed file: {}".format(filepath))
            continue

        manifest_entry["speciation_initiation_from_orthospecies_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_initiation_from_orthospecies_rate"].value)
        manifest_entry["speciation_initiation_from_incipient_species_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_initiation_from_incipient_species_rate"].value)
        manifest_entry["speciation_completion_rate"] = try_to_coerce_to_float(source_tree.annotations["speciation_completion_rate"].value)
        manifest_entry["orthospecies_extinction_rate"] = try_to_coerce_to_float(source_tree.annotations["orthospecies_extinction_rate"].value)
        manifest_entry["incipient_species_extinction_rate"] = try_to_coerce_to_float(source_tree.annotations["incipient_species_extinction_rate"].value)
        manifest_entry["max_time"] = try_to_coerce_to_float(source_tree.annotations["max_time"].value)
        manifest_entry["max_extant_orthospecies"] = try_to_coerce_to_float(source_tree.annotations["max_extant_orthospecies"].value)
        manifest_entry["num_extant_lineages"] = try_to_coerce_to_float(source_tree.annotations["num_extant_lineages"].value)
        manifest_entry["num_extant_orthospecies"] = try_to_coerce_to_float(source_tree.annotations["num_extant_orthospecies"].value)
        manifest_entry["source_tree_type"] = source_tree.annotations["tree_type"].value
        manifest_entry["population_size"] = args.population_size
        manifest_entry["total_number_of_individuals"] = args.total_number_of_individuals
        manifest_entry["num_loci_per_individual"] = args.num_loci_per_individual
        manifest_entry["mutation_rate_per_site"] = args.mutation_rate_per_site

        source_tree.calc_node_ages()
        original_containing_tree_num_species = len(source_tree.taxon_namespace)
        original_containing_tree_species_labels = " ".join(t.label for t in source_tree.taxon_namespace)

        containing_tree, gene_trees = generate_contained_trees(
                containing_tree=source_tree,
                total_number_of_individuals=args.total_number_of_individuals,
                num_gene_trees=args.num_loci_per_individual,
                population_size=args.population_size,
                rng=rng,
                )

        imap_filepath = "{}.input.imap.txt".format(job_title)
        f = open(imap_filepath, "w")
        for taxon in gene_trees.taxon_namespace:
            f.write("{}    {}\n".format(taxon.label.split("^")[1], taxon.population_label))
        f.write("\n//end of file")

        d0 = sg.generate(gene_trees)
        chars_filepath = "{}.input.chars.txt".format(job_title)
        f = open(chars_filepath, "w")
        for cm in d0.char_matrices:
            d0.write(file=f, schema="phylip")
            f.write("\n")

        out_filepath = "{}.results.out.txt".format(job_title)
        mcmc_filepath = "{}.results.mcmc.txt".format(job_title)
        final_containing_tree_num_species = len(containing_tree.taxon_namespace)

        # final_containing_tree_num_species = len(containing_tree.taxon_namespace)
        # final_containing_tree_species_labels = " ".join(t.label for t in containing_tree.taxon_namespace)
        # num_individuals_per_species = " ".join(str(args.num_individuals_per_population) for i in range(len(source_tree.taxon_namespace)))
        final_containing_tree_species_labels = []
        final_containing_tree_num_individuals_per_species = []
        for nd in containing_tree.leaf_node_iter():
            final_containing_tree_species_labels.append(nd.taxon.label)
            final_containing_tree_num_individuals_per_species.append(nd.num_individuals_sampled)
        final_containing_tree_num_species = len(final_containing_tree_species_labels)
        final_containing_tree_species_labels = " ".join(final_containing_tree_species_labels)
        final_containing_tree_num_individuals_per_species = " ".join([str(i) for i in final_containing_tree_num_individuals_per_species])

        theta_prior_mean = args.population_size * 4 * args.mutation_rate_per_site
        theta_prior_a = 2.0
        theta_prior_b = theta_prior_a/theta_prior_mean
        tau_prior_mean = containing_tree.seed_node.age
        tau_prior_a = 2.0
        tau_prior_b = tau_prior_a/tau_prior_mean

        manifest_entry["num_input_lineages"] = final_containing_tree_num_species
        manifest_entry["theta"] = theta_prior_mean
        manifest_entry["theta_prior_a"] = theta_prior_a
        manifest_entry["theta_prior_b"] = theta_prior_b
        manifest_entry["root_age"] = containing_tree.seed_node.age
        manifest_entry["tau_prior_a"] = tau_prior_a
        manifest_entry["tau_prior_b"] = tau_prior_b

        species_tree = containing_tree.as_string(
                schema="newick",
                suppress_leaf_taxon_labels=False,
                suppress_leaf_node_labels=True,
                suppress_internal_taxon_labels=True,
                suppress_internal_node_labels=True,
                suppress_rooting=True,
                suppress_edge_lengths=True,
                unquoted_underscores=True,
                preserve_spaces=True,
                store_tree_weights=False,
                suppress_annotations=True,
                suppress_item_comments=True,
                )
        bpp_config = BPP_TEMPLATE.format(
                chars_filepath=chars_filepath,
                imap_filepath=imap_filepath,
                out_filepath=out_filepath,
                mcmc_filepath=mcmc_filepath,
                num_species=final_containing_tree_num_species,
                species_labels=final_containing_tree_species_labels,
                num_individuals_per_species=final_containing_tree_num_individuals_per_species,
                species_tree=species_tree,
                theta_prior_a=theta_prior_a,
                theta_prior_b=theta_prior_b,
                tau_prior_a=tau_prior_a,
                tau_prior_b=tau_prior_b,
                num_loci=args.num_loci_per_individual,
                )
        bpp_ctl_filepath = "{}.input.bpp.ctl".format(job_title)
        f = open(bpp_ctl_filepath, "w")
        f.write(bpp_config)
        f.write("\n")

        jobf = open("{}.job.sge".format(job_title), "w")
        jobf.write("#! /bin/bash\n")
        jobf.write("#$ -cwd\n")
        jobf.write("#$ -V\n")
        jobf.write("#$ -S /bin/bash\n")
        jobf.write("#$ -l h_vmem=12G\n")
        jobf.write("#$ -l virtual_free=12G\n")
        jobf.write("bpp {}\n".format(bpp_ctl_filepath))

        manifest_entry["source_tree_path"] = filepath
        manifest_entry["results_filepath"] = out_filepath
        manifest_entry["mcmc_filepath"] = mcmc_filepath
        manifest_entries.append(manifest_entry)
Example #8
0
def main():
    """
    Main CLI handler.
    """

    parser = argparse.ArgumentParser(description=__description__)
    parser.add_argument("--version",
                        action="version",
                        version="%(prog)s " + __version__)
    parser.add_argument("output_prefix")
    parser.add_argument(
        "-t",
        "--tree-files",
        action="append",
        type=str,
        metavar="TREEFILE",
        help="Path to tree files (default: read from standard input).")
    parser.add_argument("-f",
                        "--input-format",
                        type=str,
                        default="newick",
                        choices=["nexus", "newick"],
                        help="Input data format (default='%(default)s')")
    parser.add_argument(
        "-n",
        "--num-characters-per-locus",
        type=int,
        default=1000,
        help="Number of characters sampled per locus (default: %(default)s).")
    parser.add_argument("--mutation-rate-per-site",
                        type=float,
                        default=0.00001,
                        help="Per-site mutation rate (default: %(default)s).")
    parser.add_argument(
        "-s",
        "--scale-branch-lengths",
        action="store",
        type=float,
        default=1.0,
        help="Scale branch lengths by this factor [default=%(default)s].")
    parser.add_argument("--num-replicates",
                        type=int,
                        default=1,
                        help="Number of replicates (default: %(default)s).")
    parser.add_argument("-F",
                        "--output-format",
                        type=str,
                        default="bpp",
                        choices=["bpp", "nexus", "phylip"],
                        help="Input data format (default='%(default)s')")
    parser.add_argument(
        "--concatenate",
        action="store_true",
        default=False,
        help="Concatenate the alignments across all genealogies")
    parser.add_argument("-z",
                        "--random-seed",
                        type=int,
                        default=None,
                        help="Seed for random number generator engine.")

    args = parser.parse_args()
    if not args.tree_files:
        sys.exit("Please specify path(s) to genealogy tree file(s)")
    sg = seqgen.SeqGen()
    sg.seq_len = args.num_characters_per_locus
    sg.scale_branch_lens = args.mutation_rate_per_site
    gene_trees = dendropy.TreeList()
    for src_idx, src_path in enumerate(args.tree_files):
        if src_path == "-":
            src = sys.stdin
        else:
            src = open(src_path)
        try:
            src_id = src.name
        except AttributeError:
            src_id = "<stdin>"
        with src:
            data = []
            gene_trees.read(file=src,
                            schema=args.input_format,
                            rooting="force-rooted")
    if args.output_format == "bpp":
        for t in gene_trees.taxon_namespace:
            t.label = "^{}".format(t.label)
    for rep_idx in range(args.num_replicates):
        d0 = sg.generate(gene_trees)
        chars_filepath = "{}.{:03d}.chars".format(args.output_prefix,
                                                  rep_idx + 1)
        if args.output_format == "nexus":
            chars_filepath += ".nex"
            d0.write(path=chars_filepath, schema="nexus")
        elif args.output_format == "phylip":
            chars_filepath += ".phylip"
            d0.write(path=chars_filepath, schema="phylip")
        elif args.output_format == "bpp":
            chars_filepath += ".txt"
            f = open(chars_filepath, "w")
            for cm in d0.char_matrices:
                cm.write(file=f, schema="phylip")
                f.write("\n")
        else:
            raise NotImplementedError