Example #1
0
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#

# This script is an example in the simuPOP user's guide. Please refer to
# the user's guide (http://simupop.sourceforge.net/manual) for a detailed
# description of this example.
#

import simuPOP as sim
pop = sim.Population(10, infoFields='ind_id', ancGen=1)
pop.evolve(initOps=sim.IdTagger(),
           matingScheme=sim.RandomSelection(ops=[
               sim.CloneGenoTransmitter(),
               sim.IdTagger(),
           ]),
           gen=1)
print([int(ind.ind_id) for ind in pop.individuals()])
pop.useAncestralGen(1)
print([int(ind.ind_id) for ind in pop.individuals()])
sim.tagID(pop)  # re-assign ID
print([int(ind.ind_id) for ind in pop.individuals()])
Example #2
0
def runSimulation(scenario_id, sub_population_size, minMatingAge, maxMatingAge,
                  gen):
    '''
    sub_population_size   A vector giving the population sizes for each sub-population. The subpopulations determine which breeding ground an individual belongs to
    minMatingAge          minimal mating age.
    maxMatingAge          maximal mating age. Individuals older than this are effectively dead
    years                 number of years to simulate
    '''

    # scenario_id describes the batch of files to load
    # The mitochondrial DNA will be in mtdna_<scenario_id>
    # The SNP DNA will be in snp_<scenario_id>

    # Read the mitochondrial haplotype frequencies. There's a bit to unpack here
    # We read the lines into an array, and for each one, call split() on it to get one element per column.
    # However, we do not want this - we want the transpose, where haplotype_frequencies[0] is a vector of
    # all the frequencies for population 0, and haplotype_frequencies[1] is the corresponding vector for
    # population 2. list(map(list, zip(*t))) will achieve this transformation for us.
    # While we are at it, we also convert the strings into floats.
    mitochondrial_file = "mtdna_" + scenario_id + ".txt"
    with open(mitochondrial_file, "r") as fd:
        haplotype_frequencies = list(
            map(list,
                zip(*[list(map(float, line[0:-1].split())) for line in fd])))

    if len(haplotype_frequencies) != len(sub_population_size):
        raise ValueError(
            'The number of populations in the population size vector and the number of populations deduced from the haplotype file are different'
        )

    # Now read the SNP data. This builds a 2D array indexed as snp[locus][population]
    snp_file = "snp_" + scenario_id + ".txt"
    with open(snp_file, "r") as fd:
        snp = [list(map(float, line[0:-1].split())) for line in fd]

    sub_population_count = len(sub_population_size)
    print()
    print(sub_population_count, "subpopulations detected")

    # Now we can create the population. We want to give each population a population name, starting from A
    sub_population_names = list(map(chr, range(65, 65 + sub_population_count)))

    # We have two chromosomes. The first is an autosome with nb_loci loci, and the second is the mitochondrial chromosome with 1 locus
    pop = simuPOP.Population(
        sub_population_size,
        ploidy=2,
        loci=[nb_loci, 1],
        ancGen=2,
        infoFields=[
            'age', 'ind_id', 'father_id', 'mother_id', 'nitrogen', 'carbon',
            'feeding_ground', 'native_breeding_ground', 'migrate_to'
        ],
        subPopNames=sub_population_names,
        chromTypes=[simuPOP.AUTOSOME, simuPOP.MITOCHONDRIAL])
    sub_population_names = tuple(sub_population_names)

    # Create an attribute on each individual called 'age'. Set it to a random number between 0 and maxMatingAge
    # Note that size is a vector - the size of each population. We have to sum these to get the total number of individuals
    individual_count = sum(sub_population_size)

    # Assign a random age to each individual
    pop.setIndInfo(
        [random.randint(0, maxMatingAge) for x in range(individual_count)],
        'age')
    # Assign a random feeding ground to each individual
    pop.setIndInfo([
        random.randint(0, numberOfFeedingGrounds - 1)
        for x in range(individual_count)
    ], 'feeding_ground')

    # Currently we have these virtual subpopulations:
    # age < minMatingAge (juvenile)
    # age >= minMatingAge and age < maxMatingAge + 0.1 (age <= maxMatingAge) (mature)
    # age >= maxMatingAge (dead)
    #
    # Ideally we would want something like this:
    # 1) Immature
    # 2) Receptive female (every 3 years)
    # 3) Non-receptive female
    # 4) Mature male
    # 5) Dead
    #
    # Note that we use a cutoff InfoSplitter here, it is also possible to
    # provide a list of values, each corresponding to a virtual subpopulation.
    pop.setVirtualSplitter(
        simuPOP.CombinedSplitter([
            simuPOP.ProductSplitter([
                simuPOP.SexSplitter(),
                simuPOP.InfoSplitter('age',
                                     cutoff=[minMatingAge, maxMatingAge + 0.1],
                                     names=['juvenile', 'mature', 'dead'])
            ])
        ],
                                 vspMap=[[0], [1], [2], [3], [4], [5],
                                         [0, 1, 3, 4], [1, 4]],
                                 names=[
                                     'Juvenile Male', 'Mature Male',
                                     'Dead Male', 'Juvenile Female',
                                     'Mature Female', 'Dead Female',
                                     'Not dead yet', 'Active'
                                 ]))

    pop.evolve(
        initOps=[
            simuPOP.InitSex(),
            simuPOP.IdTagger(),
            simuPOP.PyOperator(func=init_native_breeding_grounds)
        ] + [
            simuPOP.InitGenotype(subPops=sub_population_names[i],
                                 freq=haplotype_frequencies[i],
                                 loci=[nb_loci])
            for i in range(0, sub_population_count)
        ] + [
            simuPOP.InitGenotype(subPops=sub_population_names[i],
                                 freq=[snp[n][i], 1 - snp[n][i]],
                                 loci=[n])
            for i in range(0, sub_population_count)
            for n in range(0, nb_loci - 1)
        ],
        # increase age by 1
        preOps=[simuPOP.InfoExec('age += 1')],
        matingScheme=simuPOP.HeteroMating(
            [
                # age <= maxAge, copy to the next generation (weight=-1)
                # subPops is a list of tuples that will participate in mating. The tuple is a pair (subPopulation, virtualSubPopulation)
                # First, we propagate (clone) all individuals in all subpopulations (and all VSPs except the ones who are now in the VSP of deceased individuals) to the next generation
                simuPOP.CloneMating(
                    ops=[simuPOP.CloneGenoTransmitter(chroms=[0, 1])],
                    subPops=[
                        (sub_population, 6)
                        for sub_population in range(0, sub_population_count)
                    ],
                    weight=-1),
                # Then we simulate random mating only in VSP 1 (ie reproductively mature individuals) within subpopulation (breeding/winter grounds)
                simuPOP.RandomMating(
                    ops=[
                        simuPOP.MitochondrialGenoTransmitter(),
                        simuPOP.MendelianGenoTransmitter(),
                        simuPOP.IdTagger(),
                        simuPOP.InheritTagger(mode=simuPOP.MATERNAL,
                                              infoFields=['feeding_ground']),
                        simuPOP.InheritTagger(
                            mode=simuPOP.MATERNAL,
                            infoFields=['native_breeding_ground']),
                        simuPOP.PedigreeTagger()
                    ],
                    subPops=[
                        (sub_population, 7)
                        for sub_population in range(0, sub_population_count)
                    ],
                    weight=1)
            ],
            subPopSize=configure_new_population_size),
        postOps=[

            # Determine the isotopic ratios in individuals
            simuPOP.PyOperator(func=postop_processing),
            simuPOP.Migrator(mode=simuPOP.BY_IND_INFO),
            # count the individuals in each virtual subpopulation
            #simuPOP.Stat(popSize=True, subPops=[(0,0), (0,1), (0,2), (1,0), (1, 1), (1, 2)]),
            # print virtual subpopulation sizes (there is no individual with age > maxAge after mating)
            #simuPOP.PyEval(r"'Size of age groups: %s\n' % (','.join(['%d' % x for x in subPopSize]))")

            # Alternatively, calculate the Fst
            # FIXME: How does this actually work? Does it work for > 2 populations? I don't really understand it yet
            # ELC: it is a calculation that partitions variance among and between populations, and can be calculated as a
            # global statistic or on a pairwise basis. We use it as an indication of genetic differentiation.
            simuPOP.Stat(structure=range(1),
                         subPops=sub_population_names,
                         suffix='_AB',
                         step=10),
            simuPOP.PyEval(r"'Fst=%.3f \n' % (F_st_AB)", step=10)
        ],
        gen=years)

    #simuPOP.dump(pop, width=3, loci=[], subPops=[(simuPOP.ALL_AVAIL, simuPOP.ALL_AVAIL)], max=1000, structure=False);
    #return

    ped = simuPOP.Pedigree(pop)
    print("This is the pedigree stuff")
    simuPOP.dump(pop)

    # Now sample the individuals
    sample = drawRandomSample(pop, sizes=[sample_count] * sub_population_count)

    # Print out the allele frequency data
    simuPOP.stat(sample, alleleFreq=simuPOP.ALL_AVAIL)
    frequencies = sample.dvars().alleleFreq
    with open('freq.txt', 'w') as freqfile:
        index = 0
        for locus in frequencies:
            if (locus == nb_loci):
                continue
            if (len(frequencies[locus]) < 2):
                continue
            print(index, end=' ', file=freqfile)
            index = index + 1
            for allele in frequencies[locus]:
                print(frequencies[locus][allele], end=' ', file=freqfile)
            print(file=freqfile)

    # We want to remove monoallelic loci. This means a position in the genotype for which all individuals have the same value in both alleles
    # To implement this we will build up a list of loci that get ignored when we dump out the file. Generally speaking, if we add all the values up
    # then either they will sum to 0 (if all individuals have type 0) or to the number of individuals * 2 (if all individuals have type 1)
    geno_sum = [0] * (nb_loci + 1) * 2
    for individual in sample.individuals():
        geno_sum = list(map(add, geno_sum, individual.genotype()))
    final_sum = list(
        map(add, geno_sum[:(nb_loci + 1)], geno_sum[(nb_loci + 1):]))

    monoallelic_loci = []
    for i in range(0, nb_loci):
        if final_sum[i] == 0 or final_sum[
                i] == sample_count * sub_population_count * 2:
            monoallelic_loci = [i] + monoallelic_loci
    monoallelic_loci = sorted(monoallelic_loci, reverse=True)

    nb_ignored_loci = len(monoallelic_loci)
    # Generate the two files
    with open('mixfile.txt', 'w') as mixfile:
        with open('haploiso.txt', 'w') as haplofile:
            print(sub_population_count,
                  nb_loci - nb_ignored_loci,
                  2,
                  1,
                  file=mixfile)
            print("sex, haplotype, iso1, iso2, native_ground", file=haplofile)
            for i in range(0, nb_loci - nb_ignored_loci):
                print('Loc', i + 1, sep='_', file=mixfile)
            for individual in sample.individuals():
                genotype = individual.genotype()
                print(
                    1 if individual.sex() == 1 else 0,
                    genotype[nb_loci],
                    individual.info('carbon'),
                    individual.info('nitrogen'),
                    #                      int(individual.info('native_breeding_ground')),
                    file=haplofile,
                    sep=' ')
                print(int(individual.info('native_breeding_ground') + 1),
                      end=' ',
                      file=mixfile)
                for i in range(0, nb_loci):
                    if i not in monoallelic_loci:
                        print(genotype[i] + 1,
                              genotype[i + nb_loci + 1] + 1,
                              ' ',
                              end='',
                              sep='',
                              file=mixfile)
                print(file=mixfile)
    return sample
Example #3
0
def simulateBySimuPOP():
    #starting variables
    directory = '/data/new/javi/toxo/simulations4/'
    input_path = 'Toxo20.txt'
    output_path = 'SimulatedToxo.txt'

    input_path = directory + input_path
    output_path = directory + output_path
    parents_path = directory + '/parents.txt'
    pedigree_path = directory + 'pedigree.txt'

    number_of_ancestors = 3
    expansion_pop_size = 15
    offsprings_sampled = number_of_ancestors + expansion_pop_size
    gen = 3
    translate_mode = 'toxoplasma'
    structure_mode = 'simupop'

    #parsing input
    init_info = parseSNPInput(input_path, number_of_ancestors)
    ancestral_genomes = init_info[0]
    ancestor_names = ancestral_genomes.keys()
    loci_positions = init_info[1]
    chromosome_names = sorted(loci_positions.keys(),
                              key=lambda x: cns.getValue(x, translate_mode))
    list_of_loci = [len(loci_positions[chr]) for chr in chromosome_names]
    lociPos = fc.reduce(lambda x, y: x + y,
                        [loci_positions[x] for x in chromosome_names])

    sp.turnOnDebug(code="DBG_GENERAL")

    #initializing
    print('Initializaing Population')
    population = sp.Population(size=[number_of_ancestors], loci=list_of_loci, ancGen = 5, lociPos = lociPos, \
                               chromNames = chromosome_names, lociNames = [], alleleNames = ['A','T','G','C'],\
                               infoFields=['name', 'ind_id', 'father_id', 'mother_id'])

    for individual, sample, ind_id in zip(population.individuals(),
                                          ancestral_genomes,
                                          range(len(ancestral_genomes))):
        individual.setInfo(ancestor_names.index(sample), 'name')
        individual.setInfo(ind_id, 'ind_id')
        for ind, chr in enumerate(chromosome_names):
            individual.setGenotype(ancestral_genomes[sample][chr],
                                   chroms=[ind])

    #Alternating rounds of recombination with clonal expansion. Clonal expansion gives + 2.
    #Mutation prior to each round

    simulator = sp.Simulator(population)
    rate_matrix = createRateMatrix(len(ancestor_names),
                                   0.0002)  #10,000 times the mutation rate.
    id_tagger = sp.IdTagger()
    ped_tagger = sp.PedigreeTagger(output='>>' + pedigree_path,
                                   outputFields=['name', 'ind_id'])
    inherit_tagger = sp.InheritTagger(infoFields='name')

    initOps1 = [sp.PyExec('print("Starting random selection")'), ped_tagger]
    initOps2 = [sp.PyExec('print("Starting random mating")'), ped_tagger]
    preOps1 = [sp.MatrixMutator(rate=rate_matrix)]
    preOps2 = [sp.InitSex(sex=[sp.MALE, sp.FEMALE])]

    matingScheme1 = sp.RandomSelection(
        ops=[sp.CloneGenoTransmitter(), inherit_tagger, id_tagger, ped_tagger],
        subPopSize=expansion_pop_size)
    matingScheme2 = sp.RandomMating(
        ops=[
            sp.Recombinator(intensity=0.01 / 105000,
                            convMode=(sp.GEOMETRIC_DISTRIBUTION, 0.001,
                                      0.01)),  #10x normal
            sp.PyTagger(func=addNames),
            id_tagger,
            ped_tagger
        ],
        subPopSize=expansion_pop_size)

    postOps = []
    finalOps = []

    print('Starting Evolution Cycles.')

    try:
        os.remove(pedigree_path)
    except:
        pass

    simulator.evolve(
        initOps=[id_tagger, ped_tagger],
        matingScheme=sp.CloneMating(ops=[
            sp.CloneGenoTransmitter(), ped_tagger, id_tagger, inherit_tagger
        ]),
        gen=1)

    for x in range(gen):
        simulator.evolve(initOps=initOps1,
                         preOps=preOps1,
                         matingScheme=matingScheme1,
                         postOps=postOps,
                         finalOps=finalOps,
                         gen=1)
        simulator.evolve(initOps=initOps2,
                         preOps=preOps2,
                         matingScheme=matingScheme2,
                         postOps=postOps,
                         finalOps=finalOps,
                         gen=1)

    offsprings = {
        ''.join([
            str(int(x.info('name'))),
            generateID(3),
            str(int(x.info('ind_id')))
        ]): x.info('ind_id')
        for x in simulator.population(0).individuals()
    }
    sampled_ones = rand.sample(offsprings.keys(), offsprings_sampled)

    #reorganizes the offspring genome. Extract info by chr.
    offspring_genomes = {name: {} for name in sampled_ones}
    for name in sampled_ones:
        for ind, chr in enumerate(chromosome_names):
            offspring_genomes[name][chr] = simulator.population(0).indByID(
                offsprings[name], idField='ind_id').genotype(ploidy=0,
                                                             chroms=[ind])

    offspring_genomes.update(ancestral_genomes)

    print('Parent Guide:')
    for ind, id in enumerate(ancestor_names):
        print(" : ".join([str(ind), str(id)]))

    print('Complete. Generating Output.')

    with open(parents_path, 'w') as parent_output:
        parent_output.write('Parent Guide:\n')
        for ind, id in enumerate(ancestor_names):
            parent_output.write(" : ".join([str(ind), str(id)]) + '\n')

    #output
    offspring_genomes = snp.restructure((offspring_genomes, loci_positions),
                                        structure_mode)
    snp.outputGriggFormat(offspring_genomes, output_path)
    print('Simulation Complete.')