# the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # # This script is an example in the simuPOP user's guide. Please refer to # the user's guide (http://simupop.sourceforge.net/manual) for a detailed # description of this example. # import simuPOP as sim pop = sim.Population(10, infoFields='ind_id', ancGen=1) pop.evolve(initOps=sim.IdTagger(), matingScheme=sim.RandomSelection(ops=[ sim.CloneGenoTransmitter(), sim.IdTagger(), ]), gen=1) print([int(ind.ind_id) for ind in pop.individuals()]) pop.useAncestralGen(1) print([int(ind.ind_id) for ind in pop.individuals()]) sim.tagID(pop) # re-assign ID print([int(ind.ind_id) for ind in pop.individuals()])
def runSimulation(scenario_id, sub_population_size, minMatingAge, maxMatingAge, gen): ''' sub_population_size A vector giving the population sizes for each sub-population. The subpopulations determine which breeding ground an individual belongs to minMatingAge minimal mating age. maxMatingAge maximal mating age. Individuals older than this are effectively dead years number of years to simulate ''' # scenario_id describes the batch of files to load # The mitochondrial DNA will be in mtdna_<scenario_id> # The SNP DNA will be in snp_<scenario_id> # Read the mitochondrial haplotype frequencies. There's a bit to unpack here # We read the lines into an array, and for each one, call split() on it to get one element per column. # However, we do not want this - we want the transpose, where haplotype_frequencies[0] is a vector of # all the frequencies for population 0, and haplotype_frequencies[1] is the corresponding vector for # population 2. list(map(list, zip(*t))) will achieve this transformation for us. # While we are at it, we also convert the strings into floats. mitochondrial_file = "mtdna_" + scenario_id + ".txt" with open(mitochondrial_file, "r") as fd: haplotype_frequencies = list( map(list, zip(*[list(map(float, line[0:-1].split())) for line in fd]))) if len(haplotype_frequencies) != len(sub_population_size): raise ValueError( 'The number of populations in the population size vector and the number of populations deduced from the haplotype file are different' ) # Now read the SNP data. This builds a 2D array indexed as snp[locus][population] snp_file = "snp_" + scenario_id + ".txt" with open(snp_file, "r") as fd: snp = [list(map(float, line[0:-1].split())) for line in fd] sub_population_count = len(sub_population_size) print() print(sub_population_count, "subpopulations detected") # Now we can create the population. We want to give each population a population name, starting from A sub_population_names = list(map(chr, range(65, 65 + sub_population_count))) # We have two chromosomes. The first is an autosome with nb_loci loci, and the second is the mitochondrial chromosome with 1 locus pop = simuPOP.Population( sub_population_size, ploidy=2, loci=[nb_loci, 1], ancGen=2, infoFields=[ 'age', 'ind_id', 'father_id', 'mother_id', 'nitrogen', 'carbon', 'feeding_ground', 'native_breeding_ground', 'migrate_to' ], subPopNames=sub_population_names, chromTypes=[simuPOP.AUTOSOME, simuPOP.MITOCHONDRIAL]) sub_population_names = tuple(sub_population_names) # Create an attribute on each individual called 'age'. Set it to a random number between 0 and maxMatingAge # Note that size is a vector - the size of each population. We have to sum these to get the total number of individuals individual_count = sum(sub_population_size) # Assign a random age to each individual pop.setIndInfo( [random.randint(0, maxMatingAge) for x in range(individual_count)], 'age') # Assign a random feeding ground to each individual pop.setIndInfo([ random.randint(0, numberOfFeedingGrounds - 1) for x in range(individual_count) ], 'feeding_ground') # Currently we have these virtual subpopulations: # age < minMatingAge (juvenile) # age >= minMatingAge and age < maxMatingAge + 0.1 (age <= maxMatingAge) (mature) # age >= maxMatingAge (dead) # # Ideally we would want something like this: # 1) Immature # 2) Receptive female (every 3 years) # 3) Non-receptive female # 4) Mature male # 5) Dead # # Note that we use a cutoff InfoSplitter here, it is also possible to # provide a list of values, each corresponding to a virtual subpopulation. pop.setVirtualSplitter( simuPOP.CombinedSplitter([ simuPOP.ProductSplitter([ simuPOP.SexSplitter(), simuPOP.InfoSplitter('age', cutoff=[minMatingAge, maxMatingAge + 0.1], names=['juvenile', 'mature', 'dead']) ]) ], vspMap=[[0], [1], [2], [3], [4], [5], [0, 1, 3, 4], [1, 4]], names=[ 'Juvenile Male', 'Mature Male', 'Dead Male', 'Juvenile Female', 'Mature Female', 'Dead Female', 'Not dead yet', 'Active' ])) pop.evolve( initOps=[ simuPOP.InitSex(), simuPOP.IdTagger(), simuPOP.PyOperator(func=init_native_breeding_grounds) ] + [ simuPOP.InitGenotype(subPops=sub_population_names[i], freq=haplotype_frequencies[i], loci=[nb_loci]) for i in range(0, sub_population_count) ] + [ simuPOP.InitGenotype(subPops=sub_population_names[i], freq=[snp[n][i], 1 - snp[n][i]], loci=[n]) for i in range(0, sub_population_count) for n in range(0, nb_loci - 1) ], # increase age by 1 preOps=[simuPOP.InfoExec('age += 1')], matingScheme=simuPOP.HeteroMating( [ # age <= maxAge, copy to the next generation (weight=-1) # subPops is a list of tuples that will participate in mating. The tuple is a pair (subPopulation, virtualSubPopulation) # First, we propagate (clone) all individuals in all subpopulations (and all VSPs except the ones who are now in the VSP of deceased individuals) to the next generation simuPOP.CloneMating( ops=[simuPOP.CloneGenoTransmitter(chroms=[0, 1])], subPops=[ (sub_population, 6) for sub_population in range(0, sub_population_count) ], weight=-1), # Then we simulate random mating only in VSP 1 (ie reproductively mature individuals) within subpopulation (breeding/winter grounds) simuPOP.RandomMating( ops=[ simuPOP.MitochondrialGenoTransmitter(), simuPOP.MendelianGenoTransmitter(), simuPOP.IdTagger(), simuPOP.InheritTagger(mode=simuPOP.MATERNAL, infoFields=['feeding_ground']), simuPOP.InheritTagger( mode=simuPOP.MATERNAL, infoFields=['native_breeding_ground']), simuPOP.PedigreeTagger() ], subPops=[ (sub_population, 7) for sub_population in range(0, sub_population_count) ], weight=1) ], subPopSize=configure_new_population_size), postOps=[ # Determine the isotopic ratios in individuals simuPOP.PyOperator(func=postop_processing), simuPOP.Migrator(mode=simuPOP.BY_IND_INFO), # count the individuals in each virtual subpopulation #simuPOP.Stat(popSize=True, subPops=[(0,0), (0,1), (0,2), (1,0), (1, 1), (1, 2)]), # print virtual subpopulation sizes (there is no individual with age > maxAge after mating) #simuPOP.PyEval(r"'Size of age groups: %s\n' % (','.join(['%d' % x for x in subPopSize]))") # Alternatively, calculate the Fst # FIXME: How does this actually work? Does it work for > 2 populations? I don't really understand it yet # ELC: it is a calculation that partitions variance among and between populations, and can be calculated as a # global statistic or on a pairwise basis. We use it as an indication of genetic differentiation. simuPOP.Stat(structure=range(1), subPops=sub_population_names, suffix='_AB', step=10), simuPOP.PyEval(r"'Fst=%.3f \n' % (F_st_AB)", step=10) ], gen=years) #simuPOP.dump(pop, width=3, loci=[], subPops=[(simuPOP.ALL_AVAIL, simuPOP.ALL_AVAIL)], max=1000, structure=False); #return ped = simuPOP.Pedigree(pop) print("This is the pedigree stuff") simuPOP.dump(pop) # Now sample the individuals sample = drawRandomSample(pop, sizes=[sample_count] * sub_population_count) # Print out the allele frequency data simuPOP.stat(sample, alleleFreq=simuPOP.ALL_AVAIL) frequencies = sample.dvars().alleleFreq with open('freq.txt', 'w') as freqfile: index = 0 for locus in frequencies: if (locus == nb_loci): continue if (len(frequencies[locus]) < 2): continue print(index, end=' ', file=freqfile) index = index + 1 for allele in frequencies[locus]: print(frequencies[locus][allele], end=' ', file=freqfile) print(file=freqfile) # We want to remove monoallelic loci. This means a position in the genotype for which all individuals have the same value in both alleles # To implement this we will build up a list of loci that get ignored when we dump out the file. Generally speaking, if we add all the values up # then either they will sum to 0 (if all individuals have type 0) or to the number of individuals * 2 (if all individuals have type 1) geno_sum = [0] * (nb_loci + 1) * 2 for individual in sample.individuals(): geno_sum = list(map(add, geno_sum, individual.genotype())) final_sum = list( map(add, geno_sum[:(nb_loci + 1)], geno_sum[(nb_loci + 1):])) monoallelic_loci = [] for i in range(0, nb_loci): if final_sum[i] == 0 or final_sum[ i] == sample_count * sub_population_count * 2: monoallelic_loci = [i] + monoallelic_loci monoallelic_loci = sorted(monoallelic_loci, reverse=True) nb_ignored_loci = len(monoallelic_loci) # Generate the two files with open('mixfile.txt', 'w') as mixfile: with open('haploiso.txt', 'w') as haplofile: print(sub_population_count, nb_loci - nb_ignored_loci, 2, 1, file=mixfile) print("sex, haplotype, iso1, iso2, native_ground", file=haplofile) for i in range(0, nb_loci - nb_ignored_loci): print('Loc', i + 1, sep='_', file=mixfile) for individual in sample.individuals(): genotype = individual.genotype() print( 1 if individual.sex() == 1 else 0, genotype[nb_loci], individual.info('carbon'), individual.info('nitrogen'), # int(individual.info('native_breeding_ground')), file=haplofile, sep=' ') print(int(individual.info('native_breeding_ground') + 1), end=' ', file=mixfile) for i in range(0, nb_loci): if i not in monoallelic_loci: print(genotype[i] + 1, genotype[i + nb_loci + 1] + 1, ' ', end='', sep='', file=mixfile) print(file=mixfile) return sample
def simulateBySimuPOP(): #starting variables directory = '/data/new/javi/toxo/simulations4/' input_path = 'Toxo20.txt' output_path = 'SimulatedToxo.txt' input_path = directory + input_path output_path = directory + output_path parents_path = directory + '/parents.txt' pedigree_path = directory + 'pedigree.txt' number_of_ancestors = 3 expansion_pop_size = 15 offsprings_sampled = number_of_ancestors + expansion_pop_size gen = 3 translate_mode = 'toxoplasma' structure_mode = 'simupop' #parsing input init_info = parseSNPInput(input_path, number_of_ancestors) ancestral_genomes = init_info[0] ancestor_names = ancestral_genomes.keys() loci_positions = init_info[1] chromosome_names = sorted(loci_positions.keys(), key=lambda x: cns.getValue(x, translate_mode)) list_of_loci = [len(loci_positions[chr]) for chr in chromosome_names] lociPos = fc.reduce(lambda x, y: x + y, [loci_positions[x] for x in chromosome_names]) sp.turnOnDebug(code="DBG_GENERAL") #initializing print('Initializaing Population') population = sp.Population(size=[number_of_ancestors], loci=list_of_loci, ancGen = 5, lociPos = lociPos, \ chromNames = chromosome_names, lociNames = [], alleleNames = ['A','T','G','C'],\ infoFields=['name', 'ind_id', 'father_id', 'mother_id']) for individual, sample, ind_id in zip(population.individuals(), ancestral_genomes, range(len(ancestral_genomes))): individual.setInfo(ancestor_names.index(sample), 'name') individual.setInfo(ind_id, 'ind_id') for ind, chr in enumerate(chromosome_names): individual.setGenotype(ancestral_genomes[sample][chr], chroms=[ind]) #Alternating rounds of recombination with clonal expansion. Clonal expansion gives + 2. #Mutation prior to each round simulator = sp.Simulator(population) rate_matrix = createRateMatrix(len(ancestor_names), 0.0002) #10,000 times the mutation rate. id_tagger = sp.IdTagger() ped_tagger = sp.PedigreeTagger(output='>>' + pedigree_path, outputFields=['name', 'ind_id']) inherit_tagger = sp.InheritTagger(infoFields='name') initOps1 = [sp.PyExec('print("Starting random selection")'), ped_tagger] initOps2 = [sp.PyExec('print("Starting random mating")'), ped_tagger] preOps1 = [sp.MatrixMutator(rate=rate_matrix)] preOps2 = [sp.InitSex(sex=[sp.MALE, sp.FEMALE])] matingScheme1 = sp.RandomSelection( ops=[sp.CloneGenoTransmitter(), inherit_tagger, id_tagger, ped_tagger], subPopSize=expansion_pop_size) matingScheme2 = sp.RandomMating( ops=[ sp.Recombinator(intensity=0.01 / 105000, convMode=(sp.GEOMETRIC_DISTRIBUTION, 0.001, 0.01)), #10x normal sp.PyTagger(func=addNames), id_tagger, ped_tagger ], subPopSize=expansion_pop_size) postOps = [] finalOps = [] print('Starting Evolution Cycles.') try: os.remove(pedigree_path) except: pass simulator.evolve( initOps=[id_tagger, ped_tagger], matingScheme=sp.CloneMating(ops=[ sp.CloneGenoTransmitter(), ped_tagger, id_tagger, inherit_tagger ]), gen=1) for x in range(gen): simulator.evolve(initOps=initOps1, preOps=preOps1, matingScheme=matingScheme1, postOps=postOps, finalOps=finalOps, gen=1) simulator.evolve(initOps=initOps2, preOps=preOps2, matingScheme=matingScheme2, postOps=postOps, finalOps=finalOps, gen=1) offsprings = { ''.join([ str(int(x.info('name'))), generateID(3), str(int(x.info('ind_id'))) ]): x.info('ind_id') for x in simulator.population(0).individuals() } sampled_ones = rand.sample(offsprings.keys(), offsprings_sampled) #reorganizes the offspring genome. Extract info by chr. offspring_genomes = {name: {} for name in sampled_ones} for name in sampled_ones: for ind, chr in enumerate(chromosome_names): offspring_genomes[name][chr] = simulator.population(0).indByID( offsprings[name], idField='ind_id').genotype(ploidy=0, chroms=[ind]) offspring_genomes.update(ancestral_genomes) print('Parent Guide:') for ind, id in enumerate(ancestor_names): print(" : ".join([str(ind), str(id)])) print('Complete. Generating Output.') with open(parents_path, 'w') as parent_output: parent_output.write('Parent Guide:\n') for ind, id in enumerate(ancestor_names): parent_output.write(" : ".join([str(ind), str(id)]) + '\n') #output offspring_genomes = snp.restructure((offspring_genomes, loci_positions), structure_mode) snp.outputGriggFormat(offspring_genomes, output_path) print('Simulation Complete.')