def test_decode_errors(self): with self.assertRaises(ValueError): pyslim.decode_mutation(2.0) with self.assertRaises(ValueError): pyslim.decode_mutation([2.0, 3.0]) with self.assertRaises(ValueError): pyslim.decode_node(2.0) with self.assertRaises(ValueError): pyslim.decode_node([1, 2]) with self.assertRaises(ValueError): pyslim.decode_individual(3.0) with self.assertRaises(ValueError): pyslim.decode_individual([1, 2]) with self.assertRaises(ValueError): pyslim.decode_population(1.0) with self.assertRaises(ValueError): pyslim.decode_population([2, 3])
def test_mutation_derived_info(self): for ts in self.get_slim_examples(): for j, mut in enumerate(ts.mutations()): a = ts.tables.mutations.metadata_offset[j] b = ts.tables.mutations.metadata_offset[j+1] raw_md = ts.tables.mutations.metadata[a:b] md = pyslim.decode_mutation(raw_md) self.assertEqual(mut.metadata, md) self.assertEqual(ts.mutation(j).metadata, md)
def verify_mutation_decoding(self, t): ms = tskit.MetadataSchema(None) nt = t.copy() nt.metadata_schema = ms for a, b in zip(t, nt): md = a.metadata with self.assertWarns(DeprecationWarning): omd = pyslim.decode_mutation(b.metadata) self.assertEqual(md, {"mutation_list": [u.asdict() for u in omd]})
def test_legacy_errors(self): defaults = pyslim.default_slim_metadata with self.assertRaisesRegex(ValueError, "legacy"): pyslim.decode_mutation(defaults('mutation')) with self.assertRaisesRegex(ValueError, "legacy"): pyslim.decode_population(defaults('population')) with self.assertRaisesRegex(ValueError, "legacy"): pyslim.decode_individual(defaults('individual')) with self.assertRaisesRegex(ValueError, "legacy"): pyslim.decode_node(defaults('node')) with self.assertRaisesRegex(ValueError, "legacy"): pyslim.encode_mutation(defaults('mutation')) with self.assertRaisesRegex(ValueError, "legacy"): pyslim.encode_population(defaults('population')) with self.assertRaisesRegex(ValueError, "legacy"): pyslim.encode_individual(defaults('individual')) with self.assertRaisesRegex(ValueError, "legacy"): pyslim.encode_node(defaults('node'))
def test_decode_already_mutation(self): m = [pyslim.MutationMetadata(mutation_type = 0, selection_coeff = 0.2, population = k, slim_time = 130, nucleotide = 2) for k in range(4)] dm = pyslim.decode_mutation(m) self.assertEqual(type(dm), type([])) for a, b in zip(m, dm): self.assertEqual(a, b)
def test_decode_already_mutation(self): m = [pyslim.MutationMetadata(mutation_type = 0, selection_coeff = 0.2, population = k, slim_time = 130, nucleotide = 2) for k in range(4)] with self.assertWarns(FutureWarning): dm = pyslim.decode_mutation(m) self.assertTrue(isinstance(dm, list)) for a, b in zip(m, dm): self.assertEqual(a, b)
def test_mutation_metadata(self): for md_length in [0, 1, 5]: md = [pyslim.MutationMetadata( mutation_type=j, selection_coeff=0.5, population=j, slim_time=10 + j, nucleotide=(j % 5) - 1) for j in range(md_length)] md_bytes = pyslim.encode_mutation(md) new_md = pyslim.decode_mutation(md_bytes) self.assertEqual(len(md), len(new_md)) for x, y in zip(md, new_md): self.assertEqual(x, y)
def test_annotate_mutations(self): for ts in self.get_slim_examples(): tables = ts.tables new_tables = ts.tables metadata = [] for md in tskit.unpack_bytes(tables.mutations.metadata, tables.mutations.metadata_offset): dm = pyslim.decode_mutation(md) edm = pyslim.encode_mutation(dm) self.assertEqual(md, edm) metadata.append(dm) pyslim.annotate_mutation_metadata(new_tables, metadata) self.assertEqual(tables, new_tables)
def test_annotate_mutations(self): for ts in get_msprime_examples(): slim_ts = pyslim.annotate_defaults(ts, model_type="nonWF", slim_generation=1) tables = slim_ts.tables metadata = list(pyslim.extract_mutation_metadata(tables)) self.assertEqual(len(metadata), slim_ts.num_mutations) selcoefs = [random.uniform(0, 1) for _ in metadata] for j in range(len(metadata)): metadata[j].selection_coeff = selcoefs[j] pyslim.annotate_mutation_metadata(tables, metadata) new_ts = pyslim.load_tables(tables) for j, x in enumerate(new_ts.mutations()): md = pyslim.decode_mutation(x.metadata) self.assertEqual(md.selection_coeff, selcoefs[j])
def main(): ## Define command line args parser = argparse.ArgumentParser(description="") parser.add_argument("--trees", required = True, dest = "trees", type = str, help = "Coalescent trees for the simulation") parser.add_argument("--optima", required = True, dest = "optima", type = str, help = "The file of phenotypic optima for the simulation") parser.add_argument("--output", required = True, dest = "output", type = str, help = "What name do you want to give to the output file? [DON'T GIVE FILE EXTENSION, THE SCRIPT MAKES TWO OUTPUT FILES") parser.add_argument("--nPops", required = False, dest = "nPops", type = int, help = "The number of subPops to downsample to", default = 0) parser.add_argument("--nInds", required = False, dest = "nInds", type = int, help = "The number of individuals to grab from each subPop", default = 0) parser.add_argument("--bayPass", required = False, dest = "bayPass", action = "store_true", help = "Do you want to spit out a BayPass config?") parser.add_argument("--directional", required = False, dest = "directional", action = "store_true", help = "Are these simulations modelling directional selection?") parser.add_argument("--intervals", required = False, dest = "intervals", type = int, help = "The width of the intervals you want to analyse", default = 0) parser.add_argument("--s", required = False, dest = "s", type = int, help = "Are these simulations modelling directional selection?") parser.add_argument("--environments", required = False, dest = "environments", type = str, help = "The file with environments in it - if not given, optima will be used") args = parser.parse_args() print("analysing",args.trees) ## Read in the tree sequence file if args.nPops == 0: ts = pyslim.load(args.trees) else: ts_raw = pyslim.load(args.trees) ts = getSample(ts_raw, args.nPops, args.nInds) ## Make a dict of the environments from the optima and (optional) enviroment files if args.environments == None: enviDict = {} count = 0 with open(args.optima) as file: for line in file: enviDict[ count ] = float( line.strip() ) count += 1 optimaDict = enviDict.copy() else: enviDict = {} count = 0 with open(args.environments) as file: for line in file: enviDict[ count ] = float( line.strip() ) count += 1 optimaDict = {} count = 0 with open(args.optima) as file: for line in file: optimaDict[ count ] = float( line.strip() ) count += 1 ## Let's make a dict of all the demes and the individuals from the tree being analysed subPopDict = {} count = 0 for i in ts.individuals_alive_at(0): count += 1 if ts.individual(i).population == 999: continue try: subPopDict[ts.individual(i).population].append( i ) except KeyError: subPopDict[ts.individual(i).population] = [i] print(count,"individuals") envi_pops = [enviDict[i] for i in sorted(subPopDict.keys()) ] print(envi_pops) to_keep = [] for v in ts.variants(): if v.genotypes.sum()/(args.nPops*args.nInds) < 0.3: continue to_keep.append( [ v.position-0.01, v.position + 0.01] ) ts = ts.keep_intervals( np.array(to_keep) ) # for t in ts_edit.trees(): # print(t.draw(format = "unicode")) if len( [q.position for q in ts.variants()] )*2+1 != len( [q.interval for q in ts.trees()] ): print("WTF, this is error 1") ## Get the sample size N = ts.get_sample_size() nPops = len( list(subPopDict.keys()) ) diploids_per_pop = len( subPopDict[list(subPopDict.keys())[0]] ) print( diploids_per_pop ,"diploid individuals per population") print( nPops ,"populations") print(envi_pops, len(envi_pops)) ## Set the mutation rate for neutral mutations mut_rate = 5 ## HARD CODED ## Sprinkle neutral mutations onto the coalescent tree print('Sprinkling mutations onto trees') sprinkled = msprime.mutate(ts, rate= mut_rate, keep=True, random_seed = 12345) print('extracting segregating sites from trees...') # Make a little dict that will be populated by the genes that contribute to LA selSites = 0 count = 0 outputDF = open( args.output +'.csv' ,'w' ) header = [ "position", "gene", "selCoeff", "geno_spearman_correlation", "geno_spearman_correlation_2", "geno_spearman_pvalue", "geno_k_tau", "geno_k_tau_p_value", "pbar", "pbar_qbar", "maf", "LA" ] outputDF.write(",".join(header) + "\n") if args.bayPass: bayPassConfig = open( args.output + ".bayPass.txt", "w") ## Iterating over all variants in the population we now perform the actual GEA for variant in sprinkled.variants(): # if variant.position > 1e6: # continue if variant.num_alleles > 2: continue md = pyslim.decode_mutation(variant.site.mutations[0].metadata) if len(md) > 0: selCoeff = md[0].selection_coeff else: selCoeff = 0 count +=1 if count % 1000 == 0: print('extracted',count,'neutral sites') ## Calculating the genotype freqs this way assumes that the ref and alt are coded as 1s and 0s, repectively. I'll keep that for now, but will change in a bit if 2 in variant.genotypes: print(variant) print(list(variant.genotypes)) print("!!!!!!!!!!!!!!") close() genotypes_as_freqs = (variant.genotypes[::2] + variant.genotypes[1::2])/2 if variant.genotypes.sum() == N: continue # Ignore fixed mutations freqs_per_pop = ([i.mean() for i in chunks(genotypes_as_freqs,int(diploids_per_pop))]) pbar = sum(freqs_per_pop)/len( freqs_per_pop ) LA = -99 if pbar == 1:continue ## This can be triggered by mutation stacking and infinite sites funniness, fix this part of the script if pbar > 1: print(variant) print("WTF, what is going on here: pbar > 1") print(genotypes) continue pbar_qbar = pbar * ( 1 - pbar ) maf = min(pbar, 1-pbar) if pbar_qbar <0: print("WTF, what is going on here: pbar_qbar < 1") print(genotypes) continue geno_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop) geno_k_tau, geno_k_tau_p_value = scipy.stats.kendalltau(envi_pops, freqs_per_pop) pop_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop) pop_k_tau, pop_k_tau_p_value = scipy.stats.kendalltau(envi_pops, freqs_per_pop) if variant.position == int(variant.position): selCoeff = 0.003 else: selCoeff = 0.0 gene = int(round(variant.position/10)*10) outline = [ variant.position, gene, selCoeff, geno_spearman.correlation, geno_spearman.correlation**2, geno_spearman.pvalue, geno_k_tau, geno_k_tau_p_value, pbar, pbar_qbar, maf, LA] outputDF.write(",".join(map(str, outline)) + "\n") outputDF.close() if args.bayPass: bayPassConfig.close()
tables = ts.dump_tables() x = [[], [], []] y = [[], [], []] c = [[], [], []] varit = ts.variants() colors = ['m', 'c', 'y'] ind_loc = np.array( [i.location[0] + np.random.uniform(-0.5, 0.5) for i in ts.individuals()]) phenotypes = np.array([0.0 for _ in ts.individuals()]) populations = np.array( [ts.node(ind.nodes[0]).population for ind in ts.individuals()]) for v in varit: mut_meta = pyslim.decode_mutation(v.site.mutations[0].metadata)[0] selectionCoeff = mut_meta.selection_coeff mutType = mut_meta.mutation_type color = None dominance = None if (mutType == 1 or mutType == 2): color = 'm' dominance = 'dom' elif (mutType == 4 or mutType == 5): color = 'c' dominance = 'rec' else: color = 'y' dominance = 'add' for ind in ts.individuals():
def main(): ## Define command line args parser = argparse.ArgumentParser(description="") parser.add_argument("--trees", required=True, dest="trees", type=str, help="Coalescent trees for the simulation") parser.add_argument( "--optima", required=True, dest="optima", type=str, help="The file of phenotypic optima for the simulation") parser.add_argument( "--output", required=True, dest="output", type=str, help= "What name do you want to give to the output file? [DON'T GIVE FILE EXTENSION, THE SCRIPT MAKES TWO OUTPUT FILES" ) parser.add_argument("--nPops", required=False, dest="nPops", type=int, help="The number of subPops to downsample to", default=0) parser.add_argument( "--nInds", required=False, dest="nInds", type=int, help="The number of individuals to grab from each subPop", default=0) parser.add_argument("--bayPass", required=False, dest="bayPass", action="store_true", help="Do you want to spit out a BayPass config?") parser.add_argument( "--directional", required=False, dest="directional", action="store_true", help="Are these simulations modelling directional selection?") parser.add_argument("--intervals", required=False, dest="intervals", type=int, help="The width of the intervals you want to analyse", default=0) parser.add_argument( "--s", required=False, dest="s", type=int, help="Are these simulations modelling directional selection?") parser.add_argument( "--environments", required=False, dest="environments", type=str, help= "The file with environments in it - if not given, optima will be used") parser.add_argument( "--demo", required=False, dest="demo", action="store_true", help= "Do you want to make a file with allele frequency info for demonstration purposes?" ) args = parser.parse_args() print("analysing", args.trees) ## Read in the tree sequence file if args.nPops == 0: ts = pyslim.load(args.trees) else: ts_raw = pyslim.load(args.trees) ts = getSample(ts_raw, args.nPops, args.nInds) ## The simulations have a common genome structure, ## mimicking a group of species with syntenic genomes, genome = genomeMaker() genome['names'] = np.array( ['gene' + str(i) for i in range(genome.shape[0])]) ## Get the sample size N = ts.get_sample_size() ## Make a dict of the environments from the optima and (optional) enviroment files if args.environments == None: enviDict = {} count = 0 with open(args.optima) as file: for line in file: enviDict[count] = float(line.strip()) count += 1 optimaDict = enviDict.copy() else: enviDict = {} count = 0 with open(args.environments) as file: for line in file: enviDict[count] = float(line.strip()) count += 1 optimaDict = {} count = 0 with open(args.optima) as file: for line in file: optimaDict[count] = float(line.strip()) count += 1 ## Let's make a dict of all the demes and the individuals from the tree being analysed subPopDict = {} count = 0 for i in ts.individuals_alive_at(0): count += 1 if ts.individual(i).population == 999: continue try: subPopDict[ts.individual(i).population].append(i) except KeyError: subPopDict[ts.individual(i).population] = [i] print(count, "individuals") envi_pops = [enviDict[i] for i in sorted(subPopDict.keys())] print(envi_pops) nPops = len(list(subPopDict.keys())) diploids_per_pop = len(subPopDict[list(subPopDict.keys())[0]]) print(diploids_per_pop, "diploid individuals per population") print(nPops, "populations") print(envi_pops, len(envi_pops)) if args.bayPass: bayPassEnvs = open(args.output + ".bayPass.pc1", "w") for e in envi_pops: bayPassEnvs.write(str(e) + " ") bayPassEnvs.close() ## Figure out from the whole population, which genes are involved in local adaptation or not if not args.directional: if args.nPops == 0: adapGenes = getPVE_stabilising(ts, optimaDict, genome) else: adapGenes = getPVE_stabilising(ts_raw, optimaDict, genome) else: if args.nPops == 0: adapGenes = getPVE_Directional(ts, diploids_per_pop, optimaDict, genome) else: adapGenes = getPVE_Directional(ts_raw, diploids_per_pop, optimaDict, genome) print(adapGenes) # for i in ts.individuals_alive_at(0): # print(ts.individual(i).id, ts.individual(i).population, enviDict[ts.individual(i).population]) ## Uncomment for the time bing # if len( enviDict.keys() ) != len( subPopDict.keys() ): # print( "Uneven numbers of subpopulations in the simulation and the optima file that was given" ) # return ## Set the mutation rate for neutral mutations mut_rate = 1e-8 ## HARD CODED ## If you specify an interval, the following will cut down the tree to focus sub-windows within each 10,000bp window ## This is a way of getting low recombination results on the cheap ## ...if you are into the whole brevity thing if args.intervals != 0: intervalDiff = (10000 - args.intervals) / 2 if args.intervals == 1: print("I have not put in support for single base pair intervals") return genome["interval_start"] = genome[0] + intervalDiff genome["interval_end"] = genome[1] - intervalDiff intervalArray = np.array(genome[["interval_start", "interval_end"]]) print(genome) ts = ts.keep_intervals(intervalArray) ## Reset the mutation rate for neutral mutations to acheive the same net mutation rate mut_rate = 1e-8 * (10000 / args.intervals) # HARD CODED, SO CHANGE IF NECESSARY ## Sprinkle neutral mutations onto the coalescent tree print('Sprinkling mutations onto trees') sprinkled = msprime.mutate(ts, rate=mut_rate, keep=True, random_seed=12345) print('extracting segregating sites from trees...') # Make a little dict that will be populated by the genes that contribute to LA selSites = 0 count = 0 outputDF = open(args.output + '.csv', 'w') if args.demo: print( "I'm not performing GEA analysis, I'm make an allele frequency SNP table for demonstration purposes" ) envOutput = open("environments." + args.output + ".csv", "w") envOutput.write(",".join(map(str, envi_pops)) + "\n") envOutput.close() else: print("Not demo!") header = [ "position", "gene", "selCoeff", "geno_spearman_correlation", "geno_spearman_correlation_2", "geno_spearman_pvalue", "geno_k_tau", "geno_k_tau_p_value", "pbar", "pbar_qbar", "maf", "LA" ] outputDF.write(",".join(header) + "\n") if args.bayPass: bayPassConfig = open(args.output + ".bayPass.txt", "w") ## Iterating over all variants in the population we now perform the actual GEA for variant in sprinkled.variants(): # if variant.position > 1e6: # continue if variant.num_alleles > 2: continue gene = findGene(variant.position, genome) if not gene: continue elif gene == -99: print(gene) print('FAILED') return # print(variant.position) md = pyslim.decode_mutation(variant.site.mutations[0].metadata) if len(md) > 0: selCoeff = md[0].selection_coeff else: selCoeff = 0 count += 1 # if count == 2:break if count % 1000 == 0: print('extracted', count, 'neutral sites') ## Calculating the genotype freqs this way assumes that the ref and alt are coded as 1s and 0s, repectively. I'll keep that for now, but will change in a bit if 2 in variant.genotypes: print(variant) print(list(variant.genotypes)) print("!!!!!!!!!!!!!!") close() genotypes_as_freqs = (variant.genotypes[::2] + variant.genotypes[1::2]) / 2 if variant.genotypes.sum() == N: continue # Ignore fixed mutations freqs_per_pop = ([ i.mean() for i in chunks(genotypes_as_freqs, int(diploids_per_pop)) ]) if args.bayPass: counts_per_pop = ([ int(i.sum() * 2) for i in chunks(genotypes_as_freqs, int(diploids_per_pop)) ]) bayPassConfig.write(" ") for c in counts_per_pop: bayPassConfig.write( str(c) + " " + str(diploids_per_pop * 2 - c) + " ") bayPassConfig.write("\n") try: LA = adapGenes[gene[0]] except KeyError: LA = 0 pbar = sum(freqs_per_pop) / len(freqs_per_pop) if pbar == 1: continue ## This can be triggered by mutation stacking and infinite sites funniness, fix this part of the script if pbar > 1: print(variant) print("WTF, what is going on here: pbar > 1") print(genotypes) continue pbar_qbar = pbar * (1 - pbar) maf = min(pbar, 1 - pbar) if pbar_qbar < 0: print("WTF, what is going on here: pbar_qbar < 1") print(genotypes) continue if args.demo == True: # print("DEMO") outline = [ "chr" + str(1 + math.floor(variant.position / 2000000)), int(variant.position), gene[0] ] + freqs_per_pop # print(outline) outputDF.write(",".join(map(str, outline)) + "\n") continue # geno_spearman = scipy.stats.spearmanr(envi, genotypes_as_freqs) # geno_k_tau, geno_k_tau_p_value = scipy.stats.kendalltau(envi, genotypes_as_freqs) geno_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop) geno_k_tau, geno_k_tau_p_value = scipy.stats.kendalltau( envi_pops, freqs_per_pop) pop_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop) pop_k_tau, pop_k_tau_p_value = scipy.stats.kendalltau( envi_pops, freqs_per_pop) outline = [ variant.position, gene[0], selCoeff, geno_spearman.correlation, geno_spearman.correlation**2, geno_spearman.pvalue, geno_k_tau, geno_k_tau_p_value, pbar, pbar_qbar, maf, LA ] outputDF.write(",".join(map(str, outline)) + "\n") outputDF.close() if args.bayPass: bayPassConfig.close()
import pyslim import msprime ts = pyslim.load("simple.trees") tables = ts.tables print(tables) # mutations mut_metadata = [] for md in msprime.unpack_bytes(tables.mutations.metadata, tables.mutations.metadata_offset): dm = pyslim.decode_mutation(md) edm = pyslim.encode_mutation(dm) assert (md == edm) mut_metadata.append(dm) pyslim.annotate_mutations(tables, mut_metadata) # nodes node_metadata = [] for md in msprime.unpack_bytes(tables.nodes.metadata, tables.nodes.metadata_offset): dn = pyslim.decode_node(md) edn = pyslim.encode_node(dn) assert (md == edn) node_metadata.append(dn) pyslim.annotate_nodes(tables, node_metadata)
def main(): ## Define command line args parser = argparse.ArgumentParser(description="") parser.add_argument("--trees", required=True, dest="trees", type=str, help="Coalescent trees for the simulation") parser.add_argument( "--optima", required=True, dest="optima", type=str, help="The file of phenotypic optima for the simulation") parser.add_argument( "--output", required=True, dest="output", type=str, help= "What name do you want to give to the output file? [DON'T GIVE FILE EXTENSION, THE SCRIPT MAKES TWO OUTPUT FILES" ) args = parser.parse_args() ts = pyslim.load(args.trees) N = ts.get_sample_size() alive = ts.individuals_alive_at(0) ## Make a dict of the environments enviDict = {} envi = [] count = 0 with open(args.optima) as file: for line in file: enviDict[count] = int(line.strip()) count += 1 ## Let's make a dict of all the demes and the individuals from each subPopDict = {} for a in alive: try: subPopDict[ts.individual(a).population].append(a) except: subPopDict[ts.individual(a).population] = [a] envi.append(enviDict[ts.individual(a).population]) # random.shuffle(envi) # print(envi) if len(enviDict.keys()) != len(subPopDict.keys()): print( "Uneven numbers of subpopulations in the simulation and the optima file that was given" ) return nPops = len(enviDict.keys()) diploids_per_pop = len(subPopDict[0]) print(diploids_per_pop, "diploid individuals per population") print(nPops, "populations") ## The simulations have a common genome structure, ## mimicking a group of species with syntenic genomes, genome = genomeMaker() genome['names'] = np.array( ['gene' + str(i) for i in range(genome.shape[0])]) ## Make a dict for the environments for the populations in the ## 10 deme simulations. Make a key for population 99. ## There are ghost samples from pop 99, so to avoid a KeyError ## I just make a little dict for them. # enviDict = {1:-6, 2:-4, 3:-2, 4:-1, 5:0, 6:0, 7:1, 8:2, 9:4, 10:6, 99:99} ## Make a vector that contains the environment for each sample variants = {} ## Get the allele frequencies for all segregating sites at QTL print('extracting segregating sites from trees...') # Make a little dict that will be populated by the genes that contribute to LA adapGenes = {} ## Iterate over all variants present on the tree from SLiM for vs in ts.variants(): alleleFreqs = (vs.genotypes[::2] + vs.genotypes[1::2] ) ## I hate this line of code... # It is doing exactly what I want it to do. if vs.genotypes.sum() == N or vs.genotypes.sum() == 0: continue # Remove fixed mutations gene = findGene(vs.position, genome) if len(vs.site.mutations) > 1: print('This site has more than a single variant:', vs.position) md = pyslim.decode_mutation(vs.site.mutations[0].metadata) selCoeff = md[0].selection_coeff if sum(alleleFreqs) == N / 2: continue adapGenes[gene[ 0]] = 1 ## This is to keep track of which genes have mutations that affect the phenotype genos_per_pop = ([ i for i in chunks(alleleFreqs, int(diploids_per_pop / 2)) ]) # print(genos_per_pop) genos = [] for g in genos_per_pop: a1 = (len(g) * 2) - g.sum() # The number of A alleles a2 = g.sum() # The number of a alleles genos.append(int(a1)) genos.append(a2) if sum(genos) != len(alleleFreqs) * 2: # print(genos_per_pop) # print(genos, sum(genos)) print('something went haywire with the genotype counts') variants[vs.position] = [alleleFreqs, selCoeff, genos] # print( [alleleFreqs, selCoeff, genos] ) print(len(variants.keys()), 'segregating sites affecting QTL') ## Set the mutation rate for neutral mutations mut_rate = 5e-9 # HARD CODED, SO CHANGE IF NECESSARY ## Sprinkle mutations onto the coalescent tree print('Sprinkling mutations onto trees') sprinkled = msprime.mutate(ts, rate=mut_rate, keep=True) ## Iterate over all polymorphisms in the tree and count = 0 for variant in sprinkled.variants(): count += 1 if count % 1000 == 0: print('extracted', count, 'neutral sites') if variant.position in variants.keys(): s = -99 else: s = 0.0 alleleFreqs = (variant.genotypes[::2] + variant.genotypes[1::2]) all_alleles = alleleFreqs.sum() if variant.genotypes.sum() == N: continue # Remove fixed mutations # print(list(variant.genotypes)) genos_per_pop = ([ i for i in chunks(alleleFreqs, int(diploids_per_pop / 2)) ]) genos = [] for g in genos_per_pop: a1 = (len(g) * 2) - g.sum() a2 = g.sum() genos.append(int(a1)) genos.append(a2) if sum(genos) != len(alleleFreqs) * 2: print('something went haywire with the genotype counts') # print(genos) # [(variant.genotypes[i] + variant.genotypes[i+1])/2 for i in range(N)[::2]] # alleleFreqs = (variant.genotypes[:int(N/2)] + variant.genotypes[int(N/2):] )/2 variants[variant.position] = [alleleFreqs, s, genos] ## For each variant, identify the gene it is within, ## calculate the Spearman's Rho as well as pq ## These data are then collated and made into a dataFrame print(count, "neutral segregating sites") data = [] count = 0 print('Performing GEA on the resulting data') # output = open(args.output, 'w') # output.write('pos,gene,s,rho,rho2,pval,pbar_qbar\n') scanners = [] for v in variants.keys(): s = variants[v][1] alleleFreqs = np.array(variants[v][0]) gene = findGene(v, genome) if not gene: continue elif gene == -99: print(gene) print('FAILED') return try: LA = adapGenes[gene[0]] except KeyError: LA = 0 pbar = alleleFreqs.sum() / (len(alleleFreqs) * 2) # print(alleleFreqs) if pbar == 1: continue if pbar > 1: print("WTF, what is going on here: pbar > 1") print(alleleFreqs) continue pbar_qbar = pbar * (1 - pbar) maf = min(pbar, 1 - pbar) if pbar_qbar < 0: print("WTF, what is going on here: pbar_qbar < 1") print(alleleFreqs) continue spearman = scipy.stats.spearmanr(envi, alleleFreqs) k_tau, k_tau_p_value = scipy.stats.kendalltau(envi, alleleFreqs) if spearman.correlation == 0: continue dataPoint = { 'pos': v, 'gene': gene[0], 's': s, 'spearman_rho': spearman.correlation, 'spearman_rho2': spearman.correlation**2, 'spearman_rho_pval': spearman.pvalue, 'kendall_tau': k_tau, 'kendall_tau_pval': k_tau_p_value, 'pbar': pbar, 'pbar_qbar': pbar_qbar, 'maf': maf, 'LA': LA } data.append(dataPoint) scanners.append(v) # if count == 1000:break ## convert the GEA data points into a summary CSV pd.DataFrame(data).sort_values(by=['pos']).to_csv(args.output + '.csv', index=False) ## Make a BayScan Input File from the input data # BayScan = [] # for b in sorted(vars.keys()): # BayScan.append(vars[b][2]) # print(b, vars[b][2]) BayScan = [variants[b][2] for b in sorted(scanners)] pd.DataFrame(BayScan).to_csv(args.output + '.bayPass.txt', index=False, header=False, sep=' ')
# Keywords: Python, tree-sequence recording, tree sequence recording import msprime, pyslim ts = pyslim.load("recipe_16.7.trees").simplify() # selection coefficients and locations of all selected mutations coeffs = [] for mut in ts.mutations(): md = pyslim.decode_mutation(mut.metadata) sel = [x.selection_coeff for x in md] if any([s != 0 for s in sel]): coeffs += sel b = [x for x in coeffs if x > 0] d = [x for x in coeffs if x < 0] print("Beneficial: " + str(len(b)) + ", mean " + str(sum(b) / len(b))) print("Deleterious: " + str(len(d)) + ", mean " + str(sum(d) / len(d)))
def main(): ## Define command line args parser = argparse.ArgumentParser( description= "Perform GEA on SNPs present within a single Tree from each gene in the simulated genome" ) parser.add_argument("--trees", required=True, dest="trees", type=str, help="Coalescent trees for the simulation") parser.add_argument( "--optima", required=True, dest="optima", type=str, help="The file of phenotypic optima for the simulation") parser.add_argument( "--output", required=True, dest="output", type=str, help= "What name do you want to give to the output file? [DON'T GIVE FILE EXTENSION, THE SCRIPT MAKES TWO OUTPUT FILES" ) parser.add_argument("--nPops", required=False, dest="nPops", type=int, help="The number of subPops to downsample to", default=0) parser.add_argument( "--nInds", required=False, dest="nInds", type=int, help="The number of individuals to grab from each subPop", default=0) parser.add_argument("--bayPass", required=False, dest="bayPass", action="store_true", help="Do you want to spit out a BayPass config?") parser.add_argument( "--directional", required=False, dest="directional", action="store_true", help="Are these simulations modelling directional selection?") args = parser.parse_args() print("analysing", args.trees) if args.nPops == 0: ts = pyslim.load(args.trees) else: ts_raw = pyslim.load(args.trees) ts = getSample(ts_raw, args.nPops, args.nInds) N = ts.get_sample_size() alive = ts.individuals_alive_at(0) ## Make a dict of the environments enviDict = {} count = 0 with open(args.optima) as file: for line in file: enviDict[count] = int(line.strip()) count += 1 ## Let's make a dict of all the demes and the individuals from each subPopDict = {} count = 0 for i in ts.individuals_alive_at(0): count += 1 if ts.individual(i).population == 999: continue try: subPopDict[ts.individual(i).population].append(i) except KeyError: subPopDict[ts.individual(i).population] = [i] print(count, "individuals") envi_pops = [enviDict[i] for i in sorted(subPopDict.keys())] if args.bayPass: bayPassEnvs = open(args.output + ".bayPass.pc1", "w") for e in envi_pops: bayPassEnvs.write(str(e) + " ") bayPassEnvs.close() nPops = len(list(subPopDict.keys())) diploids_per_pop = len(subPopDict[list(subPopDict.keys())[0]]) print(diploids_per_pop, "diploid individuals per population") print(nPops, "populations") ## The simulations have a common genome structure, ## mimicking a group of species with syntenic genomes, genome = genomeMaker() genome['names'] = np.array( ['gene' + str(i) for i in range(genome.shape[0])]) outputDF = open(args.output + '.csv', 'w') header = [ "position", "gene", "selCoeff", "geno_spearman_correlation", "geno_spearman_correlation_2", "geno_spearman_pvalue", "geno_k_tau", "geno_k_tau_p_value", "pop_spearman_correlation", "pop_spearman_correlation_2", "pop_spearman_pvalue", "pop_k_tau", "pop_k_tau_p_value", "pbar", "pbar_q_bar", "maf", "LA" ] outputDF.write(",".join(header) + "\n") if not args.directional: winVars, winCovars = getPVE(ts, envi_pops) print(winVars) adapGenes = getAdapGenes(winCovars) else: adapGenes = getAdapGenesDirectional(ts, diploids_per_pop, envi_pops, genome) print(adapGenes) pos_ts = ts.keep_intervals(np.array([[pos, pos + 1]])) print(np.array([[1, 1]])) print(np.array([[1, 1]]).shape) for pos in np.array(genome[0]) + 4997: # pos_tree = tskit.TreeSequence( ts.at(pos) ) ## Set the mutation rate for neutral mutations mut_rate = 1e-8 * 10000 # HARD CODED, SO CHANGE IF NECESSARY ## Sprinkle mutations onto the coalescent tree print('Sprinkling mutations onto trees') sprinkled = msprime.mutate(pos_tree, rate=mut_rate, keep=True) ## Calculating the PVE for each of the focal genes ## Get the allele frequencies for all segregating sites at QTL print('extracting segregating sites from trees...') selSites = 0 count = 0 if args.bayPass: bayPassConfig = open(args.output + ".bayPass.txt", "w") for variant in sprinkled.variants(): # if variant.position > 1e6: # continue if variant.num_alleles > 2: continue gene = findGene(variant.position, genome) if not gene: continue elif gene == -99: print(gene) print('FAILED') return # print(variant.position) md = pyslim.decode_mutation(variant.site.mutations[0].metadata) if len(md) > 0: selCoeff = md[0].selection_coeff else: selCoeff = 0 count += 1 # if count == 2:break if count % 1000 == 0: print('extracted', count, 'neutral sites') ## Calculating the genotype freqs this way assumes that the ref and alt are coded as 1s and 0s, repectively. I'll keep that for now, but will change in a bit if 2 in variant.genotypes: print(variant) print(list(variant.genotypes)) print("!!!!!!!!!!!!!!") close() genotypes_as_freqs = (variant.genotypes[::2] + variant.genotypes[1::2]) / 2 if variant.genotypes.sum() == N: continue # Ignore fixed mutations freqs_per_pop = ([ i.mean() for i in chunks(genotypes_as_freqs, int(diploids_per_pop)) ]) if args.bayPass: counts_per_pop = ([ int(i.sum() * 2) for i in chunks(genotypes_as_freqs, int(diploids_per_pop)) ]) bayPassConfig.write(" ") for c in counts_per_pop: bayPassConfig.write( str(c) + " " + str(diploids_per_pop * 2 - c) + " ") bayPassConfig.write("\n") try: LA = adapGenes[gene[0]] except KeyError: LA = 0 pbar = sum(freqs_per_pop) / len(freqs_per_pop) if pbar == 1: continue ## This can be triggered by mutation stacking and infinite sites funniness, fix this part of the script if pbar > 1: print("WTF, what is going on here: pbar > 1") print(genotypes) continue pbar_qbar = pbar * (1 - pbar) maf = min(pbar, 1 - pbar) if pbar_qbar < 0: print("WTF, what is going on here: pbar_qbar < 1") print(genotypes) continue geno_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop) geno_k_tau, geno_k_tau_p_value = scipy.stats.kendalltau( envi_pops, freqs_per_pop) pop_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop) pop_k_tau, pop_k_tau_p_value = scipy.stats.kendalltau( envi_pops, freqs_per_pop) outline = [ variant.position, gene[0], selCoeff, geno_spearman.correlation, geno_spearman.correlation**2, geno_spearman.pvalue, geno_k_tau, geno_k_tau_p_value, pop_spearman.correlation, pop_spearman.correlation**2, pop_spearman.pvalue, pop_k_tau, pop_k_tau_p_value, pbar, pbar_qbar, maf, LA ] outputDF.write(",".join(map(str, outline)) + "\n") outputDF.close() if args.bayPass: bayPassConfig.close()
# Keywords: Python, tree-sequence recording, tree sequence recording import msprime, pyslim ts = pyslim.load("recipe_17.7.trees").simplify() # selection coefficients and locations of all selected mutations coeffs = [] for mut in ts.mutations(): md = pyslim.decode_mutation(mut.metadata) sel = [x.selection_coeff for x in md] if any([s != 0 for s in sel]): coeffs += sel b = [x for x in coeffs if x > 0] d = [x for x in coeffs if x < 0] print("Beneficial: " + str(len(b)) + ", mean " + str(sum(b) / len(b))) print("Deleterious: " + str(len(d)) + ", mean " + str(sum(d) / len(d)))