def test_decode_errors(self):
     with self.assertRaises(ValueError):
         pyslim.decode_mutation(2.0)
     with self.assertRaises(ValueError):
         pyslim.decode_mutation([2.0, 3.0])
     with self.assertRaises(ValueError):
         pyslim.decode_node(2.0)
     with self.assertRaises(ValueError):
         pyslim.decode_node([1, 2])
     with self.assertRaises(ValueError):
         pyslim.decode_individual(3.0)
     with self.assertRaises(ValueError):
         pyslim.decode_individual([1, 2])
     with self.assertRaises(ValueError):
         pyslim.decode_population(1.0)
     with self.assertRaises(ValueError):
         pyslim.decode_population([2, 3])
Beispiel #2
0
 def test_mutation_derived_info(self):
     for ts in self.get_slim_examples():
         for j, mut in enumerate(ts.mutations()):
             a = ts.tables.mutations.metadata_offset[j]
             b = ts.tables.mutations.metadata_offset[j+1]
             raw_md = ts.tables.mutations.metadata[a:b]
             md = pyslim.decode_mutation(raw_md)
             self.assertEqual(mut.metadata, md)
             self.assertEqual(ts.mutation(j).metadata, md)
Beispiel #3
0
 def verify_mutation_decoding(self, t):
     ms = tskit.MetadataSchema(None)
     nt = t.copy()
     nt.metadata_schema = ms
     for a, b in zip(t, nt):
         md = a.metadata
         with self.assertWarns(DeprecationWarning):
             omd = pyslim.decode_mutation(b.metadata)
         self.assertEqual(md, {"mutation_list": [u.asdict() for u in omd]})
Beispiel #4
0
 def test_legacy_errors(self):
     defaults = pyslim.default_slim_metadata
     with self.assertRaisesRegex(ValueError, "legacy"):
         pyslim.decode_mutation(defaults('mutation'))
     with self.assertRaisesRegex(ValueError, "legacy"):
         pyslim.decode_population(defaults('population'))
     with self.assertRaisesRegex(ValueError, "legacy"):
         pyslim.decode_individual(defaults('individual'))
     with self.assertRaisesRegex(ValueError, "legacy"):
         pyslim.decode_node(defaults('node'))
     with self.assertRaisesRegex(ValueError, "legacy"):
         pyslim.encode_mutation(defaults('mutation'))
     with self.assertRaisesRegex(ValueError, "legacy"):
         pyslim.encode_population(defaults('population'))
     with self.assertRaisesRegex(ValueError, "legacy"):
         pyslim.encode_individual(defaults('individual'))
     with self.assertRaisesRegex(ValueError, "legacy"):
         pyslim.encode_node(defaults('node'))
Beispiel #5
0
 def test_decode_already_mutation(self):
     m = [pyslim.MutationMetadata(mutation_type = 0,
                                  selection_coeff = 0.2,
                                  population = k,
                                  slim_time = 130,
                                  nucleotide = 2) for k in range(4)]
     dm = pyslim.decode_mutation(m)
     self.assertEqual(type(dm), type([]))
     for a, b in zip(m, dm):
         self.assertEqual(a, b)
Beispiel #6
0
 def test_decode_already_mutation(self):
     m = [pyslim.MutationMetadata(mutation_type = 0,
                                  selection_coeff = 0.2,
                                  population = k,
                                  slim_time = 130,
                                  nucleotide = 2) for k in range(4)]
     with self.assertWarns(FutureWarning):
         dm = pyslim.decode_mutation(m)
     self.assertTrue(isinstance(dm, list))
     for a, b in zip(m, dm):
         self.assertEqual(a, b)
Beispiel #7
0
 def test_mutation_metadata(self):
     for md_length in [0, 1, 5]:
         md = [pyslim.MutationMetadata(
                  mutation_type=j, selection_coeff=0.5, population=j,
                  slim_time=10 + j, nucleotide=(j % 5) - 1) 
                  for j in range(md_length)]
         md_bytes = pyslim.encode_mutation(md)
         new_md = pyslim.decode_mutation(md_bytes)
         self.assertEqual(len(md), len(new_md))
         for x, y in zip(md, new_md):
             self.assertEqual(x, y)
Beispiel #8
0
    def test_annotate_mutations(self):
        for ts in self.get_slim_examples():
            tables = ts.tables
            new_tables = ts.tables
            metadata = []
            for md in tskit.unpack_bytes(tables.mutations.metadata,
                                         tables.mutations.metadata_offset):
                dm = pyslim.decode_mutation(md)
                edm = pyslim.encode_mutation(dm)
                self.assertEqual(md, edm)
                metadata.append(dm)

            pyslim.annotate_mutation_metadata(new_tables, metadata)
            self.assertEqual(tables, new_tables)
Beispiel #9
0
 def test_annotate_mutations(self):
     for ts in get_msprime_examples():
         slim_ts = pyslim.annotate_defaults(ts, model_type="nonWF", slim_generation=1)
         tables = slim_ts.tables
         metadata = list(pyslim.extract_mutation_metadata(tables))
         self.assertEqual(len(metadata), slim_ts.num_mutations)
         selcoefs = [random.uniform(0, 1) for _ in metadata]
         for j in range(len(metadata)):
             metadata[j].selection_coeff = selcoefs[j]
         pyslim.annotate_mutation_metadata(tables, metadata)
         new_ts = pyslim.load_tables(tables)
         for j, x in enumerate(new_ts.mutations()):
             md = pyslim.decode_mutation(x.metadata)
             self.assertEqual(md.selection_coeff, selcoefs[j])
def main():
## Define command line args
	parser = argparse.ArgumentParser(description="")
	parser.add_argument("--trees", 
			required = True,
			dest = "trees",
			type = str, 
			help = "Coalescent trees for the simulation")
	parser.add_argument("--optima", 
			required = True,
			dest = "optima",
			type = str, 
			help = "The file of phenotypic optima for the simulation")
	parser.add_argument("--output", 
			required = True,
			dest = "output",
			type = str, 
			help = "What name do you want to give to the output file? [DON'T GIVE FILE EXTENSION, THE SCRIPT MAKES TWO OUTPUT FILES")
	parser.add_argument("--nPops", 
			required = False,
			dest = "nPops",
			type = int, 
			help = "The number of subPops to downsample to",
			default = 0)
	parser.add_argument("--nInds", 
			required = False,
			dest = "nInds",
			type = int, 
			help = "The number of individuals to grab from each subPop",
			default = 0)
	parser.add_argument("--bayPass", 
			required = False,
			dest = "bayPass",
			action = "store_true",
			help = "Do you want to spit out a BayPass config?")
	parser.add_argument("--directional", 
			required = False,
			dest = "directional",
			action = "store_true",
			help = "Are these simulations modelling directional selection?")
	parser.add_argument("--intervals", 
			required = False,
			dest = "intervals",
			type = int, 
			help = "The width of the intervals you want to analyse",
			default = 0)
	parser.add_argument("--s", 
			required = False,
			dest = "s",
			type = int,
			help = "Are these simulations modelling directional selection?")
	parser.add_argument("--environments", 
			required = False,
			dest = "environments",
			type = str, 
			help = "The file with environments in it - if not given, optima will be used")
	args = parser.parse_args()

	print("analysing",args.trees)
	
## Read in the tree sequence file
	if args.nPops == 0:
		ts = pyslim.load(args.trees)
	else:
		ts_raw = pyslim.load(args.trees)
		ts = getSample(ts_raw, args.nPops, args.nInds)


## Make a dict of the environments from the optima and (optional) enviroment files
	if args.environments == None:
		enviDict = {}
		count = 0
		with open(args.optima) as file:
			for line in file:
				enviDict[ count ] = float( line.strip() )
				count += 1
		optimaDict  = enviDict.copy()
	else:
		enviDict = {}
		count = 0
		with open(args.environments) as file:
			for line in file:
				enviDict[ count ] = float( line.strip() )
				count += 1
		optimaDict = {}
		count = 0
		with open(args.optima) as file:
			for line in file:
				optimaDict[ count ] = float( line.strip() )
				count += 1
## Let's make a dict of all the demes and the individuals from the tree being analysed
	subPopDict = {} 
	count = 0
	for i in ts.individuals_alive_at(0):
		count += 1
		if ts.individual(i).population == 999: continue
		try: 
			subPopDict[ts.individual(i).population].append( i )
		except KeyError:
			subPopDict[ts.individual(i).population] = [i]

	print(count,"individuals")
	envi_pops = [enviDict[i] for i in sorted(subPopDict.keys()) ]
	print(envi_pops)
	


	to_keep = []
	for v in ts.variants():
		if v.genotypes.sum()/(args.nPops*args.nInds) < 0.3: continue
		to_keep.append( [ v.position-0.01, v.position + 0.01] )
		
	ts = ts.keep_intervals( np.array(to_keep) )
	
#	for t in ts_edit.trees():
#		print(t.draw(format = "unicode"))

	if len( [q.position for q in ts.variants()]  )*2+1 != len( [q.interval for q in ts.trees()]  ):
		print("WTF, this is error 1")

	
## Get the sample size
	N = ts.get_sample_size()



	nPops = len( list(subPopDict.keys()) )
	diploids_per_pop = len( subPopDict[list(subPopDict.keys())[0]] )

	print( diploids_per_pop ,"diploid individuals per population")
	print( nPops ,"populations")


	print(envi_pops, len(envi_pops))


## Set the mutation rate for neutral mutations 
	mut_rate = 5 ## HARD CODED

## Sprinkle neutral mutations onto the coalescent tree
	print('Sprinkling mutations onto trees')

	sprinkled = msprime.mutate(ts, rate= mut_rate, keep=True, random_seed = 12345)

	print('extracting segregating sites from trees...')
# Make a little dict that will be populated by the genes that contribute to LA

	selSites = 0
	count = 0

	outputDF = open( args.output +'.csv' ,'w' )
	header = [ "position",
				"gene",
				"selCoeff", 
				"geno_spearman_correlation",
				"geno_spearman_correlation_2",
				"geno_spearman_pvalue",
				"geno_k_tau",
				"geno_k_tau_p_value",
				"pbar",
				"pbar_qbar",
				"maf",
				"LA" ]
	outputDF.write(",".join(header) + "\n")
	
	if args.bayPass:
		
		bayPassConfig = open( args.output + ".bayPass.txt", "w")
	
## Iterating over all variants in the population we now perform the actual GEA	

	for variant in sprinkled.variants():
#		if variant.position > 1e6:
#			continue
		if variant.num_alleles > 2:
			continue

		md = pyslim.decode_mutation(variant.site.mutations[0].metadata)

		if len(md) > 0:
			selCoeff = md[0].selection_coeff
		else:
			selCoeff = 0

		count +=1
		if count % 1000 == 0:
			print('extracted',count,'neutral sites')

## Calculating the genotype freqs this way assumes that the ref and alt are coded as 1s and 0s, repectively. I'll keep that for now, but will change in a bit
		if 2 in variant.genotypes:
			print(variant)
			print(list(variant.genotypes))
			print("!!!!!!!!!!!!!!")
			close()

		genotypes_as_freqs = (variant.genotypes[::2] + variant.genotypes[1::2])/2
		
		if variant.genotypes.sum() == N: continue # Ignore fixed mutations

		freqs_per_pop = ([i.mean()  for i in chunks(genotypes_as_freqs,int(diploids_per_pop))])


		pbar = sum(freqs_per_pop)/len( freqs_per_pop )

		LA = -99
		
		if pbar == 1:continue ## This can be triggered by mutation stacking and infinite sites funniness, fix this part of the script
		if pbar > 1:
			print(variant)
			print("WTF, what is going on here: pbar > 1")
			print(genotypes)
			continue
		pbar_qbar = pbar * ( 1 - pbar )
		maf = min(pbar, 1-pbar)
		if pbar_qbar <0:
			print("WTF, what is going on here: pbar_qbar < 1")
			print(genotypes)
			continue

		geno_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop)
		geno_k_tau, geno_k_tau_p_value = scipy.stats.kendalltau(envi_pops, freqs_per_pop)
		
		pop_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop)
		pop_k_tau, pop_k_tau_p_value = scipy.stats.kendalltau(envi_pops, freqs_per_pop)
		
		if variant.position == int(variant.position):
			selCoeff = 0.003
		else:
			selCoeff = 0.0
		gene = int(round(variant.position/10)*10)
			
		outline = [ variant.position, 
					gene,
					selCoeff, 
					geno_spearman.correlation,
					geno_spearman.correlation**2,
					geno_spearman.pvalue,
					geno_k_tau,
					geno_k_tau_p_value,
					pbar,
					pbar_qbar,
					maf,
					LA]
					
		outputDF.write(",".join(map(str, outline)) + "\n")

	outputDF.close()

	if args.bayPass:
		bayPassConfig.close()
tables = ts.dump_tables()
x = [[], [], []]
y = [[], [], []]
c = [[], [], []]
varit = ts.variants()
colors = ['m', 'c', 'y']

ind_loc = np.array(
    [i.location[0] + np.random.uniform(-0.5, 0.5) for i in ts.individuals()])
phenotypes = np.array([0.0 for _ in ts.individuals()])
populations = np.array(
    [ts.node(ind.nodes[0]).population for ind in ts.individuals()])

for v in varit:
    mut_meta = pyslim.decode_mutation(v.site.mutations[0].metadata)[0]
    selectionCoeff = mut_meta.selection_coeff
    mutType = mut_meta.mutation_type
    color = None
    dominance = None
    if (mutType == 1 or mutType == 2):
        color = 'm'
        dominance = 'dom'
    elif (mutType == 4 or mutType == 5):
        color = 'c'
        dominance = 'rec'
    else:
        color = 'y'
        dominance = 'add'

    for ind in ts.individuals():
Beispiel #12
0
def main():
    ## Define command line args
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--trees",
                        required=True,
                        dest="trees",
                        type=str,
                        help="Coalescent trees for the simulation")
    parser.add_argument(
        "--optima",
        required=True,
        dest="optima",
        type=str,
        help="The file of phenotypic optima for the simulation")
    parser.add_argument(
        "--output",
        required=True,
        dest="output",
        type=str,
        help=
        "What name do you want to give to the output file? [DON'T GIVE FILE EXTENSION, THE SCRIPT MAKES TWO OUTPUT FILES"
    )
    parser.add_argument("--nPops",
                        required=False,
                        dest="nPops",
                        type=int,
                        help="The number of subPops to downsample to",
                        default=0)
    parser.add_argument(
        "--nInds",
        required=False,
        dest="nInds",
        type=int,
        help="The number of individuals to grab from each subPop",
        default=0)
    parser.add_argument("--bayPass",
                        required=False,
                        dest="bayPass",
                        action="store_true",
                        help="Do you want to spit out a BayPass config?")
    parser.add_argument(
        "--directional",
        required=False,
        dest="directional",
        action="store_true",
        help="Are these simulations modelling directional selection?")
    parser.add_argument("--intervals",
                        required=False,
                        dest="intervals",
                        type=int,
                        help="The width of the intervals you want to analyse",
                        default=0)
    parser.add_argument(
        "--s",
        required=False,
        dest="s",
        type=int,
        help="Are these simulations modelling directional selection?")
    parser.add_argument(
        "--environments",
        required=False,
        dest="environments",
        type=str,
        help=
        "The file with environments in it - if not given, optima will be used")
    parser.add_argument(
        "--demo",
        required=False,
        dest="demo",
        action="store_true",
        help=
        "Do you want to make a file with allele frequency info for demonstration purposes?"
    )
    args = parser.parse_args()

    print("analysing", args.trees)

    ## Read in the tree sequence file
    if args.nPops == 0:
        ts = pyslim.load(args.trees)
    else:
        ts_raw = pyslim.load(args.trees)
        ts = getSample(ts_raw, args.nPops, args.nInds)

## The simulations have a common genome structure,
## mimicking a group of species with syntenic genomes,
    genome = genomeMaker()
    genome['names'] = np.array(
        ['gene' + str(i) for i in range(genome.shape[0])])

    ## Get the sample size
    N = ts.get_sample_size()

    ## Make a dict of the environments from the optima and (optional) enviroment files
    if args.environments == None:
        enviDict = {}
        count = 0
        with open(args.optima) as file:
            for line in file:
                enviDict[count] = float(line.strip())
                count += 1
        optimaDict = enviDict.copy()
    else:
        enviDict = {}
        count = 0
        with open(args.environments) as file:
            for line in file:
                enviDict[count] = float(line.strip())
                count += 1
        optimaDict = {}
        count = 0
        with open(args.optima) as file:
            for line in file:
                optimaDict[count] = float(line.strip())
                count += 1

## Let's make a dict of all the demes and the individuals from the tree being analysed
    subPopDict = {}
    count = 0
    for i in ts.individuals_alive_at(0):
        count += 1
        if ts.individual(i).population == 999: continue
        try:
            subPopDict[ts.individual(i).population].append(i)
        except KeyError:
            subPopDict[ts.individual(i).population] = [i]

    print(count, "individuals")
    envi_pops = [enviDict[i] for i in sorted(subPopDict.keys())]
    print(envi_pops)

    nPops = len(list(subPopDict.keys()))
    diploids_per_pop = len(subPopDict[list(subPopDict.keys())[0]])

    print(diploids_per_pop, "diploid individuals per population")
    print(nPops, "populations")

    print(envi_pops, len(envi_pops))
    if args.bayPass:
        bayPassEnvs = open(args.output + ".bayPass.pc1", "w")
        for e in envi_pops:
            bayPassEnvs.write(str(e) + " ")
        bayPassEnvs.close()

## Figure out from the whole population, which genes are involved in local adaptation or not
    if not args.directional:
        if args.nPops == 0:
            adapGenes = getPVE_stabilising(ts, optimaDict, genome)
        else:
            adapGenes = getPVE_stabilising(ts_raw, optimaDict, genome)
    else:
        if args.nPops == 0:
            adapGenes = getPVE_Directional(ts, diploids_per_pop, optimaDict,
                                           genome)
        else:
            adapGenes = getPVE_Directional(ts_raw, diploids_per_pop,
                                           optimaDict, genome)

    print(adapGenes)
    #	for i in ts.individuals_alive_at(0):
    #		print(ts.individual(i).id, ts.individual(i).population, enviDict[ts.individual(i).population])
    ## Uncomment for the time bing
    #	if len( enviDict.keys() ) != len( subPopDict.keys() ):
    #		print( "Uneven numbers of subpopulations in the simulation and the optima file that was given" )
    #		return

    ## Set the mutation rate for neutral mutations
    mut_rate = 1e-8  ## HARD CODED

    ## If you specify an interval, the following will cut down the tree to focus sub-windows within each 10,000bp window
    ## This is a way of getting low recombination results on the cheap
    ## ...if you are into the whole brevity thing
    if args.intervals != 0:
        intervalDiff = (10000 - args.intervals) / 2
        if args.intervals == 1:
            print("I have not put in support for single base pair intervals")
            return
        genome["interval_start"] = genome[0] + intervalDiff
        genome["interval_end"] = genome[1] - intervalDiff
        intervalArray = np.array(genome[["interval_start", "interval_end"]])
        print(genome)
        ts = ts.keep_intervals(intervalArray)

        ## Reset the mutation rate for neutral mutations to acheive the same net mutation rate
        mut_rate = 1e-8 * (10000 / args.intervals)
# HARD CODED, SO CHANGE IF NECESSARY

## Sprinkle neutral mutations onto the coalescent tree
    print('Sprinkling mutations onto trees')

    sprinkled = msprime.mutate(ts, rate=mut_rate, keep=True, random_seed=12345)

    print('extracting segregating sites from trees...')
    # Make a little dict that will be populated by the genes that contribute to LA

    selSites = 0
    count = 0

    outputDF = open(args.output + '.csv', 'w')

    if args.demo:
        print(
            "I'm not performing GEA analysis, I'm make an allele frequency SNP table for demonstration purposes"
        )
        envOutput = open("environments." + args.output + ".csv", "w")
        envOutput.write(",".join(map(str, envi_pops)) + "\n")
        envOutput.close()
    else:
        print("Not demo!")
        header = [
            "position", "gene", "selCoeff", "geno_spearman_correlation",
            "geno_spearman_correlation_2", "geno_spearman_pvalue",
            "geno_k_tau", "geno_k_tau_p_value", "pbar", "pbar_qbar", "maf",
            "LA"
        ]
        outputDF.write(",".join(header) + "\n")

    if args.bayPass:

        bayPassConfig = open(args.output + ".bayPass.txt", "w")

## Iterating over all variants in the population we now perform the actual GEA

    for variant in sprinkled.variants():
        #		if variant.position > 1e6:
        #			continue
        if variant.num_alleles > 2:
            continue
        gene = findGene(variant.position, genome)

        if not gene:
            continue

        elif gene == -99:
            print(gene)
            print('FAILED')
            return
    #	print(variant.position)

        md = pyslim.decode_mutation(variant.site.mutations[0].metadata)
        if len(md) > 0:
            selCoeff = md[0].selection_coeff
        else:
            selCoeff = 0

        count += 1
        #		if count == 2:break
        if count % 1000 == 0:
            print('extracted', count, 'neutral sites')

## Calculating the genotype freqs this way assumes that the ref and alt are coded as 1s and 0s, repectively. I'll keep that for now, but will change in a bit
        if 2 in variant.genotypes:
            print(variant)
            print(list(variant.genotypes))
            print("!!!!!!!!!!!!!!")
            close()
        genotypes_as_freqs = (variant.genotypes[::2] +
                              variant.genotypes[1::2]) / 2

        if variant.genotypes.sum() == N: continue  # Ignore fixed mutations

        freqs_per_pop = ([
            i.mean() for i in chunks(genotypes_as_freqs, int(diploids_per_pop))
        ])

        if args.bayPass:
            counts_per_pop = ([
                int(i.sum() * 2)
                for i in chunks(genotypes_as_freqs, int(diploids_per_pop))
            ])
            bayPassConfig.write(" ")
            for c in counts_per_pop:
                bayPassConfig.write(
                    str(c) + " " + str(diploids_per_pop * 2 - c) + " ")
            bayPassConfig.write("\n")

        try:
            LA = adapGenes[gene[0]]
        except KeyError:
            LA = 0

        pbar = sum(freqs_per_pop) / len(freqs_per_pop)

        if pbar == 1:
            continue  ## This can be triggered by mutation stacking and infinite sites funniness, fix this part of the script
        if pbar > 1:
            print(variant)
            print("WTF, what is going on here: pbar > 1")
            print(genotypes)
            continue
        pbar_qbar = pbar * (1 - pbar)
        maf = min(pbar, 1 - pbar)
        if pbar_qbar < 0:
            print("WTF, what is going on here: pbar_qbar < 1")
            print(genotypes)
            continue

        if args.demo == True:
            #			print("DEMO")
            outline = [
                "chr" + str(1 + math.floor(variant.position / 2000000)),
                int(variant.position), gene[0]
            ] + freqs_per_pop
            #			print(outline)
            outputDF.write(",".join(map(str, outline)) + "\n")

            continue


#		geno_spearman = scipy.stats.spearmanr(envi, genotypes_as_freqs)
#		geno_k_tau, geno_k_tau_p_value = scipy.stats.kendalltau(envi, genotypes_as_freqs)
        geno_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop)
        geno_k_tau, geno_k_tau_p_value = scipy.stats.kendalltau(
            envi_pops, freqs_per_pop)

        pop_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop)
        pop_k_tau, pop_k_tau_p_value = scipy.stats.kendalltau(
            envi_pops, freqs_per_pop)

        outline = [
            variant.position, gene[0], selCoeff, geno_spearman.correlation,
            geno_spearman.correlation**2, geno_spearman.pvalue, geno_k_tau,
            geno_k_tau_p_value, pbar, pbar_qbar, maf, LA
        ]

        outputDF.write(",".join(map(str, outline)) + "\n")

    outputDF.close()

    if args.bayPass:
        bayPassConfig.close()
Beispiel #13
0
import pyslim
import msprime

ts = pyslim.load("simple.trees")
tables = ts.tables
print(tables)

# mutations

mut_metadata = []
for md in msprime.unpack_bytes(tables.mutations.metadata,
                               tables.mutations.metadata_offset):
    dm = pyslim.decode_mutation(md)
    edm = pyslim.encode_mutation(dm)
    assert (md == edm)
    mut_metadata.append(dm)

pyslim.annotate_mutations(tables, mut_metadata)

# nodes

node_metadata = []
for md in msprime.unpack_bytes(tables.nodes.metadata,
                               tables.nodes.metadata_offset):
    dn = pyslim.decode_node(md)
    edn = pyslim.encode_node(dn)
    assert (md == edn)
    node_metadata.append(dn)

pyslim.annotate_nodes(tables, node_metadata)
def main():
    ## Define command line args
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--trees",
                        required=True,
                        dest="trees",
                        type=str,
                        help="Coalescent trees for the simulation")
    parser.add_argument(
        "--optima",
        required=True,
        dest="optima",
        type=str,
        help="The file of phenotypic optima for the simulation")
    parser.add_argument(
        "--output",
        required=True,
        dest="output",
        type=str,
        help=
        "What name do you want to give to the output file? [DON'T GIVE FILE EXTENSION, THE SCRIPT MAKES TWO OUTPUT FILES"
    )
    args = parser.parse_args()

    ts = pyslim.load(args.trees)
    N = ts.get_sample_size()

    alive = ts.individuals_alive_at(0)

    ## Make a dict of the environments
    enviDict = {}
    envi = []
    count = 0
    with open(args.optima) as file:
        for line in file:
            enviDict[count] = int(line.strip())
            count += 1

## Let's make a dict of all the demes and the individuals from each
    subPopDict = {}
    for a in alive:
        try:
            subPopDict[ts.individual(a).population].append(a)
        except:
            subPopDict[ts.individual(a).population] = [a]
        envi.append(enviDict[ts.individual(a).population])
#	random.shuffle(envi)
#	print(envi)

    if len(enviDict.keys()) != len(subPopDict.keys()):
        print(
            "Uneven numbers of subpopulations in the simulation and the optima file that was given"
        )
        return

    nPops = len(enviDict.keys())

    diploids_per_pop = len(subPopDict[0])

    print(diploids_per_pop, "diploid individuals per population")
    print(nPops, "populations")

    ## The simulations have a common genome structure,
    ## mimicking a group of species with syntenic genomes,
    genome = genomeMaker()
    genome['names'] = np.array(
        ['gene' + str(i) for i in range(genome.shape[0])])

    ## Make a dict for the environments for the populations in the
    ## 10 deme simulations. Make a key for population 99.
    ## There are ghost samples from pop 99, so to avoid a KeyError
    ## I just make a little dict for them.
    #	enviDict = {1:-6, 2:-4, 3:-2, 4:-1, 5:0, 6:0, 7:1, 8:2, 9:4, 10:6, 99:99}

    ## Make a vector that contains the environment for each sample

    variants = {}

    ## Get the allele frequencies for all segregating sites at QTL

    print('extracting segregating sites from trees...')

    # Make a little dict that will be populated by the genes that contribute to LA
    adapGenes = {}

    ## Iterate over all variants present on the tree from SLiM

    for vs in ts.variants():
        alleleFreqs = (vs.genotypes[::2] + vs.genotypes[1::2]
                       )  ## I hate this line of code...
        # It is doing exactly what I want it to do.
        if vs.genotypes.sum() == N or vs.genotypes.sum() == 0:
            continue  # Remove fixed mutations

        gene = findGene(vs.position, genome)

        if len(vs.site.mutations) > 1:
            print('This site has more than a single variant:', vs.position)

        md = pyslim.decode_mutation(vs.site.mutations[0].metadata)

        selCoeff = md[0].selection_coeff

        if sum(alleleFreqs) == N / 2:
            continue

        adapGenes[gene[
            0]] = 1  ## This is to keep track of which genes have mutations that affect the phenotype

        genos_per_pop = ([
            i for i in chunks(alleleFreqs, int(diploids_per_pop / 2))
        ])
        #		print(genos_per_pop)
        genos = []

        for g in genos_per_pop:
            a1 = (len(g) * 2) - g.sum()  # The number of A alleles
            a2 = g.sum()  # The number of a alleles
            genos.append(int(a1))
            genos.append(a2)
        if sum(genos) != len(alleleFreqs) * 2:
            #			print(genos_per_pop)
            #			print(genos, sum(genos))
            print('something went haywire with the genotype counts')
        variants[vs.position] = [alleleFreqs, selCoeff, genos]
#		print( [alleleFreqs, selCoeff, genos] )

    print(len(variants.keys()), 'segregating sites affecting QTL')

    ## Set the mutation rate for neutral mutations
    mut_rate = 5e-9  # HARD CODED, SO CHANGE IF NECESSARY

    ## Sprinkle mutations onto the coalescent tree
    print('Sprinkling mutations onto trees')
    sprinkled = msprime.mutate(ts, rate=mut_rate, keep=True)

    ## Iterate over all polymorphisms in the tree and
    count = 0

    for variant in sprinkled.variants():
        count += 1
        if count % 1000 == 0:
            print('extracted', count, 'neutral sites')
        if variant.position in variants.keys():
            s = -99
        else:
            s = 0.0
        alleleFreqs = (variant.genotypes[::2] + variant.genotypes[1::2])
        all_alleles = alleleFreqs.sum()
        if variant.genotypes.sum() == N: continue  # Remove fixed mutations
        #		print(list(variant.genotypes))
        genos_per_pop = ([
            i for i in chunks(alleleFreqs, int(diploids_per_pop / 2))
        ])

        genos = []

        for g in genos_per_pop:
            a1 = (len(g) * 2) - g.sum()
            a2 = g.sum()
            genos.append(int(a1))
            genos.append(a2)

        if sum(genos) != len(alleleFreqs) * 2:
            print('something went haywire with the genotype counts')

#		print(genos)

#		[(variant.genotypes[i] + variant.genotypes[i+1])/2 for  i in range(N)[::2]]
#		alleleFreqs =  (variant.genotypes[:int(N/2)] + variant.genotypes[int(N/2):] )/2
        variants[variant.position] = [alleleFreqs, s, genos]

## For each variant, identify the gene it is within,
## calculate the Spearman's Rho  as well as pq
## These data are then collated and made into a dataFrame
    print(count, "neutral segregating sites")
    data = []
    count = 0
    print('Performing GEA on the resulting data')
    #	output = open(args.output, 'w')
    #	output.write('pos,gene,s,rho,rho2,pval,pbar_qbar\n')
    scanners = []
    for v in variants.keys():
        s = variants[v][1]
        alleleFreqs = np.array(variants[v][0])

        gene = findGene(v, genome)

        if not gene:
            continue

        elif gene == -99:
            print(gene)
            print('FAILED')
            return

        try:
            LA = adapGenes[gene[0]]
        except KeyError:
            LA = 0

        pbar = alleleFreqs.sum() / (len(alleleFreqs) * 2)

        #		print(alleleFreqs)
        if pbar == 1: continue
        if pbar > 1:
            print("WTF, what is going on here: pbar > 1")
            print(alleleFreqs)
            continue
        pbar_qbar = pbar * (1 - pbar)
        maf = min(pbar, 1 - pbar)
        if pbar_qbar < 0:
            print("WTF, what is going on here: pbar_qbar < 1")
            print(alleleFreqs)
            continue

        spearman = scipy.stats.spearmanr(envi, alleleFreqs)
        k_tau, k_tau_p_value = scipy.stats.kendalltau(envi, alleleFreqs)

        if spearman.correlation == 0: continue

        dataPoint = {
            'pos': v,
            'gene': gene[0],
            's': s,
            'spearman_rho': spearman.correlation,
            'spearman_rho2': spearman.correlation**2,
            'spearman_rho_pval': spearman.pvalue,
            'kendall_tau': k_tau,
            'kendall_tau_pval': k_tau_p_value,
            'pbar': pbar,
            'pbar_qbar': pbar_qbar,
            'maf': maf,
            'LA': LA
        }
        data.append(dataPoint)
        scanners.append(v)


#		if count == 1000:break

## convert the GEA data points into a summary CSV
    pd.DataFrame(data).sort_values(by=['pos']).to_csv(args.output + '.csv',
                                                      index=False)

    ## Make a BayScan Input File from the input data

    #	BayScan = []
    #	for b in sorted(vars.keys()):
    #		BayScan.append(vars[b][2])
    #		print(b, vars[b][2])
    BayScan = [variants[b][2] for b in sorted(scanners)]

    pd.DataFrame(BayScan).to_csv(args.output + '.bayPass.txt',
                                 index=False,
                                 header=False,
                                 sep=' ')
# Keywords: Python, tree-sequence recording, tree sequence recording

import msprime, pyslim

ts = pyslim.load("recipe_16.7.trees").simplify()

# selection coefficients and locations of all selected mutations
coeffs = []
for mut in ts.mutations():
    md = pyslim.decode_mutation(mut.metadata)
    sel = [x.selection_coeff for x in md]
    if any([s != 0 for s in sel]):
        coeffs += sel

b = [x for x in coeffs if x > 0]
d = [x for x in coeffs if x < 0]

print("Beneficial: " + str(len(b)) + ", mean " + str(sum(b) / len(b)))
print("Deleterious: " + str(len(d)) + ", mean " + str(sum(d) / len(d)))
def main():
    ## Define command line args
    parser = argparse.ArgumentParser(
        description=
        "Perform GEA on SNPs present within a single Tree from each gene in the simulated genome"
    )
    parser.add_argument("--trees",
                        required=True,
                        dest="trees",
                        type=str,
                        help="Coalescent trees for the simulation")
    parser.add_argument(
        "--optima",
        required=True,
        dest="optima",
        type=str,
        help="The file of phenotypic optima for the simulation")
    parser.add_argument(
        "--output",
        required=True,
        dest="output",
        type=str,
        help=
        "What name do you want to give to the output file? [DON'T GIVE FILE EXTENSION, THE SCRIPT MAKES TWO OUTPUT FILES"
    )
    parser.add_argument("--nPops",
                        required=False,
                        dest="nPops",
                        type=int,
                        help="The number of subPops to downsample to",
                        default=0)
    parser.add_argument(
        "--nInds",
        required=False,
        dest="nInds",
        type=int,
        help="The number of individuals to grab from each subPop",
        default=0)
    parser.add_argument("--bayPass",
                        required=False,
                        dest="bayPass",
                        action="store_true",
                        help="Do you want to spit out a BayPass config?")
    parser.add_argument(
        "--directional",
        required=False,
        dest="directional",
        action="store_true",
        help="Are these simulations modelling directional selection?")

    args = parser.parse_args()

    print("analysing", args.trees)

    if args.nPops == 0:
        ts = pyslim.load(args.trees)
    else:
        ts_raw = pyslim.load(args.trees)
        ts = getSample(ts_raw, args.nPops, args.nInds)

    N = ts.get_sample_size()

    alive = ts.individuals_alive_at(0)

    ## Make a dict of the environments
    enviDict = {}
    count = 0
    with open(args.optima) as file:
        for line in file:
            enviDict[count] = int(line.strip())
            count += 1


## Let's make a dict of all the demes and the individuals from each

    subPopDict = {}
    count = 0
    for i in ts.individuals_alive_at(0):
        count += 1
        if ts.individual(i).population == 999: continue
        try:
            subPopDict[ts.individual(i).population].append(i)
        except KeyError:
            subPopDict[ts.individual(i).population] = [i]

    print(count, "individuals")
    envi_pops = [enviDict[i] for i in sorted(subPopDict.keys())]

    if args.bayPass:
        bayPassEnvs = open(args.output + ".bayPass.pc1", "w")
        for e in envi_pops:
            bayPassEnvs.write(str(e) + " ")
        bayPassEnvs.close()

    nPops = len(list(subPopDict.keys()))

    diploids_per_pop = len(subPopDict[list(subPopDict.keys())[0]])

    print(diploids_per_pop, "diploid individuals per population")
    print(nPops, "populations")

    ## The simulations have a common genome structure,
    ## mimicking a group of species with syntenic genomes,
    genome = genomeMaker()
    genome['names'] = np.array(
        ['gene' + str(i) for i in range(genome.shape[0])])

    outputDF = open(args.output + '.csv', 'w')
    header = [
        "position", "gene", "selCoeff", "geno_spearman_correlation",
        "geno_spearman_correlation_2", "geno_spearman_pvalue", "geno_k_tau",
        "geno_k_tau_p_value", "pop_spearman_correlation",
        "pop_spearman_correlation_2", "pop_spearman_pvalue", "pop_k_tau",
        "pop_k_tau_p_value", "pbar", "pbar_q_bar", "maf", "LA"
    ]
    outputDF.write(",".join(header) + "\n")

    if not args.directional:
        winVars, winCovars = getPVE(ts, envi_pops)
        print(winVars)
        adapGenes = getAdapGenes(winCovars)
    else:
        adapGenes = getAdapGenesDirectional(ts, diploids_per_pop, envi_pops,
                                            genome)
        print(adapGenes)

    pos_ts = ts.keep_intervals(np.array([[pos, pos + 1]]))

    print(np.array([[1, 1]]))
    print(np.array([[1, 1]]).shape)

    for pos in np.array(genome[0]) + 4997:

        #		pos_tree = tskit.TreeSequence( ts.at(pos) )

        ## Set the mutation rate for neutral mutations

        mut_rate = 1e-8 * 10000  # HARD CODED, SO CHANGE IF NECESSARY

        ## Sprinkle mutations onto the coalescent tree
        print('Sprinkling mutations onto trees')
        sprinkled = msprime.mutate(pos_tree, rate=mut_rate, keep=True)

        ##	Calculating the PVE for each of the focal genes

        ## Get the allele frequencies for all segregating sites at QTL
        print('extracting segregating sites from trees...')
        selSites = 0

        count = 0

        if args.bayPass:

            bayPassConfig = open(args.output + ".bayPass.txt", "w")

        for variant in sprinkled.variants():
            #		if variant.position > 1e6:
            #			continue
            if variant.num_alleles > 2:
                continue
            gene = findGene(variant.position, genome)

            if not gene:
                continue

            elif gene == -99:
                print(gene)
                print('FAILED')
                return
        #	print(variant.position)

            md = pyslim.decode_mutation(variant.site.mutations[0].metadata)
            if len(md) > 0:
                selCoeff = md[0].selection_coeff
            else:
                selCoeff = 0

            count += 1
            #		if count == 2:break
            if count % 1000 == 0:
                print('extracted', count, 'neutral sites')

    ## Calculating the genotype freqs this way assumes that the ref and alt are coded as 1s and 0s, repectively. I'll keep that for now, but will change in a bit
            if 2 in variant.genotypes:
                print(variant)
                print(list(variant.genotypes))
                print("!!!!!!!!!!!!!!")
                close()
            genotypes_as_freqs = (variant.genotypes[::2] +
                                  variant.genotypes[1::2]) / 2

            if variant.genotypes.sum() == N: continue  # Ignore fixed mutations

            freqs_per_pop = ([
                i.mean()
                for i in chunks(genotypes_as_freqs, int(diploids_per_pop))
            ])

            if args.bayPass:
                counts_per_pop = ([
                    int(i.sum() * 2)
                    for i in chunks(genotypes_as_freqs, int(diploids_per_pop))
                ])
                bayPassConfig.write(" ")
                for c in counts_per_pop:
                    bayPassConfig.write(
                        str(c) + " " + str(diploids_per_pop * 2 - c) + " ")
                bayPassConfig.write("\n")

            try:
                LA = adapGenes[gene[0]]
            except KeyError:
                LA = 0

            pbar = sum(freqs_per_pop) / len(freqs_per_pop)

            if pbar == 1:
                continue  ## This can be triggered by mutation stacking and infinite sites funniness, fix this part of the script
            if pbar > 1:
                print("WTF, what is going on here: pbar > 1")
                print(genotypes)
                continue
            pbar_qbar = pbar * (1 - pbar)
            maf = min(pbar, 1 - pbar)
            if pbar_qbar < 0:
                print("WTF, what is going on here: pbar_qbar < 1")
                print(genotypes)
                continue

            geno_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop)
            geno_k_tau, geno_k_tau_p_value = scipy.stats.kendalltau(
                envi_pops, freqs_per_pop)

            pop_spearman = scipy.stats.spearmanr(envi_pops, freqs_per_pop)
            pop_k_tau, pop_k_tau_p_value = scipy.stats.kendalltau(
                envi_pops, freqs_per_pop)

            outline = [
                variant.position, gene[0], selCoeff, geno_spearman.correlation,
                geno_spearman.correlation**2, geno_spearman.pvalue, geno_k_tau,
                geno_k_tau_p_value, pop_spearman.correlation,
                pop_spearman.correlation**2, pop_spearman.pvalue, pop_k_tau,
                pop_k_tau_p_value, pbar, pbar_qbar, maf, LA
            ]
            outputDF.write(",".join(map(str, outline)) + "\n")
        outputDF.close()
        if args.bayPass:
            bayPassConfig.close()
# Keywords: Python, tree-sequence recording, tree sequence recording

import msprime, pyslim

ts = pyslim.load("recipe_17.7.trees").simplify()

# selection coefficients and locations of all selected mutations
coeffs = []
for mut in ts.mutations():
    md = pyslim.decode_mutation(mut.metadata)
    sel = [x.selection_coeff for x in md]
    if any([s != 0 for s in sel]):
        coeffs += sel

b = [x for x in coeffs if x > 0]
d = [x for x in coeffs if x < 0]

print("Beneficial: " + str(len(b)) + ", mean " + str(sum(b) / len(b)))
print("Deleterious: " + str(len(d)) + ", mean " + str(sum(d) / len(d)))