Esempio n. 1
0
def execute_tpr(args):
	''' estimate true positive rate for region detection '''
	model = args.model
	regionlen = args.regionlen
	thresshold = args.thresshold
	cutoff = args.cutoff
	numReps = args.nrep
	pop = args.simpop
	suffix = args.suffix
	writedir = args.writedir
	takeScore = args.score

	all_scores = []
	all_percentages = []
	
	#if args.saveLog	is not None:
	#	writefilename = args.saveLog
	#	if os.path.isfile(writefilename):
	#		print(writefilename + " already exists; aborting.")
	#		sys.exit(0)

	#per seldaf
	dafbins = [['0.10', '0.20', '0.30', '0.40', '0.50', '0.60', '0.70', '0.80', '0.90'], ['0.10', '0.20', '0.30'], ['0.40', '0.50', '0.60'], ['0.70', '0.80', '0.90'], ['0.90']]
	daflabels = ['all', 'lo', 'mid', 'hi','highest']
	for ibin in [3]:#[1, 2, 3, 4]:#range(1):
		thesebins, thislabel = dafbins[ibin], daflabels[ibin]
		allrepfilenames = []
		for selbin in thesebins:
			for irep in range(1, numReps + 1):
				repfilename = get_sel_repfile_name(model, irep, pop, selbin, normed=True, suffix=suffix, basedir=writedir)
				if (irep==1):
					print(repfilename)
				if os.path.isfile(repfilename):
					allrepfilenames.append(repfilename)
		print('loaded ' + str(len(allrepfilenames)) + " replicates...")
		#numToTake = min(500, len(allrepfilenames))
		#chosen = np.random.choice(allrepfilenames, numToTake, replace=False) #take random sample	
		chosen = allrepfilenames #this was just to expedite, no?
		for repfilename in chosen:
			physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename)
			#physpos, genpos, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(repfilename)
			these_scores = eval(takeScore)
			if len(these_scores) > 0:
				all_scores.append(these_scores)
				rep_percentages = check_rep_windows(physpos, these_scores, regionlen, cutoff = cutoff)
				all_percentages.append(rep_percentages)		

		print('loaded ' + str(len(all_scores)) + " replicates populations for model " + model + "...")
		tpr = calc_pr(all_percentages, thresshold)
		print('true positive rate: ' + str(tpr) + "\n")

		if args.saveLog	is not None:
			writefilename = args.saveLog +"_" + thislabel
			writefile = open(writefilename, 'w')
			writefile.write(str(tpr)+'\n')

			writefile.write(model + "\t" + str(regionlen) + "\t" + str(thresshold) + '\t' + str(cutoff) + '\n')
			writefile.close()
			print('wrote to :  ' + str(writefilename))
	return	
Esempio n. 2
0
def execute_cdf(args):
	""" visualize power to localize variants: estimate p(causal variant captured | signif thresshold includes x top SNPs) from simulates. plot as cumulative density function"""
	reps = args.nrep
	savefilename = args.savefilename
	writedir = args.writedir
	scenars = ['0.70', '0.80', '0.90']#'0.10', '0.20', '0.30', '0.40', '0.50', '0.60', '0.70', '0.80', '0.90']
	model = args.model
	causalPos = args.selPos
	suffix = args.suffix
	#causal_ranks_all = []
	causal_ranks_1, causal_ranks_2, causal_ranks_3, causal_ranks_4 = [], [], [], []
	for pop in [1, 2, 3, 4]:
		for scenar in scenars:
			for irep in range(1, reps+1):
				cmsfilename = get_sel_repfile_name(model, irep, pop, scenar, normed = False, basedir=writedir, suffix=suffix)
			
				if os.path.isfile(cmsfilename):
					physpos, genpos, seldaf, ihs_normed, delihh_normed, nsl_normed, xpehh_normed, fst, deldaf, cms_unnormed, cms_normed = read_cms_repfile(cmsfilename)
					if causalPos in physpos:
						causal_index = physpos.index(causalPos)
						causal_unnormed = cms_unnormed[causal_index]
						causal_rank = get_causal_rank(cms_unnormed, causal_unnormed)
						#print(cmsfilename)
						#print('causal rank: ' + str(causal_rank)) 
						#causal_ranks.append(causal_rank)
						this_array = eval('causal_ranks_' + str(pop))
						if not np.isnan(causal_rank):
							this_array.append(causal_rank)
				else:
					print("missing; " + cmsfilename)
	print("for pop 1, loaded " + str(len(causal_ranks_1)) + " replicates.")
	print("for pop 2, loaded " + str(len(causal_ranks_2)) + " replicates.")
	print("for pop 3, loaded " + str(len(causal_ranks_3)) + " replicates.")
	print("for pop 4, loaded " + str(len(causal_ranks_4)) + " replicates.")

	cdf_fig, cdf_ax = plt.subplots()
	if len(causal_ranks_1) > 0:
		cdf_bins1, cdf1 = get_cdf_from_causal_ranks(causal_ranks_1)
		cdf_ax.plot(cdf_bins1[1:], cdf1, color="yellow")
	if len(causal_ranks_2) > 0:
		cdf_bins2, cdf2 = get_cdf_from_causal_ranks(causal_ranks_2)
		cdf_ax.plot(cdf_bins2[1:], cdf2, color="blue")
	if len(causal_ranks_3) > 0:
		cdf_bins3, cdf3 = get_cdf_from_causal_ranks(causal_ranks_3)
		cdf_ax.plot(cdf_bins3[1:], cdf3, color="green")
	if len(causal_ranks_4) > 0:
		cdf_bins4, cdf4 = get_cdf_from_causal_ranks(causal_ranks_4)			
		cdf_ax.plot(cdf_bins4[1:], cdf4, color="purple")
	cdf_ax.set_xlim([0, 50])
	plt.title(model) #+ ", " + str(len(causal_ranks)) + " selection replicates")
	plt.ylabel('probability that the causal variant is captured')
	plt.xlabel('significance thresshold (i.e., examining the top x variants)')
	plt.savefig(savefilename)
	plt.close()
	print('plotted to ' + savefilename)
	return
Esempio n. 3
0
def execute_normsims_genomewide(args):
    """ given output from composite_sims, normalize all replicates to neutral parameters """
    sel_freq_bins = [
        '0.10', '0.20', '0.30', '0.40', '0.50', '0.60', '0.70', '0.80', '0.90'
    ]
    model = args.model
    selpop = args.simpop
    numPerBin_sel = args.nrep_sel
    numPerBin_neut = args.nrep_neut
    writedir = args.writedir
    suffix = args.runSuffix

    values = []
    ##############################
    ## LOAD STATS FROM NEUT SIMS #
    ##############################
    for irep in range(1, numPerBin_neut + 1):
        outfile = get_neut_repfile_name(model,
                                        irep,
                                        selpop,
                                        suffix=suffix,
                                        normed=False,
                                        basedir=writedir)
        if os.path.isfile(outfile):
            openfile = open(outfile, 'r')
            header = openfile.readline()
            for line in openfile:
                entries = line.split()
                rawscore = np.log(float(entries[-1]))
                values.append(rawscore)
            openfile.close()
        else:
            print('missing: ' + outfile)

    print('loaded ' + str(len(values)) + ' values from neutral sims...')

    #check for nans
    values = np.array(values)
    values = values[~np.isnan(values)]
    values = list(values)

    #check for infs
    values = np.array(values)
    values = values[~np.isinf(values)]
    values = list(values)

    mean = np.mean(values)
    var = np.var(values)
    sd = np.sqrt(var)

    print("max: " + str(max(values)))
    print("min: " + str(min(values)))
    print("mean: " + str(np.mean(values)))
    print("var: " + str(np.var(values)))

    ############################
    ## NORMALIZE NEUTRAL SIMS ##
    ############################

    for irep in range(1, numPerBin_neut + 1):
        outfile = get_neut_repfile_name(model,
                                        irep,
                                        selpop,
                                        suffix=suffix,
                                        normed=False,
                                        basedir=writedir)
        if os.path.isfile(outfile):
            normedfile = outfile + ".norm"  #.z"
            if True:
                #if not os.path.isfile(normedfile): #CHANGE FOR --checkOverwrite
                openfile = open(outfile, 'r')
                writefile = open(normedfile, 'w')
                header = openfile.readline()
                writefile.write(header)
                for line in openfile:
                    entries = line.split()
                    rawscore = np.log(float(entries[-1]))
                    normalized = normalize(rawscore, mean, sd)
                    writeline = line.strip('\n') + "\t" + str(
                        normalized) + "\n"
                    writefile.write(writeline)
                openfile.close()
                writefile.close()
    print("wrote to eg: " + normedfile)

    ########################
    ## NORMALIZE SEL SIMS ##
    ########################
    for sel_freq_bin in sel_freq_bins:
        for irep in range(1, numPerBin_sel + 1):
            rawfile = get_sel_repfile_name(model,
                                           irep,
                                           selpop,
                                           sel_freq_bin,
                                           suffix=suffix,
                                           normed=False,
                                           basedir=writedir)
            #print(rawfile)
            if os.path.isfile(rawfile):
                normedfile = rawfile + ".norm"  #.z"
                if True:
                    #if not os.path.isfile(normedfile):
                    openfile = open(rawfile, 'r')
                    writefile = open(normedfile, 'w')
                    header = openfile.readline()
                    writefile.write(header)
                    for line in openfile:
                        entries = line.split()
                        rawscore = np.log(float(entries[-1]))
                        normalized = normalize(rawscore, mean, sd)
                        writeline = line.strip('\n') + "\t" + str(
                            normalized) + "\n"
                        writefile.write(writeline)
                    openfile.close()
                    writefile.close()
    print("wrote to eg: " + normedfile)
    return