def generate_homhet_cohort(num_cases, num_conts, num_snps, ps, ps_stratif, h_sq, het=False): # NOTE: ps for homogeneous cases still needed to generate betas #num_cases = 5000 #num_conts = 5000 #num_snps = 10 prev = 0.01 thresh = norm.ppf(1-prev, loc=0, scale=1) #h_sq = 0.1 ps, betas = generate_snp_props(num_snps, ps, h_sq) cases, conts = generate_cohort(num_cases=num_cases, num_conts=num_conts, # freqs=ps, num_snps=num_snps, # freqs are sampled randomly when generating stratification betas=betas, ps=ps, ps_stratif=ps_stratif, h_sq=h_sq, thresh=thresh) if het: pi = 0.5 # conts_temp = np.random.binomial(n=2, p=ps, size=(num_conts,num_snps)) conts_temp = generate_stratified_controls(num_conts, num_snps, ps_stratif) np.random.shuffle(conts_temp) cases = np.concatenate((cases[:int(num_cases * pi)], conts_temp[:int(num_cases * (1-pi))]),axis=0) return cases,conts
def run_liability(): FILE_PATH = "subtype_frac_liab.p" if os.path.exists(FILE_PATH): h_sq_frac_range, results, hetsc_exp = pickle.load(open( FILE_PATH, "rb")) plot_results(h_sq_frac_range, results, hetsc_exp, outname="subtype_scale_liability.eps", xaxis_label='Fraction of subtype variance explained') else: num_cases = 30000 num_conts = 30000 num_snps = 100 fixed_ps = np.array([0.2] * num_snps) # num_trials = 3 num_trials = 20 h_sq = 0.05 h_sq_frac_range = [0, 0.2, 0.4, 0.6, 0.8, 1.0] sub_betas_list, _ = generate_snps_splits(num_sub_phenos=2, frac_shared_effects=0, num_snps=num_snps, ps=fixed_ps, h_sq=h_sq) # subtype 1 properties ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq) prev = 0.01 thresh = norm.ppf(1 - prev, loc=0, scale=1) # expected score if all cases followed subtype 1 hetsc_exp = heterogeneity_expected_corr(ncases=num_cases, nconts=num_cases, effects=betas, thresh=thresh, freqs=fixed_ps, heritability=h_sq, verbose=False) print(hetsc_exp) results = [] for h_sq_frac in h_sq_frac_range: scores = [] # generate subtype 2 properties # _, betas_sub2 = generate_snp_props(num_snps, fixed_ps, h_sq*h_sq_frac) sub_betas_list2, _ = generate_snps_splits(num_sub_phenos=2, frac_shared_effects=0, num_snps=num_snps, ps=fixed_ps, h_sq=h_sq * h_sq_frac) for nt in range(num_trials): print("h_sq_frac: %s, trial: %s" % (h_sq_frac, nt)) cases_sub1, conts = generate_cohort( num_cases=int(num_cases / 2), num_conts=num_conts, freqs=fixed_ps, # freqs=ps, betas=sub_betas_list[0], # betas=betas, h_sq=h_sq, thresh=thresh) cases_sub2, _ = generate_cohort( num_cases=int(num_cases / 2), num_conts=num_conts, freqs=fixed_ps, # freqs=ps, betas=sub_betas_list2[1], # betas=betas_sub2, h_sq=h_sq * h_sq_frac, thresh=thresh) cases = np.concatenate((cases_sub1, cases_sub2), axis=0) score = heterogeneity(cases, conts) scores.append(score) results.append(scores) pickle.dump((h_sq_frac_range, results, hetsc_exp), open(FILE_PATH, "wb"))
def run_heritability(): FILE_PATH = "simulate_results_varexp.p" fig, ax = plt.subplots(1,1) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) if os.path.exists(FILE_PATH): pckl = pickle.load(open(FILE_PATH, "rb")) # hetsc_means_hom = pckl["hetsc_means_hom"] # hetsc_stds_hom = pckl["hetsc_stds_hom"] # hetsc_means_het = pckl["hetsc_means_het"] # hetsc_stds_het = pckl["hetsc_stds_het"] # hetsc_means_cont = pckl["hetsc_means_cont"] # hetsc_stds_cont = pckl["hetsc_stds_cont"] hetsc_means_hom = [np.mean(x) for x in pckl["hetscs_homs"]] hetsc_stds_hom = [np.std(x) for x in pckl["hetscs_homs"]] hetsc_means_het = [np.mean(x) for x in pckl["hetscs_hets"]] hetsc_stds_het = [np.std(x) for x in pckl["hetscs_hets"]] hetsc_means_cont = [np.mean(x) for x in pckl["hetscs_conts"]] hetsc_stds_cont = [np.std(x) for x in pckl["hetscs_conts"]] hetsc_exps = pckl["hetsc_exps"] h_sqs = pckl["h_sqs"] plt.errorbar(h_sqs, hetsc_means_hom, yerr=hetsc_stds_hom, color='red', capsize=5) plt.errorbar(h_sqs, hetsc_means_het, yerr=hetsc_stds_het, color='green', capsize=5) plt.errorbar(h_sqs, hetsc_means_cont, yerr=hetsc_stds_cont, color='black', capsize=5) plt.plot(h_sqs, hetsc_exps, color='blue') plt.xlabel("Variance explained by modeled SNPs", fontsize=14) plt.ylabel("Heterogeneity Score", fontsize=14) # label CLiP score xloc = 0.8 * (h_sqs[-1] - h_sqs[0]) yloc1 = np.interp(xloc, h_sqs, hetsc_exps) yloc2 = np.interp(xloc, h_sqs, hetsc_means_het) ax.annotate(s='',xy=(xloc, yloc1), xycoords='data', xytext=(xloc, yloc2),textcoords='data', arrowprops=dict(arrowstyle="<->", color='gray')) ax.annotate(s='CLiP Score',xy=(xloc * 1.02, yloc1 + (yloc2-yloc1)*0.75), xycoords='data',fontsize=14.0,textcoords='data', ha='left', color='gray') plt.xticks(fontsize=11) plt.yticks(fontsize=11) plt.savefig("simulate_basic_heritability.eps", format="eps", dpi=1000) plt.show() else: # hetsc_means_hom = [] # hetsc_stds_hom = [] # hetsc_means_het = [] # hetsc_stds_het = [] # hetsc_means_cont = [] # hetsc_stds_cont = [] hetscs_homs = [] hetscs_hets = [] hetscs_conts = [] hetsc_exps = [] h_sqs = [0.001, 0.01, 0.02, 0.03, 0.05, 0.075, 0.1] num_trials = 20 for h_sq in h_sqs: num_cases = 30000 num_conts = 30000 num_snps = 100 fixed_ps = np.array([0.2]*num_snps) hetscs_hom = [] hetscs_het = [] hetscs_cont = [] # expected score, homogeneous cases ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq) prev = 0.01 thresh = norm.ppf(1-prev, loc=0, scale=1) hetsc_exp = heterogeneity_expected_corr(ncases = num_cases, nconts = num_conts, effects = betas, thresh = thresh, freqs = ps, heritability = h_sq, verbose=False) hetsc_exps.append(hetsc_exp) for i in range(num_trials): print("h_sq: %s, trial: %s" % (h_sq, i)) # homogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, h_sq) score = heterogeneity(cases,conts) hetscs_hom.append(score) # heterogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, h_sq, het=True) score = heterogeneity(cases,conts) hetscs_het.append(score) # controls, controls cases,conts = generate_controls(num_cases, num_conts, num_snps, ps, h_sq) score = heterogeneity(cases,conts) hetscs_cont.append(score) # hetsc_means_hom.append(np.mean(hetscs_hom)) # hetsc_stds_hom.append(np.std(hetscs_hom)) # hetsc_means_het.append(np.mean(hetscs_het)) # hetsc_stds_het.append(np.std(hetscs_het)) # hetsc_means_cont.append(np.mean(hetscs_cont)) # hetsc_stds_cont.append(np.std(hetscs_cont)) hetscs_homs.append(hetscs_hom) hetscs_hets.append(hetscs_het) hetscs_conts.append(hetscs_cont) pickle.dump({"hetscs_homs":hetscs_homs, "hetscs_hets":hetscs_hets, "hetscs_conts":hetscs_conts, "hetsc_exps":hetsc_exps, "h_sqs":h_sqs}, open(FILE_PATH, "wb")) """
def run_sample_size(): FILE_PATH = "simulate_results_samplesize.p" fig, ax = plt.subplots(1,1) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) if os.path.exists(FILE_PATH): pckl = pickle.load(open(FILE_PATH, "rb")) # hetsc_means_hom = pckl["hetsc_means_hom"] # hetsc_stds_hom = pckl["hetsc_stds_hom"] # hetsc_means_het = pckl["hetsc_means_het"] # hetsc_stds_het = pckl["hetsc_stds_het"] # hetsc_means_cont = pckl["hetsc_means_cont"] # hetsc_stds_cont = pckl["hetsc_stds_cont"] hetsc_means_hom = [np.mean(x) for x in pckl["hetscs_homs"]] hetsc_stds_hom = [np.std(x) for x in pckl["hetscs_homs"]] hetsc_means_het = [np.mean(x) for x in pckl["hetscs_hets"]] hetsc_stds_het = [np.std(x) for x in pckl["hetscs_hets"]] hetsc_means_cont = [np.mean(x) for x in pckl["hetscs_conts"]] hetsc_stds_cont = [np.std(x) for x in pckl["hetscs_conts"]] hetsc_exps = pckl["hetsc_exps"] sample_size = pckl["sample_size"] plt.errorbar(sample_size, hetsc_means_hom, yerr=hetsc_stds_hom, color='red', capsize=5) plt.errorbar(sample_size, hetsc_means_het, yerr=hetsc_stds_het, color='green', capsize=5) plt.errorbar(sample_size, hetsc_means_cont, yerr=hetsc_stds_cont, color='black', capsize=5) # plot values for i in range(len(sample_size)): print("sample size: %s, score: %s, std dev: %s" % (sample_size[i], hetsc_means_hom[i], hetsc_stds_hom[i])) plt.plot(sample_size, hetsc_exps, color='blue') plt.xlabel("Number of simulated cases and controls", fontsize=14) plt.ylabel("Heterogeneity Score", fontsize=14) # label CLiP score xloc = 0.8 * (sample_size[-1] - sample_size[0]) yloc1 = np.interp(xloc, sample_size, hetsc_exps) yloc2 = np.interp(xloc, sample_size, hetsc_means_het) ax.annotate(s='',xy=(xloc, yloc1), xycoords='data', xytext=(xloc, yloc2),textcoords='data', arrowprops=dict(arrowstyle="<->", color='gray')) ax.annotate(s='CLiP Score',xy=(xloc * 1.02, yloc1 + (yloc2-yloc1)*0.75), xycoords='data',fontsize=14.0,textcoords='data', ha='left', color='gray') plt.xticks(fontsize=11) plt.yticks(fontsize=11) plt.savefig("simulate_basic_num_inds.eps", format="eps", dpi=1000) plt.show() else: # hetsc_means_hom = [] # hetsc_stds_hom = [] # hetsc_means_het = [] # hetsc_stds_het = [] # hetsc_means_cont = [] # hetsc_stds_cont = [] hetscs_homs = [] hetscs_hets = [] hetscs_conts = [] hetsc_exps = [] sample_sizes = [1000, 5000, 10000, 20000, 30000, 50000] # sample_sizes = [5000, 10000] # sample_sizes = [1000, 5000, 10000] num_trials = 20 for sample_size in sample_sizes: num_cases = sample_size num_conts = sample_size num_snps = 100 h_sq = 0.034 fixed_ps = np.array([0.2]*num_snps) hetscs_hom = [] hetscs_het = [] hetscs_cont = [] # expected score, homogeneous cases ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq) prev = 0.01 thresh = norm.ppf(1-prev, loc=0, scale=1) hetsc_exp = heterogeneity_expected_corr(ncases = num_cases, nconts = num_conts, effects = betas, thresh = thresh, freqs = ps, heritability = h_sq, verbose=False) hetsc_exps.append(hetsc_exp) for i in range(num_trials): print("sample_size: %s, trial: %s" % (sample_size, i)) # homogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, fixed_ps, h_sq) score = heterogeneity(cases,conts) hetscs_hom.append(score) # heterogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, fixed_ps, h_sq, het=True) score = heterogeneity(cases,conts) hetscs_het.append(score) # controls, controls cases,conts = generate_controls(num_cases, num_conts, num_snps, fixed_ps, h_sq) score = heterogeneity(cases,conts) hetscs_cont.append(score) # hetsc_means_hom.append(np.mean(hetscs_hom)) # hetsc_stds_hom.append(np.std(hetscs_hom)) # hetsc_means_het.append(np.mean(hetscs_het)) # hetsc_stds_het.append(np.std(hetscs_het)) # hetsc_means_cont.append(np.mean(hetscs_cont)) # hetsc_stds_cont.append(np.std(hetscs_cont)) hetscs_homs.append(hetscs_hom) hetscs_hets.append(hetscs_het) hetscs_conts.append(hetscs_cont) pickle.dump({"hetscs_homs":hetscs_homs, "hetscs_hets":hetscs_hets, "hetscs_conts":hetscs_conts, "hetsc_exps":hetsc_exps, "sample_size":sample_sizes}, open(FILE_PATH, "wb")) """
def run_heritability(): # FILE_PATH = "population_stratif_out.p" # pop_allele_diffs = [0.0, 0.05, 0.1, 0.15, 0.2] fsts = np.array([0.001, 0.005, 0.01, 0.05, 0.1]) pop_allele_diffs = 0.5*np.sqrt(fsts) # for allele_diff in pop_allele_diffs: # FILE_PATH = "population_stratif_out_%s.p" % (allele_diff) for fst, allele_diff in zip(fsts, pop_allele_diffs): FILE_PATH = "population_stratif_out_%s.p" % (fst) if not os.path.exists(FILE_PATH): num_snps = 100 fixed_ps_val = 0.5 fixed_ps = np.array([fixed_ps_val]*num_snps) # generate stratified populations ps_stratif = [[fixed_ps_val - allele_diff]*int(num_snps/2) + [fixed_ps_val + allele_diff]*int(num_snps/2), [fixed_ps_val + allele_diff]*int(num_snps/2) + [fixed_ps_val - allele_diff]*int(num_snps/2)] hetscs_homs = [] hetscs_hets = [] hetscs_conts = [] hetsc_exps = [] mean_allele_diffs = [] # allele difference between homogeneous cases and controls h_sqs = [0.001, 0.01, 0.02, 0.03, 0.05, 0.075, 0.1] # h_sqs = [0.001, 0.025, 0.05, 0.075, 0.1] num_trials = 30 for h_sq in h_sqs: num_cases = 30000 num_conts = 30000 hetscs_hom = [] hetscs_het = [] hetscs_cont = [] mean_allele_diff = [] # expected score, homogeneous cases ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq) prev = 0.01 thresh = norm.ppf(1-prev, loc=0, scale=1) hetsc_exp = heterogeneity_expected_corr(ncases = num_cases, nconts = num_conts, effects = betas, thresh = thresh, freqs = ps, heritability = h_sq, verbose=False) hetsc_exps.append(hetsc_exp) for i in range(num_trials): print("allele_diff: %s, h_sq: %s, trial: %s" % (allele_diff, h_sq, i)) # homogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, ps_stratif, h_sq) score = heterogeneity(cases,conts) hetscs_hom.append(score) # heterogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, ps_stratif, h_sq, het=True) score = heterogeneity(cases,conts) hetscs_het.append(score) mean_allele_diff.append(np.mean(cases) - np.mean(conts)) # controls, controls # cases,conts = generate_controls(num_cases, num_conts, num_snps, ps, h_sq) cases, conts = generate_controls(num_cases, num_conts, num_snps, ps_stratif) score = heterogeneity(cases,conts) hetscs_cont.append(score) hetscs_homs.append(hetscs_hom) hetscs_hets.append(hetscs_het) hetscs_conts.append(hetscs_cont) mean_allele_diffs.append(mean_allele_diff) pickle.dump({"hetscs_homs":hetscs_homs, "hetscs_hets":hetscs_hets, "hetscs_conts":hetscs_conts, "hetsc_exps":hetsc_exps, "mean_allele_diffs":mean_allele_diffs, "h_sqs":h_sqs}, open(FILE_PATH, "wb")) plot_results(fsts)