def test_n_splits(num_sub_phenos, frac_shared_effects, num_snps=100, num_cases=5000, num_conts=5000): num_snps_shared = num_snps * frac_shared_effects # ensure equal number of non-shared SNPs across sub-phenotypes assert (num_snps - num_snps_shared) % num_sub_phenos == 0 num_snps_exclsv = (num_snps - num_snps_shared) / num_sub_phenos h_sq = 0.034 ps = np.array([0.2] * num_snps) cases = np.zeros((0, num_snps)) conts = np.zeros((0, num_snps)) for i in range(num_sub_phenos): num_sub_cases = int(num_cases / num_sub_phenos) num_sub_conts = num_sub_cases sub_betas = np.array([ int(j < num_snps_shared or ( j>= num_snps_shared + i*(num_snps_exclsv) and \ j< num_snps_shared + (i+1)*(num_snps_exclsv)) ) for j in range(num_snps)]) print(num_snps_shared) print(num_snps_exclsv) print(sub_betas) # set variance explained over the subset beta_val = np.sqrt( h_sq / np.sum(np.multiply(2 * np.multiply(ps, 1 - ps), sub_betas))) sub_betas = beta_val * sub_betas print(i, num_sub_cases) print(sub_betas) print("subset variance explained:", np.dot(np.square(sub_betas), 2 * np.multiply(ps, 1 - ps))) # generate sub-cohort prev = 0.01 thresh = norm.ppf(1 - prev, loc=0, scale=1) sub_cases, sub_conts = generate_cohort(num_cases=num_sub_cases, num_conts=num_sub_conts, freqs=ps, betas=sub_betas, h_sq=h_sq, thresh=thresh) cases = np.concatenate((cases, sub_cases), axis=0) conts = np.concatenate((conts, sub_conts), axis=0) score = heterogeneity(cases, conts) return score
def run_liability(): FILE_PATH = "subtype_frac_liab.p" if os.path.exists(FILE_PATH): h_sq_frac_range, results, hetsc_exp = pickle.load(open( FILE_PATH, "rb")) plot_results(h_sq_frac_range, results, hetsc_exp, outname="subtype_scale_liability.eps", xaxis_label='Fraction of subtype variance explained') else: num_cases = 30000 num_conts = 30000 num_snps = 100 fixed_ps = np.array([0.2] * num_snps) # num_trials = 3 num_trials = 20 h_sq = 0.05 h_sq_frac_range = [0, 0.2, 0.4, 0.6, 0.8, 1.0] sub_betas_list, _ = generate_snps_splits(num_sub_phenos=2, frac_shared_effects=0, num_snps=num_snps, ps=fixed_ps, h_sq=h_sq) # subtype 1 properties ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq) prev = 0.01 thresh = norm.ppf(1 - prev, loc=0, scale=1) # expected score if all cases followed subtype 1 hetsc_exp = heterogeneity_expected_corr(ncases=num_cases, nconts=num_cases, effects=betas, thresh=thresh, freqs=fixed_ps, heritability=h_sq, verbose=False) print(hetsc_exp) results = [] for h_sq_frac in h_sq_frac_range: scores = [] # generate subtype 2 properties # _, betas_sub2 = generate_snp_props(num_snps, fixed_ps, h_sq*h_sq_frac) sub_betas_list2, _ = generate_snps_splits(num_sub_phenos=2, frac_shared_effects=0, num_snps=num_snps, ps=fixed_ps, h_sq=h_sq * h_sq_frac) for nt in range(num_trials): print("h_sq_frac: %s, trial: %s" % (h_sq_frac, nt)) cases_sub1, conts = generate_cohort( num_cases=int(num_cases / 2), num_conts=num_conts, freqs=fixed_ps, # freqs=ps, betas=sub_betas_list[0], # betas=betas, h_sq=h_sq, thresh=thresh) cases_sub2, _ = generate_cohort( num_cases=int(num_cases / 2), num_conts=num_conts, freqs=fixed_ps, # freqs=ps, betas=sub_betas_list2[1], # betas=betas_sub2, h_sq=h_sq * h_sq_frac, thresh=thresh) cases = np.concatenate((cases_sub1, cases_sub2), axis=0) score = heterogeneity(cases, conts) scores.append(score) results.append(scores) pickle.dump((h_sq_frac_range, results, hetsc_exp), open(FILE_PATH, "wb"))
def run_logistic(): FILE_PATH = "subtype_frac_logit.p" if os.path.exists(FILE_PATH): h_sq_frac_range, results, hetsc_exp = pickle.load(open( FILE_PATH, "rb")) plot_results(h_sq_frac_range, results, hetsc_exp, outname="subtype_scale_logit.eps", xaxis_label='Fraction of subtype odds ratio magnitude') else: num_cases = 30000 num_conts = 30000 num_snps = 100 fixed_ps = np.array([0.2] * num_snps) # num_trials = 3 num_trials = 20 OR_val = 1.1 OR_val_frac_range = [0, 0.2, 0.4, 0.6, 0.8, 1.0] prev = 0.01 # subtype 1 properties # ORs = np.array([OR_val]*num_snps) ORs = np.array([OR_val] * int(num_snps / 2) + [1.0] * int(num_snps / 2)) # expected score if all cases followed subtype 1 hetsc_exp = heterogeneity_expected_corr_logit(ncases=num_cases, nconts=num_cases, ORs=ORs, freqs=fixed_ps, prev=prev, verbose=False) print(hetsc_exp) results = [] for OR_val_frac in OR_val_frac_range: scores = [] # generate subtype 2 properties ORs_frac_val = 1.0 + (OR_val - 1) * OR_val_frac # ORs_frac = np.array([ORs_frac_val]*num_snps) ORs2 = np.array([1.0] * int(num_snps / 2) + [ORs_frac_val] * int(num_snps / 2)) for nt in range(num_trials): print("OR_val_frac: %s, trial: %s" % (OR_val_frac, nt)) cases_sub1, conts = generate_cohort_logistic( num_cases=int(num_cases / 2), num_conts=num_conts, freqs=fixed_ps, ORs=ORs, prev=prev) cases_sub2, _ = generate_cohort_logistic(num_cases=int( num_cases / 2), num_conts=num_conts, freqs=fixed_ps, ORs=ORs2, prev=prev) cases = np.concatenate((cases_sub1, cases_sub2), axis=0) score = heterogeneity(cases, conts) scores.append(score) results.append(scores) pickle.dump((OR_val_frac_range, results, hetsc_exp), open(FILE_PATH, "wb"))
def run_heritability(): FILE_PATH = "simulate_results_varexp.p" fig, ax = plt.subplots(1,1) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) if os.path.exists(FILE_PATH): pckl = pickle.load(open(FILE_PATH, "rb")) # hetsc_means_hom = pckl["hetsc_means_hom"] # hetsc_stds_hom = pckl["hetsc_stds_hom"] # hetsc_means_het = pckl["hetsc_means_het"] # hetsc_stds_het = pckl["hetsc_stds_het"] # hetsc_means_cont = pckl["hetsc_means_cont"] # hetsc_stds_cont = pckl["hetsc_stds_cont"] hetsc_means_hom = [np.mean(x) for x in pckl["hetscs_homs"]] hetsc_stds_hom = [np.std(x) for x in pckl["hetscs_homs"]] hetsc_means_het = [np.mean(x) for x in pckl["hetscs_hets"]] hetsc_stds_het = [np.std(x) for x in pckl["hetscs_hets"]] hetsc_means_cont = [np.mean(x) for x in pckl["hetscs_conts"]] hetsc_stds_cont = [np.std(x) for x in pckl["hetscs_conts"]] hetsc_exps = pckl["hetsc_exps"] h_sqs = pckl["h_sqs"] plt.errorbar(h_sqs, hetsc_means_hom, yerr=hetsc_stds_hom, color='red', capsize=5) plt.errorbar(h_sqs, hetsc_means_het, yerr=hetsc_stds_het, color='green', capsize=5) plt.errorbar(h_sqs, hetsc_means_cont, yerr=hetsc_stds_cont, color='black', capsize=5) plt.plot(h_sqs, hetsc_exps, color='blue') plt.xlabel("Variance explained by modeled SNPs", fontsize=14) plt.ylabel("Heterogeneity Score", fontsize=14) # label CLiP score xloc = 0.8 * (h_sqs[-1] - h_sqs[0]) yloc1 = np.interp(xloc, h_sqs, hetsc_exps) yloc2 = np.interp(xloc, h_sqs, hetsc_means_het) ax.annotate(s='',xy=(xloc, yloc1), xycoords='data', xytext=(xloc, yloc2),textcoords='data', arrowprops=dict(arrowstyle="<->", color='gray')) ax.annotate(s='CLiP Score',xy=(xloc * 1.02, yloc1 + (yloc2-yloc1)*0.75), xycoords='data',fontsize=14.0,textcoords='data', ha='left', color='gray') plt.xticks(fontsize=11) plt.yticks(fontsize=11) plt.savefig("simulate_basic_heritability.eps", format="eps", dpi=1000) plt.show() else: # hetsc_means_hom = [] # hetsc_stds_hom = [] # hetsc_means_het = [] # hetsc_stds_het = [] # hetsc_means_cont = [] # hetsc_stds_cont = [] hetscs_homs = [] hetscs_hets = [] hetscs_conts = [] hetsc_exps = [] h_sqs = [0.001, 0.01, 0.02, 0.03, 0.05, 0.075, 0.1] num_trials = 20 for h_sq in h_sqs: num_cases = 30000 num_conts = 30000 num_snps = 100 fixed_ps = np.array([0.2]*num_snps) hetscs_hom = [] hetscs_het = [] hetscs_cont = [] # expected score, homogeneous cases ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq) prev = 0.01 thresh = norm.ppf(1-prev, loc=0, scale=1) hetsc_exp = heterogeneity_expected_corr(ncases = num_cases, nconts = num_conts, effects = betas, thresh = thresh, freqs = ps, heritability = h_sq, verbose=False) hetsc_exps.append(hetsc_exp) for i in range(num_trials): print("h_sq: %s, trial: %s" % (h_sq, i)) # homogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, h_sq) score = heterogeneity(cases,conts) hetscs_hom.append(score) # heterogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, h_sq, het=True) score = heterogeneity(cases,conts) hetscs_het.append(score) # controls, controls cases,conts = generate_controls(num_cases, num_conts, num_snps, ps, h_sq) score = heterogeneity(cases,conts) hetscs_cont.append(score) # hetsc_means_hom.append(np.mean(hetscs_hom)) # hetsc_stds_hom.append(np.std(hetscs_hom)) # hetsc_means_het.append(np.mean(hetscs_het)) # hetsc_stds_het.append(np.std(hetscs_het)) # hetsc_means_cont.append(np.mean(hetscs_cont)) # hetsc_stds_cont.append(np.std(hetscs_cont)) hetscs_homs.append(hetscs_hom) hetscs_hets.append(hetscs_het) hetscs_conts.append(hetscs_cont) pickle.dump({"hetscs_homs":hetscs_homs, "hetscs_hets":hetscs_hets, "hetscs_conts":hetscs_conts, "hetsc_exps":hetsc_exps, "h_sqs":h_sqs}, open(FILE_PATH, "wb")) """
def run_sample_size(): FILE_PATH = "simulate_results_samplesize.p" fig, ax = plt.subplots(1,1) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) if os.path.exists(FILE_PATH): pckl = pickle.load(open(FILE_PATH, "rb")) # hetsc_means_hom = pckl["hetsc_means_hom"] # hetsc_stds_hom = pckl["hetsc_stds_hom"] # hetsc_means_het = pckl["hetsc_means_het"] # hetsc_stds_het = pckl["hetsc_stds_het"] # hetsc_means_cont = pckl["hetsc_means_cont"] # hetsc_stds_cont = pckl["hetsc_stds_cont"] hetsc_means_hom = [np.mean(x) for x in pckl["hetscs_homs"]] hetsc_stds_hom = [np.std(x) for x in pckl["hetscs_homs"]] hetsc_means_het = [np.mean(x) for x in pckl["hetscs_hets"]] hetsc_stds_het = [np.std(x) for x in pckl["hetscs_hets"]] hetsc_means_cont = [np.mean(x) for x in pckl["hetscs_conts"]] hetsc_stds_cont = [np.std(x) for x in pckl["hetscs_conts"]] hetsc_exps = pckl["hetsc_exps"] sample_size = pckl["sample_size"] plt.errorbar(sample_size, hetsc_means_hom, yerr=hetsc_stds_hom, color='red', capsize=5) plt.errorbar(sample_size, hetsc_means_het, yerr=hetsc_stds_het, color='green', capsize=5) plt.errorbar(sample_size, hetsc_means_cont, yerr=hetsc_stds_cont, color='black', capsize=5) # plot values for i in range(len(sample_size)): print("sample size: %s, score: %s, std dev: %s" % (sample_size[i], hetsc_means_hom[i], hetsc_stds_hom[i])) plt.plot(sample_size, hetsc_exps, color='blue') plt.xlabel("Number of simulated cases and controls", fontsize=14) plt.ylabel("Heterogeneity Score", fontsize=14) # label CLiP score xloc = 0.8 * (sample_size[-1] - sample_size[0]) yloc1 = np.interp(xloc, sample_size, hetsc_exps) yloc2 = np.interp(xloc, sample_size, hetsc_means_het) ax.annotate(s='',xy=(xloc, yloc1), xycoords='data', xytext=(xloc, yloc2),textcoords='data', arrowprops=dict(arrowstyle="<->", color='gray')) ax.annotate(s='CLiP Score',xy=(xloc * 1.02, yloc1 + (yloc2-yloc1)*0.75), xycoords='data',fontsize=14.0,textcoords='data', ha='left', color='gray') plt.xticks(fontsize=11) plt.yticks(fontsize=11) plt.savefig("simulate_basic_num_inds.eps", format="eps", dpi=1000) plt.show() else: # hetsc_means_hom = [] # hetsc_stds_hom = [] # hetsc_means_het = [] # hetsc_stds_het = [] # hetsc_means_cont = [] # hetsc_stds_cont = [] hetscs_homs = [] hetscs_hets = [] hetscs_conts = [] hetsc_exps = [] sample_sizes = [1000, 5000, 10000, 20000, 30000, 50000] # sample_sizes = [5000, 10000] # sample_sizes = [1000, 5000, 10000] num_trials = 20 for sample_size in sample_sizes: num_cases = sample_size num_conts = sample_size num_snps = 100 h_sq = 0.034 fixed_ps = np.array([0.2]*num_snps) hetscs_hom = [] hetscs_het = [] hetscs_cont = [] # expected score, homogeneous cases ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq) prev = 0.01 thresh = norm.ppf(1-prev, loc=0, scale=1) hetsc_exp = heterogeneity_expected_corr(ncases = num_cases, nconts = num_conts, effects = betas, thresh = thresh, freqs = ps, heritability = h_sq, verbose=False) hetsc_exps.append(hetsc_exp) for i in range(num_trials): print("sample_size: %s, trial: %s" % (sample_size, i)) # homogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, fixed_ps, h_sq) score = heterogeneity(cases,conts) hetscs_hom.append(score) # heterogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, fixed_ps, h_sq, het=True) score = heterogeneity(cases,conts) hetscs_het.append(score) # controls, controls cases,conts = generate_controls(num_cases, num_conts, num_snps, fixed_ps, h_sq) score = heterogeneity(cases,conts) hetscs_cont.append(score) # hetsc_means_hom.append(np.mean(hetscs_hom)) # hetsc_stds_hom.append(np.std(hetscs_hom)) # hetsc_means_het.append(np.mean(hetscs_het)) # hetsc_stds_het.append(np.std(hetscs_het)) # hetsc_means_cont.append(np.mean(hetscs_cont)) # hetsc_stds_cont.append(np.std(hetscs_cont)) hetscs_homs.append(hetscs_hom) hetscs_hets.append(hetscs_het) hetscs_conts.append(hetscs_cont) pickle.dump({"hetscs_homs":hetscs_homs, "hetscs_hets":hetscs_hets, "hetscs_conts":hetscs_conts, "hetsc_exps":hetsc_exps, "sample_size":sample_sizes}, open(FILE_PATH, "wb")) """
# frac_shared_effects_list = [0, .25, .5, .75, 1] frac_shared_effects_list = [0, .5, 1] split_results = {} for i in num_subpheno_list: split_results[i] = {} for j in frac_shared_effects_list: split_results[i][j] = [] for nsubph in num_subpheno_list: for fsheff in frac_shared_effects_list: for nt in range(num_trials): print(nsubph, fsheff, nt) h_sq = 0.05 sub_betas_list, ps = generate_snps_splits( num_sub_phenos=nsubph, frac_shared_effects=fsheff, num_snps=num_snps, ps=ps, h_sq=h_sq) cases, conts = generate_cohort_splits( num_sub_phenos=nsubph, sub_betas_list=sub_betas_list, ps=ps, num_cases=num_cases, num_conts=num_cases, h_sq=h_sq) score = heterogeneity(cases, conts) split_results[nsubph][fsheff].append(score) pickle.dump(split_results, open("split_results.p", "wb"))
# pickle file for saving correlation tests corr_matrices = [] # for display table and FDR calculation table_vals = [] # for plotting score vs num_cases plot_vals = [] score_std = 1 for fl in files: cht_name, FILE_PATH, SAMPLE_PATH = fl # try: cases, conts, cases_abr, conts_abr, snpsdel, ors, frqs = extract( SNP_PATH, FILE_PATH, SAMPLE_PATH) hetsc = heterogeneity(cases_abr, conts_abr) # convert odds ratios to liability threshold ratios betas = convertORs(ors, prev) h_sq = np.sum( np.multiply(np.square(betas), 2 * np.multiply(frqs, 1 - frqs))) # print("h_sq:", h_sq) expected_score = heterogeneity_expected_corr(ncases=cases.shape[0], nconts=conts.shape[0], effects=betas, thresh=thresh, freqs=frqs, heritability=h_sq, verbose=False) if snpsdel == 0: # store for combined set super_betas = betas
def run_heritability(): # FILE_PATH = "population_stratif_out.p" # pop_allele_diffs = [0.0, 0.05, 0.1, 0.15, 0.2] fsts = np.array([0.001, 0.005, 0.01, 0.05, 0.1]) pop_allele_diffs = 0.5*np.sqrt(fsts) # for allele_diff in pop_allele_diffs: # FILE_PATH = "population_stratif_out_%s.p" % (allele_diff) for fst, allele_diff in zip(fsts, pop_allele_diffs): FILE_PATH = "population_stratif_out_%s.p" % (fst) if not os.path.exists(FILE_PATH): num_snps = 100 fixed_ps_val = 0.5 fixed_ps = np.array([fixed_ps_val]*num_snps) # generate stratified populations ps_stratif = [[fixed_ps_val - allele_diff]*int(num_snps/2) + [fixed_ps_val + allele_diff]*int(num_snps/2), [fixed_ps_val + allele_diff]*int(num_snps/2) + [fixed_ps_val - allele_diff]*int(num_snps/2)] hetscs_homs = [] hetscs_hets = [] hetscs_conts = [] hetsc_exps = [] mean_allele_diffs = [] # allele difference between homogeneous cases and controls h_sqs = [0.001, 0.01, 0.02, 0.03, 0.05, 0.075, 0.1] # h_sqs = [0.001, 0.025, 0.05, 0.075, 0.1] num_trials = 30 for h_sq in h_sqs: num_cases = 30000 num_conts = 30000 hetscs_hom = [] hetscs_het = [] hetscs_cont = [] mean_allele_diff = [] # expected score, homogeneous cases ps, betas = generate_snp_props(num_snps, fixed_ps, h_sq) prev = 0.01 thresh = norm.ppf(1-prev, loc=0, scale=1) hetsc_exp = heterogeneity_expected_corr(ncases = num_cases, nconts = num_conts, effects = betas, thresh = thresh, freqs = ps, heritability = h_sq, verbose=False) hetsc_exps.append(hetsc_exp) for i in range(num_trials): print("allele_diff: %s, h_sq: %s, trial: %s" % (allele_diff, h_sq, i)) # homogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, ps_stratif, h_sq) score = heterogeneity(cases,conts) hetscs_hom.append(score) # heterogeneous cases, controls cases,conts = generate_homhet_cohort(num_cases, num_conts, num_snps, ps, ps_stratif, h_sq, het=True) score = heterogeneity(cases,conts) hetscs_het.append(score) mean_allele_diff.append(np.mean(cases) - np.mean(conts)) # controls, controls # cases,conts = generate_controls(num_cases, num_conts, num_snps, ps, h_sq) cases, conts = generate_controls(num_cases, num_conts, num_snps, ps_stratif) score = heterogeneity(cases,conts) hetscs_cont.append(score) hetscs_homs.append(hetscs_hom) hetscs_hets.append(hetscs_het) hetscs_conts.append(hetscs_cont) mean_allele_diffs.append(mean_allele_diff) pickle.dump({"hetscs_homs":hetscs_homs, "hetscs_hets":hetscs_hets, "hetscs_conts":hetscs_conts, "hetsc_exps":hetsc_exps, "mean_allele_diffs":mean_allele_diffs, "h_sqs":h_sqs}, open(FILE_PATH, "wb")) plot_results(fsts)