def simulate_mult_n(file_path, runs=1, num_snps=100, h=0.3, num_inds_vals=(1000, 5000, 10000, 20000, 50000, 100000), bzs=[-0.6, -0.3, -0.1, 0, 0.1, 0.3, 0.6, 0.9, 1.2, 1.5]): independent_snps = generate_pss_model_simple(num_snps, h) hetero_snps = generate_pss_model_simple(num_snps, h) weight_funcs = { "log1": log1, "log1p5": log1p5, "log3": log3, "sigmoid": sigmoid, "linear": linear, "step": step, "polynom2": polynom2, "polynom4": polynom4, "polynom6": polynom6 } hom_thresh_runs = {} het_thresh_runs = {} hom_continuous_runs = {} het_continuous_runs = {} hom_contin_exp_runs = {} for bz in bzs: hom_thresh_runs[bz] = {ni: [] for ni in num_inds_vals} het_thresh_runs[bz] = {ni: [] for ni in num_inds_vals} for ws in weight_funcs: hom_continuous_runs[ws] = {ni: [] for ni in num_inds_vals} het_continuous_runs[ws] = {ni: [] for ni in num_inds_vals} hom_contin_exp_runs[ws] = {ni: None for ni in num_inds_vals} for num_inds in num_inds_vals: for ws in weight_funcs: print("running exp hom score: " + ws) hom_contin_exp_runs[ws][num_inds] = get_exp_heterogeneity( num_inds, independent_snps, h, weight_funcs[ws]) for i in range(0, runs): print(str(num_inds) + "-" + str(i)) independent_pop = generate_population(independent_snps, num_inds=num_inds, h=h) hetero_pop = generate_population(hetero_snps, num_inds=num_inds, h=h) # mean-center phenos independent_pop = (independent_pop[0], np.array(independent_pop[1]) - np.mean(independent_pop[1])) hetero_pop = (hetero_pop[0], np.array(hetero_pop[1]) - np.mean(hetero_pop[1])) het_mean = np.mean(hetero_pop[1]) het_std = np.std(hetero_pop[1]) hetero_pop[1][int(num_inds / 2):] = np.random.normal( loc=het_mean, scale=het_std, size=int(num_inds / 2)) for ws in weight_funcs: print("num inds:", num_inds, "run:", i, "weight func:", ws) hom_continuous_runs[ws][num_inds].append( run_cont_heterogeneity_on_pop( independent_pop, independent_snps, weight_func=weight_funcs[ws])) het_continuous_runs[ws][num_inds].append( run_cont_heterogeneity_on_pop( hetero_pop, hetero_snps, weight_func=weight_funcs[ws])) for bz in bzs: print("num inds:", num_inds, "run:", i, "binary thresh:", bz) hom_thresh_runs[bz][num_inds].append( run_heterogeneity_on_pop(independent_pop, independent_snps, z=bz)) het_thresh_runs[bz][num_inds].append( run_heterogeneity_on_pop(hetero_pop, hetero_snps, z=bz)) with open(file_path, "wb") as f: pickle.dump( { "hom_thresh_runs": hom_thresh_runs, "het_thresh_runs": het_thresh_runs, "hom_continuous_runs": hom_continuous_runs, "het_continuous_runs": het_continuous_runs, "hom_contin_exp_runs": hom_contin_exp_runs }, f)
def run(num_inds=50000, num_snps=10, h=0.1, deg=8, numtrain=5, symmetric=False, verbose=False): # Sample data training_data = [] for i in range(numtrain): independent_snps = generate_pss_model_simple(num_snps, h) hetero_snps = generate_pss_model_simple(num_snps, h) independent_pop = generate_population(independent_snps, num_inds=num_inds, h=h) hetero_pop = generate_population(hetero_snps, num_inds=num_inds, h=h) # mean-center phenos independent_pop = (independent_pop[0], np.array(independent_pop[1]) - np.mean(independent_pop[1])) hetero_pop = (hetero_pop[0], np.array(hetero_pop[1]) - np.mean(hetero_pop[1])) het_mean = np.mean(hetero_pop[1]) het_std = np.std(hetero_pop[1]) hetero_pop[1][int(num_inds / 2):] = np.random.normal(loc=het_mean, scale=het_std, size=int( num_inds / 2)) training_data.append({ "independent_snps": independent_snps, "independent_pop": independent_pop, "hetero_snps": hetero_snps, "hetero_pop": hetero_pop }) plt.ion() count = 0 plot_count = 1 plot_cnt_tot = 10 ## when learn_coef=True, coefficients are learned directly rather than roots ## -- This allows coefficients for odd degrees to be set to 0 ## -- Not all polynomials have maximum number of roots # initialization of weight distribution here coef_wts = np.random.normal(loc=0, scale=500, size=deg + 1) if symmetric: coef_wts[1::2] = 0 while not valid_poly( coef_wts, symmetric=symmetric, minval=-0.5, maxval=0.5): coef_wts = np.random.normal(loc=0, scale=500, size=deg + 1) if symmetric: coef_wts[1::2] = 0 scrhoms = [] scrhets = [] for i in range(numtrain): independent_pop = training_data[i]["independent_pop"] independent_snps = training_data[i]["independent_snps"] hetero_pop = training_data[i]["hetero_pop"] hetero_snps = training_data[i]["hetero_snps"] HetScorehom = run_cont_heterogeneity_on_pop(independent_pop, independent_snps, weight_func=None, coef_wts=coef_wts, symmetric=symmetric) HetScorehet = run_cont_heterogeneity_on_pop(hetero_pop, hetero_snps, weight_func=None, coef_wts=coef_wts, symmetric=symmetric) scrhoms.append(HetScorehom) scrhets.append(HetScorehet) HetScorehom = np.mean(scrhoms) HetScorehet = np.mean(scrhets) # HetScorediff = HetScorehom HetScorediff = HetScorehet - HetScorehom if symmetric: pc_range = np.linspace(-0.5, 0.5, 500, endpoint=False) else: pc_range = np.linspace(0, 1, 500, endpoint=False) polynom = np.poly1d(coef_wts, r=True) if verbose: plt.scatter(pc_range, [polynom(x) for x in pc_range], color=str(1.0 - plot_count / plot_cnt_tot)) if verbose: plt.show(block=False) plt.pause(0.05) while count < 50: count += 1 inc = np.random.randint(0, deg + 1) if symmetric and inc % 2 == 1: while inc % 2 == 1: inc = np.random.randint(0, deg + 1) coef_wts_cand = np.copy(coef_wts) coef_wts_cand[inc] += np.random.normal(loc=0, scale=500) # check that this is valid while not valid_poly( coef_wts_cand, symmetric=symmetric, minval=-0.5, maxval=0.5): print(".", ) inc = np.random.randint(0, deg + 1) if symmetric and inc % 2 == 1: while inc % 2 == 1: inc = np.random.randint(0, deg + 1) coef_wts_cand = np.copy(coef_wts) coef_wts_cand[inc] += np.random.normal(loc=0, scale=10) HetScorehom_cand = run_cont_heterogeneity_on_pop( independent_pop, independent_snps, weight_func=None, coef_wts=coef_wts_cand, symmetric=symmetric) HetScorehet_cand = run_cont_heterogeneity_on_pop( hetero_pop, hetero_snps, weight_func=None, coef_wts=coef_wts_cand, symmetric=symmetric) scrhoms = [] scrhets = [] for i in range(numtrain): independent_pop = training_data[i]["independent_pop"] independent_snps = training_data[i]["independent_snps"] hetero_pop = training_data[i]["hetero_pop"] hetero_snps = training_data[i]["hetero_snps"] HetScorehom_cand = run_cont_heterogeneity_on_pop( independent_pop, independent_snps, weight_func=None, coef_wts=coef_wts_cand, symmetric=symmetric) HetScorehet_cand = run_cont_heterogeneity_on_pop( hetero_pop, hetero_snps, weight_func=None, coef_wts=coef_wts_cand, symmetric=symmetric) scrhoms.append(HetScorehom_cand) scrhets.append(HetScorehet_cand) HetScorehom_cand = np.mean(scrhoms) HetScorehet_cand = np.mean(scrhets) # HetScorediff_cand = HetScorehom_cand HetScorediff_cand = HetScorehet_cand - HetScorehom_cand print("count: ", count) print("HetScorediff:", HetScorediff) print("HetScorediff_cand: ", HetScorediff_cand) print("coef_wts:", coef_wts) print("-" * 20) if HetScorediff_cand > HetScorediff: HetScorediff = HetScorediff_cand coef_wts = coef_wts_cand # plot updated value here polynom = np.poly1d(coef_wts, r=False) if verbose: plt.figure() plt.scatter(pc_range, [polynom(x) for x in pc_range], color=str(1.0 - plot_count / plot_cnt_tot)) plt.xlabel("PRS percentile") plt.ylabel("individual weight phi") plt.show(block=False) plt.pause(0.05) plot_count = min(plot_count + 1, plot_cnt_tot) return HetScorediff, coef_wts
def learn_weight_func(num_snps=10, h=0.1, num_inds=5000): """ Evaluate quantitative phenotype score for multiple values of case sample size """ independent_snps = generate_pss_model_simple(num_snps, h) hetero_snps = generate_pss_model_simple(num_snps, h) independent_pop = generate_population(independent_snps, num_inds=num_inds, h=h) hetero_pop = generate_population(hetero_snps, num_inds=num_inds, h=h) # mean-center phenos independent_pop = (independent_pop[0], np.array(independent_pop[1]) - np.mean(independent_pop[1])) hetero_pop = (hetero_pop[0], np.array(hetero_pop[1]) - np.mean(hetero_pop[1])) het_mean = np.mean(hetero_pop[1]) het_std = np.std(hetero_pop[1]) hetero_pop[1][int(num_inds / 2):] = np.random.normal(loc=het_mean, scale=het_std, size=int(num_inds / 2)) ########################### ### Test to find best w ### ########################### learn_coefs = False symmetric = False if not learn_coefs: # don't apply symmetry to the bin weights method symmetric = False plt.ion() count = 0 plot_count = 1 plot_cnt_tot = 50 if not learn_coefs else 10 deg = 4 ## when learn_coef=True, coefficients are learned directly rather than roots ## -- This allows coefficients for odd degrees to be set to 0 ## -- Not all polynomials have maximum number of roots # initialization of weight distribution here if learn_coefs: coef_wts = np.random.normal(loc=0, scale=10, size=deg + 1) if symmetric: coef_wts[1::2] = 0 while not valid_poly( coef_wts, symmetric=symmetric, minval=-0.5, maxval=0.5): coef_wts = np.random.normal(loc=0, scale=1, size=deg + 1) if symmetric: coef_wts[1::2] = 0 HSC_hom = run_cont_heterogeneity_on_pop(independent_pop, independent_snps, weight_func=None, coef_wts=coef_wts, symmetric=symmetric) HSC_het = run_cont_heterogeneity_on_pop(hetero_pop, hetero_snps, weight_func=None, coef_wts=coef_wts, symmetric=symmetric) else: # block coef numbins = 40 block_wts = np.array([0.0] * int(numbins / 2) + [1.0] * int(numbins / 2)) # block_wts = np.random.normal(loc=1.0, scale=0.01, size=numbins) HSC_hom = run_cont_heterogeneity_on_pop(independent_pop, independent_snps, weight_func=None, block_wts=block_wts, numbins=numbins) HSC_het = run_cont_heterogeneity_on_pop(hetero_pop, hetero_snps, weight_func=None, block_wts=block_wts, numbins=numbins) # HSC_diff = HSC_hom HSC_diff = HSC_het - HSC_hom if symmetric: # pc_range = np.linspace(np.amin(independent_pop[1]), np.amax(independent_pop[1]), 500) pc_range = np.linspace(-0.5, 0.5, 500, endpoint=False) else: pc_range = np.linspace(0, 1, 500, endpoint=False) if learn_coefs: polynom = np.poly1d(coef_wts, r=True) plt.scatter(pc_range, [polynom(x) for x in pc_range], color=str(1.0 - plot_count / plot_cnt_tot)) else: plt.scatter(pc_range, block_wts[[int(x) for x in np.floor((pc_range) * numbins)]], color=str(1.0 - plot_count / plot_cnt_tot)) plt.show(block=False) plt.pause(0.05) if symmetric: plt.figure() while True: count += 1 if learn_coefs: inc = np.random.randint(0, deg + 1) if symmetric and inc % 2 == 1: while inc % 2 == 1: inc = np.random.randint(0, deg + 1) coef_wts_cand = np.copy(coef_wts) coef_wts_cand[inc] += np.random.normal(loc=0, scale=10) # check that this is valid while not valid_poly( coef_wts_cand, symmetric=symmetric, minval=-0.5, maxval=0.5): print(".", ) inc = np.random.randint(0, deg + 1) if symmetric and inc % 2 == 1: while inc % 2 == 1: inc = np.random.randint(0, deg + 1) coef_wts_cand = np.copy(coef_wts) coef_wts_cand[inc] += np.random.normal(loc=0, scale=10) HSC_hom_cand = run_cont_heterogeneity_on_pop( independent_pop, independent_snps, weight_func=None, coef_wts=coef_wts_cand, symmetric=symmetric) HSC_het_cand = run_cont_heterogeneity_on_pop( hetero_pop, hetero_snps, weight_func=None, coef_wts=coef_wts_cand, symmetric=symmetric) else: inc = np.random.randint(0, numbins) block_wts_cand = np.copy(block_wts) candidate = block_wts_cand[inc] + np.random.normal(loc=0, scale=0.01) if candidate < 0: continue block_wts_cand[inc] = candidate HSC_hom_cand = run_cont_heterogeneity_on_pop( independent_pop, independent_snps, weight_func=None, block_wts=block_wts_cand, numbins=numbins) HSC_het_cand = run_cont_heterogeneity_on_pop( hetero_pop, hetero_snps, weight_func=None, block_wts=block_wts_cand, numbins=numbins) # HSC_diff_cand = HSC_hom_cand HSC_diff_cand = HSC_het_cand - HSC_hom_cand if learn_coefs: print("count: ", count) print("HSC_diff:", HSC_diff) print("HSC_diff_cand: ", HSC_diff_cand) print("coef_wts:", coef_wts) print("-" * 20) if HSC_diff_cand > HSC_diff: HSC_diff = HSC_diff_cand if learn_coefs: coef_wts = coef_wts_cand # plot updated value here polynom = np.poly1d(coef_wts, r=False) plt.scatter(pc_range, [polynom(x) for x in pc_range], color=str(1.0 - plot_count / plot_cnt_tot)) plt.xlabel("PRS percentile") plt.ylabel("individual weight phi") plt.show(block=False) plt.pause(0.05) plot_count = min(plot_count + 1, plot_cnt_tot) else: block_wts = block_wts_cand if not learn_coefs and count > 500: count = 0 if plot_count < plot_cnt_tot: plot_count += 1 print("-" * 20) print("HSC diff:", HSC_diff) coefs = np.polyfit(np.linspace(0, 1, numbins), block_wts, deg=deg) print("poly coefs:", coefs) # score from the poly coefs directly try: HSC_hom_cand = run_cont_heterogeneity_on_pop(independent_pop, independent_snps, weight_func=None, coef_wts=coefs) HSC_het_cand = run_cont_heterogeneity_on_pop(hetero_pop, hetero_snps, weight_func=None, coef_wts=coefs) HSC_diff_cand = HSC_het_cand - HSC_hom_cand print("poly coef HSC:", HSC_diff_cand) except: print("poly coef HSC: invalid") plt.scatter( pc_range, block_wts[[int(x) for x in np.floor((pc_range) * numbins)]], color=str(1.0 - plot_count / plot_cnt_tot)) polynom = np.poly1d(coefs) plt.plot(np.linspace(0, 1, numbins), polynom(np.linspace(0, 1, numbins)), color=str(1.0 - plot_count / plot_cnt_tot)) plt.xlabel("PRS percentile") plt.ylabel("individual weight phi") plt.show(block=False) plt.pause(0.05)
def test_funcs(filename, numtrain=20, num_snps=10, h=0.1, num_inds=100000, symmetric=False, verbose=True): training_data = [] for i in range(numtrain): independent_snps = generate_pss_model_simple(num_snps, h) hetero_snps = generate_pss_model_simple(num_snps, h) independent_pop = generate_population(independent_snps, num_inds=num_inds, h=h) hetero_pop = generate_population(hetero_snps, num_inds=num_inds, h=h) # mean-center phenos independent_pop = (independent_pop[0], np.array(independent_pop[1]) - np.mean(independent_pop[1])) hetero_pop = (hetero_pop[0], np.array(hetero_pop[1]) - np.mean(hetero_pop[1])) het_mean = np.mean(hetero_pop[1]) het_std = np.std(hetero_pop[1]) hetero_pop[1][int(num_inds / 2):] = np.random.normal(loc=het_mean, scale=het_std, size=int( num_inds / 2)) training_data.append({ "independent_snps": independent_snps, "independent_pop": independent_pop, "hetero_snps": hetero_snps, "hetero_pop": hetero_pop }) # test all polynoms in file coef_cands = [] HetScorecands = [] HetScorecand_stds = [] count = 1 with open(filename) as f: for line in f: line = line.split(":")[1] coef_wts = [float(x) for x in line.split(",")] scr_diffs = [] scr_homs = [] for i in range(numtrain): independent_pop = training_data[i]["independent_pop"] independent_snps = training_data[i]["independent_snps"] hetero_pop = training_data[i]["hetero_pop"] hetero_snps = training_data[i]["hetero_snps"] HetScorehom = run_cont_heterogeneity_on_pop( independent_pop, independent_snps, weight_func=None, coef_wts=coef_wts, symmetric=symmetric) HetScorehet = run_cont_heterogeneity_on_pop( hetero_pop, hetero_snps, weight_func=None, coef_wts=coef_wts, symmetric=symmetric) HetScorediff = HetScorehet - HetScorehom scr_diffs.append(HetScorediff) scr_homs.append(HetScorehom) coef_cands.append(coef_wts) HetScorecands.append(np.mean(scr_diffs)) HetScorecand_stds.append(np.std(scr_homs)) print(count, HetScorecands[-1], HetScorecand_stds[-1]) with open("evaluated_candidates.txt", "a") as f: f.write("%s|%s|%s|%s\n" % (count, HetScorecands[-1], HetScorecand_stds[-1], filename)) count += 1 if verbose: plt.errorbar(range(len(coef_cands)), HetScorecands, yerr=HetScorecand_stds) plt.show()