def _test_(): singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) print snps snps = snps.T snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0) snps = snps.T print snps, snps.shape file_prefix = os.environ['HOME'] + '/tmp/test' phen_list = phenotypes.simulate_traits_w_snps_to_hdf5(snps, hdf5_file_prefix=file_prefix, num_traits=30, p=0.1) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list['phenotypes']): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('%s_%d_phen.png' % (file_prefix, i)) plt.clf() agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i), [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]], result_labels=['Common SNPs', 'Singletons', 'Doubletons'], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Cholesky permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) #ATT permutations (Implement) #PC permutations (Implement) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def _test_scz_(): # Load Schizophrenia data singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) test_snps = sp.vstack([singleton_snps, doubleton_snps]) print snps phen_list = phenotypes.simulate_traits( snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i) plt.clf() agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i, [ ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:] ], result_labels=[ 'Common SNPs', 'Singletons', 'Doubletons' ], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Now permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def _test_scz_(): # Load Schizophrenia data singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) test_snps = sp.vstack([singleton_snps, doubleton_snps]) print snps phen_list = phenotypes.simulate_traits(snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i) plt.clf() agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i, [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]], result_labels=['Common SNPs', 'Singletons', 'Doubletons'], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Now permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def _test_GxE_mixed_model_gwas( num_indivs=1000, num_snps=10000, num_trait_pairs=10, plot_prefix='/Users/bjarnivilhjalmsson/tmp/test'): """ Test for the multiple environment mixed model Simulates correlated trait pairs with exponentially distributed effects. """ import simulations import kinship import scipy as sp import linear_models as lm import gwaResults as gr num_trait_pairs = 10 num_indivs = 200 num_snps = 10000 num_causals = 10 # Number of causal SNPs per trait (in total there may be up to twice that, depending on genetic correlation) # Simulating (unlinked) genotypes and phenotype pairs w. random positive correlation d = simulations.get_simulated_data(num_indivs=num_indivs, num_snps=num_snps, num_trait_pairs=num_trait_pairs, num_causals=num_causals) for i in range(num_trait_pairs): # The two different phenotypes. phen1 = d['trait_pairs'][i][0] phen2 = d['trait_pairs'][i][1] # Stacking up the two phenotypes into one vector. Y = sp.hstack([phen1, phen2]) # The higher genetic correlation, the better the model fit (since we assume genetic correlation is 1). print 'The genetic correlation between the two traits is %0.4f' % d[ 'rho_est_list'][i][0, 1] # The genotypes sd = d['sd'] snps = sd.get_snps() # Doubling the genotype data as well. snps = sp.hstack([snps, snps]) # Calculating the kinship using the duplicated genotypes K = kinship.calc_ibd_kinship(snps) print '' # Calculating the environment vector E = sp.zeros((2 * num_indivs, 1)) E[num_indivs:, 0] = 1 print 'Here are the dimensions:' print 'Y.shape: ', Y.shape print 'snps.shape: ', snps.shape print 'E.shape: ', E.shape print 'K.shape: ', K.shape mm_results = lm.emmax_w_two_env(snps, Y, K, E) gtres = mm_results["gt_res"] gtgres = mm_results["gt_g_res"] gres = mm_results["g_res"] # Figuring out which loci are causal highlight_loci = sp.array( sd.get_chr_pos_list())[d['causal_indices_list'][i]] highlight_loci = highlight_loci.tolist() highlight_loci.sort() # Plotting stuff res = gr.Result(scores=gtres['ps'], snps_data=sd) res.plot_manhattan(png_file='%s_%d_gtres_manhattan.png' % (plot_prefix, i), percentile=50, highlight_loci=highlight_loci, plot_bonferroni=True, neg_log_transform=True) res.plot_qq('%s_%d_gtres_qq.png' % (plot_prefix, i)) res = gr.Result(scores=gtgres['ps'], snps_data=sd) res.plot_manhattan(png_file='%s_%d_gtgres_manhattan.png' % (plot_prefix, i), percentile=50, highlight_loci=highlight_loci, plot_bonferroni=True, neg_log_transform=True) res.plot_qq('%s_%d_gtgres_qq.png' % (plot_prefix, i)) res = gr.Result(scores=gres['ps'], snps_data=sd) res.plot_manhattan(png_file='%s_%d_gres_manhattan.png' % (plot_prefix, i), percentile=50, highlight_loci=highlight_loci, plot_bonferroni=True, neg_log_transform=True) res.plot_qq('%s_%d_gres_qq.png' % (plot_prefix, i))
def leave_k_out_blup( num_cvs=20, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/', k_thres=0.5): """ """ import h5py import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] res_dict = {} for phenotype in phenotypes: env_dict = {} for env in envs: print phenotype, env s1 = time.time() #Load data.. d = hdf5_data.coordinate_cegs_genotype_phenotype(phen_dict, phenotype, env, k_thres=k_thres) Y_means = d['Y_means'] snps = d['snps'] assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?' K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' assert sp.all(sp.negative(sp.isnan(K))), 'WTF?' n = len(Y_means) #partition genotypes in k parts. gt_ids = d['gt_ids'] num_ids = len(gt_ids) chunk_size = num_ids / num_cvs #Create k CV sets of prediction and validation data cv_chunk_size = int((n / num_cvs) + 1) ordering = sp.random.permutation(n) a = sp.arange(n) osb_ys = [] pred_ys = [] p_herits = [] for cv_i, i in enumerate(range(0, n, cv_chunk_size)): cv_str = 'cv_%d' % cv_i #print 'Working on CV %d' % cv_i end_i = min(n, i + cv_chunk_size) validation_filter = sp.in1d(a, ordering[i:end_i]) training_filter = sp.negative(validation_filter) train_snps = snps[:, training_filter] val_snps = snps[:, validation_filter] train_Y = Y_means[training_filter] val_Y = Y_means[validation_filter] #Calc. kinship K_train = K[training_filter, :][:, training_filter] K_cross = K[validation_filter, :][:, training_filter] #Do gBLUP lmm = lm.LinearMixedModel(train_Y) lmm.add_random_effect(K_train) r1 = lmm.get_REML() #Now the BLUP. y_mean = sp.mean(lmm.Y) Y = lmm.Y - y_mean p_herit = r1['pseudo_heritability'] p_herits.append(p_herit) #delta = (1 - p_herit) / p_herit # if K_inverse == None: # K_inverse = K.I # M = (sp.eye(K.shape[0]) + delta * K_inverse) # u_blup = M.I * Y M = sp.mat(p_herit * sp.mat(K_train) + (1 - p_herit) * sp.eye(K_train.shape[0])) u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten() osb_ys.extend(val_Y) pred_ys.extend(u_mean_pred) corr = sp.corrcoef(osb_ys, pred_ys)[1, 0] print 'Correlation:', corr r2 = corr**2 print 'R2:', r2 mean_herit = sp.mean(p_herits) print 'Avg. heritability:', mean_herit env_dict[env] = { 'R2': r2, 'obs_y': osb_ys, 'pred_y': pred_ys, 'corr': corr, 'avg_herit': mean_herit } res_dict[phenotype] = env_dict res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % ( num_cvs, k_thres) h5f = h5py.File(res_hdf5_file) for phenotype in phenotypes: phen_g = h5f.create_group(phenotype) for env in envs: d = res_dict[phenotype][env] env_g = phen_g.create_group(env) env_g.create_dataset('R2', data=[d['R2']]) env_g.create_dataset('corr', data=[d['corr']]) env_g.create_dataset('obs_y', data=d['obs_y']) env_g.create_dataset('pred_y', data=d['pred_y']) env_g.create_dataset('avg_herit', data=[d['avg_herit']]) h5f.close()
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'): """ Perform a simple MLM GWAS for the 8 traits """ import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] for phenotype in phenotypes: for env in envs: print phenotype, env s1 = time.time() d = hdf5_data.coordinate_cegs_genotype_phenotype( phen_dict, phenotype, env) print 'Calculating kinship' if kinship_type == 'ibs': K = kinship.calc_ibs_kinship(d['snps']) elif kinship_type == 'ibd': K = kinship.calc_ibd_kinship(d['snps']) else: raise NotImplemented if phen_type == 'means': lmm = lm.LinearMixedModel(d['Y_means']) elif phen_type == 'medians': lmm = lm.LinearMixedModel(d['Y_medians']) else: raise NotImplementedError lmm.add_random_effect(K) print "Running EMMAX" res = lmm.emmax_f_test(d['snps'], emma_num=1000) print 'Mean p-value:', sp.mean(res['ps']) secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) #Now generating QQ-plots label_str = '%s_%s_%s_%s' % (kinship_type, phenotype, env, phen_type) agr.plot_simple_qqplots_pvals( '/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str), [res['ps']], result_labels=[label_str], line_colors=['green'], num_dots=1000, title=None, max_neg_log_val=6) # Perform multiple loci mixed model GWAS chromosomes = d['positions'][:, 0] positions = sp.array(d['positions'][:, 1], 'int32') x_positions = [] y_log_pvals = [] colors = [] x_shift = 0 for i, chrom in enumerate(sp.unique(chromosomes)): if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']: colors.append('c') else: # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra'] #Toss U and Hets colors.append('m') chrom_filter = sp.in1d(chromosomes, chrom) positions_slice = positions[chrom_filter] x_positions.append(positions_slice + x_shift) x_shift += positions_slice.max() log_ps_slice = -sp.log10(res['ps'][chrom_filter]) y_log_pvals.append(log_ps_slice) m = len(positions) log_bonf = -sp.log10(1 / (20.0 * m)) print m, log_bonf # Plot manhattan plots? plt.figure(figsize=(12, 4)) plt.axes([0.03, 0.1, 0.95, 0.8]) for i, chrom in enumerate(sp.unique(chromosomes)): plt.plot(x_positions[i], y_log_pvals[i], c=colors[i], ls='', marker='.') xmin, xmax = plt.xlim() plt.hlines(log_bonf, xmin, xmax, colors='k', linestyles='--', alpha=0.5) plt.title('%s, %s' % (phenotype, env)) plt.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png' % (kinship_type, phenotype, env, phen_type))
def _test_GxE_mixed_model_gwas(num_indivs=1000, num_snps=10000, num_trait_pairs=10, plot_prefix='/Users/bjarnivilhjalmsson/tmp/test'): """ Test for the multiple environment mixed model Simulates correlated trait pairs with exponentially distributed effects. """ import simulations import kinship import scipy as sp import linear_models as lm import gwaResults as gr num_trait_pairs = 10 num_indivs = 200 num_snps = 10000 # Number of causal SNPs per trait (in total there may be up to twice that, # depending on genetic correlation) num_causals = 10 # Simulating (unlinked) genotypes and phenotype pairs w. random positive # correlation d = simulations.get_simulated_data(num_indivs=num_indivs, num_snps=num_snps, num_trait_pairs=num_trait_pairs, num_causals=num_causals) for i in range(num_trait_pairs): # The two different phenotypes. phen1 = d['trait_pairs'][i][0] phen2 = d['trait_pairs'][i][1] # Stacking up the two phenotypes into one vector. Y = sp.hstack([phen1, phen2]) # The higher genetic correlation, the better the model fit (since we # assume genetic correlation is 1). print 'The genetic correlation between the two traits is %0.4f' % d['rho_est_list'][i][0, 1] # The genotypes sd = d['sd'] snps = sd.get_snps() # Doubling the genotype data as well. snps = sp.hstack([snps, snps]) # Calculating the kinship using the duplicated genotypes K = kinship.calc_ibd_kinship(snps) print '' # Calculating the environment vector E = sp.zeros((2 * num_indivs, 1)) E[num_indivs:, 0] = 1 print 'Here are the dimensions:' print 'Y.shape: ', Y.shape print 'snps.shape: ', snps.shape print 'E.shape: ', E.shape print 'K.shape: ', K.shape mm_results = lm.emmax_w_two_env(snps, Y, K, E) gtres = mm_results["gt_res"] gtgres = mm_results["gt_g_res"] gres = mm_results["g_res"] # Figuring out which loci are causal highlight_loci = sp.array(sd.get_chr_pos_list())[ d['causal_indices_list'][i]] highlight_loci = highlight_loci.tolist() highlight_loci.sort() # Plotting stuff res = gr.Result(scores=gtres['ps'], snps_data=sd) res.plot_manhattan(png_file='%s_%d_gtres_manhattan.png' % (plot_prefix, i), percentile=50, highlight_loci=highlight_loci, plot_bonferroni=True, neg_log_transform=True) res.plot_qq('%s_%d_gtres_qq.png' % (plot_prefix, i)) res = gr.Result(scores=gtgres['ps'], snps_data=sd) res.plot_manhattan(png_file='%s_%d_gtgres_manhattan.png' % (plot_prefix, i), percentile=50, highlight_loci=highlight_loci, plot_bonferroni=True, neg_log_transform=True) res.plot_qq('%s_%d_gtgres_qq.png' % (plot_prefix, i)) res = gr.Result(scores=gres['ps'], snps_data=sd) res.plot_manhattan(png_file='%s_%d_gres_manhattan.png' % (plot_prefix, i), percentile=50, highlight_loci=highlight_loci, plot_bonferroni=True, neg_log_transform=True) res.plot_qq('%s_%d_gres_qq.png' % (plot_prefix, i))
def leave_k_out_blup(num_repeats=20, num_cvs=5, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/', k_thres=0.5): """ """ import h5py import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] rep_dict = {} for rep_i in range(num_repeats): res_dict = {} for phenotype in phenotypes: env_dict = {} for env in envs: print phenotype, env s1 = time.time() # Load data.. d = hdf5_data.coordinate_cegs_genotype_phenotype( phen_dict, phenotype, env, k_thres=k_thres) Y_means = d['Y_means'] snps = d['snps'] assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?' K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' assert sp.all(sp.negative(sp.isnan(K))), 'WTF?' n = len(Y_means) # partition genotypes in k parts. gt_ids = d['gt_ids'] num_ids = len(gt_ids) chunk_size = num_ids / num_cvs # Create k CV sets of prediction and validation data cv_chunk_size = int((n / num_cvs) + 1) ordering = sp.random.permutation(n) a = sp.arange(n) osb_ys = [] pred_ys = [] p_herits = [] for cv_i, i in enumerate(range(0, n, cv_chunk_size)): cv_str = 'cv_%d' % cv_i # print 'Working on CV %d' % cv_i end_i = min(n, i + cv_chunk_size) validation_filter = sp.in1d(a, ordering[i:end_i]) training_filter = sp.negative(validation_filter) train_snps = snps[:, training_filter] val_snps = snps[:, validation_filter] train_Y = Y_means[training_filter] val_Y = Y_means[validation_filter] #Calc. kinship K_train = K[training_filter, :][:, training_filter] K_cross = K[validation_filter, :][:, training_filter] # Do gBLUP lmm = lm.LinearMixedModel(train_Y) lmm.add_random_effect(K_train) r1 = lmm.get_REML() # Now the BLUP. y_mean = sp.mean(lmm.Y) Y = lmm.Y - y_mean p_herit = r1['pseudo_heritability'] p_herits.append(p_herit) #delta = (1 - p_herit) / p_herit # if K_inverse == None: # K_inverse = K.I # M = (sp.eye(K.shape[0]) + delta * K_inverse) # u_blup = M.I * Y M = sp.mat(p_herit * sp.mat(K_train) + (1 - p_herit) * sp.eye(K_train.shape[0])) u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten() osb_ys.extend(val_Y) pred_ys.extend(u_mean_pred) corr = sp.corrcoef(osb_ys, pred_ys)[1, 0] print 'Correlation:', corr r2 = corr**2 print 'R2:', r2 mean_herit = sp.mean(p_herits) print 'Avg. heritability:', mean_herit env_dict[env] = {'R2': r2, 'obs_y': osb_ys, 'pred_y': pred_ys, 'corr': corr, 'avg_herit': mean_herit} res_dict[phenotype] = env_dict rep_dict[rep_i] = res_dict res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % ( num_cvs, k_thres) h5f = h5py.File(res_hdf5_file) for rep_i in range(num_repeats): res_dict = rep_dict[rep_i] rep_g = h5f.create_group('repl_%d' % rep_i) for phenotype in phenotypes: phen_g = rep_g.create_group(phenotype) for env in envs: d = res_dict[phenotype][env] env_g = phen_g.create_group(env) env_g.create_dataset('R2', data=[d['R2']]) env_g.create_dataset('corr', data=[d['corr']]) env_g.create_dataset('obs_y', data=d['obs_y']) env_g.create_dataset('pred_y', data=d['pred_y']) env_g.create_dataset('avg_herit', data=[d['avg_herit']]) h5f.close()
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'): """ Perform a simple MLM GWAS for the 8 traits """ import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] for phenotype in phenotypes: for env in envs: print phenotype, env s1 = time.time() d = hdf5_data.coordinate_cegs_genotype_phenotype( phen_dict, phenotype, env) print 'Calculating kinship' if kinship_type == 'ibs': K = kinship.calc_ibs_kinship(d['snps']) elif kinship_type == 'ibd': K = kinship.calc_ibd_kinship(d['snps']) else: raise NotImplementedError if phen_type == 'means': lmm = lm.LinearMixedModel(d['Y_means']) elif phen_type == 'medians': lmm = lm.LinearMixedModel(d['Y_medians']) else: raise NotImplementedError lmm.add_random_effect(K) print "Running EMMAX" res = lmm.emmax_f_test(d['snps'], emma_num=1000) print 'Mean p-value:', sp.mean(res['ps']) secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) # Now generating QQ-plots label_str = '%s_%s_%s_%s' % ( kinship_type, phenotype, env, phen_type) agr.plot_simple_qqplots_pvals('/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str), [res['ps']], result_labels=[ label_str], line_colors=['green'], num_dots=1000, title=None, max_neg_log_val=6) # Perform multiple loci mixed model GWAS chromosomes = d['positions'][:, 0] positions = sp.array(d['positions'][:, 1], 'int32') x_positions = [] y_log_pvals = [] colors = [] x_shift = 0 for i, chrom in enumerate(sp.unique(chromosomes)): if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']: colors.append('c') else: # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra'] # Toss U and Hets colors.append('m') chrom_filter = sp.in1d(chromosomes, chrom) positions_slice = positions[chrom_filter] x_positions.append(positions_slice + x_shift) x_shift += positions_slice.max() log_ps_slice = -sp.log10(res['ps'][chrom_filter]) y_log_pvals.append(log_ps_slice) m = len(positions) log_bonf = -sp.log10(1 / (20.0 * m)) print m, log_bonf # Plot manhattan plots? plt.figure(figsize=(12, 4)) plt.axes([0.03, 0.1, 0.95, 0.8]) for i, chrom in enumerate(sp.unique(chromosomes)): plt.plot(x_positions[i], y_log_pvals[i], c=colors[i], ls='', marker='.') xmin, xmax = plt.xlim() plt.hlines(log_bonf, xmin, xmax, colors='k', linestyles='--', alpha=0.5) plt.title('%s, %s' % (phenotype, env)) plt.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png' % (kinship_type, phenotype, env, phen_type))
def _test_(): singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) print snps snps = snps.T snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0) snps = snps.T print snps, snps.shape file_prefix = os.environ['HOME'] + '/tmp/test' phen_list = phenotypes.simulate_traits_w_snps_to_hdf5( snps, hdf5_file_prefix=file_prefix, num_traits=30, p=0.1) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list['phenotypes']): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('%s_%d_phen.png' % (file_prefix, i)) plt.clf() agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i), [ ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:] ], result_labels=[ 'Common SNPs', 'Singletons', 'Doubletons' ], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Cholesky permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) #ATT permutations (Implement) #PC permutations (Implement) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def coordinate_cegs_genotype_phenotype(phen_dict, phenotype='Protein',env='mated',k_thres=0.8, ind_missing_thres=0.5, snp_missing_thres=0.05, maf_thres=0.1, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.Filter_imputed.hdf5'): """ Parse genotypes and coordinate with phenotype, and ready data for analysis. """ gh5f = h5py.File(genotype_file) p_dict = phen_dict[phenotype][env] print 'Loading SNPs' snps = sp.array(gh5f['gt'][...],dtype='single') snps = snps[:,p_dict['ind_filter']] positions = gh5f['pos'][...] m,n = snps.shape print 'Loaded %d SNPs for %d individuals'%(m,n) print 'Filtering individuals with missing rates >%0.2f'%ind_missing_thres missing_mat = sp.isnan(snps) ind_missing_rates = sp.sum(missing_mat,0)/float(m) ind_filter = ind_missing_rates<ind_missing_thres snps = snps[:,ind_filter] n = sp.sum(ind_filter) print 'Filtered %d individuals due to high missing rates'%sp.sum(sp.negative(ind_filter)) gt_ids = gh5f['gt_ids'][p_dict['ind_filter']] gt_ids = gt_ids[ind_filter] Y_means = p_dict['Y_means'][p_dict['ind_filter']] Y_means = Y_means[ind_filter] Y_medians = p_dict['Y_medians'][p_dict['ind_filter']] Y_medians = Y_medians[ind_filter] rep_count = p_dict['rep_count'][p_dict['ind_filter']] rep_count = rep_count[ind_filter] print 'Now removing "bad" genotypes.' bad_genotypes = ['Raleigh_272', 'Raleigh_378', 'Raleigh_554', 'Raleigh_591', 'Raleigh_398', 'Raleigh_138', 'Raleigh_208', 'Raleigh_336', 'Raleigh_370', 'Raleigh_373', 'Raleigh_374', 'Raleigh_799', 'Raleigh_821', 'Raleigh_822', 'Raleigh_884', 'Raleigh_335'] ind_filter = sp.negative(sp.in1d(gt_ids,bad_genotypes)) gt_ids = gt_ids[ind_filter] Y_means= Y_means[ind_filter] Y_medians= Y_medians[ind_filter] rep_count= rep_count[ind_filter] snps = snps[:,ind_filter] print 'Removed %d "bad" genotypes'%sp.sum(sp.negative(ind_filter)) n = len(snps[0]) print 'Filtering SNPs with missing rate >%0.2f'%snp_missing_thres missing_mat = sp.isnan(snps) snp_missing_rates = sp.sum(missing_mat,1)/float(n) snps_filter = snp_missing_rates<snp_missing_thres snps = snps[snps_filter] positions = positions[snps_filter] m = sp.sum(snps_filter) print 'Filtered %d SNPs due to high missing rate'%sp.sum(sp.negative(snps_filter)) print 'Now imputing (w mean)' missing_mat = sp.isnan(snps) ok_counts = n-sp.sum(missing_mat,1) snps[missing_mat]=0 snp_means = sp.sum(snps,1)/ok_counts # print snp_means.shape # print snp_means[:10] # import pdb # pdb.set_trace() for i in range(len(snps)): snps[i,missing_mat[i]]=snp_means[i] print 'And filtering SNPs with MAF<%0.2f'%maf_thres snp_means = sp.mean(snps,1) snp_mafs = sp.minimum(snp_means,1-snp_means) snps_filter = snp_mafs>maf_thres snps = snps[snps_filter] positions = positions[snps_filter] print 'Filtered %d SNPs with low MAFs'%sp.sum(sp.negative(snps_filter)) print 'Filtering based on kinship w threshold:',k_thres import kinship K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' K_ind_filter = [] for i in range(n): K_ind_filter.append(not sp.any(K[i,i+1:n]>k_thres)) if sum(K_ind_filter)==n: print 'No individuals were filtered based on kinship..' else: print 'Filtering %d individuals based on kinship.'%(n-sum(K_ind_filter)) K_ind_filter = sp.array(K_ind_filter) gt_ids = gt_ids[K_ind_filter] Y_means= Y_means[K_ind_filter] Y_medians= Y_medians[K_ind_filter] rep_count= rep_count[K_ind_filter] snps = snps[:,K_ind_filter] print 'Again filtering SNPs with MAF<%0.2f'%maf_thres snp_means = sp.mean(snps,1) snp_mafs = sp.minimum(snp_means,1-snp_means) snps_filter = snp_mafs>maf_thres snps = snps[snps_filter] positions = positions[snps_filter] print 'Filtered %d additional SNPs with low MAFs'%sp.sum(sp.negative(snps_filter)) print 'All filtering done.' m,n = snps.shape print 'In all there are %d SNPs remaining, for %d individuals.'%(m,n) ret_dict = {'Y_means':Y_means, 'Y_medians':Y_medians, 'rep_count':rep_count, 'gt_ids':gt_ids, 'positions':positions, 'snps':snps} return ret_dict
def get_ibd_kinship_matrix(self, debug_filter=1, dtype='single',chunk_size=None): log.debug('Starting IBD calculation') cov_mat = kinship.calc_ibd_kinship(self,chunk_size=chunk_size) log.debug('Finished calculating IBD kinship matrix') return cov_mat
def get_ibd_kinship_matrix(self, debug_filter=1, dtype='single',chunk_size=None): log.debug('Starting IBD calculation') return kinship.calc_ibd_kinship(self,chunk_size=chunk_size) log.debug('Finished calculating IBD kinship matrix') return cov_mat
def coordinate_cegs_genotype_phenotype( phen_dict, phenotype='Protein', env='mated', k_thres=0.8, ind_missing_thres=0.5, snp_missing_thres=0.05, maf_thres=0.1, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.Filter_imputed.hdf5' ): """ Parse genotypes and coordinate with phenotype, and ready data for analysis. """ gh5f = h5py.File(genotype_file) p_dict = phen_dict[phenotype][env] print 'Loading SNPs' snps = sp.array(gh5f['gt'][...], dtype='single') snps = snps[:, p_dict['ind_filter']] positions = gh5f['pos'][...] m, n = snps.shape print 'Loaded %d SNPs for %d individuals' % (m, n) print 'Filtering individuals with missing rates >%0.2f' % ind_missing_thres missing_mat = sp.isnan(snps) ind_missing_rates = sp.sum(missing_mat, 0) / float(m) ind_filter = ind_missing_rates < ind_missing_thres snps = snps[:, ind_filter] n = sp.sum(ind_filter) print 'Filtered %d individuals due to high missing rates' % sp.sum( sp.negative(ind_filter)) gt_ids = gh5f['gt_ids'][p_dict['ind_filter']] gt_ids = gt_ids[ind_filter] Y_means = p_dict['Y_means'][p_dict['ind_filter']] Y_means = Y_means[ind_filter] Y_medians = p_dict['Y_medians'][p_dict['ind_filter']] Y_medians = Y_medians[ind_filter] rep_count = p_dict['rep_count'][p_dict['ind_filter']] rep_count = rep_count[ind_filter] print 'Now removing "bad" genotypes.' bad_genotypes = [ 'Raleigh_272', 'Raleigh_378', 'Raleigh_554', 'Raleigh_591', 'Raleigh_398', 'Raleigh_138', 'Raleigh_208', 'Raleigh_336', 'Raleigh_370', 'Raleigh_373', 'Raleigh_374', 'Raleigh_799', 'Raleigh_821', 'Raleigh_822', 'Raleigh_884', 'Raleigh_335' ] ind_filter = sp.negative(sp.in1d(gt_ids, bad_genotypes)) gt_ids = gt_ids[ind_filter] Y_means = Y_means[ind_filter] Y_medians = Y_medians[ind_filter] rep_count = rep_count[ind_filter] snps = snps[:, ind_filter] print 'Removed %d "bad" genotypes' % sp.sum(sp.negative(ind_filter)) n = len(snps[0]) print 'Filtering SNPs with missing rate >%0.2f' % snp_missing_thres missing_mat = sp.isnan(snps) snp_missing_rates = sp.sum(missing_mat, 1) / float(n) snps_filter = snp_missing_rates < snp_missing_thres snps = snps[snps_filter] positions = positions[snps_filter] m = sp.sum(snps_filter) print 'Filtered %d SNPs due to high missing rate' % sp.sum( sp.negative(snps_filter)) print 'Now imputing (w mean)' missing_mat = sp.isnan(snps) ok_counts = n - sp.sum(missing_mat, 1) snps[missing_mat] = 0 snp_means = sp.sum(snps, 1) / ok_counts # print snp_means.shape # print snp_means[:10] # import pdb # pdb.set_trace() for i in range(len(snps)): snps[i, missing_mat[i]] = snp_means[i] print 'And filtering SNPs with MAF<%0.2f' % maf_thres snp_means = sp.mean(snps, 1) snp_mafs = sp.minimum(snp_means, 1 - snp_means) snps_filter = snp_mafs > maf_thres snps = snps[snps_filter] positions = positions[snps_filter] print 'Filtered %d SNPs with low MAFs' % sp.sum(sp.negative(snps_filter)) print 'Filtering based on kinship w threshold:', k_thres import kinship K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' K_ind_filter = [] for i in range(n): K_ind_filter.append(not sp.any(K[i, i + 1:n] > k_thres)) if sum(K_ind_filter) == n: print 'No individuals were filtered based on kinship..' else: print 'Filtering %d individuals based on kinship.' % ( n - sum(K_ind_filter)) K_ind_filter = sp.array(K_ind_filter) gt_ids = gt_ids[K_ind_filter] Y_means = Y_means[K_ind_filter] Y_medians = Y_medians[K_ind_filter] rep_count = rep_count[K_ind_filter] snps = snps[:, K_ind_filter] print 'Again filtering SNPs with MAF<%0.2f' % maf_thres snp_means = sp.mean(snps, 1) snp_mafs = sp.minimum(snp_means, 1 - snp_means) snps_filter = snp_mafs > maf_thres snps = snps[snps_filter] positions = positions[snps_filter] print 'Filtered %d additional SNPs with low MAFs' % sp.sum( sp.negative(snps_filter)) print 'All filtering done.' m, n = snps.shape print 'In all there are %d SNPs remaining, for %d individuals.' % (m, n) ret_dict = { 'Y_means': Y_means, 'Y_medians': Y_medians, 'rep_count': rep_count, 'gt_ids': gt_ids, 'positions': positions, 'snps': snps } return ret_dict