def get_blup(self, pid, K): """ Returns the REML estimate for the BLUP and the pseudo-heritability. """ from scipy import stats import linear_models as lm phen_vals = self.get_values(pid) lmm = lm.LinearMixedModel(phen_vals) if len(phen_vals) == len(set(phen_vals)): lmm.add_random_effect(K) else: Z = self.get_incidence_matrix(pid) lmm.add_random_effect(Z * K * Z.T) r1 = lmm.get_REML() ll1 = r1['max_ll'] rlm = lm.LinearModel(phen_vals) ll0 = rlm.get_ll() lrt_stat = 2 * (ll1 - ll0) pval = stats.chi2.sf(lrt_stat, 1) #Now the BLUP. y_mean = sp.mean(lmm.Y) Y = lmm.Y - y_mean p_herit = r1['pseudo_heritability'] delta = (1 - p_herit) / p_herit # if K_inverse == None: # K_inverse = K.I # M = (sp.eye(K.shape[0]) + delta * K_inverse) # u_blup = M.I * Y M = (K + delta * sp.eye(K.shape[0])) u_mean_pred = K * (M.I * Y) blup_residuals = Y - u_mean_pred return {'pseudo_heritability':r1['pseudo_heritability'], 'pval':pval, 'u_blup':u_mean_pred, 'blup_residuals':blup_residuals}
def _get_estimates_(self): print "Initializing mixed model..." self.lmm = lm.LinearMixedModel(self.pvls) self.lmm.add_random_effect(self.k) eig_L = self.lmm._get_eigen_L_() print "Estimating variance components..." self.est = self.lmm.get_estimates(eig_L, self.k)
def _test_scz_(): # Load Schizophrenia data singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) test_snps = sp.vstack([singleton_snps, doubleton_snps]) print snps phen_list = phenotypes.simulate_traits( snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i) plt.clf() agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i, [ ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:] ], result_labels=[ 'Common SNPs', 'Singletons', 'Doubletons' ], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Now permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def __init__(self, y, fixed_effects=None, K=None, Z=None): self.lmm = linear_models.LinearMixedModel(Y=y) if Z is not None: self.lmm.add_random_effect(Z * K * Z.T) if fixed_effects is not None: for cofactor in fixed_effects: self.lmm.add_factor(Z * cofactor) else: self.lmm.add_random_effect(K) if fixed_effects: for cofactor in fixed_effects: self.lmm.add_factor(cofactor)
def get_pseudo_heritability(self, K): """ Returns the REML estimate of the heritability. methods: 'avg' (averages), 'repl' (replicates) """ from scipy import stats import linear_models as lm lmm = lm.LinearMixedModel(self.phen_vals) if len(self.values) == len(set(self.values)): lmm.add_random_effect(K) else: Z = self.get_incidence_matrix() lmm.add_random_effect(Z * K * Z.T) r1 = lmm.get_REML() ll1 = r1['max_ll'] rlm = lm.LinearModel(self.values) ll0 = rlm.get_ll() lrt_stat = 2 * (ll1 - ll0) pval = stats.chi2.sf(lrt_stat, 1) return {'pseudo_heritability': r1['pseudo_heritability'], 'pval': pval}
def leave_k_out_blup( num_cvs=20, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/', k_thres=0.5): """ """ import h5py import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] res_dict = {} for phenotype in phenotypes: env_dict = {} for env in envs: print phenotype, env s1 = time.time() #Load data.. d = hdf5_data.coordinate_cegs_genotype_phenotype(phen_dict, phenotype, env, k_thres=k_thres) Y_means = d['Y_means'] snps = d['snps'] assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?' K = kinship.calc_ibd_kinship(snps) print '\nKinship calculated' assert sp.all(sp.negative(sp.isnan(K))), 'WTF?' n = len(Y_means) #partition genotypes in k parts. gt_ids = d['gt_ids'] num_ids = len(gt_ids) chunk_size = num_ids / num_cvs #Create k CV sets of prediction and validation data cv_chunk_size = int((n / num_cvs) + 1) ordering = sp.random.permutation(n) a = sp.arange(n) osb_ys = [] pred_ys = [] p_herits = [] for cv_i, i in enumerate(range(0, n, cv_chunk_size)): cv_str = 'cv_%d' % cv_i #print 'Working on CV %d' % cv_i end_i = min(n, i + cv_chunk_size) validation_filter = sp.in1d(a, ordering[i:end_i]) training_filter = sp.negative(validation_filter) train_snps = snps[:, training_filter] val_snps = snps[:, validation_filter] train_Y = Y_means[training_filter] val_Y = Y_means[validation_filter] #Calc. kinship K_train = K[training_filter, :][:, training_filter] K_cross = K[validation_filter, :][:, training_filter] #Do gBLUP lmm = lm.LinearMixedModel(train_Y) lmm.add_random_effect(K_train) r1 = lmm.get_REML() #Now the BLUP. y_mean = sp.mean(lmm.Y) Y = lmm.Y - y_mean p_herit = r1['pseudo_heritability'] p_herits.append(p_herit) #delta = (1 - p_herit) / p_herit # if K_inverse == None: # K_inverse = K.I # M = (sp.eye(K.shape[0]) + delta * K_inverse) # u_blup = M.I * Y M = sp.mat(p_herit * sp.mat(K_train) + (1 - p_herit) * sp.eye(K_train.shape[0])) u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten() osb_ys.extend(val_Y) pred_ys.extend(u_mean_pred) corr = sp.corrcoef(osb_ys, pred_ys)[1, 0] print 'Correlation:', corr r2 = corr**2 print 'R2:', r2 mean_herit = sp.mean(p_herits) print 'Avg. heritability:', mean_herit env_dict[env] = { 'R2': r2, 'obs_y': osb_ys, 'pred_y': pred_ys, 'corr': corr, 'avg_herit': mean_herit } res_dict[phenotype] = env_dict res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % ( num_cvs, k_thres) h5f = h5py.File(res_hdf5_file) for phenotype in phenotypes: phen_g = h5f.create_group(phenotype) for env in envs: d = res_dict[phenotype][env] env_g = phen_g.create_group(env) env_g.create_dataset('R2', data=[d['R2']]) env_g.create_dataset('corr', data=[d['corr']]) env_g.create_dataset('obs_y', data=d['obs_y']) env_g.create_dataset('pred_y', data=d['pred_y']) env_g.create_dataset('avg_herit', data=[d['avg_herit']]) h5f.close()
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'): """ Perform a simple MLM GWAS for the 8 traits """ import hdf5_data import kinship import linear_models as lm import time import scipy as sp from matplotlib import pyplot as plt import analyze_gwas_results as agr phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes() phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight'] envs = ['mated', 'virgin'] for phenotype in phenotypes: for env in envs: print phenotype, env s1 = time.time() d = hdf5_data.coordinate_cegs_genotype_phenotype( phen_dict, phenotype, env) print 'Calculating kinship' if kinship_type == 'ibs': K = kinship.calc_ibs_kinship(d['snps']) elif kinship_type == 'ibd': K = kinship.calc_ibd_kinship(d['snps']) else: raise NotImplemented if phen_type == 'means': lmm = lm.LinearMixedModel(d['Y_means']) elif phen_type == 'medians': lmm = lm.LinearMixedModel(d['Y_medians']) else: raise NotImplementedError lmm.add_random_effect(K) print "Running EMMAX" res = lmm.emmax_f_test(d['snps'], emma_num=1000) print 'Mean p-value:', sp.mean(res['ps']) secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) #Now generating QQ-plots label_str = '%s_%s_%s_%s' % (kinship_type, phenotype, env, phen_type) agr.plot_simple_qqplots_pvals( '/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str), [res['ps']], result_labels=[label_str], line_colors=['green'], num_dots=1000, title=None, max_neg_log_val=6) # Perform multiple loci mixed model GWAS chromosomes = d['positions'][:, 0] positions = sp.array(d['positions'][:, 1], 'int32') x_positions = [] y_log_pvals = [] colors = [] x_shift = 0 for i, chrom in enumerate(sp.unique(chromosomes)): if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']: colors.append('c') else: # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra'] #Toss U and Hets colors.append('m') chrom_filter = sp.in1d(chromosomes, chrom) positions_slice = positions[chrom_filter] x_positions.append(positions_slice + x_shift) x_shift += positions_slice.max() log_ps_slice = -sp.log10(res['ps'][chrom_filter]) y_log_pvals.append(log_ps_slice) m = len(positions) log_bonf = -sp.log10(1 / (20.0 * m)) print m, log_bonf # Plot manhattan plots? plt.figure(figsize=(12, 4)) plt.axes([0.03, 0.1, 0.95, 0.8]) for i, chrom in enumerate(sp.unique(chromosomes)): plt.plot(x_positions[i], y_log_pvals[i], c=colors[i], ls='', marker='.') xmin, xmax = plt.xlim() plt.hlines(log_bonf, xmin, xmax, colors='k', linestyles='--', alpha=0.5) plt.title('%s, %s' % (phenotype, env)) plt.savefig( '/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png' % (kinship_type, phenotype, env, phen_type))
def _emmax_permutations(self, snps, phenotypes, num_perm, K=None, Z=None, method='REML'): """ EMMAX permutation test Single SNPs Returns the list of max_pvals and max_fstats """ lmm = lm.LinearMixedModel(phenotypes) lmm.add_random_effect(Z * K * Z.T) eig_L = lmm._get_eigen_L_() print 'Getting variance estimates' res = lmm.get_estimates(eig_L, method=method) q = 1 # Single SNP is being tested p = len(lmm.X.T) + q n = lmm.n n_p = n - p H_sqrt_inv = res['H_sqrt_inv'] Y = H_sqrt_inv * lmm.Y #The transformed outputs. h0_X = H_sqrt_inv * lmm.X (h0_betas, h0_rss, h0_rank, h0_s) = linalg.lstsq(h0_X, Y) Y = Y - h0_X * h0_betas num_snps = len(snps) max_fstat_list = [] min_pval_list = [] chunk_size = len(Y) print "Working with chunk size: " + str(chunk_size) print "and " + str(num_snps) + " SNPs." Ys = sp.mat(sp.zeros((chunk_size, num_perm))) for perm_i in range(num_perm): #print 'Permutation nr. % d' % perm_i sp.random.shuffle(Y) Ys[:, perm_i] = Y min_rss_list = sp.repeat(h0_rss, num_perm) for i in range(0, num_snps, chunk_size): #Do the dot-product in chunks! snps_chunk = sp.matrix(snps[i:(i + chunk_size)]) snps_chunk = snps_chunk * Z.T Xs = snps_chunk * (H_sqrt_inv.T) Xs = Xs - sp.mat(sp.mean(Xs, axis=1)) for j in range(len(Xs)): # for each snp (betas, rss_list, p, sigma) = linalg.lstsq(Xs[j].T, Ys, overwrite_a=True) # read the lstsq lit for k, rss in enumerate(rss_list): if not rss: print 'No predictability in the marker, moving on...' continue if min_rss_list[k] > rss: min_rss_list[k] = rss if num_snps >= 10 and (i + j + 1) % ( num_snps / num_perm) == 0: #Print dots sys.stdout.write('.') sys.stdout.flush() if num_snps >= 10: sys.stdout.write('\n') #min_rss = min(rss_list) max_f_stats = ((h0_rss / min_rss_list) - 1.0) * n_p / float(q) min_pvals = (stats.f.sf(max_f_stats, q, n_p)) res_d = {'min_ps': min_pvals, 'max_f_stats': max_f_stats} print "There are: " + str(len(min_pvals)) return res_d
def _test_(): singleton_snps = genotypes.simulate_k_tons(n=500, m=1000) doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000) common_snps = genotypes.simulate_common_genotypes(500, 1000) snps = sp.vstack([common_snps, singleton_snps, doubleton_snps]) print snps snps = snps.T snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0) snps = snps.T print snps, snps.shape file_prefix = os.environ['HOME'] + '/tmp/test' phen_list = phenotypes.simulate_traits_w_snps_to_hdf5( snps, hdf5_file_prefix=file_prefix, num_traits=30, p=0.1) singletons_thres = [] doubletons_thres = [] common_thres = [] for i, y in enumerate(phen_list['phenotypes']): K = kinship.calc_ibd_kinship(snps) K = kinship.scale_k(K) lmm = lm.LinearMixedModel(y) lmm.add_random_effect(K) r1 = lmm.get_REML() print 'pseudo_heritability:', r1['pseudo_heritability'] ex_res = lm.emmax(snps, y, K) plt.figure() plt.hist(y, 50) plt.savefig('%s_%d_phen.png' % (file_prefix, i)) plt.clf() agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i), [ ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:] ], result_labels=[ 'Common SNPs', 'Singletons', 'Doubletons' ], line_colors=['b', 'r', 'y'], num_dots=200, max_neg_log_val=3) # Cholesky permutations.. res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] singletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] doubletons_thres.append(res['threshold_05'][0]) res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000) print 1.0 / (20 * 1000.0), res['threshold_05'] common_thres.append(res['threshold_05'][0]) #ATT permutations (Implement) #PC permutations (Implement) print sp.mean(singletons_thres), sp.std(singletons_thres) print sp.mean(doubletons_thres), sp.std(doubletons_thres) print sp.mean(common_thres), sp.std(common_thres)
def run_emmax(hdf5_filename='/home/bv25/data/Ls154/Ls154_12.hdf5', out_file='/home/bv25/data/Ls154/Ls154_results.hdf5', min_maf=0.1, recalculate_kinship=True, chunk_size=1000): """ Apply the EMMAX algorithm to hdf5 formated genotype/phenotype data """ ih5f = h5py.File(hdf5_filename) gg = ih5f['genot_data'] ig = ih5f['indiv_data'] n_indivs = len(ig['indiv_ids'][...]) if recalculate_kinship: print 'Calculating kinship.' k_mat = sp.zeros((n_indivs, n_indivs), dtype='single') chromosomes = gg.keys() n_snps = 0 for chrom in chromosomes: print 'Working on Chromosome %s' % chrom cg = gg[chrom] freqs = cg['freqs'][...] mafs = sp.minimum(freqs, 1 - freqs) maf_filter = mafs > min_maf print 'Filtered out %d SNPs with MAF<%0.2f.' % ( len(maf_filter) - sum(maf_filter), min_maf) snps = cg['raw_snps'][...] snps = snps[maf_filter] num_snps = len(snps) for chunk_i, i in enumerate(range(0, num_snps, chunk_size)): end_i = min(i + chunk_size, num_snps) x = snps[i:end_i] x = x.T x = (x - sp.mean(x, 0)) / sp.std(x, 0) x = x.T n_snps += len(x) k_mat += sp.dot(x.T, x) del x sys.stdout.write( '\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, ((chunk_i + 1.0) * chunk_size) / num_snps)))) sys.stdout.flush() sys.stdout.write('\b\b\b\b\b\b\b100.00%\n') k_mat = k_mat / float(n_snps) c = sp.sum( (sp.eye(len(k_mat)) - (1.0 / len(k_mat)) * sp.ones(k_mat.shape)) * sp.array(k_mat)) scalar = (len(k_mat) - 1) / c print 'Kinship scaled by: %0.4f' % scalar k = scalar * k_mat else: assert 'kinship' in ih5f.keys( ), 'Kinship is missing. Please calculate that first!' k = ih5f['kinship'] # Get the phenotypes phenotypes = ig['phenotypes'][...] # Initialize the mixed model lmm = lm.LinearMixedModel(phenotypes) lmm.add_random_effect(k) # Calculate pseudo-heritability, etc. print 'Calculating the eigenvalues of K' s0 = time.time() eig_L = lmm._get_eigen_L_() print 'Done.' print 'Took %0.2f seconds' % (time.time() - s0) print "Calculating the eigenvalues of S(K+I)S where S = I-X(X'X)^-1X'" s0 = time.time() eig_R = lmm._get_eigen_R_(X=lmm.X) print 'Done' print 'Took %0.2f seconds' % (time.time() - s0) print 'Getting variance estimates' s0 = time.time() res = lmm.get_estimates(eig_L, method='REML', eig_R=eig_R) # Get the variance estimates.. print 'Done.' print 'Took %0.2f seconds' % (time.time() - s0) print 'pseudo_heritability:', res['pseudo_heritability'] # Initialize results file oh5f = h5py.File(out_file) # Store phenotype_data oh5f.create_dataset('pseudo_heritability', data=sp.array(res['pseudo_heritability'])) oh5f.create_dataset('ve', data=sp.array(res['ve'])) oh5f.create_dataset('vg', data=sp.array(res['vg'])) oh5f.create_dataset('max_ll', data=sp.array(res['max_ll'])) oh5f.create_dataset('num_snps', data=ih5f['num_snps']) # Construct results data containers chrom_res_group = oh5f.create_group('chrom_results') for chrom in gg.keys(): crg = chrom_res_group.create_group(chrom) # Get the SNPs print 'Working on Chromosome: %s' % chrom freqs = gg[chrom]['freqs'][...] mafs = sp.minimum(freqs, 1 - freqs) maf_filter = mafs > min_maf print 'Filtered out %d SNPs with MAF<%0.2f.' % ( len(maf_filter) - sum(maf_filter), min_maf) snps = gg[chrom]['raw_snps'][...] snps = snps[maf_filter] positions = gg[chrom]['positions'][...] positions = positions[maf_filter] # Now run EMMAX print "Running EMMAX" s1 = time.time() r = lmm._emmax_f_test_(snps, res['H_sqrt_inv'], with_betas=False, emma_num=0, eig_L=eig_L) secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs % 60 print 'Took %d mins and %0.1f seconds.' % (mins, secs) else: print 'Took %0.1f seconds.' % (secs) crg.create_dataset('ps', data=r['ps']) crg.create_dataset('positions', data=positions) oh5f.flush() ih5f.close() oh5f.close()
def run_emmax_perm(hdf5_filename='/home/bv25/data/Ls154/Ls154_12.hdf5', out_file='/home/bv25/data/Ls154/Ls154_results_perm.hdf5', min_maf=0.1, recalculate_kinship=True, chunk_size=1000, num_perm=500): """ Apply the EMMAX algorithm to hdf5 formated genotype/phenotype data """ ih5f = h5py.File(hdf5_filename) gg = ih5f['genot_data'] ig = ih5f['indiv_data'] n_indivs = len(ig['indiv_ids'][...]) print 'Calculating kinship.' k_mat = sp.zeros((n_indivs, n_indivs), dtype='single') chromosomes = gg.keys() # chromosomes = chromosomes[-1:] n_snps = 0 for chrom in chromosomes: print 'Working on Chromosome %s' % chrom cg = gg[chrom] freqs = cg['freqs'][...] mafs = sp.minimum(freqs, 1 - freqs) maf_filter = mafs > min_maf print 'Filtered out %d SNPs with MAF<%0.2f.' % ( len(maf_filter) - sum(maf_filter), min_maf) snps = cg['raw_snps'][...] snps = snps[maf_filter] num_snps = len(snps) for chunk_i, i in enumerate(range(0, num_snps, chunk_size)): end_i = min(i + chunk_size, num_snps) x = snps[i:end_i] x = x.T x = (x - sp.mean(x, 0)) / sp.std(x, 0) x = x.T n_snps += len(x) k_mat += sp.dot(x.T, x) del x sys.stdout.write( '\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, ((chunk_i + 1.0) * chunk_size) / num_snps)))) sys.stdout.flush() sys.stdout.write('\b\b\b\b\b\b\b100.00%\n') k_mat = k_mat / float(n_snps) c = sp.sum((sp.eye(len(k_mat)) - (1.0 / len(k_mat)) * sp.ones(k_mat.shape)) * sp.array(k_mat)) scalar = (len(k_mat) - 1) / c print 'Kinship scaled by: %0.4f' % scalar k = scalar * k_mat # Store the kinship # Initialize results file oh5f = h5py.File(out_file) oh5f.create_dataset('kinship', data=k) oh5f.flush() chromosomes = gg.keys() num_tot_snps = 0 num_12_chr_snps = 0 for chrom in chromosomes: cg = gg[chrom] freqs = cg['freqs'][...] mafs = sp.minimum(freqs, 1 - freqs) maf_filter = mafs > min_maf n_snps = sum(maf_filter) num_tot_snps += n_snps if chrom != chromosomes[-1]: num_12_chr_snps += n_snps # Get the phenotypes phenotypes = ig['phenotypes'][...] # Initialize the mixed model lmm = lm.LinearMixedModel(phenotypes) lmm.add_random_effect(k) # Calculate pseudo-heritability, etc. print 'Calculating the eigenvalues of K' s0 = time.time() eig_L = lmm._get_eigen_L_() print 'Done.' print 'Took %0.2f seconds' % (time.time() - s0) print "Calculating the eigenvalues of S(K+I)S where S = I-X(X'X)^-1X'" s0 = time.time() eig_R = lmm._get_eigen_R_(X=lmm.X) print 'Done' print 'Took %0.2f seconds' % (time.time() - s0) print 'Getting variance estimates' s0 = time.time() res = lmm.get_estimates(eig_L, method='REML', eig_R=eig_R) # Get the variance estimates.. print 'Done.' print 'Took %0.2f seconds' % (time.time() - s0) print 'pseudo_heritability:', res['pseudo_heritability'] # Store phenotype_data oh5f.create_dataset('pseudo_heritability', data=sp.array(res['pseudo_heritability'])) oh5f.create_dataset('ve', data=sp.array(res['ve'])) oh5f.create_dataset('vg', data=sp.array(res['vg'])) oh5f.create_dataset('max_ll', data=sp.array(res['max_ll'])) oh5f.create_dataset('num_snps', data=sp.array(n_snps)) # Construct results data containers chrom_res_group = oh5f.create_group('chrom_results') # all_snps = sp.empty((n_snps, n_indivs)) chr12_snps = sp.empty((num_12_chr_snps, n_indivs)) i = 0 for chrom in gg.keys(): crg = chrom_res_group.create_group(chrom) # Get the SNPs print 'Working on Chromosome: %s' % chrom freqs = gg[chrom]['freqs'][...] mafs = sp.minimum(freqs, 1 - freqs) maf_filter = mafs > min_maf print 'Filtered out %d SNPs with MAF<%0.2f.' % ( len(maf_filter) - sum(maf_filter), min_maf) snps = gg[chrom]['raw_snps'][...] snps = snps[maf_filter] positions = gg[chrom]['positions'][...] positions = positions[maf_filter] n = len(snps) # all_snps[i:i + n] = snps if chrom != chromosomes[-1]: chr12_snps[i:i + n] = snps # Now run EMMAX print "Running EMMAX" s1 = time.time() r = lmm._emmax_f_test_(snps, res['H_sqrt_inv'], with_betas=False, emma_num=0, eig_L=eig_L) secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs % 60 print 'Took %d mins and %0.1f seconds.' % (mins, secs) else: print 'Took %0.1f seconds.' % (secs) crg.create_dataset('ps', data=r['ps']) crg.create_dataset('positions', data=positions) oh5f.flush() i += n print 'Starting permutation test for detecting the genome-wide significance threshold' s1 = time.time() perm_res = lmm._emmax_permutations_(chr12_snps, k, res['H_sqrt_inv'], num_perm=num_perm) secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs % 60 print 'Took %d mins and %0.1f seconds.' % (mins, secs) else: print 'Took %0.1f seconds.' % (secs) perm_res['min_ps'].sort() perm_res['max_f_stats'].sort() perm_res['max_f_stats'][::-1] # reverse array five_perc_i = int(num_perm / 20) print "The 0.05 genome-wide significance threshold is %0.4e, and the corresponding statistic is %0.4e." % ( perm_res['min_ps'][five_perc_i], perm_res['max_f_stats'][five_perc_i]) oh5f.create_dataset('perm_min_ps', data=perm_res['min_ps']) oh5f.create_dataset('perm_max_f_stats', data=perm_res['max_f_stats']) oh5f.create_dataset('five_perc_perm_min_ps', data=perm_res['min_ps'][five_perc_i]) oh5f.create_dataset('five_perc_perm_max_f_stats', data=perm_res['max_f_stats'][five_perc_i]) ih5f.close() oh5f.close()