def generate_test_data_w_sum_stats(h2=0.5, n=100000, n_sample=100, m=50000, model='gaussian', p=1.0, conseq_r2=0, m_ld_chunk_size=100): """ Generate """ #Get LD sample matrix D_sample = genotypes.get_sample_D(200,conseq_r2=conseq_r2,m=m_ld_chunk_size) #Simulate beta_hats ret_dict = simulate_beta_hats(h2=h2, n=n, n_sample=n_sample, m=m, model=model, p=p, conseq_r2=conseq_r2, m_ld_chunk_size=m_ld_chunk_size, D_sample=D_sample) #Simulate test genotypes test_snps = genotypes.simulate_genotypes_w_ld(n_sample=n_sample, m=m, conseq_r2=conseq_r2, m_ld_chunk_size=m_ld_chunk_size) ret_dict['test_snps'] = test_snps #Simulate test phenotypes phen_noise = stats.norm.rvs(0, sp.sqrt(1.0 - h2), size=n_sample) phen_noise = sp.sqrt((1.0 - h2) / sp.var(phen_noise)) * phen_noise genetic_part = sp.dot(test_snps.T, ret_dict['betas']) genetic_part = sp.sqrt(h2 / sp.var(genetic_part)) * genetic_part test_phen = genetic_part + phen_noise ret_dict['test_phen'] = test_phen return ret_dict
def generate_test_data_w_sum_stats(h2=0.5, n=100000, n_sample=100, m=50000, model='gaussian', p=1.0, conseq_r2=0, m_ld_chunk_size=100): """ Generate """ #Get LD sample matrix D_sample = genotypes.get_sample_D(200, conseq_r2=conseq_r2, m=m_ld_chunk_size) #Simulate beta_hats ret_dict = simulate_beta_hats(h2=h2, n=n, n_sample=n_sample, m=m, model=model, p=p, conseq_r2=conseq_r2, m_ld_chunk_size=m_ld_chunk_size, D_sample=D_sample) #Simulate test genotypes test_snps = genotypes.simulate_genotypes_w_ld( n_sample=n_sample, m=m, conseq_r2=conseq_r2, m_ld_chunk_size=m_ld_chunk_size) ret_dict['test_snps'] = test_snps #Simulate test phenotypes phen_noise = stats.norm.rvs(0, sp.sqrt(1.0 - h2), size=n_sample) phen_noise = sp.sqrt((1.0 - h2) / sp.var(phen_noise)) * phen_noise genetic_part = sp.dot(test_snps.T, ret_dict['betas']) genetic_part = sp.sqrt(h2 / sp.var(genetic_part)) * genetic_part test_phen = genetic_part + phen_noise ret_dict['test_phen'] = test_phen return ret_dict
def simulate_traits(n=1000, m=100, hdf5_file_prefix=None, hdf5_group=None, num_traits=1000, h2=0.5, effect_prior='gaussian', p=1.0, conseq_ld=0, overwrite_hdf5=False, test_n=1000, simulate_validation_traits=True): """ Simluate traits: First simulate SNPs, then simulate the traits """ print "Using %d SNPs to simulate traits for %d individuals." % (m, n) genotype_dict = genotypes.simulate_genotypes_w_ld(n=n, m=m, ld=conseq_ld, return_ne=False, ld_window_size=0) snps = genotype_dict['X'] betas_list = [] betas_marg_list = [] phen_list = [] for i in range(num_traits): if effect_prior == 'gaussian': if p == 1.0: betas = stats.norm.rvs(0, sp.sqrt(h2 / m), size=m) else: M = int(round(m * p)) betas = sp.concatenate((stats.norm.rvs(0, sp.sqrt(h2 / M), size=M), sp.zeros(m - M, dtype=float))) elif effect_prior == 'laplace': if p == 1.0: betas = stats.laplace.rvs(scale=sp.sqrt(h2 / (2 * m)), size=m) else: M = int(round(m * p)) betas = sp.concatenate((stats.laplace.rvs(scale=sp.sqrt(h2 / (2 * M)), size=M), sp.zeros(m - M, dtype=float))) betas_var = sp.var(betas) beta_scalar = sp.sqrt(h2 / (m * betas_var)) betas = betas * beta_scalar betas_list.append(betas) phen_noise = stats.norm.rvs(0, sp.sqrt(1.0 - h2), size=n) phen_noise = sp.sqrt((1.0 - h2) / sp.var(phen_noise)) * phen_noise genetic_part = sp.dot(snps, betas) genetic_part = sp.sqrt(h2 / sp.var(genetic_part)) * genetic_part train_phen = genetic_part + phen_noise print 'Herit:', sp.var(genetic_part) / sp.var(train_phen) phen_list.append(train_phen) betas_marg = (1. / n) * sp.dot(train_phen, snps) betas_marg_list.append(betas_marg) sys.stdout.write('\b\b\b\b\b\b\b%0.1f%%' % (100.0 * (float(i) / num_traits))) sys.stdout.flush() if hdf5_file_prefix != None: hdf5_file = '%s_p_%0.4f.hdf5' % (hdf5_file_prefix, p) if os.path.isfile(hdf5_file): print 'File already exists.' if overwrite_hdf5: print 'Overwriting %s' % hdf5_file os.remove(hdf5_file) else: print 'Attempting to continue.' h5f = h5py.File(hdf5_file) h5f.create_dataset('phenotypes', data=phen_list, compression='gzip') h5f.create_dataset('betas', data=betas_list, compression='gzip') h5f.create_dataset('betas_marg', data=betas_marg_list, compression='gzip') elif hdf5_group != None: hdf5_group.create_dataset('phenotypes', data=phen_list, compression='gzip') hdf5_group.create_dataset('betas', data=betas_list, compression='gzip') hdf5_group.create_dataset('betas_marg', data=betas_marg_list, compression='gzip') else: print 'Warning: No storage file given!' print '.' print "Done simulating data." return phen_list
def simulate_plink_train_test_datasets(num_traits=1, n_sample=1000, p=0.001, m=10000, h2=0.1, adj_r2=0.9, m_ld_chunk_size=100, effect_prior='gaussian', out_prefix='/Users/bjarnivilhjalmsson/data/tmp/LDpred_data'): #First simulate SNPs (w LD) snps = gt.simulate_genotypes_w_ld(n_sample=n_sample, m=m, conseq_r2=adj_r2, m_ld_chunk_size=m_ld_chunk_size, diploid=True, verbose=True) positions = range(m) print snps[0], snps[100], snps[200] #Simulate traits phen_dict = pt.simulate_traits_w_snps(snps, num_traits=num_traits, p=p, m=m, h2=h2, effect_prior=effect_prior, verbose=True, liability_thres=None) #Partition into training and test data part_i = int(n_sample/5.0) train_snps = snps[:,part_i:] test_snps = snps[:,:part_i] #Write out Plink files for t_i in range(num_traits): train_plink_prefix = '%s_p%0.3f_train_%d'%(out_prefix, p, t_i) test_plink_prefix = '%s_p%0.3f_test_%d'%(out_prefix, p, t_i) train_phens = phen_dict['phenotypes'][t_i][part_i:] test_phens = phen_dict['phenotypes'][t_i][:part_i] write_fake_plink_file(train_snps, train_plink_prefix, positions, phenotypes=train_phens) print 'Done w Training file' write_fake_plink_file(test_snps, test_plink_prefix, positions, phenotypes=test_phens) print 'Done w Testing file' #Conduct GWAS, and write out results. print 'Normalizing genotypes' snps_stds = sp.std(train_snps,axis=1) snps_means = sp.mean(train_snps,axis=1) snps_stds.shape = (len(snps_stds),1) snps_means.shape = (len(snps_means),1) snps = (train_snps - snps_means)/snps_stds for t_i in range(num_traits): ss_filename = '%s_p%0.3f_ss_%d.txt'%(out_prefix, p, t_i) train_phens = phen_dict['phenotypes'][t_i][part_i:] #Normalize phenotypes n_training = len(train_phens) beta_hats = sp.dot(snps, train_phens) / n_training b2s = beta_hats ** 2 f_stats = (n_training - 2) * b2s / (1 - b2s) pvals = stats.f.sf(f_stats, 1, n_training - 2) print 'Median p-value is %0.3f, and mean p-value is %0.3f'%(sp.median(pvals),sp.mean(pvals)) """ chr pos ref alt reffrq info rs pval effalt chr1 1020428 C T 0.85083 0.98732 rs6687776 0.0587 -0.0100048507289348 chr1 1020496 G A 0.85073 0.98751 rs6678318 0.1287 -0.00826075392985992 """ with open(ss_filename,'w') as f: f.write('chr pos ref alt reffrq info rs pval effalt\n') i = 0 for eff, pval in it.izip(beta_hats,pvals): f.write('chr1 %d A G 0.5 1 sid_%d %0.6e %0.6e\n'%(i,i,pval,eff)) i += 1
def simulate_traits(n=1000, m=100, hdf5_file_prefix=None, hdf5_group=None, num_traits=1000, h2=0.5, effect_prior='gaussian', p=1.0, conseq_ld=0, overwrite_hdf5=False, test_n=1000, simulate_validation_traits=True): """ Simluate traits: First simulate SNPs, then simulate the traits """ print "Using %d SNPs to simulate traits for %d individuals." % (m, n) genotype_dict = genotypes.simulate_genotypes_w_ld(n=n, m=m, ld=conseq_ld, return_ne=False, ld_window_size=0) snps = genotype_dict['X'] betas_list = [] betas_marg_list = [] phen_list = [] for i in range(num_traits): if effect_prior == 'gaussian': if p == 1.0: betas = stats.norm.rvs(0, sp.sqrt(h2 / m), size=m) else: M = int(round(m * p)) betas = sp.concatenate( (stats.norm.rvs(0, sp.sqrt(h2 / M), size=M), sp.zeros(m - M, dtype=float))) elif effect_prior == 'laplace': if p == 1.0: betas = stats.laplace.rvs(scale=sp.sqrt(h2 / (2 * m)), size=m) else: M = int(round(m * p)) betas = sp.concatenate( (stats.laplace.rvs(scale=sp.sqrt(h2 / (2 * M)), size=M), sp.zeros(m - M, dtype=float))) betas_var = sp.var(betas) beta_scalar = sp.sqrt(h2 / (m * betas_var)) betas = betas * beta_scalar betas_list.append(betas) phen_noise = stats.norm.rvs(0, sp.sqrt(1.0 - h2), size=n) phen_noise = sp.sqrt((1.0 - h2) / sp.var(phen_noise)) * phen_noise genetic_part = sp.dot(snps, betas) genetic_part = sp.sqrt(h2 / sp.var(genetic_part)) * genetic_part train_phen = genetic_part + phen_noise print 'Herit:', sp.var(genetic_part) / sp.var(train_phen) phen_list.append(train_phen) betas_marg = (1. / n) * sp.dot(train_phen, snps) betas_marg_list.append(betas_marg) sys.stdout.write('\b\b\b\b\b\b\b%0.1f%%' % (100.0 * (float(i) / num_traits))) sys.stdout.flush() if hdf5_file_prefix != None: hdf5_file = '%s_p_%0.4f.hdf5' % (hdf5_file_prefix, p) if os.path.isfile(hdf5_file): print 'File already exists.' if overwrite_hdf5: print 'Overwriting %s' % hdf5_file os.remove(hdf5_file) else: print 'Attempting to continue.' h5f = h5py.File(hdf5_file) h5f.create_dataset('phenotypes', data=phen_list, compression='gzip') h5f.create_dataset('betas', data=betas_list, compression='gzip') h5f.create_dataset('betas_marg', data=betas_marg_list, compression='gzip') elif hdf5_group != None: hdf5_group.create_dataset('phenotypes', data=phen_list, compression='gzip') hdf5_group.create_dataset('betas', data=betas_list, compression='gzip') hdf5_group.create_dataset('betas_marg', data=betas_marg_list, compression='gzip') else: print 'Warning: No storage file given!' print '.' print "Done simulating data." return phen_list