def calculate_ld_tables(input_genotype_file, chrom_i, local_ld_hdf5_file, ld_radius, min_r2=0.2, maf_thres=0.01, indiv_filter=None, snp_filter=None, return_void=True, verbose=True): """ Calculate the LD tables for the given radius, and store in the given file. """ if not os.path.isfile(local_ld_hdf5_file): h5f = h5py.File(input_genotype_file) print 'Calculating LD information for chromosome %d w. radius %d' % (chrom_i, ld_radius) g_dict = kgenome.get_genotype_data(h5f, chrom_i, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=False, snps_signs=None) ld_dict = get_ld_table(g_dict['norm_snps'], ld_radius=ld_radius, min_r2=min_r2, verbose=verbose) ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores']) print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'w') hu.dict_to_hdf5(ld_dict, oh5f) oh5f.close() print 'LD information is now stored.' else: print 'Loading LD information from file: %s' % local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'r') ld_dict = hu.hdf5_to_dict(oh5f) oh5f.close() if not return_void: return ld_dict
def calc_pc_snp_weights(input_file='/project/PCMA/faststorage/1_DATA/1k_genomes/1K_genomes_phase3_EUR_unrelated.hdf5', pc_file='/project/PCMA/faststorage/1_DATA/1k_genomes/1kgenomes_kinship_pca_f0.95.hdf5', out_file='/project/PCMA/faststorage/1_DATA/1k_genomes/pc_snp_weights_top20.hdf5', snp_filter_frac=1, maf_thres=0.01, num_pcs=20): pcs_h5f = h5py.File(pc_file) print 'Loading Genotype from ' in_h5f = h5py.File(input_file) out_h5f = h5py.File(out_file, 'w') # eur_filter = in_h5f['indivs']['continent'][...] == 'EUR' # num_indivs = sp.sum(eur_filter) indiv_ids = in_h5f['indiv_ids'][...] indiv_filter = None assert len(sp.unique(indiv_ids)) == len(indiv_ids) num_indivs = len(indiv_ids) for chrom in range(1, 23): print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom snp_filter = None if snp_filter_frac < 1: snp_filter = sp.random.random(len(in_h5f[chrom_str]['snps'])) < snp_filter_frac g_dict = kgenome.get_genotype_data(in_h5f, chrom, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=True, snps_signs=None, return_snps_info=True) norm_snps = g_dict['norm_snps'] snp_ids = g_dict['snp_ids'] evecs = pcs_h5f[chrom_str]['evecs_leave_one_out'][...] evals = pcs_h5f[chrom_str]['evals_leave_one_out'][...] sort_indices = sp.argsort(evals,)[::-1] ordered_evals = evals[sort_indices] pcs_var_expl = sp.array(ordered_evals / sp.sum(ordered_evals), dtype='double') pcs = evecs[:, sort_indices] pcs = pcs[:, :num_pcs] ordered_evals = ordered_evals[:num_pcs] norm_pcs = pcs - sp.mean(pcs, axis=0) pcs_std = sp.std(norm_pcs, axis=0)æ norm_pcs = norm_pcs / pcs_std cg = out_h5f.create_group(chrom_str) cg.create_dataset('snp_pc_weights', data=sp.dot(norm_snps, norm_pcs) / num_indivs) cg.create_dataset('pcs_var_expl', data=pcs_var_expl) cg.create_dataset('snp_ids', data=snp_ids) out_h5f.flush() in_h5f.close() out_h5f.close()
def calculate_ld_tables(input_genotype_file, chrom_i, local_ld_hdf5_file, ld_radius, min_r2=0.2, maf_thres=0.01, indiv_filter=None, snp_filter=None, return_void=True, verbose=True): """ Calculate the LD tables for the given radius, and store in the given file. """ if not os.path.isfile(local_ld_hdf5_file): h5f = h5py.File(input_genotype_file) print 'Calculating LD information for chromosome %d w. radius %d' % ( chrom_i, ld_radius) g_dict = kgenome.get_genotype_data(h5f, chrom_i, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=False, snps_signs=None) ld_dict = get_ld_table(g_dict['norm_snps'], ld_radius=ld_radius, min_r2=min_r2, verbose=verbose) ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores']) print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'w') hu.dict_to_hdf5(ld_dict, oh5f) oh5f.close() print 'LD information is now stored.' else: print 'Loading LD information from file: %s' % local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'r') ld_dict = hu.hdf5_to_dict(oh5f) oh5f.close() if not return_void: return ld_dict
def calc_pc_snp_weights( input_file="/project/PCMA/faststorage/1_DATA/1k_genomes/1K_genomes_phase3_EUR_unrelated.hdf5", pc_file="/project/PCMA/faststorage/1_DATA/1k_genomes/1kgenomes_kinship_pca_f0.95.hdf5", out_file="/project/PCMA/faststorage/1_DATA/1k_genomes/pc_snp_weights_top20.hdf5", snp_filter_frac=1, maf_thres=0.01, num_pcs=20, ): pcs_h5f = h5py.File(pc_file) print "Loading Genotype from " in_h5f = h5py.File(input_file) out_h5f = h5py.File(out_file, "w") # eur_filter = in_h5f['indivs']['continent'][...] == 'EUR' # num_indivs = sp.sum(eur_filter) indiv_ids = in_h5f["indiv_ids"][...] indiv_filter = None assert len(sp.unique(indiv_ids)) == len(indiv_ids) num_indivs = len(indiv_ids) for chrom in range(1, 23): print "Working on Chromosome %d" % chrom chrom_str = "chr%d" % chrom snp_filter = None if snp_filter_frac < 1: snp_filter = sp.random.random(len(in_h5f[chrom_str]["snps"])) < snp_filter_frac g_dict = kgenome.get_genotype_data( in_h5f, chrom, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=True, snps_signs=None, return_snps_info=True, ) norm_snps = g_dict["norm_snps"] snp_ids = g_dict["snp_ids"] evecs = pcs_h5f[chrom_str]["evecs_leave_one_out"][...] evals = pcs_h5f[chrom_str]["evals_leave_one_out"][...] sort_indices = sp.argsort(evals)[::-1] ordered_evals = evals[sort_indices] pcs_var_expl = sp.array(ordered_evals / sp.sum(ordered_evals), dtype="double") pcs = evecs[:, sort_indices] pcs = pcs[:, :num_pcs] ordered_evals = ordered_evals[:num_pcs] norm_pcs = pcs - sp.mean(pcs, axis=0) pcs_std = sp.std(norm_pcs, axis=0) norm_pcs = norm_pcs / pcs_std cg = out_h5f.create_group(chrom_str) cg.create_dataset("snp_pc_weights", data=sp.dot(norm_snps, norm_pcs) / num_indivs) cg.create_dataset("pcs_var_expl", data=pcs_var_expl) cg.create_dataset("snp_ids", data=snp_ids) out_h5f.flush() in_h5f.close() out_h5f.close()
def generate_1k_LD_scores(input_genotype_file, chrom_snp_trans_mats, maf_thres=0.01, ld_radius=200, debug_filter_frac=0.01, indiv_filter=None, snp_filter=None): """ Generates 1k genomes LD scores and stores in the given file """ chrom_ld_scores_dict = {} ld_score_sum = 0 struct_adj_ld_score_sum = 0 num_snps = 0 print 'Calculating LD information w. radius %d' % ld_radius in_h5f = h5py.File(input_genotype_file) print 'Calculating local LD' for chrom in range(1, 23): print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom print 'Loading SNPs' g_dict = kgenome.get_genotype_data(in_h5f, chrom, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=False, snps_signs=None, return_snps_info=True, debug_filter_frac=debug_filter_frac) norm_snps = g_dict['norm_snps'] ret_dict = ld.get_ld_scores(norm_snps, ld_radius=ld_radius) avg_ld_score = sp.mean(ret_dict['ld_scores']) g_dict['ld_scores'] = ret_dict['ld_scores'] g_dict['avg_ld_score'] = avg_ld_score ld_score_sum += sp.sum(ret_dict['ld_scores']) print 'Un-adjusted average LD score was: %0.3f' % avg_ld_score if chrom_snp_trans_mats is not None: snp_trans_mat = chrom_snp_trans_mats[chrom_str] norm_snps = sp.dot(norm_snps, snp_trans_mat.T) # Need to re-normalize? snp_means = sp.mean(norm_snps, 1) snp_means.shape = (len(snp_means), 1) snp_stds = sp.std(norm_snps, 1) snp_stds.shape = (len(snp_stds), 1) norm_snps = sp.array((norm_snps - snp_means) / snp_stds) ret_dict = ld.get_ld_scores(norm_snps, ld_radius=ld_radius) avg_ld_score = sp.mean(ret_dict['ld_scores']) print 'Pop-structure adjusted average LD score was: %0.3f' % avg_ld_score g_dict['struct_adj_ld_scores'] = ret_dict['ld_scores'] g_dict['avg_struct_adj_ld_score'] = avg_ld_score struct_adj_ld_score_sum += sp.sum(ret_dict['ld_scores']) del g_dict['norm_snps'] del g_dict['snp_means'] del g_dict['snp_stds'] chrom_ld_scores_dict[chrom_str] = g_dict num_snps += len(norm_snps) avg_gw_ld_score = ld_score_sum / float(num_snps) avg_gw_struct_adj_ld_score = ld_score_sum / float(num_snps) ld_scores_dict = { 'avg_gw_ld_score': avg_gw_ld_score, 'avg_gw_struct_adj_ld_score': avg_gw_struct_adj_ld_score, 'chrom_dict': chrom_ld_scores_dict } print 'Done calculating the LD table and LD scores.' return ld_scores_dict
def generate_1k_LD_scores( input_genotype_file, chrom_snp_trans_mats, maf_thres=0.01, ld_radius=200, debug_filter_frac=0.01, indiv_filter=None, snp_filter=None, ): """ Generates 1k genomes LD scores and stores in the given file """ chrom_ld_scores_dict = {} ld_score_sum = 0 struct_adj_ld_score_sum = 0 num_snps = 0 print "Calculating LD information w. radius %d" % ld_radius in_h5f = h5py.File(input_genotype_file) print "Calculating local LD" for chrom in range(1, 23): print "Working on Chromosome %d" % chrom chrom_str = "chr%d" % chrom print "Loading SNPs" g_dict = kgenome.get_genotype_data( in_h5f, chrom, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=False, snps_signs=None, return_snps_info=True, debug_filter_frac=debug_filter_frac, ) norm_snps = g_dict["norm_snps"] ret_dict = ld.get_ld_scores(norm_snps, ld_radius=ld_radius) avg_ld_score = sp.mean(ret_dict["ld_scores"]) g_dict["ld_scores"] = ret_dict["ld_scores"] g_dict["avg_ld_score"] = avg_ld_score ld_score_sum += sp.sum(ret_dict["ld_scores"]) print "Un-adjusted average LD score was: %0.3f" % avg_ld_score if chrom_snp_trans_mats is not None: snp_trans_mat = chrom_snp_trans_mats[chrom_str] norm_snps = sp.dot(norm_snps, snp_trans_mat.T) # Need to re-normalize? snp_means = sp.mean(norm_snps, 1) snp_means.shape = (len(snp_means), 1) snp_stds = sp.std(norm_snps, 1) snp_stds.shape = (len(snp_stds), 1) norm_snps = sp.array((norm_snps - snp_means) / snp_stds) ret_dict = ld.get_ld_scores(norm_snps, ld_radius=ld_radius) avg_ld_score = sp.mean(ret_dict["ld_scores"]) print "Pop-structure adjusted average LD score was: %0.3f" % avg_ld_score g_dict["struct_adj_ld_scores"] = ret_dict["ld_scores"] g_dict["avg_struct_adj_ld_score"] = avg_ld_score struct_adj_ld_score_sum += sp.sum(ret_dict["ld_scores"]) del g_dict["norm_snps"] del g_dict["snp_means"] del g_dict["snp_stds"] chrom_ld_scores_dict[chrom_str] = g_dict num_snps += len(norm_snps) avg_gw_ld_score = ld_score_sum / float(num_snps) avg_gw_struct_adj_ld_score = ld_score_sum / float(num_snps) ld_scores_dict = { "avg_gw_ld_score": avg_gw_ld_score, "avg_gw_struct_adj_ld_score": avg_gw_struct_adj_ld_score, "chrom_dict": chrom_ld_scores_dict, } print "Done calculating the LD table and LD scores." return ld_scores_dict