def calculate_ld_tables(input_genotype_file, chrom_i, local_ld_hdf5_file, ld_radius, min_r2=0.2, maf_thres=0.01, indiv_filter=None, snp_filter=None, return_void=True, verbose=True): """ Calculate the LD tables for the given radius, and store in the given file. """ if not os.path.isfile(local_ld_hdf5_file): h5f = h5py.File(input_genotype_file) print 'Calculating LD information for chromosome %d w. radius %d' % (chrom_i, ld_radius) g_dict = kgenome.get_genotype_data(h5f, chrom_i, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=False, snps_signs=None) ld_dict = get_ld_table(g_dict['norm_snps'], ld_radius=ld_radius, min_r2=min_r2, verbose=verbose) ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores']) print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'w') hu.dict_to_hdf5(ld_dict, oh5f) oh5f.close() print 'LD information is now stored.' else: print 'Loading LD information from file: %s' % local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'r') ld_dict = hu.hdf5_to_dict(oh5f) oh5f.close() if not return_void: return ld_dict
def calculate_ld_tables(input_genotype_file, chrom_i, local_ld_hdf5_file, ld_radius, min_r2=0.2, maf_thres=0.01, indiv_filter=None, snp_filter=None, return_void=True, verbose=True): """ Calculate the LD tables for the given radius, and store in the given file. """ if not os.path.isfile(local_ld_hdf5_file): h5f = h5py.File(input_genotype_file) print 'Calculating LD information for chromosome %d w. radius %d' % ( chrom_i, ld_radius) g_dict = kgenome.get_genotype_data(h5f, chrom_i, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=False, snps_signs=None) ld_dict = get_ld_table(g_dict['norm_snps'], ld_radius=ld_radius, min_r2=min_r2, verbose=verbose) ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores']) print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'w') hu.dict_to_hdf5(ld_dict, oh5f) oh5f.close() print 'LD information is now stored.' else: print 'Loading LD information from file: %s' % local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'r') ld_dict = hu.hdf5_to_dict(oh5f) oh5f.close() if not return_void: return ld_dict
def calculate( input_genotype_file, input_ld_pruned_genotype_file, ld_score_file, kinship_pca_file, ld_radius=200, maf_thres=0.01, snp_filter_frac=0.05, debug_filter_frac=1, ): """ Generates population structure adjusted 1k genomes LD scores and stores in the given file. """ # Kinship kinship_pca_dict = kgenome.get_kinship_pca_dict( input_ld_pruned_genotype_file, kinship_pca_file, maf_thres=maf_thres, snp_filter_frac=snp_filter_frac ) chrom_snp_trans_mats = {} for chrom in range(1, 23): print "Working on Chromosome %d" % chrom chrom_str = "chr%d" % chrom chrom_snp_trans_mats[chrom_str] = kinship_pca_dict[chrom_str]["cholesky_decomp_inv_snp_cov"] # bla lds_dict = generate_1k_LD_scores( input_genotype_file, chrom_snp_trans_mats, maf_thres=maf_thres, ld_radius=ld_radius, debug_filter_frac=debug_filter_frac, ) # Store LD scores lds_h5f = h5py.File(ld_score_file) hu.dict_to_hdf5(lds_dict, lds_h5f) lds_h5f.close()
def calculate(input_genotype_file, input_ld_pruned_genotype_file, ld_score_file, kinship_pca_file, ld_radius=200, maf_thres=0.01, snp_filter_frac=0.05, debug_filter_frac=1): """ Generates population structure adjusted 1k genomes LD scores and stores in the given file. """ # Kinship kinship_pca_dict = kgenome.get_kinship_pca_dict( input_ld_pruned_genotype_file, kinship_pca_file, maf_thres=maf_thres, snp_filter_frac=snp_filter_frac) chrom_snp_trans_mats = {} for chrom in range(1, 23): print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom chrom_snp_trans_mats[chrom_str] = kinship_pca_dict[chrom_str][ 'cholesky_decomp_inv_snp_cov'] # bla lds_dict = generate_1k_LD_scores(input_genotype_file, chrom_snp_trans_mats, maf_thres=maf_thres, ld_radius=ld_radius, debug_filter_frac=debug_filter_frac) # Store LD scores lds_h5f = h5py.File(ld_score_file) hu.dict_to_hdf5(lds_dict, lds_h5f) lds_h5f.close()
def calc_kinship(input_file='Data/1Kgenomes/1K_genomes_v3.hdf5', out_file='Data/1Kgenomes/kinship.hdf5', maf_thres=0.01, figure_dir='', figure_fn='', snp_filter_frac=1, indiv_filter_frac=1, chrom_ok_snp_dict=None): import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt print 'Loading Genotype from ' in_h5f = h5py.File(input_file) # eur_filter = in_h5f['indivs']['continent'][...] == 'EUR' # num_indivs = sp.sum(eur_filter) indiv_ids = in_h5f['indiv_ids'][...] indiv_filter = None if indiv_filter_frac < 1: indiv_filter = sp.array( sp.random.random(len(indiv_ids)) < indiv_filter_frac, dtype='bool8') indiv_ids = indiv_ids[indiv_filter] assert len(sp.unique(indiv_ids)) == len(indiv_ids) num_indivs = len(indiv_ids) ok_chromosome_dict = {} not_done = set(range(1, 23)) while len(not_done) > 0: chromosome_dict = {} K_all_snps = sp.zeros((num_indivs, num_indivs), dtype='float32') num_all_snps = 0 sum_indiv_genotypes_all_chrom = sp.zeros(num_indivs, dtype='float32') # snp_cov_all_snps = sp.zeros((num_indivs, num_indivs), dtype='float64') print 'Calculating kinship' for chrom in range(1, 23): print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom snp_filter = None if snp_filter_frac < 1: snp_filter = sp.random.random(len( in_h5f[chrom_str]['snps'])) < snp_filter_frac g_dict = get_genotype_data(in_h5f, chrom, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=True, snps_signs=None, chrom_ok_snp_dict=chrom_ok_snp_dict) norm_snps = g_dict['norm_snps'] sum_indiv_genotypes = sp.sum(g_dict['norm_snps'], 0) sum_indiv_genotypes_all_chrom += sum_indiv_genotypes print 'Calculating chromosome kinship' K_unscaled = sp.array(sp.dot(norm_snps.T, norm_snps), dtype='float32') assert sp.isclose( sp.sum(sp.diag(K_unscaled)) / (len(norm_snps) * num_indivs), 1.0), '..bug' K_all_snps += K_unscaled num_all_snps += len(norm_snps) print 'SNP-cov normalisation' sum_indiv_genotypes = sp.sum(norm_snps, 0) sum_indiv_genotypes_all_chrom += sum_indiv_genotypes mean_indiv_genotypes = sum_indiv_genotypes / len(norm_snps) norm_snps = norm_snps - mean_indiv_genotypes print 'Calculating SNP covariance unscaled' snp_cov_unscaled = sp.array(sp.dot(norm_snps.T, norm_snps), dtype='float32') # snp_cov_all_snps += snp_cov_unscaled print 'Storing and updating things' chromosome_dict[chrom_str] = { 'K_unscaled': K_unscaled, 'num_snps': len(norm_snps), 'sum_indiv_genotypes': sum_indiv_genotypes, 'snp_cov_unscaled': snp_cov_unscaled, 'snps_signs': g_dict['snps_signs'] } if snp_filter_frac < 1: chromosome_dict[chrom_str]['snp_filter'] = snp_filter # snp_cov_all_snps = snp_cov_all_snps / float(num_all_snps) # K_all_snps = K_all_snps / float(num_all_snps) # print 'K_all_snps.shape: %s' % str(K_all_snps.shape) # print 'snp_cov_all_snps.shape: %s' % str(snp_cov_all_snps.shape) # print 'sp.diag(snp_cov_all_snps): %s' % str(sp.diag(snp_cov_all_snps)) # print 'sp.mean(sp.diag(snp_cov_all_snps)_: %s' % str(sp.mean(sp.diag(snp_cov_all_snps))) # print 'Full kinship and snp-covariance calculation done using %d SNPs\n' % num_all_snps mean_indiv_genotypes_all_chrom = sum_indiv_genotypes_all_chrom / num_all_snps print 'Individual gentoype mean found:' print mean_indiv_genotypes_all_chrom print 'Calculating chromosome-wise SNP-covariance and kinship matrices' for chrom in range(1, 23): if chrom in not_done: print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom snp_cov_leave_one_out = sp.zeros((num_indivs, num_indivs), dtype='float32') K_leave_one_out = sp.zeros((num_indivs, num_indivs), dtype='float32') num_snps_used = 0 sum_indiv_genotypes = sp.zeros(num_indivs, dtype='float32') for chrom2 in range(1, 23): chrom2_str = 'chr%d' % chrom2 if chrom2 != chrom: sum_indiv_genotypes += chromosome_dict[chrom2_str][ 'sum_indiv_genotypes'] K_leave_one_out += chromosome_dict[chrom2_str][ 'K_unscaled'] num_snps_used += chromosome_dict[chrom2_str][ 'num_snps'] assert sp.isclose( sp.sum(sp.diag(K_leave_one_out)) / (num_snps_used * num_indivs), 1.0), '..bug' mean_indiv_genotypes = sum_indiv_genotypes / num_snps_used for chrom2 in range(1, 23): chrom2_str = 'chr%d' % chrom2 if chrom2 != chrom: print 'Loading SNPs' snps_signs = chromosome_dict[chrom2_str]['snps_signs'] snp_filter = chromosome_dict[chrom2_str]['snp_filter'] g_dict = get_genotype_data( in_h5f, chrom2, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=True, snps_signs=snps_signs, chrom_ok_snp_dict=chrom_ok_snp_dict) norm_snps = g_dict['norm_snps'] print 'SNP-cov normalisation' norm_snps = norm_snps - mean_indiv_genotypes print 'Calculating SNP covariance unscaled' snp_cov_unscaled = sp.dot(norm_snps.T, norm_snps) snp_cov_leave_one_out += snp_cov_unscaled snp_cov_leave_one_out = snp_cov_leave_one_out / num_snps_used K_leave_one_out = K_leave_one_out / num_snps_used assert (K_leave_one_out - sp.diag(K_leave_one_out)).max() < 0.1, '..bug' try: cholesky_decomp_inv_snp_cov = linalg.cholesky( linalg.pinv( sp.array(snp_cov_leave_one_out, dtype='float64'))) evals, evecs = linalg.eig( sp.array(K_leave_one_out, dtype='float64')) except: try: cholesky_decomp_inv_snp_cov = linalg.cholesky( linalg.pinv( sp.array(snp_cov_leave_one_out, dtype='float32'))) evals, evecs = linalg.eig( sp.array(K_leave_one_out, dtype='float32')) except: print 'Failed when obtaining the Cholesky decomposotion or eigen decomposition' print 'Moving on, trying again later.' continue sort_indices = sp.argsort(evals, ) ordered_evals = evals[sort_indices] print ordered_evals[-10:] / sp.sum(ordered_evals) ordered_evecs = evecs[:, sort_indices] d = {} d['evecs_leave_one_out'] = ordered_evecs d['evals_leave_one_out'] = ordered_evals d['cholesky_decomp_inv_snp_cov'] = cholesky_decomp_inv_snp_cov d['K_leave_one_out'] = K_leave_one_out d['K_unscaled'] = chromosome_dict[chrom_str]['K_unscaled'] d['num_snps'] = chromosome_dict[chrom_str]['num_snps'] d['snp_cov_leave_one_out'] = snp_cov_leave_one_out ok_chromosome_dict[chrom_str] = d not_done.remove(chrom) # While loop ends here. K_all_snps = K_all_snps / float(num_all_snps) in_h5f.close() ok_chromosome_dict['K_all_snps'] = K_all_snps ok_chromosome_dict['num_all_snps'] = num_all_snps assert sp.sum((ok_chromosome_dict['chr1']['K_leave_one_out'] - ok_chromosome_dict['chr2']['K_leave_one_out'])** 2) != 0, 'Kinships are probably too similar.' print 'Calculating PCAs' evals, evecs = linalg.eigh(sp.array( K_all_snps, dtype='float64')) # PCA via eigen decomp evals[evals < 0] = 0 sort_indices = sp.argsort(evals, )[::-1] ordered_evals = evals[sort_indices] print ordered_evals[:10] / sp.sum(ordered_evals) pcs = evecs[:, sort_indices] tot = sum(evals) var_exp = [(i / tot) * 100 for i in sorted(evals, reverse=True)] print 'Total variance explained:', sp.sum(var_exp) ok_chromosome_dict['pcs'] = pcs ok_chromosome_dict['pcs_var_exp'] = var_exp if figure_dir is not None: plt.clf() plt.plot(pcs[:, 0], pcs[:, 1], 'k.') plt.title("Overall PCA") plt.xlabel('PC1') plt.xlabel('PC2') plt.tight_layout() plt.savefig(figure_dir + '/' + figure_fn, format='pdf') plt.clf() out_h5f = h5py.File(out_file) hu.dict_to_hdf5(ok_chromosome_dict, out_h5f) out_h5f.close() return ok_chromosome_dict
def calc_kinship(input_file='Data/1Kgenomes/1K_genomes_v3.hdf5' , out_file='Data/1Kgenomes/kinship.hdf5', maf_thres=0.01, figure_dir='', figure_fn='', snp_filter_frac=1, indiv_filter_frac=1, chrom_ok_snp_dict=None): import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt print 'Loading Genotype from ' in_h5f = h5py.File(input_file) # eur_filter = in_h5f['indivs']['continent'][...] == 'EUR' # num_indivs = sp.sum(eur_filter) indiv_ids = in_h5f['indiv_ids'][...] indiv_filter = None if indiv_filter_frac < 1: indiv_filter = sp.array(sp.random.random(len(indiv_ids)) < indiv_filter_frac, dtype='bool8') indiv_ids = indiv_ids[indiv_filter] assert len(sp.unique(indiv_ids)) == len(indiv_ids) num_indivs = len(indiv_ids) ok_chromosome_dict = {} not_done = set(range(1, 23)) while len(not_done) > 0: chromosome_dict = {} K_all_snps = sp.zeros((num_indivs, num_indivs), dtype='float32') num_all_snps = 0 sum_indiv_genotypes_all_chrom = sp.zeros(num_indivs, dtype='float32') # snp_cov_all_snps = sp.zeros((num_indivs, num_indivs), dtype='float64') print 'Calculating kinship' for chrom in range(1, 23): print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom snp_filter = None if snp_filter_frac < 1: snp_filter = sp.random.random(len(in_h5f[chrom_str]['snps'])) < snp_filter_frac g_dict = get_genotype_data(in_h5f, chrom, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=True, snps_signs=None, chrom_ok_snp_dict=chrom_ok_snp_dict) norm_snps = g_dict['norm_snps'] sum_indiv_genotypes = sp.sum(g_dict['norm_snps'], 0) sum_indiv_genotypes_all_chrom += sum_indiv_genotypes print 'Calculating chromosome kinship' K_unscaled = sp.array(sp.dot(norm_snps.T, norm_snps), dtype='float32') assert sp.isclose(sp.sum(sp.diag(K_unscaled)) / (len(norm_snps) * num_indivs), 1.0), '..bug' K_all_snps += K_unscaled num_all_snps += len(norm_snps) print 'SNP-cov normalisation' sum_indiv_genotypes = sp.sum(norm_snps, 0) sum_indiv_genotypes_all_chrom += sum_indiv_genotypes mean_indiv_genotypes = sum_indiv_genotypes / len(norm_snps) norm_snps = norm_snps - mean_indiv_genotypes print 'Calculating SNP covariance unscaled' snp_cov_unscaled = sp.array(sp.dot(norm_snps.T, norm_snps), dtype='float32') # snp_cov_all_snps += snp_cov_unscaled print 'Storing and updating things' chromosome_dict[chrom_str] = {'K_unscaled':K_unscaled, 'num_snps':len(norm_snps), 'sum_indiv_genotypes':sum_indiv_genotypes, 'snp_cov_unscaled':snp_cov_unscaled, 'snps_signs':g_dict['snps_signs']} if snp_filter_frac < 1: chromosome_dict[chrom_str]['snp_filter'] = snp_filter # snp_cov_all_snps = snp_cov_all_snps / float(num_all_snps) # K_all_snps = K_all_snps / float(num_all_snps) # print 'K_all_snps.shape: %s' % str(K_all_snps.shape) # print 'snp_cov_all_snps.shape: %s' % str(snp_cov_all_snps.shape) # print 'sp.diag(snp_cov_all_snps): %s' % str(sp.diag(snp_cov_all_snps)) # print 'sp.mean(sp.diag(snp_cov_all_snps)_: %s' % str(sp.mean(sp.diag(snp_cov_all_snps))) # print 'Full kinship and snp-covariance calculation done using %d SNPs\n' % num_all_snps mean_indiv_genotypes_all_chrom = sum_indiv_genotypes_all_chrom / num_all_snps print 'Individual gentoype mean found:' print mean_indiv_genotypes_all_chrom print 'Calculating chromosome-wise SNP-covariance and kinship matrices' for chrom in range(1, 23): if chrom in not_done: print 'Working on Chromosome %d' % chrom chrom_str = 'chr%d' % chrom snp_cov_leave_one_out = sp.zeros((num_indivs, num_indivs), dtype='float32') K_leave_one_out = sp.zeros((num_indivs, num_indivs), dtype='float32') num_snps_used = 0 sum_indiv_genotypes = sp.zeros(num_indivs, dtype='float32') for chrom2 in range(1, 23): chrom2_str = 'chr%d' % chrom2 if chrom2 != chrom: sum_indiv_genotypes += chromosome_dict[chrom2_str]['sum_indiv_genotypes'] K_leave_one_out += chromosome_dict[chrom2_str]['K_unscaled'] num_snps_used += chromosome_dict[chrom2_str]['num_snps'] assert sp.isclose(sp.sum(sp.diag(K_leave_one_out)) / (num_snps_used * num_indivs), 1.0), '..bug' mean_indiv_genotypes = sum_indiv_genotypes / num_snps_used for chrom2 in range(1, 23): chrom2_str = 'chr%d' % chrom2 if chrom2 != chrom: print 'Loading SNPs' snps_signs = chromosome_dict[chrom2_str]['snps_signs'] snp_filter = chromosome_dict[chrom2_str]['snp_filter'] g_dict = get_genotype_data(in_h5f, chrom2, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=True, snps_signs=snps_signs, chrom_ok_snp_dict=chrom_ok_snp_dict) norm_snps = g_dict['norm_snps'] print 'SNP-cov normalisation' norm_snps = norm_snps - mean_indiv_genotypes print 'Calculating SNP covariance unscaled' snp_cov_unscaled = sp.dot(norm_snps.T, norm_snps) snp_cov_leave_one_out += snp_cov_unscaled snp_cov_leave_one_out = snp_cov_leave_one_out / num_snps_used K_leave_one_out = K_leave_one_out / num_snps_used assert (K_leave_one_out - sp.diag(K_leave_one_out)).max() < 0.1, '..bug' try: cholesky_decomp_inv_snp_cov = linalg.cholesky(linalg.pinv(sp.array(snp_cov_leave_one_out, dtype='float64'))) evals, evecs = linalg.eig(sp.array(K_leave_one_out, dtype='float64')) except: try: cholesky_decomp_inv_snp_cov = linalg.cholesky(linalg.pinv(sp.array(snp_cov_leave_one_out, dtype='float32'))) evals, evecs = linalg.eig(sp.array(K_leave_one_out, dtype='float32')) except: print 'Failed when obtaining the Cholesky decomposotion or eigen decomposition' print 'Moving on, trying again later.' continue sort_indices = sp.argsort(evals,) ordered_evals = evals[sort_indices] print ordered_evals[-10:] / sp.sum(ordered_evals) ordered_evecs = evecs[:, sort_indices] d = {} d['evecs_leave_one_out'] = ordered_evecs d['evals_leave_one_out'] = ordered_evals d['cholesky_decomp_inv_snp_cov'] = cholesky_decomp_inv_snp_cov d['K_leave_one_out'] = K_leave_one_out d['K_unscaled'] = chromosome_dict[chrom_str]['K_unscaled'] d['num_snps'] = chromosome_dict[chrom_str]['num_snps'] d['snp_cov_leave_one_out'] = snp_cov_leave_one_out ok_chromosome_dict[chrom_str] = d not_done.remove(chrom) # While loop ends here. K_all_snps = K_all_snps / float(num_all_snps) in_h5f.close() ok_chromosome_dict['K_all_snps'] = K_all_snps ok_chromosome_dict['num_all_snps'] = num_all_snps assert sp.sum((ok_chromosome_dict['chr1']['K_leave_one_out'] - ok_chromosome_dict['chr2']['K_leave_one_out']) ** 2) != 0 , 'Kinships are probably too similar.' print 'Calculating PCAs' evals, evecs = linalg.eigh(sp.array(K_all_snps, dtype='float64')) # PCA via eigen decomp evals[evals < 0] = 0 sort_indices = sp.argsort(evals,)[::-1] ordered_evals = evals[sort_indices] print ordered_evals[:10] / sp.sum(ordered_evals) pcs = evecs[:, sort_indices] tot = sum(evals) var_exp = [(i / tot) * 100 for i in sorted(evals, reverse=True)] print 'Total variance explained:', sp.sum(var_exp) ok_chromosome_dict['pcs'] = pcs ok_chromosome_dict['pcs_var_exp'] = var_exp if figure_dir is not None: plt.clf() plt.plot(pcs[:, 0], pcs[:, 1], 'k.') plt.title("Overall PCA") plt.xlabel('PC1') plt.xlabel('PC2') plt.tight_layout() plt.savefig(figure_dir + '/' + figure_fn, format='pdf') plt.clf() out_h5f = h5py.File(out_file) hu.dict_to_hdf5(ok_chromosome_dict, out_h5f) out_h5f.close() return ok_chromosome_dict