def calculate_ld_tables(input_genotype_file, chrom_i, local_ld_hdf5_file, ld_radius, min_r2=0.2, maf_thres=0.01, indiv_filter=None, snp_filter=None, return_void=True, verbose=True): """ Calculate the LD tables for the given radius, and store in the given file. """ if not os.path.isfile(local_ld_hdf5_file): h5f = h5py.File(input_genotype_file) print 'Calculating LD information for chromosome %d w. radius %d' % (chrom_i, ld_radius) g_dict = kgenome.get_genotype_data(h5f, chrom_i, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=False, snps_signs=None) ld_dict = get_ld_table(g_dict['norm_snps'], ld_radius=ld_radius, min_r2=min_r2, verbose=verbose) ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores']) print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'w') hu.dict_to_hdf5(ld_dict, oh5f) oh5f.close() print 'LD information is now stored.' else: print 'Loading LD information from file: %s' % local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'r') ld_dict = hu.hdf5_to_dict(oh5f) oh5f.close() if not return_void: return ld_dict
def get_kinship_pca_dict(input_genotype_file, kinship_pca_file, maf_thres, snp_filter_frac, chrom_ok_snp_dict=None): if os.path.isfile(kinship_pca_file): print ':Loading Kinship and PCA information from %s' % kinship_pca_file k_h5f = h5py.File(kinship_pca_file) kinship_pca_dict = hu.hdf5_to_dict(k_h5f) else: kinship_pca_dict = calc_kinship(input_file=input_genotype_file , out_file=kinship_pca_file, maf_thres=maf_thres, figure_dir=None, snp_filter_frac=snp_filter_frac, chrom_ok_snp_dict=chrom_ok_snp_dict) return kinship_pca_dict
def ld_prune_1k_genotypes(in_hdf5_file, out_hdf5_file, local_ld_file_prefix, ld_radius, max_r2=0.2, maf_thres=0.01, chrom_ok_snp_dict=None): # Open input and output file ih5f = h5py.File(in_hdf5_file) oh5f = h5py.File(out_hdf5_file) for chrom_i in range(1, 23): print 'Working on Chromosome %d' % chrom_i chrom_str = 'chr%d' % chrom_i g_dict = get_genotype_data(ih5f, chrom_i, maf_thres=maf_thres, randomize_sign=False, snps_signs=None, return_raw_snps=True, return_snps_info=True, return_normalized_snps=False, chrom_ok_snp_dict=chrom_ok_snp_dict) snps = g_dict['snps'] snp_means = g_dict['snp_means'] snp_stds = g_dict['snp_stds'] snp_ids = g_dict['snp_ids'] positions = g_dict['positions'] nts = g_dict['nts'] local_ld_hdf5_file = '%s_chrom%d_ldradius%d.hdf5' % (local_ld_file_prefix, chrom_i, ld_radius) print 'Loading LD information from file: %s' % local_ld_hdf5_file ldh5f = h5py.File(local_ld_hdf5_file, 'r') ld_dict = hu.hdf5_to_dict(ldh5f) ldh5f.close() ld_snp_filter = ld.ld_pruning(ld_dict, max_r2=max_r2, verbose=True) print ld_snp_filter assert ld_dict['num_snps'] == len(snps) assert ld_dict['num_snps'] == len(ld_snp_filter) # Pruning SNPs in LD snps = snps[ld_snp_filter] snp_means = snp_means[ld_snp_filter] snp_stds = snp_stds[ld_snp_filter] snp_ids = snp_ids[ld_snp_filter] positions = positions[ld_snp_filter] nts = nts[ld_snp_filter] print 'Out of %d SNPs %d were retained' % (ld_dict['num_snps'], len(snps)) cg = oh5f.create_group(chrom_str) cg.create_dataset('snps', data=sp.array(snps, dtype='int8')) cg.create_dataset('snp_means', data=snp_means) cg.create_dataset('snp_stds', data=snp_stds) cg.create_dataset('snp_ids', data=snp_ids) cg.create_dataset('positions', data=positions) cg.create_dataset('nts', data=nts) indiv_ids = ih5f['indiv_ids'][...] oh5f.create_dataset('indiv_ids', data=indiv_ids) ih5f.close() oh5f.close()
def get_kinship_pca_dict(input_genotype_file, kinship_pca_file, maf_thres, snp_filter_frac, chrom_ok_snp_dict=None): if os.path.isfile(kinship_pca_file): print ':Loading Kinship and PCA information from %s' % kinship_pca_file k_h5f = h5py.File(kinship_pca_file) kinship_pca_dict = hu.hdf5_to_dict(k_h5f) else: kinship_pca_dict = calc_kinship(input_file=input_genotype_file, out_file=kinship_pca_file, maf_thres=maf_thres, figure_dir=None, snp_filter_frac=snp_filter_frac, chrom_ok_snp_dict=chrom_ok_snp_dict) return kinship_pca_dict
def calculate_ld_tables(input_genotype_file, chrom_i, local_ld_hdf5_file, ld_radius, min_r2=0.2, maf_thres=0.01, indiv_filter=None, snp_filter=None, return_void=True, verbose=True): """ Calculate the LD tables for the given radius, and store in the given file. """ if not os.path.isfile(local_ld_hdf5_file): h5f = h5py.File(input_genotype_file) print 'Calculating LD information for chromosome %d w. radius %d' % ( chrom_i, ld_radius) g_dict = kgenome.get_genotype_data(h5f, chrom_i, maf_thres, indiv_filter=indiv_filter, snp_filter=snp_filter, randomize_sign=False, snps_signs=None) ld_dict = get_ld_table(g_dict['norm_snps'], ld_radius=ld_radius, min_r2=min_r2, verbose=verbose) ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores']) print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'w') hu.dict_to_hdf5(ld_dict, oh5f) oh5f.close() print 'LD information is now stored.' else: print 'Loading LD information from file: %s' % local_ld_hdf5_file oh5f = h5py.File(local_ld_hdf5_file, 'r') ld_dict = hu.hdf5_to_dict(oh5f) oh5f.close() if not return_void: return ld_dict
def ld_prune_1k_genotypes(in_hdf5_file, out_hdf5_file, local_ld_file_prefix, ld_radius, max_r2=0.2, maf_thres=0.01, chrom_ok_snp_dict=None): # Open input and output file ih5f = h5py.File(in_hdf5_file) oh5f = h5py.File(out_hdf5_file) for chrom_i in range(1, 23): print 'Working on Chromosome %d' % chrom_i chrom_str = 'chr%d' % chrom_i g_dict = get_genotype_data(ih5f, chrom_i, maf_thres=maf_thres, randomize_sign=False, snps_signs=None, return_raw_snps=True, return_snps_info=True, return_normalized_snps=False, chrom_ok_snp_dict=chrom_ok_snp_dict) snps = g_dict['snps'] snp_means = g_dict['snp_means'] snp_stds = g_dict['snp_stds'] snp_ids = g_dict['snp_ids'] positions = g_dict['positions'] nts = g_dict['nts'] local_ld_hdf5_file = '%s_chrom%d_ldradius%d.hdf5' % ( local_ld_file_prefix, chrom_i, ld_radius) print 'Loading LD information from file: %s' % local_ld_hdf5_file ldh5f = h5py.File(local_ld_hdf5_file, 'r') ld_dict = hu.hdf5_to_dict(ldh5f) ldh5f.close() ld_snp_filter = ld.ld_pruning(ld_dict, max_r2=max_r2, verbose=True) print ld_snp_filter assert ld_dict['num_snps'] == len(snps) assert ld_dict['num_snps'] == len(ld_snp_filter) # Pruning SNPs in LD snps = snps[ld_snp_filter] snp_means = snp_means[ld_snp_filter] snp_stds = snp_stds[ld_snp_filter] snp_ids = snp_ids[ld_snp_filter] positions = positions[ld_snp_filter] nts = nts[ld_snp_filter] print 'Out of %d SNPs %d were retained' % (ld_dict['num_snps'], len(snps)) cg = oh5f.create_group(chrom_str) cg.create_dataset('snps', data=sp.array(snps, dtype='int8')) cg.create_dataset('snp_means', data=snp_means) cg.create_dataset('snp_stds', data=snp_stds) cg.create_dataset('snp_ids', data=snp_ids) cg.create_dataset('positions', data=positions) cg.create_dataset('nts', data=nts) indiv_ids = ih5f['indiv_ids'][...] oh5f.create_dataset('indiv_ids', data=indiv_ids) ih5f.close() oh5f.close()