Beispiel #1
0
def calculate_ld_tables(input_genotype_file, chrom_i, local_ld_hdf5_file, ld_radius,
                        min_r2=0.2, maf_thres=0.01, indiv_filter=None,
                        snp_filter=None, return_void=True, verbose=True):
    """
    Calculate the LD tables for the given radius, and store in the given file.
    """
    if not os.path.isfile(local_ld_hdf5_file):
        h5f = h5py.File(input_genotype_file)
        
        print 'Calculating LD information for chromosome %d w. radius %d' % (chrom_i, ld_radius)

        g_dict = kgenome.get_genotype_data(h5f, chrom_i, maf_thres, indiv_filter=indiv_filter,
                        snp_filter=snp_filter, randomize_sign=False, snps_signs=None)
          
        ld_dict = get_ld_table(g_dict['norm_snps'], ld_radius=ld_radius, min_r2=min_r2, verbose=verbose)
        ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores'])
                    
        print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'w')
        hu.dict_to_hdf5(ld_dict, oh5f)
        oh5f.close()
        print 'LD information is now stored.'
    else:
        print 'Loading LD information from file: %s' % local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'r')
        ld_dict = hu.hdf5_to_dict(oh5f)
        oh5f.close()
    
    if not return_void:
        return ld_dict
Beispiel #2
0
def get_kinship_pca_dict(input_genotype_file, kinship_pca_file, maf_thres, snp_filter_frac, chrom_ok_snp_dict=None):
    if os.path.isfile(kinship_pca_file):
        print ':Loading Kinship and PCA information from %s' % kinship_pca_file
        k_h5f = h5py.File(kinship_pca_file)
        kinship_pca_dict = hu.hdf5_to_dict(k_h5f)
    else:
        kinship_pca_dict = calc_kinship(input_file=input_genotype_file , out_file=kinship_pca_file,
                                                maf_thres=maf_thres, figure_dir=None, snp_filter_frac=snp_filter_frac,
                                                chrom_ok_snp_dict=chrom_ok_snp_dict)
    return kinship_pca_dict
Beispiel #3
0
def ld_prune_1k_genotypes(in_hdf5_file, out_hdf5_file, local_ld_file_prefix, ld_radius, max_r2=0.2, maf_thres=0.01, chrom_ok_snp_dict=None):
    # Open input and output file
    ih5f = h5py.File(in_hdf5_file)
    oh5f = h5py.File(out_hdf5_file)
    
    for chrom_i in range(1, 23):
        print 'Working on Chromosome %d' % chrom_i
        chrom_str = 'chr%d' % chrom_i
          
        g_dict = get_genotype_data(ih5f, chrom_i, maf_thres=maf_thres, randomize_sign=False, snps_signs=None,
                                   return_raw_snps=True, return_snps_info=True, return_normalized_snps=False,
                                   chrom_ok_snp_dict=chrom_ok_snp_dict)
        snps = g_dict['snps']
        snp_means = g_dict['snp_means']
        snp_stds = g_dict['snp_stds']
        snp_ids = g_dict['snp_ids'] 
        positions = g_dict['positions']
        nts = g_dict['nts']

        local_ld_hdf5_file = '%s_chrom%d_ldradius%d.hdf5' % (local_ld_file_prefix, chrom_i, ld_radius)
        print 'Loading LD information from file: %s' % local_ld_hdf5_file
        ldh5f = h5py.File(local_ld_hdf5_file, 'r')
        ld_dict = hu.hdf5_to_dict(ldh5f)
        ldh5f.close()

        ld_snp_filter = ld.ld_pruning(ld_dict, max_r2=max_r2, verbose=True)
        print ld_snp_filter
        
        assert ld_dict['num_snps'] == len(snps)
        assert ld_dict['num_snps'] == len(ld_snp_filter)
        
        # Pruning SNPs in LD
        snps = snps[ld_snp_filter]
        snp_means = snp_means[ld_snp_filter]
        snp_stds = snp_stds[ld_snp_filter]
        snp_ids = snp_ids[ld_snp_filter]
        positions = positions[ld_snp_filter]
        nts = nts[ld_snp_filter]

        print 'Out of %d SNPs %d were retained' % (ld_dict['num_snps'], len(snps))

        cg = oh5f.create_group(chrom_str)
        cg.create_dataset('snps', data=sp.array(snps, dtype='int8'))
        cg.create_dataset('snp_means', data=snp_means)
        cg.create_dataset('snp_stds', data=snp_stds)
        cg.create_dataset('snp_ids', data=snp_ids)
        cg.create_dataset('positions', data=positions)
        cg.create_dataset('nts', data=nts)

    indiv_ids = ih5f['indiv_ids'][...]
    oh5f.create_dataset('indiv_ids', data=indiv_ids)    
    
    ih5f.close()
    oh5f.close()
Beispiel #4
0
def get_kinship_pca_dict(input_genotype_file,
                         kinship_pca_file,
                         maf_thres,
                         snp_filter_frac,
                         chrom_ok_snp_dict=None):
    if os.path.isfile(kinship_pca_file):
        print ':Loading Kinship and PCA information from %s' % kinship_pca_file
        k_h5f = h5py.File(kinship_pca_file)
        kinship_pca_dict = hu.hdf5_to_dict(k_h5f)
    else:
        kinship_pca_dict = calc_kinship(input_file=input_genotype_file,
                                        out_file=kinship_pca_file,
                                        maf_thres=maf_thres,
                                        figure_dir=None,
                                        snp_filter_frac=snp_filter_frac,
                                        chrom_ok_snp_dict=chrom_ok_snp_dict)
    return kinship_pca_dict
Beispiel #5
0
def calculate_ld_tables(input_genotype_file,
                        chrom_i,
                        local_ld_hdf5_file,
                        ld_radius,
                        min_r2=0.2,
                        maf_thres=0.01,
                        indiv_filter=None,
                        snp_filter=None,
                        return_void=True,
                        verbose=True):
    """
    Calculate the LD tables for the given radius, and store in the given file.
    """
    if not os.path.isfile(local_ld_hdf5_file):
        h5f = h5py.File(input_genotype_file)

        print 'Calculating LD information for chromosome %d w. radius %d' % (
            chrom_i, ld_radius)

        g_dict = kgenome.get_genotype_data(h5f,
                                           chrom_i,
                                           maf_thres,
                                           indiv_filter=indiv_filter,
                                           snp_filter=snp_filter,
                                           randomize_sign=False,
                                           snps_signs=None)

        ld_dict = get_ld_table(g_dict['norm_snps'],
                               ld_radius=ld_radius,
                               min_r2=min_r2,
                               verbose=verbose)
        ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores'])

        print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'w')
        hu.dict_to_hdf5(ld_dict, oh5f)
        oh5f.close()
        print 'LD information is now stored.'
    else:
        print 'Loading LD information from file: %s' % local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'r')
        ld_dict = hu.hdf5_to_dict(oh5f)
        oh5f.close()

    if not return_void:
        return ld_dict
Beispiel #6
0
def ld_prune_1k_genotypes(in_hdf5_file,
                          out_hdf5_file,
                          local_ld_file_prefix,
                          ld_radius,
                          max_r2=0.2,
                          maf_thres=0.01,
                          chrom_ok_snp_dict=None):
    # Open input and output file
    ih5f = h5py.File(in_hdf5_file)
    oh5f = h5py.File(out_hdf5_file)

    for chrom_i in range(1, 23):
        print 'Working on Chromosome %d' % chrom_i
        chrom_str = 'chr%d' % chrom_i

        g_dict = get_genotype_data(ih5f,
                                   chrom_i,
                                   maf_thres=maf_thres,
                                   randomize_sign=False,
                                   snps_signs=None,
                                   return_raw_snps=True,
                                   return_snps_info=True,
                                   return_normalized_snps=False,
                                   chrom_ok_snp_dict=chrom_ok_snp_dict)
        snps = g_dict['snps']
        snp_means = g_dict['snp_means']
        snp_stds = g_dict['snp_stds']
        snp_ids = g_dict['snp_ids']
        positions = g_dict['positions']
        nts = g_dict['nts']

        local_ld_hdf5_file = '%s_chrom%d_ldradius%d.hdf5' % (
            local_ld_file_prefix, chrom_i, ld_radius)
        print 'Loading LD information from file: %s' % local_ld_hdf5_file
        ldh5f = h5py.File(local_ld_hdf5_file, 'r')
        ld_dict = hu.hdf5_to_dict(ldh5f)
        ldh5f.close()

        ld_snp_filter = ld.ld_pruning(ld_dict, max_r2=max_r2, verbose=True)
        print ld_snp_filter

        assert ld_dict['num_snps'] == len(snps)
        assert ld_dict['num_snps'] == len(ld_snp_filter)

        # Pruning SNPs in LD
        snps = snps[ld_snp_filter]
        snp_means = snp_means[ld_snp_filter]
        snp_stds = snp_stds[ld_snp_filter]
        snp_ids = snp_ids[ld_snp_filter]
        positions = positions[ld_snp_filter]
        nts = nts[ld_snp_filter]

        print 'Out of %d SNPs %d were retained' % (ld_dict['num_snps'],
                                                   len(snps))

        cg = oh5f.create_group(chrom_str)
        cg.create_dataset('snps', data=sp.array(snps, dtype='int8'))
        cg.create_dataset('snp_means', data=snp_means)
        cg.create_dataset('snp_stds', data=snp_stds)
        cg.create_dataset('snp_ids', data=snp_ids)
        cg.create_dataset('positions', data=positions)
        cg.create_dataset('nts', data=nts)

    indiv_ids = ih5f['indiv_ids'][...]
    oh5f.create_dataset('indiv_ids', data=indiv_ids)

    ih5f.close()
    oh5f.close()