Exemple #1
0
def calculate_ld_tables(input_genotype_file, chrom_i, local_ld_hdf5_file, ld_radius,
                        min_r2=0.2, maf_thres=0.01, indiv_filter=None,
                        snp_filter=None, return_void=True, verbose=True):
    """
    Calculate the LD tables for the given radius, and store in the given file.
    """
    if not os.path.isfile(local_ld_hdf5_file):
        h5f = h5py.File(input_genotype_file)
        
        print 'Calculating LD information for chromosome %d w. radius %d' % (chrom_i, ld_radius)

        g_dict = kgenome.get_genotype_data(h5f, chrom_i, maf_thres, indiv_filter=indiv_filter,
                        snp_filter=snp_filter, randomize_sign=False, snps_signs=None)
          
        ld_dict = get_ld_table(g_dict['norm_snps'], ld_radius=ld_radius, min_r2=min_r2, verbose=verbose)
        ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores'])
                    
        print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'w')
        hu.dict_to_hdf5(ld_dict, oh5f)
        oh5f.close()
        print 'LD information is now stored.'
    else:
        print 'Loading LD information from file: %s' % local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'r')
        ld_dict = hu.hdf5_to_dict(oh5f)
        oh5f.close()
    
    if not return_void:
        return ld_dict
Exemple #2
0
def calculate_ld_tables(input_genotype_file,
                        chrom_i,
                        local_ld_hdf5_file,
                        ld_radius,
                        min_r2=0.2,
                        maf_thres=0.01,
                        indiv_filter=None,
                        snp_filter=None,
                        return_void=True,
                        verbose=True):
    """
    Calculate the LD tables for the given radius, and store in the given file.
    """
    if not os.path.isfile(local_ld_hdf5_file):
        h5f = h5py.File(input_genotype_file)

        print 'Calculating LD information for chromosome %d w. radius %d' % (
            chrom_i, ld_radius)

        g_dict = kgenome.get_genotype_data(h5f,
                                           chrom_i,
                                           maf_thres,
                                           indiv_filter=indiv_filter,
                                           snp_filter=snp_filter,
                                           randomize_sign=False,
                                           snps_signs=None)

        ld_dict = get_ld_table(g_dict['norm_snps'],
                               ld_radius=ld_radius,
                               min_r2=min_r2,
                               verbose=verbose)
        ld_dict['avg_ld_score'] = sp.mean(ld_dict['ld_scores'])

        print 'Done calculating the LD table and LD score, writing to file:', local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'w')
        hu.dict_to_hdf5(ld_dict, oh5f)
        oh5f.close()
        print 'LD information is now stored.'
    else:
        print 'Loading LD information from file: %s' % local_ld_hdf5_file
        oh5f = h5py.File(local_ld_hdf5_file, 'r')
        ld_dict = hu.hdf5_to_dict(oh5f)
        oh5f.close()

    if not return_void:
        return ld_dict
Exemple #3
0
def calculate(
    input_genotype_file,
    input_ld_pruned_genotype_file,
    ld_score_file,
    kinship_pca_file,
    ld_radius=200,
    maf_thres=0.01,
    snp_filter_frac=0.05,
    debug_filter_frac=1,
):
    """
    Generates population structure adjusted 1k genomes LD scores and stores in the given file.
    """

    # Kinship
    kinship_pca_dict = kgenome.get_kinship_pca_dict(
        input_ld_pruned_genotype_file, kinship_pca_file, maf_thres=maf_thres, snp_filter_frac=snp_filter_frac
    )
    chrom_snp_trans_mats = {}
    for chrom in range(1, 23):
        print "Working on Chromosome %d" % chrom
        chrom_str = "chr%d" % chrom
        chrom_snp_trans_mats[chrom_str] = kinship_pca_dict[chrom_str]["cholesky_decomp_inv_snp_cov"]

    # bla
    lds_dict = generate_1k_LD_scores(
        input_genotype_file,
        chrom_snp_trans_mats,
        maf_thres=maf_thres,
        ld_radius=ld_radius,
        debug_filter_frac=debug_filter_frac,
    )

    # Store LD scores
    lds_h5f = h5py.File(ld_score_file)
    hu.dict_to_hdf5(lds_dict, lds_h5f)
    lds_h5f.close()
Exemple #4
0
def calculate(input_genotype_file,
              input_ld_pruned_genotype_file,
              ld_score_file,
              kinship_pca_file,
              ld_radius=200,
              maf_thres=0.01,
              snp_filter_frac=0.05,
              debug_filter_frac=1):
    """
    Generates population structure adjusted 1k genomes LD scores and stores in the given file.
    """

    # Kinship
    kinship_pca_dict = kgenome.get_kinship_pca_dict(
        input_ld_pruned_genotype_file,
        kinship_pca_file,
        maf_thres=maf_thres,
        snp_filter_frac=snp_filter_frac)
    chrom_snp_trans_mats = {}
    for chrom in range(1, 23):
        print 'Working on Chromosome %d' % chrom
        chrom_str = 'chr%d' % chrom
        chrom_snp_trans_mats[chrom_str] = kinship_pca_dict[chrom_str][
            'cholesky_decomp_inv_snp_cov']

    # bla
    lds_dict = generate_1k_LD_scores(input_genotype_file,
                                     chrom_snp_trans_mats,
                                     maf_thres=maf_thres,
                                     ld_radius=ld_radius,
                                     debug_filter_frac=debug_filter_frac)

    # Store LD scores
    lds_h5f = h5py.File(ld_score_file)
    hu.dict_to_hdf5(lds_dict, lds_h5f)
    lds_h5f.close()
Exemple #5
0
def calc_kinship(input_file='Data/1Kgenomes/1K_genomes_v3.hdf5',
                 out_file='Data/1Kgenomes/kinship.hdf5',
                 maf_thres=0.01,
                 figure_dir='',
                 figure_fn='',
                 snp_filter_frac=1,
                 indiv_filter_frac=1,
                 chrom_ok_snp_dict=None):
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

    print 'Loading Genotype from '
    in_h5f = h5py.File(input_file)
    #     eur_filter = in_h5f['indivs']['continent'][...] == 'EUR'
    #     num_indivs = sp.sum(eur_filter)
    indiv_ids = in_h5f['indiv_ids'][...]
    indiv_filter = None
    if indiv_filter_frac < 1:
        indiv_filter = sp.array(
            sp.random.random(len(indiv_ids)) < indiv_filter_frac,
            dtype='bool8')
        indiv_ids = indiv_ids[indiv_filter]
    assert len(sp.unique(indiv_ids)) == len(indiv_ids)
    num_indivs = len(indiv_ids)

    ok_chromosome_dict = {}

    not_done = set(range(1, 23))
    while len(not_done) > 0:
        chromosome_dict = {}

        K_all_snps = sp.zeros((num_indivs, num_indivs), dtype='float32')
        num_all_snps = 0

        sum_indiv_genotypes_all_chrom = sp.zeros(num_indivs, dtype='float32')
        #         snp_cov_all_snps = sp.zeros((num_indivs, num_indivs), dtype='float64')

        print 'Calculating kinship'

        for chrom in range(1, 23):
            print 'Working on Chromosome %d' % chrom
            chrom_str = 'chr%d' % chrom

            snp_filter = None
            if snp_filter_frac < 1:
                snp_filter = sp.random.random(len(
                    in_h5f[chrom_str]['snps'])) < snp_filter_frac

            g_dict = get_genotype_data(in_h5f,
                                       chrom,
                                       maf_thres,
                                       indiv_filter=indiv_filter,
                                       snp_filter=snp_filter,
                                       randomize_sign=True,
                                       snps_signs=None,
                                       chrom_ok_snp_dict=chrom_ok_snp_dict)

            norm_snps = g_dict['norm_snps']

            sum_indiv_genotypes = sp.sum(g_dict['norm_snps'], 0)
            sum_indiv_genotypes_all_chrom += sum_indiv_genotypes

            print 'Calculating chromosome kinship'
            K_unscaled = sp.array(sp.dot(norm_snps.T, norm_snps),
                                  dtype='float32')
            assert sp.isclose(
                sp.sum(sp.diag(K_unscaled)) / (len(norm_snps) * num_indivs),
                1.0), '..bug'
            K_all_snps += K_unscaled
            num_all_snps += len(norm_snps)

            print 'SNP-cov normalisation'
            sum_indiv_genotypes = sp.sum(norm_snps, 0)
            sum_indiv_genotypes_all_chrom += sum_indiv_genotypes
            mean_indiv_genotypes = sum_indiv_genotypes / len(norm_snps)
            norm_snps = norm_snps - mean_indiv_genotypes

            print 'Calculating SNP covariance unscaled'

            snp_cov_unscaled = sp.array(sp.dot(norm_snps.T, norm_snps),
                                        dtype='float32')
            #             snp_cov_all_snps += snp_cov_unscaled

            print 'Storing and updating things'
            chromosome_dict[chrom_str] = {
                'K_unscaled': K_unscaled,
                'num_snps': len(norm_snps),
                'sum_indiv_genotypes': sum_indiv_genotypes,
                'snp_cov_unscaled': snp_cov_unscaled,
                'snps_signs': g_dict['snps_signs']
            }

            if snp_filter_frac < 1:
                chromosome_dict[chrom_str]['snp_filter'] = snp_filter


#         snp_cov_all_snps = snp_cov_all_snps / float(num_all_snps)
#         K_all_snps = K_all_snps / float(num_all_snps)
#         print 'K_all_snps.shape: %s' % str(K_all_snps.shape)
#         print 'snp_cov_all_snps.shape: %s' % str(snp_cov_all_snps.shape)
#         print 'sp.diag(snp_cov_all_snps): %s' % str(sp.diag(snp_cov_all_snps))
#         print 'sp.mean(sp.diag(snp_cov_all_snps)_: %s' % str(sp.mean(sp.diag(snp_cov_all_snps)))

#         print 'Full kinship and snp-covariance calculation done using %d SNPs\n' % num_all_snps

        mean_indiv_genotypes_all_chrom = sum_indiv_genotypes_all_chrom / num_all_snps
        print 'Individual gentoype mean found:'
        print mean_indiv_genotypes_all_chrom

        print 'Calculating chromosome-wise SNP-covariance and kinship matrices'
        for chrom in range(1, 23):
            if chrom in not_done:
                print 'Working on Chromosome %d' % chrom
                chrom_str = 'chr%d' % chrom

                snp_cov_leave_one_out = sp.zeros((num_indivs, num_indivs),
                                                 dtype='float32')
                K_leave_one_out = sp.zeros((num_indivs, num_indivs),
                                           dtype='float32')
                num_snps_used = 0

                sum_indiv_genotypes = sp.zeros(num_indivs, dtype='float32')

                for chrom2 in range(1, 23):
                    chrom2_str = 'chr%d' % chrom2
                    if chrom2 != chrom:
                        sum_indiv_genotypes += chromosome_dict[chrom2_str][
                            'sum_indiv_genotypes']
                        K_leave_one_out += chromosome_dict[chrom2_str][
                            'K_unscaled']
                        num_snps_used += chromosome_dict[chrom2_str][
                            'num_snps']
                        assert sp.isclose(
                            sp.sum(sp.diag(K_leave_one_out)) /
                            (num_snps_used * num_indivs), 1.0), '..bug'

                mean_indiv_genotypes = sum_indiv_genotypes / num_snps_used

                for chrom2 in range(1, 23):
                    chrom2_str = 'chr%d' % chrom2
                    if chrom2 != chrom:
                        print 'Loading SNPs'
                        snps_signs = chromosome_dict[chrom2_str]['snps_signs']
                        snp_filter = chromosome_dict[chrom2_str]['snp_filter']
                        g_dict = get_genotype_data(
                            in_h5f,
                            chrom2,
                            maf_thres,
                            indiv_filter=indiv_filter,
                            snp_filter=snp_filter,
                            randomize_sign=True,
                            snps_signs=snps_signs,
                            chrom_ok_snp_dict=chrom_ok_snp_dict)
                        norm_snps = g_dict['norm_snps']
                        print 'SNP-cov normalisation'
                        norm_snps = norm_snps - mean_indiv_genotypes

                        print 'Calculating SNP covariance unscaled'
                        snp_cov_unscaled = sp.dot(norm_snps.T, norm_snps)
                        snp_cov_leave_one_out += snp_cov_unscaled

                snp_cov_leave_one_out = snp_cov_leave_one_out / num_snps_used

                K_leave_one_out = K_leave_one_out / num_snps_used
                assert (K_leave_one_out -
                        sp.diag(K_leave_one_out)).max() < 0.1, '..bug'

                try:
                    cholesky_decomp_inv_snp_cov = linalg.cholesky(
                        linalg.pinv(
                            sp.array(snp_cov_leave_one_out, dtype='float64')))
                    evals, evecs = linalg.eig(
                        sp.array(K_leave_one_out, dtype='float64'))
                except:
                    try:
                        cholesky_decomp_inv_snp_cov = linalg.cholesky(
                            linalg.pinv(
                                sp.array(snp_cov_leave_one_out,
                                         dtype='float32')))
                        evals, evecs = linalg.eig(
                            sp.array(K_leave_one_out, dtype='float32'))
                    except:
                        print 'Failed when obtaining the Cholesky decomposotion or eigen decomposition'
                        print 'Moving on, trying again later.'
                        continue

                sort_indices = sp.argsort(evals, )
                ordered_evals = evals[sort_indices]
                print ordered_evals[-10:] / sp.sum(ordered_evals)
                ordered_evecs = evecs[:, sort_indices]
                d = {}
                d['evecs_leave_one_out'] = ordered_evecs
                d['evals_leave_one_out'] = ordered_evals
                d['cholesky_decomp_inv_snp_cov'] = cholesky_decomp_inv_snp_cov
                d['K_leave_one_out'] = K_leave_one_out
                d['K_unscaled'] = chromosome_dict[chrom_str]['K_unscaled']
                d['num_snps'] = chromosome_dict[chrom_str]['num_snps']
                d['snp_cov_leave_one_out'] = snp_cov_leave_one_out
                ok_chromosome_dict[chrom_str] = d
                not_done.remove(chrom)

    # While loop ends here.
    K_all_snps = K_all_snps / float(num_all_snps)
    in_h5f.close()
    ok_chromosome_dict['K_all_snps'] = K_all_snps
    ok_chromosome_dict['num_all_snps'] = num_all_snps

    assert sp.sum((ok_chromosome_dict['chr1']['K_leave_one_out'] -
                   ok_chromosome_dict['chr2']['K_leave_one_out'])**
                  2) != 0, 'Kinships are probably too similar.'

    print 'Calculating PCAs'
    evals, evecs = linalg.eigh(sp.array(
        K_all_snps, dtype='float64'))  # PCA via eigen decomp
    evals[evals < 0] = 0
    sort_indices = sp.argsort(evals, )[::-1]
    ordered_evals = evals[sort_indices]
    print ordered_evals[:10] / sp.sum(ordered_evals)
    pcs = evecs[:, sort_indices]

    tot = sum(evals)
    var_exp = [(i / tot) * 100 for i in sorted(evals, reverse=True)]
    print 'Total variance explained:', sp.sum(var_exp)

    ok_chromosome_dict['pcs'] = pcs
    ok_chromosome_dict['pcs_var_exp'] = var_exp

    if figure_dir is not None:
        plt.clf()
        plt.plot(pcs[:, 0], pcs[:, 1], 'k.')
        plt.title("Overall PCA")
        plt.xlabel('PC1')
        plt.xlabel('PC2')
        plt.tight_layout()
        plt.savefig(figure_dir + '/' + figure_fn, format='pdf')
        plt.clf()

    out_h5f = h5py.File(out_file)
    hu.dict_to_hdf5(ok_chromosome_dict, out_h5f)
    out_h5f.close()

    return ok_chromosome_dict
Exemple #6
0
def calc_kinship(input_file='Data/1Kgenomes/1K_genomes_v3.hdf5' , out_file='Data/1Kgenomes/kinship.hdf5',
                  maf_thres=0.01, figure_dir='', figure_fn='', snp_filter_frac=1, indiv_filter_frac=1,
                  chrom_ok_snp_dict=None):
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

    print 'Loading Genotype from '
    in_h5f = h5py.File(input_file)
#     eur_filter = in_h5f['indivs']['continent'][...] == 'EUR'
#     num_indivs = sp.sum(eur_filter)
    indiv_ids = in_h5f['indiv_ids'][...] 
    indiv_filter = None
    if indiv_filter_frac < 1:
        indiv_filter = sp.array(sp.random.random(len(indiv_ids)) < indiv_filter_frac, dtype='bool8')
        indiv_ids = indiv_ids[indiv_filter]
    assert len(sp.unique(indiv_ids)) == len(indiv_ids)
    num_indivs = len(indiv_ids) 
    

    ok_chromosome_dict = {}

    not_done = set(range(1, 23))
    while len(not_done) > 0:
        chromosome_dict = {}
        
        K_all_snps = sp.zeros((num_indivs, num_indivs), dtype='float32')
        num_all_snps = 0
        
        sum_indiv_genotypes_all_chrom = sp.zeros(num_indivs, dtype='float32')
#         snp_cov_all_snps = sp.zeros((num_indivs, num_indivs), dtype='float64')
        
        print 'Calculating kinship'
        
        for chrom in range(1, 23):
            print 'Working on Chromosome %d' % chrom
            chrom_str = 'chr%d' % chrom
            
            snp_filter = None
            if snp_filter_frac < 1:
                snp_filter = sp.random.random(len(in_h5f[chrom_str]['snps'])) < snp_filter_frac
                            
            g_dict = get_genotype_data(in_h5f, chrom, maf_thres, indiv_filter=indiv_filter,
                        snp_filter=snp_filter, randomize_sign=True, snps_signs=None, chrom_ok_snp_dict=chrom_ok_snp_dict)
            
            norm_snps = g_dict['norm_snps']
            
            sum_indiv_genotypes = sp.sum(g_dict['norm_snps'], 0)
            sum_indiv_genotypes_all_chrom += sum_indiv_genotypes
            
            print 'Calculating chromosome kinship'
            K_unscaled = sp.array(sp.dot(norm_snps.T, norm_snps), dtype='float32')
            assert sp.isclose(sp.sum(sp.diag(K_unscaled)) / (len(norm_snps) * num_indivs), 1.0), '..bug' 
            K_all_snps += K_unscaled
            num_all_snps += len(norm_snps)
    
            print 'SNP-cov normalisation'
            sum_indiv_genotypes = sp.sum(norm_snps, 0)
            sum_indiv_genotypes_all_chrom += sum_indiv_genotypes
            mean_indiv_genotypes = sum_indiv_genotypes / len(norm_snps)
            norm_snps = norm_snps - mean_indiv_genotypes
            
            print 'Calculating SNP covariance unscaled'
            
            snp_cov_unscaled = sp.array(sp.dot(norm_snps.T, norm_snps), dtype='float32')
#             snp_cov_all_snps += snp_cov_unscaled
            
            print 'Storing and updating things'
            chromosome_dict[chrom_str] = {'K_unscaled':K_unscaled, 'num_snps':len(norm_snps),
                                          'sum_indiv_genotypes':sum_indiv_genotypes,
                                          'snp_cov_unscaled':snp_cov_unscaled,
                                          'snps_signs':g_dict['snps_signs']}
            
            if snp_filter_frac < 1:
                chromosome_dict[chrom_str]['snp_filter'] = snp_filter
    
#         snp_cov_all_snps = snp_cov_all_snps / float(num_all_snps)
#         K_all_snps = K_all_snps / float(num_all_snps)
#         print 'K_all_snps.shape: %s' % str(K_all_snps.shape)
#         print 'snp_cov_all_snps.shape: %s' % str(snp_cov_all_snps.shape)
#         print 'sp.diag(snp_cov_all_snps): %s' % str(sp.diag(snp_cov_all_snps))
#         print 'sp.mean(sp.diag(snp_cov_all_snps)_: %s' % str(sp.mean(sp.diag(snp_cov_all_snps)))
        
#         print 'Full kinship and snp-covariance calculation done using %d SNPs\n' % num_all_snps
        
        mean_indiv_genotypes_all_chrom = sum_indiv_genotypes_all_chrom / num_all_snps
        print 'Individual gentoype mean found:'
        print mean_indiv_genotypes_all_chrom
        
        print 'Calculating chromosome-wise SNP-covariance and kinship matrices'
        for chrom in range(1, 23):
            if chrom in not_done:
                print 'Working on Chromosome %d' % chrom
                chrom_str = 'chr%d' % chrom
                
                snp_cov_leave_one_out = sp.zeros((num_indivs, num_indivs), dtype='float32')
                K_leave_one_out = sp.zeros((num_indivs, num_indivs), dtype='float32')
                num_snps_used = 0 
                
                sum_indiv_genotypes = sp.zeros(num_indivs, dtype='float32')
                
                for chrom2 in range(1, 23):
                    chrom2_str = 'chr%d' % chrom2
                    if chrom2 != chrom: 
                        sum_indiv_genotypes += chromosome_dict[chrom2_str]['sum_indiv_genotypes']
                        K_leave_one_out += chromosome_dict[chrom2_str]['K_unscaled']
                        num_snps_used += chromosome_dict[chrom2_str]['num_snps']
                        assert sp.isclose(sp.sum(sp.diag(K_leave_one_out)) / (num_snps_used * num_indivs), 1.0), '..bug' 
        
                mean_indiv_genotypes = sum_indiv_genotypes / num_snps_used
        
                for chrom2 in range(1, 23):
                    chrom2_str = 'chr%d' % chrom2
                    if chrom2 != chrom: 
                        print 'Loading SNPs'
                        snps_signs = chromosome_dict[chrom2_str]['snps_signs']
                        snp_filter = chromosome_dict[chrom2_str]['snp_filter']
                        g_dict = get_genotype_data(in_h5f, chrom2, maf_thres, indiv_filter=indiv_filter,
                                                   snp_filter=snp_filter, randomize_sign=True,
                                                   snps_signs=snps_signs, chrom_ok_snp_dict=chrom_ok_snp_dict)
                        norm_snps = g_dict['norm_snps']
                        print 'SNP-cov normalisation'
                        norm_snps = norm_snps - mean_indiv_genotypes
                        
                        print 'Calculating SNP covariance unscaled'
                        snp_cov_unscaled = sp.dot(norm_snps.T, norm_snps)
                        snp_cov_leave_one_out += snp_cov_unscaled
                  
                snp_cov_leave_one_out = snp_cov_leave_one_out / num_snps_used
                
                K_leave_one_out = K_leave_one_out / num_snps_used
                assert (K_leave_one_out - sp.diag(K_leave_one_out)).max() < 0.1, '..bug' 
                
                try:
                    cholesky_decomp_inv_snp_cov = linalg.cholesky(linalg.pinv(sp.array(snp_cov_leave_one_out, dtype='float64')))  
                    evals, evecs = linalg.eig(sp.array(K_leave_one_out, dtype='float64')) 
                except:
                    try: 
                        cholesky_decomp_inv_snp_cov = linalg.cholesky(linalg.pinv(sp.array(snp_cov_leave_one_out, dtype='float32')))
                        evals, evecs = linalg.eig(sp.array(K_leave_one_out, dtype='float32')) 
                    except:
                        print 'Failed when obtaining the Cholesky decomposotion or eigen decomposition'
                        print 'Moving on, trying again later.'
                        continue
                
                sort_indices = sp.argsort(evals,)
                ordered_evals = evals[sort_indices]
                print ordered_evals[-10:] / sp.sum(ordered_evals)
                ordered_evecs = evecs[:, sort_indices]
                d = {}
                d['evecs_leave_one_out'] = ordered_evecs
                d['evals_leave_one_out'] = ordered_evals
                d['cholesky_decomp_inv_snp_cov'] = cholesky_decomp_inv_snp_cov
                d['K_leave_one_out'] = K_leave_one_out
                d['K_unscaled'] = chromosome_dict[chrom_str]['K_unscaled']
                d['num_snps'] = chromosome_dict[chrom_str]['num_snps']
                d['snp_cov_leave_one_out'] = snp_cov_leave_one_out
                ok_chromosome_dict[chrom_str] = d
                not_done.remove(chrom)

    # While loop ends here.
    K_all_snps = K_all_snps / float(num_all_snps)
    in_h5f.close()
    ok_chromosome_dict['K_all_snps'] = K_all_snps
    ok_chromosome_dict['num_all_snps'] = num_all_snps

    assert sp.sum((ok_chromosome_dict['chr1']['K_leave_one_out'] - ok_chromosome_dict['chr2']['K_leave_one_out']) ** 2) != 0 , 'Kinships are probably too similar.'
        
    print 'Calculating PCAs'
    evals, evecs = linalg.eigh(sp.array(K_all_snps, dtype='float64'))  # PCA via eigen decomp
    evals[evals < 0] = 0
    sort_indices = sp.argsort(evals,)[::-1]
    ordered_evals = evals[sort_indices]
    print ordered_evals[:10] / sp.sum(ordered_evals)
    pcs = evecs[:, sort_indices]


    tot = sum(evals)
    var_exp = [(i / tot) * 100 for i in sorted(evals, reverse=True)]
    print 'Total variance explained:', sp.sum(var_exp)

    ok_chromosome_dict['pcs'] = pcs
    ok_chromosome_dict['pcs_var_exp'] = var_exp

    if figure_dir is not None:
        plt.clf()    
        plt.plot(pcs[:, 0], pcs[:, 1], 'k.')
        plt.title("Overall PCA")
        plt.xlabel('PC1')
        plt.xlabel('PC2')
        plt.tight_layout()
        plt.savefig(figure_dir + '/' + figure_fn, format='pdf')
        plt.clf()
    
    out_h5f = h5py.File(out_file)
    hu.dict_to_hdf5(ok_chromosome_dict, out_h5f)
    out_h5f.close()
    
    return ok_chromosome_dict