Beispiel #1
0
def _test_():
    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000) 
    
    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    print snps
    snps = snps.T
    snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0)
    snps = snps.T
    print snps, snps.shape
    file_prefix = os.environ['HOME'] + '/tmp/test'
    phen_list = phenotypes.simulate_traits_w_snps_to_hdf5(snps, hdf5_file_prefix=file_prefix,
                                           num_traits=30, p=0.1)
    
    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list['phenotypes']):
        
        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('%s_%d_phen.png' % (file_prefix, i))
        plt.clf()
        
        
        agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i),
                                      [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]],
                                      result_labels=['Common SNPs', 'Singletons', 'Doubletons'],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200, max_neg_log_val=3)
        
        # Cholesky permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])
        
        #ATT permutations (Implement)
        
        #PC permutations (Implement)
        

    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
Beispiel #2
0
def _test_scz_():
    # Load Schizophrenia data

    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000)

    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    test_snps = sp.vstack([singleton_snps, doubleton_snps])
    print snps
    phen_list = phenotypes.simulate_traits(
        snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0)

    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list):

        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i)
        plt.clf()
        agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i, [
            ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]
        ],
                                      result_labels=[
                                          'Common SNPs', 'Singletons',
                                          'Doubletons'
                                      ],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200,
                                      max_neg_log_val=3)

        # Now permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])
    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
Beispiel #3
0
def _test_scz_():
    # Load Schizophrenia data
    
    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000) 
    
    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    test_snps = sp.vstack([singleton_snps, doubleton_snps])
    print snps
    phen_list = phenotypes.simulate_traits(snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0)
    
    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list):
        
        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i)
        plt.clf()
        agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i,
                                      [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]],
                                      result_labels=['Common SNPs', 'Singletons', 'Doubletons'],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200, max_neg_log_val=3)
        
        # Now permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])
    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
Beispiel #4
0
def _test_GxE_mixed_model_gwas(
        num_indivs=1000,
        num_snps=10000,
        num_trait_pairs=10,
        plot_prefix='/Users/bjarnivilhjalmsson/tmp/test'):
    """
    Test for the multiple environment mixed model
    
    Simulates correlated trait pairs with exponentially distributed effects. 
    """

    import simulations
    import kinship
    import scipy as sp
    import linear_models as lm
    import gwaResults as gr
    num_trait_pairs = 10
    num_indivs = 200
    num_snps = 10000
    num_causals = 10  # Number of causal SNPs per trait (in total there may be up to twice that, depending on genetic correlation)

    # Simulating (unlinked) genotypes and phenotype pairs w. random positive correlation
    d = simulations.get_simulated_data(num_indivs=num_indivs,
                                       num_snps=num_snps,
                                       num_trait_pairs=num_trait_pairs,
                                       num_causals=num_causals)

    for i in range(num_trait_pairs):
        # The two different phenotypes.
        phen1 = d['trait_pairs'][i][0]
        phen2 = d['trait_pairs'][i][1]
        # Stacking up the two phenotypes into one vector.
        Y = sp.hstack([phen1, phen2])

        # The higher genetic correlation, the better the model fit (since we assume genetic correlation is 1).
        print 'The genetic correlation between the two traits is %0.4f' % d[
            'rho_est_list'][i][0, 1]

        # The genotypes
        sd = d['sd']
        snps = sd.get_snps()
        # Doubling the genotype data as well.
        snps = sp.hstack([snps, snps])

        # Calculating the kinship using the duplicated genotypes
        K = kinship.calc_ibd_kinship(snps)
        print ''

        # Calculating the environment vector
        E = sp.zeros((2 * num_indivs, 1))
        E[num_indivs:, 0] = 1

        print 'Here are the dimensions:'
        print 'Y.shape: ', Y.shape
        print 'snps.shape: ', snps.shape
        print 'E.shape: ', E.shape
        print 'K.shape: ', K.shape

        mm_results = lm.emmax_w_two_env(snps, Y, K, E)
        gtres = mm_results["gt_res"]
        gtgres = mm_results["gt_g_res"]
        gres = mm_results["g_res"]

        # Figuring out which loci are causal
        highlight_loci = sp.array(
            sd.get_chr_pos_list())[d['causal_indices_list'][i]]
        highlight_loci = highlight_loci.tolist()
        highlight_loci.sort()

        # Plotting stuff
        res = gr.Result(scores=gtres['ps'], snps_data=sd)
        res.plot_manhattan(png_file='%s_%d_gtres_manhattan.png' %
                           (plot_prefix, i),
                           percentile=50,
                           highlight_loci=highlight_loci,
                           plot_bonferroni=True,
                           neg_log_transform=True)
        res.plot_qq('%s_%d_gtres_qq.png' % (plot_prefix, i))
        res = gr.Result(scores=gtgres['ps'], snps_data=sd)
        res.plot_manhattan(png_file='%s_%d_gtgres_manhattan.png' %
                           (plot_prefix, i),
                           percentile=50,
                           highlight_loci=highlight_loci,
                           plot_bonferroni=True,
                           neg_log_transform=True)
        res.plot_qq('%s_%d_gtgres_qq.png' % (plot_prefix, i))
        res = gr.Result(scores=gres['ps'], snps_data=sd)
        res.plot_manhattan(png_file='%s_%d_gres_manhattan.png' %
                           (plot_prefix, i),
                           percentile=50,
                           highlight_loci=highlight_loci,
                           plot_bonferroni=True,
                           neg_log_transform=True)
        res.plot_qq('%s_%d_gres_qq.png' % (plot_prefix, i))
Beispiel #5
0
def leave_k_out_blup(
        num_cvs=20,
        genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/',
        k_thres=0.5):
    """

    """
    import h5py
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    res_dict = {}
    for phenotype in phenotypes:
        env_dict = {}
        for env in envs:
            print phenotype, env
            s1 = time.time()
            #Load data..
            d = hdf5_data.coordinate_cegs_genotype_phenotype(phen_dict,
                                                             phenotype,
                                                             env,
                                                             k_thres=k_thres)
            Y_means = d['Y_means']
            snps = d['snps']
            assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?'
            K = kinship.calc_ibd_kinship(snps)
            print '\nKinship calculated'
            assert sp.all(sp.negative(sp.isnan(K))), 'WTF?'
            n = len(Y_means)
            #partition genotypes in k parts.
            gt_ids = d['gt_ids']
            num_ids = len(gt_ids)
            chunk_size = num_ids / num_cvs

            #Create k CV sets of prediction and validation data

            cv_chunk_size = int((n / num_cvs) + 1)
            ordering = sp.random.permutation(n)

            a = sp.arange(n)
            osb_ys = []
            pred_ys = []
            p_herits = []
            for cv_i, i in enumerate(range(0, n, cv_chunk_size)):
                cv_str = 'cv_%d' % cv_i
                #print 'Working on CV %d' % cv_i
                end_i = min(n, i + cv_chunk_size)
                validation_filter = sp.in1d(a, ordering[i:end_i])
                training_filter = sp.negative(validation_filter)

                train_snps = snps[:, training_filter]
                val_snps = snps[:, validation_filter]

                train_Y = Y_means[training_filter]
                val_Y = Y_means[validation_filter]

                #Calc. kinship
                K_train = K[training_filter, :][:, training_filter]
                K_cross = K[validation_filter, :][:, training_filter]
                #Do gBLUP
                lmm = lm.LinearMixedModel(train_Y)
                lmm.add_random_effect(K_train)
                r1 = lmm.get_REML()

                #Now the BLUP.
                y_mean = sp.mean(lmm.Y)
                Y = lmm.Y - y_mean
                p_herit = r1['pseudo_heritability']
                p_herits.append(p_herit)
                #delta = (1 - p_herit) / p_herit
                #        if K_inverse == None:
                #            K_inverse = K.I
                #        M = (sp.eye(K.shape[0]) + delta * K_inverse)
                #        u_blup = M.I * Y
                M = sp.mat(p_herit * sp.mat(K_train) +
                           (1 - p_herit) * sp.eye(K_train.shape[0]))
                u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten()
                osb_ys.extend(val_Y)
                pred_ys.extend(u_mean_pred)
            corr = sp.corrcoef(osb_ys, pred_ys)[1, 0]
            print 'Correlation:', corr
            r2 = corr**2
            print 'R2:', r2
            mean_herit = sp.mean(p_herits)
            print 'Avg. heritability:', mean_herit
            env_dict[env] = {
                'R2': r2,
                'obs_y': osb_ys,
                'pred_y': pred_ys,
                'corr': corr,
                'avg_herit': mean_herit
            }

        res_dict[phenotype] = env_dict

    res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % (
        num_cvs, k_thres)
    h5f = h5py.File(res_hdf5_file)
    for phenotype in phenotypes:
        phen_g = h5f.create_group(phenotype)
        for env in envs:
            d = res_dict[phenotype][env]
            env_g = phen_g.create_group(env)
            env_g.create_dataset('R2', data=[d['R2']])
            env_g.create_dataset('corr', data=[d['corr']])
            env_g.create_dataset('obs_y', data=d['obs_y'])
            env_g.create_dataset('pred_y', data=d['pred_y'])
            env_g.create_dataset('avg_herit', data=[d['avg_herit']])
    h5f.close()
Beispiel #6
0
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'):
    """
    Perform a simple MLM GWAS for the 8 traits
    """
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    for phenotype in phenotypes:
        for env in envs:
            print phenotype, env
            s1 = time.time()
            d = hdf5_data.coordinate_cegs_genotype_phenotype(
                phen_dict, phenotype, env)
            print 'Calculating kinship'
            if kinship_type == 'ibs':
                K = kinship.calc_ibs_kinship(d['snps'])
            elif kinship_type == 'ibd':
                K = kinship.calc_ibd_kinship(d['snps'])
            else:
                raise NotImplemented

            if phen_type == 'means':
                lmm = lm.LinearMixedModel(d['Y_means'])
            elif phen_type == 'medians':
                lmm = lm.LinearMixedModel(d['Y_medians'])
            else:
                raise NotImplementedError
            lmm.add_random_effect(K)

            print "Running EMMAX"
            res = lmm.emmax_f_test(d['snps'], emma_num=1000)
            print 'Mean p-value:', sp.mean(res['ps'])

            secs = time.time() - s1
            if secs > 60:
                mins = int(secs) / 60
                secs = secs - mins * 60
                print 'Took %d mins and %f seconds.' % (mins, secs)
            else:
                print 'Took %f seconds.' % (secs)

            #Now generating QQ-plots
            label_str = '%s_%s_%s_%s' % (kinship_type, phenotype, env,
                                         phen_type)
            agr.plot_simple_qqplots_pvals(
                '/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str),
                [res['ps']],
                result_labels=[label_str],
                line_colors=['green'],
                num_dots=1000,
                title=None,
                max_neg_log_val=6)

            # Perform multiple loci mixed model GWAS
            chromosomes = d['positions'][:, 0]
            positions = sp.array(d['positions'][:, 1], 'int32')
            x_positions = []
            y_log_pvals = []
            colors = []
            x_shift = 0
            for i, chrom in enumerate(sp.unique(chromosomes)):
                if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']:
                    colors.append('c')
                else:  # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra']
                    #Toss U and Hets
                    colors.append('m')
                chrom_filter = sp.in1d(chromosomes, chrom)
                positions_slice = positions[chrom_filter]
                x_positions.append(positions_slice + x_shift)
                x_shift += positions_slice.max()
                log_ps_slice = -sp.log10(res['ps'][chrom_filter])
                y_log_pvals.append(log_ps_slice)

            m = len(positions)
            log_bonf = -sp.log10(1 / (20.0 * m))
            print m, log_bonf

            # Plot manhattan plots?
            plt.figure(figsize=(12, 4))
            plt.axes([0.03, 0.1, 0.95, 0.8])
            for i, chrom in enumerate(sp.unique(chromosomes)):
                plt.plot(x_positions[i],
                         y_log_pvals[i],
                         c=colors[i],
                         ls='',
                         marker='.')
            xmin, xmax = plt.xlim()
            plt.hlines(log_bonf,
                       xmin,
                       xmax,
                       colors='k',
                       linestyles='--',
                       alpha=0.5)
            plt.title('%s, %s' % (phenotype, env))
            plt.savefig(
                '/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png'
                % (kinship_type, phenotype, env, phen_type))
Beispiel #7
0
def _test_GxE_mixed_model_gwas(num_indivs=1000, num_snps=10000, num_trait_pairs=10,
                               plot_prefix='/Users/bjarnivilhjalmsson/tmp/test'):
    """
    Test for the multiple environment mixed model

    Simulates correlated trait pairs with exponentially distributed effects. 
    """

    import simulations
    import kinship
    import scipy as sp
    import linear_models as lm
    import gwaResults as gr
    num_trait_pairs = 10
    num_indivs = 200
    num_snps = 10000
    # Number of causal SNPs per trait (in total there may be up to twice that,
    # depending on genetic correlation)
    num_causals = 10

    # Simulating (unlinked) genotypes and phenotype pairs w. random positive
    # correlation
    d = simulations.get_simulated_data(num_indivs=num_indivs, num_snps=num_snps,
                                       num_trait_pairs=num_trait_pairs, num_causals=num_causals)

    for i in range(num_trait_pairs):
        # The two different phenotypes.
        phen1 = d['trait_pairs'][i][0]
        phen2 = d['trait_pairs'][i][1]
        # Stacking up the two phenotypes into one vector.
        Y = sp.hstack([phen1, phen2])

        # The higher genetic correlation, the better the model fit (since we
        # assume genetic correlation is 1).
        print 'The genetic correlation between the two traits is %0.4f' % d['rho_est_list'][i][0, 1]

        # The genotypes
        sd = d['sd']
        snps = sd.get_snps()
        # Doubling the genotype data as well.
        snps = sp.hstack([snps, snps])

        # Calculating the kinship using the duplicated genotypes
        K = kinship.calc_ibd_kinship(snps)
        print ''

        # Calculating the environment vector
        E = sp.zeros((2 * num_indivs, 1))
        E[num_indivs:, 0] = 1

        print 'Here are the dimensions:'
        print 'Y.shape: ', Y.shape
        print 'snps.shape: ', snps.shape
        print 'E.shape: ', E.shape
        print 'K.shape: ', K.shape

        mm_results = lm.emmax_w_two_env(snps, Y, K, E)
        gtres = mm_results["gt_res"]
        gtgres = mm_results["gt_g_res"]
        gres = mm_results["g_res"]

        # Figuring out which loci are causal
        highlight_loci = sp.array(sd.get_chr_pos_list())[
            d['causal_indices_list'][i]]
        highlight_loci = highlight_loci.tolist()
        highlight_loci.sort()

        # Plotting stuff
        res = gr.Result(scores=gtres['ps'], snps_data=sd)
        res.plot_manhattan(png_file='%s_%d_gtres_manhattan.png' % (plot_prefix, i),
                           percentile=50, highlight_loci=highlight_loci,
                           plot_bonferroni=True,
                           neg_log_transform=True)
        res.plot_qq('%s_%d_gtres_qq.png' % (plot_prefix, i))
        res = gr.Result(scores=gtgres['ps'], snps_data=sd)
        res.plot_manhattan(png_file='%s_%d_gtgres_manhattan.png' % (plot_prefix, i),
                           percentile=50, highlight_loci=highlight_loci,
                           plot_bonferroni=True,
                           neg_log_transform=True)
        res.plot_qq('%s_%d_gtgres_qq.png' % (plot_prefix, i))
        res = gr.Result(scores=gres['ps'], snps_data=sd)
        res.plot_manhattan(png_file='%s_%d_gres_manhattan.png' % (plot_prefix, i),
                           percentile=50, highlight_loci=highlight_loci,
                           plot_bonferroni=True,
                           neg_log_transform=True)
        res.plot_qq('%s_%d_gres_qq.png' % (plot_prefix, i))
Beispiel #8
0
def leave_k_out_blup(num_repeats=20, num_cvs=5, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/', k_thres=0.5):
    """

    """
    import h5py
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    rep_dict = {}
    for rep_i in range(num_repeats):
        res_dict = {}
        for phenotype in phenotypes:
            env_dict = {}
            for env in envs:
                print phenotype, env
                s1 = time.time()
                # Load data..
                d = hdf5_data.coordinate_cegs_genotype_phenotype(
                    phen_dict, phenotype, env, k_thres=k_thres)
                Y_means = d['Y_means']
                snps = d['snps']
                assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?'
                K = kinship.calc_ibd_kinship(snps)
                print '\nKinship calculated'
                assert sp.all(sp.negative(sp.isnan(K))), 'WTF?'
                n = len(Y_means)
                # partition genotypes in k parts.
                gt_ids = d['gt_ids']
                num_ids = len(gt_ids)
                chunk_size = num_ids / num_cvs

                # Create k CV sets of prediction and validation data

                cv_chunk_size = int((n / num_cvs) + 1)
                ordering = sp.random.permutation(n)

                a = sp.arange(n)
                osb_ys = []
                pred_ys = []
                p_herits = []
                for cv_i, i in enumerate(range(0, n, cv_chunk_size)):
                    cv_str = 'cv_%d' % cv_i
                    # print 'Working on CV %d' % cv_i
                    end_i = min(n, i + cv_chunk_size)
                    validation_filter = sp.in1d(a, ordering[i:end_i])
                    training_filter = sp.negative(validation_filter)

                    train_snps = snps[:, training_filter]
                    val_snps = snps[:, validation_filter]

                    train_Y = Y_means[training_filter]
                    val_Y = Y_means[validation_filter]

                    #Calc. kinship
                    K_train = K[training_filter, :][:, training_filter]
                    K_cross = K[validation_filter, :][:, training_filter]
                    # Do gBLUP
                    lmm = lm.LinearMixedModel(train_Y)
                    lmm.add_random_effect(K_train)
                    r1 = lmm.get_REML()

                    # Now the BLUP.
                    y_mean = sp.mean(lmm.Y)
                    Y = lmm.Y - y_mean
                    p_herit = r1['pseudo_heritability']
                    p_herits.append(p_herit)
                    #delta = (1 - p_herit) / p_herit
            #        if K_inverse == None:
            #            K_inverse = K.I
            #        M = (sp.eye(K.shape[0]) + delta * K_inverse)
            #        u_blup = M.I * Y
                    M = sp.mat(p_herit * sp.mat(K_train) +
                               (1 - p_herit) * sp.eye(K_train.shape[0]))
                    u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten()
                    osb_ys.extend(val_Y)
                    pred_ys.extend(u_mean_pred)
                corr = sp.corrcoef(osb_ys, pred_ys)[1, 0]
                print 'Correlation:', corr
                r2 = corr**2
                print 'R2:', r2
                mean_herit = sp.mean(p_herits)
                print 'Avg. heritability:', mean_herit
                env_dict[env] = {'R2': r2, 'obs_y': osb_ys,
                                 'pred_y': pred_ys, 'corr': corr, 'avg_herit': mean_herit}

            res_dict[phenotype] = env_dict
        rep_dict[rep_i] = res_dict
    res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % (
        num_cvs, k_thres)
    h5f = h5py.File(res_hdf5_file)
    for rep_i in range(num_repeats):
        res_dict = rep_dict[rep_i]
        rep_g = h5f.create_group('repl_%d' % rep_i)
        for phenotype in phenotypes:
            phen_g = rep_g.create_group(phenotype)
            for env in envs:
                d = res_dict[phenotype][env]
                env_g = phen_g.create_group(env)
                env_g.create_dataset('R2',  data=[d['R2']])
                env_g.create_dataset('corr',  data=[d['corr']])
                env_g.create_dataset('obs_y',  data=d['obs_y'])
                env_g.create_dataset('pred_y',  data=d['pred_y'])
                env_g.create_dataset('avg_herit',  data=[d['avg_herit']])
    h5f.close()
Beispiel #9
0
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'):
    """
    Perform a simple MLM GWAS for the 8 traits
    """
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    for phenotype in phenotypes:
        for env in envs:
            print phenotype, env
            s1 = time.time()
            d = hdf5_data.coordinate_cegs_genotype_phenotype(
                phen_dict, phenotype, env)
            print 'Calculating kinship'
            if kinship_type == 'ibs':
                K = kinship.calc_ibs_kinship(d['snps'])
            elif kinship_type == 'ibd':
                K = kinship.calc_ibd_kinship(d['snps'])
            else:
                raise NotImplementedError

            if phen_type == 'means':
                lmm = lm.LinearMixedModel(d['Y_means'])
            elif phen_type == 'medians':
                lmm = lm.LinearMixedModel(d['Y_medians'])
            else:
                raise NotImplementedError
            lmm.add_random_effect(K)

            print "Running EMMAX"
            res = lmm.emmax_f_test(d['snps'], emma_num=1000)
            print 'Mean p-value:', sp.mean(res['ps'])

            secs = time.time() - s1
            if secs > 60:
                mins = int(secs) / 60
                secs = secs - mins * 60
                print 'Took %d mins and %f seconds.' % (mins, secs)
            else:
                print 'Took %f seconds.' % (secs)

            # Now generating QQ-plots
            label_str = '%s_%s_%s_%s' % (
                kinship_type, phenotype, env, phen_type)
            agr.plot_simple_qqplots_pvals('/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str),
                                          [res['ps']], result_labels=[
                                              label_str], line_colors=['green'],
                                          num_dots=1000, title=None, max_neg_log_val=6)

            # Perform multiple loci mixed model GWAS
            chromosomes = d['positions'][:, 0]
            positions = sp.array(d['positions'][:, 1], 'int32')
            x_positions = []
            y_log_pvals = []
            colors = []
            x_shift = 0
            for i, chrom in enumerate(sp.unique(chromosomes)):
                if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']:
                    colors.append('c')
                else:  # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra']
                    # Toss U and Hets
                    colors.append('m')
                chrom_filter = sp.in1d(chromosomes, chrom)
                positions_slice = positions[chrom_filter]
                x_positions.append(positions_slice + x_shift)
                x_shift += positions_slice.max()
                log_ps_slice = -sp.log10(res['ps'][chrom_filter])
                y_log_pvals.append(log_ps_slice)

            m = len(positions)
            log_bonf = -sp.log10(1 / (20.0 * m))
            print m, log_bonf

            # Plot manhattan plots?
            plt.figure(figsize=(12, 4))
            plt.axes([0.03, 0.1, 0.95, 0.8])
            for i, chrom in enumerate(sp.unique(chromosomes)):
                plt.plot(x_positions[i], y_log_pvals[i],
                         c=colors[i], ls='', marker='.')
            xmin, xmax = plt.xlim()
            plt.hlines(log_bonf, xmin, xmax, colors='k',
                       linestyles='--', alpha=0.5)
            plt.title('%s, %s' % (phenotype, env))
            plt.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png' %
                        (kinship_type, phenotype, env, phen_type))
Beispiel #10
0
def _test_():
    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000)

    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    print snps
    snps = snps.T
    snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0)
    snps = snps.T
    print snps, snps.shape
    file_prefix = os.environ['HOME'] + '/tmp/test'
    phen_list = phenotypes.simulate_traits_w_snps_to_hdf5(
        snps, hdf5_file_prefix=file_prefix, num_traits=30, p=0.1)

    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list['phenotypes']):

        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('%s_%d_phen.png' % (file_prefix, i))
        plt.clf()

        agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i), [
            ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]
        ],
                                      result_labels=[
                                          'Common SNPs', 'Singletons',
                                          'Doubletons'
                                      ],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200,
                                      max_neg_log_val=3)

        # Cholesky permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])

        #ATT permutations (Implement)

        #PC permutations (Implement)

    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
Beispiel #11
0
def coordinate_cegs_genotype_phenotype(phen_dict, phenotype='Protein',env='mated',k_thres=0.8, ind_missing_thres=0.5, snp_missing_thres=0.05, maf_thres=0.1,
                                       genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.Filter_imputed.hdf5'):
    """
    Parse genotypes and coordinate with phenotype, and ready data for analysis.
    """
    gh5f = h5py.File(genotype_file)
    p_dict = phen_dict[phenotype][env]
    print 'Loading SNPs'
    snps = sp.array(gh5f['gt'][...],dtype='single')
    snps = snps[:,p_dict['ind_filter']]
    positions = gh5f['pos'][...]
    m,n = snps.shape
    print 'Loaded %d SNPs for %d individuals'%(m,n)
    print 'Filtering individuals with missing rates >%0.2f'%ind_missing_thres
    missing_mat = sp.isnan(snps)
    ind_missing_rates = sp.sum(missing_mat,0)/float(m)
    ind_filter = ind_missing_rates<ind_missing_thres
    snps = snps[:,ind_filter]
    n = sp.sum(ind_filter)  
    print 'Filtered %d individuals due to high missing rates'%sp.sum(sp.negative(ind_filter))
    gt_ids = gh5f['gt_ids'][p_dict['ind_filter']]
    gt_ids = gt_ids[ind_filter]
    Y_means = p_dict['Y_means'][p_dict['ind_filter']]
    Y_means = Y_means[ind_filter]
    Y_medians = p_dict['Y_medians'][p_dict['ind_filter']]
    Y_medians = Y_medians[ind_filter]
    rep_count =  p_dict['rep_count'][p_dict['ind_filter']]
    rep_count = rep_count[ind_filter]
    
    print 'Now removing "bad" genotypes.'
    bad_genotypes = ['Raleigh_272', 'Raleigh_378', 'Raleigh_554', 'Raleigh_591', 'Raleigh_398', 'Raleigh_138', 'Raleigh_208', 
                     'Raleigh_336', 'Raleigh_370', 'Raleigh_373', 'Raleigh_374', 'Raleigh_799', 'Raleigh_821', 'Raleigh_822',
                     'Raleigh_884', 'Raleigh_335']
    ind_filter = sp.negative(sp.in1d(gt_ids,bad_genotypes))
    gt_ids = gt_ids[ind_filter]
    Y_means= Y_means[ind_filter]
    Y_medians= Y_medians[ind_filter]
    rep_count= rep_count[ind_filter]    
    snps = snps[:,ind_filter]
    print 'Removed %d "bad" genotypes'%sp.sum(sp.negative(ind_filter))
    
    n = len(snps[0])
    print 'Filtering SNPs with missing rate >%0.2f'%snp_missing_thres
    missing_mat = sp.isnan(snps)
    snp_missing_rates = sp.sum(missing_mat,1)/float(n)
    snps_filter = snp_missing_rates<snp_missing_thres
    snps = snps[snps_filter]
    positions = positions[snps_filter]
    m = sp.sum(snps_filter)
    print 'Filtered %d SNPs due to high missing rate'%sp.sum(sp.negative(snps_filter))
    
    print 'Now imputing (w mean)'
    missing_mat = sp.isnan(snps)
    ok_counts = n-sp.sum(missing_mat,1)
    snps[missing_mat]=0
    snp_means = sp.sum(snps,1)/ok_counts
#     print snp_means.shape
#     print snp_means[:10]
#     import pdb
#     pdb.set_trace()
    for i in range(len(snps)):
        snps[i,missing_mat[i]]=snp_means[i]

    print 'And filtering SNPs with MAF<%0.2f'%maf_thres
    snp_means = sp.mean(snps,1)
    snp_mafs = sp.minimum(snp_means,1-snp_means)
    snps_filter = snp_mafs>maf_thres
    snps = snps[snps_filter]
    positions = positions[snps_filter]
    print 'Filtered %d SNPs with low MAFs'%sp.sum(sp.negative(snps_filter))
    

    print 'Filtering based on kinship w threshold:',k_thres
    import kinship
    K = kinship.calc_ibd_kinship(snps)
    print '\nKinship calculated'
    K_ind_filter = []
    for i in range(n):
        K_ind_filter.append(not sp.any(K[i,i+1:n]>k_thres))
    if sum(K_ind_filter)==n:
        print 'No individuals were filtered based on kinship..'
    else:
        print 'Filtering %d individuals based on kinship.'%(n-sum(K_ind_filter))
        K_ind_filter = sp.array(K_ind_filter)
        gt_ids = gt_ids[K_ind_filter]
        Y_means= Y_means[K_ind_filter]
        Y_medians= Y_medians[K_ind_filter]
        rep_count= rep_count[K_ind_filter]    
        snps = snps[:,K_ind_filter]
        
        print 'Again filtering SNPs with MAF<%0.2f'%maf_thres
        snp_means = sp.mean(snps,1)
        snp_mafs = sp.minimum(snp_means,1-snp_means)
        snps_filter = snp_mafs>maf_thres
        snps = snps[snps_filter]
        positions = positions[snps_filter]
        print 'Filtered %d additional SNPs with low MAFs'%sp.sum(sp.negative(snps_filter))


    print 'All filtering done.'
    
    m,n = snps.shape
    print 'In all there are %d SNPs remaining, for %d individuals.'%(m,n)
    
    ret_dict = {'Y_means':Y_means, 'Y_medians':Y_medians, 'rep_count':rep_count, 'gt_ids':gt_ids, 
                'positions':positions, 'snps':snps}
    
    
    
    return ret_dict
Beispiel #12
0
 def get_ibd_kinship_matrix(self, debug_filter=1, dtype='single',chunk_size=None):
     log.debug('Starting IBD calculation')
     cov_mat = kinship.calc_ibd_kinship(self,chunk_size=chunk_size)
     log.debug('Finished calculating IBD kinship matrix')
     return cov_mat
Beispiel #13
0
 def get_ibd_kinship_matrix(self, debug_filter=1, dtype='single',chunk_size=None):
     log.debug('Starting IBD calculation')
     return kinship.calc_ibd_kinship(self,chunk_size=chunk_size)
     log.debug('Finished calculating IBD kinship matrix')
     return cov_mat
Beispiel #14
0
def coordinate_cegs_genotype_phenotype(
    phen_dict,
    phenotype='Protein',
    env='mated',
    k_thres=0.8,
    ind_missing_thres=0.5,
    snp_missing_thres=0.05,
    maf_thres=0.1,
    genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.Filter_imputed.hdf5'
):
    """
    Parse genotypes and coordinate with phenotype, and ready data for analysis.
    """
    gh5f = h5py.File(genotype_file)
    p_dict = phen_dict[phenotype][env]
    print 'Loading SNPs'
    snps = sp.array(gh5f['gt'][...], dtype='single')
    snps = snps[:, p_dict['ind_filter']]
    positions = gh5f['pos'][...]
    m, n = snps.shape
    print 'Loaded %d SNPs for %d individuals' % (m, n)
    print 'Filtering individuals with missing rates >%0.2f' % ind_missing_thres
    missing_mat = sp.isnan(snps)
    ind_missing_rates = sp.sum(missing_mat, 0) / float(m)
    ind_filter = ind_missing_rates < ind_missing_thres
    snps = snps[:, ind_filter]
    n = sp.sum(ind_filter)
    print 'Filtered %d individuals due to high missing rates' % sp.sum(
        sp.negative(ind_filter))
    gt_ids = gh5f['gt_ids'][p_dict['ind_filter']]
    gt_ids = gt_ids[ind_filter]
    Y_means = p_dict['Y_means'][p_dict['ind_filter']]
    Y_means = Y_means[ind_filter]
    Y_medians = p_dict['Y_medians'][p_dict['ind_filter']]
    Y_medians = Y_medians[ind_filter]
    rep_count = p_dict['rep_count'][p_dict['ind_filter']]
    rep_count = rep_count[ind_filter]

    print 'Now removing "bad" genotypes.'
    bad_genotypes = [
        'Raleigh_272', 'Raleigh_378', 'Raleigh_554', 'Raleigh_591',
        'Raleigh_398', 'Raleigh_138', 'Raleigh_208', 'Raleigh_336',
        'Raleigh_370', 'Raleigh_373', 'Raleigh_374', 'Raleigh_799',
        'Raleigh_821', 'Raleigh_822', 'Raleigh_884', 'Raleigh_335'
    ]
    ind_filter = sp.negative(sp.in1d(gt_ids, bad_genotypes))
    gt_ids = gt_ids[ind_filter]
    Y_means = Y_means[ind_filter]
    Y_medians = Y_medians[ind_filter]
    rep_count = rep_count[ind_filter]
    snps = snps[:, ind_filter]
    print 'Removed %d "bad" genotypes' % sp.sum(sp.negative(ind_filter))

    n = len(snps[0])
    print 'Filtering SNPs with missing rate >%0.2f' % snp_missing_thres
    missing_mat = sp.isnan(snps)
    snp_missing_rates = sp.sum(missing_mat, 1) / float(n)
    snps_filter = snp_missing_rates < snp_missing_thres
    snps = snps[snps_filter]
    positions = positions[snps_filter]
    m = sp.sum(snps_filter)
    print 'Filtered %d SNPs due to high missing rate' % sp.sum(
        sp.negative(snps_filter))

    print 'Now imputing (w mean)'
    missing_mat = sp.isnan(snps)
    ok_counts = n - sp.sum(missing_mat, 1)
    snps[missing_mat] = 0
    snp_means = sp.sum(snps, 1) / ok_counts
    #     print snp_means.shape
    #     print snp_means[:10]
    #     import pdb
    #     pdb.set_trace()
    for i in range(len(snps)):
        snps[i, missing_mat[i]] = snp_means[i]

    print 'And filtering SNPs with MAF<%0.2f' % maf_thres
    snp_means = sp.mean(snps, 1)
    snp_mafs = sp.minimum(snp_means, 1 - snp_means)
    snps_filter = snp_mafs > maf_thres
    snps = snps[snps_filter]
    positions = positions[snps_filter]
    print 'Filtered %d SNPs with low MAFs' % sp.sum(sp.negative(snps_filter))

    print 'Filtering based on kinship w threshold:', k_thres
    import kinship
    K = kinship.calc_ibd_kinship(snps)
    print '\nKinship calculated'
    K_ind_filter = []
    for i in range(n):
        K_ind_filter.append(not sp.any(K[i, i + 1:n] > k_thres))
    if sum(K_ind_filter) == n:
        print 'No individuals were filtered based on kinship..'
    else:
        print 'Filtering %d individuals based on kinship.' % (
            n - sum(K_ind_filter))
        K_ind_filter = sp.array(K_ind_filter)
        gt_ids = gt_ids[K_ind_filter]
        Y_means = Y_means[K_ind_filter]
        Y_medians = Y_medians[K_ind_filter]
        rep_count = rep_count[K_ind_filter]
        snps = snps[:, K_ind_filter]

        print 'Again filtering SNPs with MAF<%0.2f' % maf_thres
        snp_means = sp.mean(snps, 1)
        snp_mafs = sp.minimum(snp_means, 1 - snp_means)
        snps_filter = snp_mafs > maf_thres
        snps = snps[snps_filter]
        positions = positions[snps_filter]
        print 'Filtered %d additional SNPs with low MAFs' % sp.sum(
            sp.negative(snps_filter))

    print 'All filtering done.'

    m, n = snps.shape
    print 'In all there are %d SNPs remaining, for %d individuals.' % (m, n)

    ret_dict = {
        'Y_means': Y_means,
        'Y_medians': Y_medians,
        'rep_count': rep_count,
        'gt_ids': gt_ids,
        'positions': positions,
        'snps': snps
    }

    return ret_dict