Example #1
0
def _test_():
    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000) 
    
    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    print snps
    snps = snps.T
    snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0)
    snps = snps.T
    print snps, snps.shape
    file_prefix = os.environ['HOME'] + '/tmp/test'
    phen_list = phenotypes.simulate_traits_w_snps_to_hdf5(snps, hdf5_file_prefix=file_prefix,
                                           num_traits=30, p=0.1)
    
    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list['phenotypes']):
        
        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('%s_%d_phen.png' % (file_prefix, i))
        plt.clf()
        
        
        agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i),
                                      [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]],
                                      result_labels=['Common SNPs', 'Singletons', 'Doubletons'],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200, max_neg_log_val=3)
        
        # Cholesky permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])
        
        #ATT permutations (Implement)
        
        #PC permutations (Implement)
        

    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
Example #2
0
def _test_scz_():
    # Load Schizophrenia data

    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000)

    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    test_snps = sp.vstack([singleton_snps, doubleton_snps])
    print snps
    phen_list = phenotypes.simulate_traits(
        snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0)

    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list):

        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i)
        plt.clf()
        agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i, [
            ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]
        ],
                                      result_labels=[
                                          'Common SNPs', 'Singletons',
                                          'Doubletons'
                                      ],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200,
                                      max_neg_log_val=3)

        # Now permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])
    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
Example #3
0
def _test_scz_():
    # Load Schizophrenia data
    
    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000) 
    
    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    test_snps = sp.vstack([singleton_snps, doubleton_snps])
    print snps
    phen_list = phenotypes.simulate_traits(snps, hdf5_file_prefix='/home/bv25/tmp/test', num_traits=30, p=1.0)
    
    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list):
        
        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('/home/bv25/tmp/test_%d_phen.png' % i)
        plt.clf()
        agr.plot_simple_qqplots_pvals('/home/bv25/tmp/test_%d' % i,
                                      [ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]],
                                      result_labels=['Common SNPs', 'Singletons', 'Doubletons'],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200, max_neg_log_val=3)
        
        # Now permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])
    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
Example #4
0
def mixed_model_gwas(phenotype_id=5,
                     pvalue_file='mm_results.pvals',
                     manhattan_plot_file='mm_manhattan.png',
                     qq_plot_file_prefix='mm_qq'):
    """
    Perform mixed model (EMMAX) GWAS for flowering time (phenotype_id=5 in the phenotype file) 
    in plants grown under 10C conditions. 
    """
    import linear_models as lm
    import kinship
    import gwaResults as gr
    # Load genotypes
    sd = load_a_thaliana_genotypes()

    # Load phenotypes
    phend = load_a_thaliana_phenotypes()

    # Coordinate phenotype of interest and genotypes.  This filters the genotypes and
    # phenotypes, leaving only accessions (individuals) which overlap between both,
    # and SNPs that are polymorphic in the resulting subset.
    sd.coordinate_w_phenotype_data(phend, phenotype_id)

    # Calculate kinship (IBS)
    K = kinship.calc_ibs_kinship(sd.get_snps())

    # Perform mixed model GWAS
    mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K)

    # Construct a results object
    res = gr.Result(scores=mm_results['ps'], snps_data=sd)

    # Save p-values to file
    res.write_to_file(pvalue_file)

    # Plot Manhattan plot
    res.plot_manhattan(png_file=manhattan_plot_file,
                       percentile=90,
                       plot_bonferroni=True,
                       neg_log_transform=True)
    # Plot a QQ-plot
    res.plot_qq(qq_plot_file_prefix)
Example #5
0
def lotus_mixed_model_gwas(phenotype_id=4, phen_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/20181113_136LjAccessionData.csv', 
                           gt_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/all_chromosomes_binary.csv', 
                           pvalue_file='mm_results.pvals', manhattan_plot_file='mm_manhattan.png', qq_plot_file_prefix='mm_qq'):
    """
    Perform mixed model (EMMAX) GWAS for Lotus data
    """
    import linear_models as lm
    import kinship
    import gwaResults as gr
    import dataParsers as dp
    # Load genotypes
    sd = dp.parse_snp_data(gt_file)

    # Load phenotypes
    import phenotypeData as pd
    phend = pd.parse_phenotype_file(phen_file, with_db_ids=False)
    
    # Coordinate phenotype of interest and genotypes.  This filters the genotypes and 
    # phenotypes, leaving only accessions (individuals) which overlap between both, 
    # and SNPs that are polymorphic in the resulting subset.
    sd.coordinate_w_phenotype_data(phend, phenotype_id)

    # Calculate kinship (IBS)
    K = kinship.calc_ibs_kinship(sd.get_snps())

    # Perform mixed model GWAS
    mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K)

    # Construct a results object
    res = gr.Result(scores=mm_results['ps'], snps_data=sd)

    # Save p-values to file
    res.write_to_file(pvalue_file)

    # Plot Manhattan plot
    res.plot_manhattan(png_file=manhattan_plot_file, percentile=90, plot_bonferroni=True,
                       neg_log_transform=True)
    # Plot a QQ-plot
    res.plot_qq(qq_plot_file_prefix)
Example #6
0
def mixed_model_gwas(phenotype_id=5, pvalue_file='mm_results.pvals',
                     manhattan_plot_file='mm_manhattan.png',
                     qq_plot_file_prefix='mm_qq'):
    """
    Perform mixed model (EMMAX) GWAS for flowering time (phenotype_id=5 in the phenotype file) 
    in plants grown under 10C conditions. 
    """
    import linear_models as lm
    import kinship
    import gwaResults as gr
    # Load genotypes
    sd = load_a_thaliana_genotypes()

    # Load phenotypes
    phend = load_a_thaliana_phenotypes()

    # Coordinate phenotype of interest and genotypes.  This filters the genotypes and
    # phenotypes, leaving only accessions (individuals) which overlap between both,
    # and SNPs that are polymorphic in the resulting subset.
    sd.coordinate_w_phenotype_data(phend, phenotype_id)

    # Calculate kinship (IBS)
    K = kinship.calc_ibs_kinship(sd.get_snps())

    # Perform mixed model GWAS
    mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K)

    # Construct a results object
    res = gr.Result(scores=mm_results['ps'], snps_data=sd)

    # Save p-values to file
    res.write_to_file(pvalue_file)

    # Plot Manhattan plot
    res.plot_manhattan(png_file=manhattan_plot_file, percentile=90, plot_bonferroni=True,
                       neg_log_transform=True)
    # Plot a QQ-plot
    res.plot_qq(qq_plot_file_prefix)
Example #7
0
def map_phenotype(p_i, phed, mapping_method, trans_method, p_dict):
    import copy
    phed = copy.deepcopy(phed)
    phenotype_name = phed.get_name(p_i)
    phen_is_binary = phed.is_binary(p_i)
    if trans_method == 'most_normal':
        trans_method, shapiro_pval = phed.most_normal_transformation(p_i, perform_trans=False)
    file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.get_name(p_i),
                mapping_method, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'],
                p_dict['call_method_id'])
    result_name = "%s_%s_%s" % (phenotype_name, mapping_method, trans_method)
    emmax_perm_threshold = None
    k = None

    res = None
    #Check whether result already exists.
    if p_dict['use_existing_results']:
        if p_dict['region_plots']:
            sd = _get_genotype_data_(p_dict)
            num_outliers = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'], p_dict['with_replicates'])
            if p_dict['remove_outliers']:
                assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA."

            snps = sd.getSnps()
        else:
            snps = None

        print "\nChecking for existing results."
        result_file = file_prefix + ".pvals"
        if os.path.isfile(result_file):
            res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps)
            pvals = True
        else:
            result_file = file_prefix + ".scores"
            if os.path.isfile(result_file):
                res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps)
                pvals = False
        if res:
            print "Found existing results.. (%s)" % (result_file)

        sys.stdout.flush()


    #Loading candidate genes
    cand_genes = None
    if p_dict['cand_genes_file']:
        cand_genes, tair_ids = gwaResults.load_cand_genes_file(p_dict['cand_genes_file'])
    else:
        cand_genes = None
        tair_ids = None

    if not res: #If results weren't found in a file... then do GWA.
        #Loading data
        sd = _get_genotype_data_(p_dict)
        num_outliers, n_filtered_snps = prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'],
                                                     p_dict['with_replicates'])

        #Do we need to calculate the K-matrix?
        if mapping_method in ['emma', 'emmax', 'emmax_anova', 'emmax_step', 'loc_glob_mm']:
            #Load genotype file (in binary format)
            sys.stdout.write("Retrieving the Kinship matrix K.\n")
            sys.stdout.flush()
            if p_dict['kinship_file']:   #Kinship file was supplied..
                print 'Loading supplied kinship file: %s' % p_dict['kinship_file']
                k = kinship.load_kinship_from_file(p_dict['kinship_file'], sd.accessions)
            else:
                print 'Loading kinship file.'
                if p_dict['data_file'] != None:
                    if p_dict['kinship_type'] == 'ibs':
                        k = sd.get_ibs_kinship_matrix()
                    elif p_dict['kinship_type'] == 'ibd':
                        k = sd.get_ibd_kinship_matrix()
                else:
                    k = kinship.get_kinship(call_method_id=p_dict['call_method_id'], data_format=p_dict['data_format'],
                                            method=p_dict['kinship_type'], n_removed_snps=n_filtered_snps,
                                            remain_accessions=sd.accessions)
            sys.stdout.flush()
            sys.stdout.write("Done!\n")

        if p_dict['remove_outliers']:
            if num_outliers == 0: print "No outliers were removed!"

        phen_vals = phed.get_values(p_i)

        if p_dict['local_gwas']: #Filter SNPs, etc..
            sd = snpsdata.SNPsDataSet([sd.get_region_snpsd(*p_dict['local_gwas'])],
                        [p_dict['local_gwas'][0]], data_format=sd.data_format)
        snps = sd.getSnps()


        sys.stdout.write("Finished loading and handling data!\n")

        print "Plotting a histogram"
        p_her = None
        hist_file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phenotype_name, trans_method,
                        p_dict['remove_outliers'], p_dict['with_replicates'],
                        p_dict['call_method_id'])
        hist_png_file = hist_file_prefix + "_hist.png"
        if k is not None:
            p_her = phed.get_pseudo_heritability(p_i, k)['pseudo_heritability']
            p_her_pval = phed.get_pseudo_heritability(p_i, k)['pval']
            phed.plot_histogram(p_i, png_file=hist_png_file, p_her=p_her, p_her_pval=p_her_pval)
        else:
            phed.plot_histogram(p_i, png_file=hist_png_file)


        print "Applying %s to data." % (mapping_method)
        sys.stdout.flush()
        kwargs = {}
        additional_columns = []
        if "kw" == mapping_method:

            if phen_is_binary:
                warnings.warn("Warning, applying KW to a binary phenotype")

            kw_res = util.kruskal_wallis(snps, phen_vals)
            pvals = kw_res['ps']
            kwargs['statistics'] = kw_res['ds']
            additional_columns.append('statistics')


        elif "ft" == mapping_method:
            raise NotImplementedError
#            pvals, or_est = run_fet(snps, phen_vals)
#            kwargs['odds_ratio_est'] = or_est
#            additional_columns.append('odds_ratio_est')

        else:  #Parametric tests below:        

            if mapping_method in ['emma', 'emmax', 'emmax_perm', 'emmax_step', 'emmax_anova', 'loc_glob_mm']:
                r = lm.mm_lrt_test(phen_vals, k)
                if r['pval'] > 0.05:
                    print "Performing EMMA, even though a mixed model does not fit the data significantly better"
                    print 'p-value: %0.3f' % r['pval']
                else:
                    print 'The mixed model fits the data significantly better than the simple linear model.'
                    print 'p-value: %f' % r['pval']

            if mapping_method in ['loc_glob_mm']:
                res_dict = lm.local_vs_global_mm_scan(phen_vals, sd, file_prefix=file_prefix,
                            global_k=k, window_size=p_dict['loc_glob_ws'],
                            jump_size=p_dict['loc_glob_ws'] / 2,
                            kinship_method=p_dict['kinship_type'])
                res_file_name = file_prefix + '.csv'
                _write_res_dict_to_file_(res_file_name, res_dict)
                return
            elif mapping_method in ['emma']:
                res = lm.emma(snps, phen_vals, k)
            elif mapping_method in ['emmax']:
                if p_dict['emmax_perm']:
                    perm_sd = _get_genotype_data_(p_dict)
                    num_outliers = prepare_data(perm_sd, phed, p_i, 'none', 0, p_dict['with_replicates'])
                    perm_sd.filter_mac_snps(p_dict['mac_threshold'])
                    t_snps = perm_sd.getSnps()
                    t_phen_vals = phed.get_values(p_i)
                    res = lm.emmax_perm_test(t_snps, t_phen_vals, k, p_dict['emmax_perm'])
                    emmax_perm_threshold = res['threshold_05'][0]
                    import pylab
                    hist_res = pylab.hist(-sp.log10(res['min_ps']), alpha=0.6)
                    threshold = -sp.log10(emmax_perm_threshold)
                    b_threshold = -sp.log10(1.0 / (len(t_snps) * 20.0))
                    pylab.vlines(threshold, 0, max(hist_res[0]), color='g')
                    pylab.vlines(b_threshold, 0, max(hist_res[0]), color='r')
                    pylab.savefig(file_prefix + 'perm_%d_min_pval_hist.png' % (p_dict['emmax_perm']),
                        format='png')
                if p_dict['with_replicates']:
                    #Get values, with ecotypes, construct Z and do GWAM
                    phen_vals = phed.get_values(p_i)
                    Z = phed.get_incidence_matrix(p_i)
                    res = lm.emmax(snps, phen_vals, k, Z=Z, with_betas=p_dict['with_betas'],
                            emma_num=p_dict['emmax_emma_num'])
                else:
                    res = lm.emmax(snps, phen_vals, k, with_betas=p_dict['with_betas'],
                            emma_num=p_dict['emmax_emma_num'])

            elif mapping_method in ['emmax_step']:
                sd.filter_mac_snps(p_dict['mac_threshold'])
                local = False
                if p_dict['local_gwas']:
                    local = True
                    file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas']))
                res = lm.emmax_step_wise(phen_vals, k, sd=sd, num_steps=p_dict['num_steps'],
                            file_prefix=file_prefix, local=local, cand_gene_list=cand_genes,
                            save_pvals=p_dict['save_stepw_pvals'],
                            emma_num=p_dict['emmax_emma_num'])
                print 'Step-wise EMMAX finished!'
                return
            elif mapping_method in ['lm_step']:
                sd.filter_mac_snps(p_dict['mac_threshold'])
                local = False
                if p_dict['local_gwas']:
                    local = True
                    file_prefix += '_' + '_'.join(map(str, p_dict['local_gwas']))
                res = lm.lm_step_wise(phen_vals, sd=sd, num_steps=p_dict['num_steps'],
                            file_prefix=file_prefix, local=local, cand_gene_list=cand_genes,
                            save_pvals=p_dict['save_stepw_pvals'])
                print 'Step-wise LM finished!'
                return
            elif mapping_method in ['lm']:
                res = lm.linear_model(snps, phen_vals)
            elif mapping_method in ['emmax_anova']:
                res = lm.emmax_anova(snps, phen_vals, k)
            elif mapping_method in ['lm_anova']:
                res = lm.anova(snps, phen_vals)
            else:
                print "Mapping method", mapping_method, 'was not found.'
                return

            if mapping_method in ['lm', 'emma', 'emmax']:
                kwargs['genotype_var_perc'] = res['var_perc']
                additional_columns.append('genotype_var_perc')
                if p_dict['with_betas'] or mapping_method in ['emma' ]:
                    betas = map(list, zip(*res['betas']))
                    kwargs['beta0'] = betas[0]
                    additional_columns.append('beta0')
                    if len(betas) > 1:
                        kwargs['beta1'] = betas[1]
                        additional_columns.append('beta1')
                pvals = res['ps']
                sys.stdout.write("Done!\n")
                sys.stdout.flush()

            if mapping_method in ['lm_anova', 'emmax_anova']:
                kwargs['genotype_var_perc'] = res['var_perc']
                pvals = res['ps']
                sys.stdout.write("Done!\n")
                sys.stdout.flush()


#        print 'Calculating SNP-phenotype correlations.'
#        kwargs['correlations'] = calc_correlations(snps, phen_vals)
#        additional_columns.append('correlations')
        print 'Writing result to file.'
        res = gwaResults.Result(scores=pvals.tolist(), snps_data=sd, name=result_name, **kwargs)
        if mapping_method in ["kw", "ft", "emma", 'lm', "emmax", 'emmax_anova', 'lm_anova']:
            result_file = file_prefix + ".pvals"
        else:
            result_file = file_prefix + ".scores"
        res.write_to_file(result_file, additional_columns, max_fraction=p_dict['pvalue_filter'])

    #add results to DB..

    if p_dict['add_to_db']:
        print 'Adding results to DB.'
        if p_dict['with_db_ids']:
            db_pid = p_i
        else:
            db_pid = phed.get_db_pid(p_i)

        import results_2_db as rdb

        short_name = 'cm%d_pid%d_%s_%s_%s_%d_%s' % (p_dict['call_method_id'], db_pid, phenotype_name,
                            mapping_method, trans_method, p_dict['remove_outliers'],
                            str(p_dict['with_replicates']))
        tm_id = transformation_method_dict[trans_method]
        try:
            rdb.add_results_to_db(result_file, short_name, p_dict['call_method_id'], db_pid,
                        analysis_methods_dict[mapping_method],
                        tm_id, remove_outliers=p_dict['remove_outliers'])
        except Exception, err_str:
            print 'Failed inserting results into DB!'
            print err_str
Example #8
0
def _test_():
    singleton_snps = genotypes.simulate_k_tons(n=500, m=1000)
    doubleton_snps = genotypes.simulate_k_tons(k=2, n=500, m=1000)
    common_snps = genotypes.simulate_common_genotypes(500, 1000)

    snps = sp.vstack([common_snps, singleton_snps, doubleton_snps])
    print snps
    snps = snps.T
    snps = (snps - sp.mean(snps, 0)) / sp.std(snps, 0)
    snps = snps.T
    print snps, snps.shape
    file_prefix = os.environ['HOME'] + '/tmp/test'
    phen_list = phenotypes.simulate_traits_w_snps_to_hdf5(
        snps, hdf5_file_prefix=file_prefix, num_traits=30, p=0.1)

    singletons_thres = []
    doubletons_thres = []
    common_thres = []
    for i, y in enumerate(phen_list['phenotypes']):

        K = kinship.calc_ibd_kinship(snps)
        K = kinship.scale_k(K)
        lmm = lm.LinearMixedModel(y)
        lmm.add_random_effect(K)
        r1 = lmm.get_REML()
        print 'pseudo_heritability:', r1['pseudo_heritability']

        ex_res = lm.emmax(snps, y, K)
        plt.figure()
        plt.hist(y, 50)
        plt.savefig('%s_%d_phen.png' % (file_prefix, i))
        plt.clf()

        agr.plot_simple_qqplots_pvals('%s_%d' % (file_prefix, i), [
            ex_res['ps'][:1000], ex_res['ps'][1000:2000], ex_res['ps'][2000:]
        ],
                                      result_labels=[
                                          'Common SNPs', 'Singletons',
                                          'Doubletons'
                                      ],
                                      line_colors=['b', 'r', 'y'],
                                      num_dots=200,
                                      max_neg_log_val=3)

        # Cholesky permutations..
        res = lm.emmax_perm_test(singleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        singletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(doubleton_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        doubletons_thres.append(res['threshold_05'][0])
        res = lm.emmax_perm_test(common_snps, y, K, num_perm=1000)
        print 1.0 / (20 * 1000.0), res['threshold_05']
        common_thres.append(res['threshold_05'][0])

        #ATT permutations (Implement)

        #PC permutations (Implement)

    print sp.mean(singletons_thres), sp.std(singletons_thres)
    print sp.mean(doubletons_thres), sp.std(doubletons_thres)
    print sp.mean(common_thres), sp.std(common_thres)
def map_phenotype(p_i, phed, snps_data_file, mapping_method, trans_method, p_dict):
	phenotype_name = phed.getPhenotypeName(p_i)
	phen_is_binary = phed.isBinary(p_i)
	file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.getPhenotypeName(p_i),
				mapping_method, trans_method, p_dict['remove_outliers'])
	result_name = "%s_%s_%s" % (phenotype_name, mapping_method, trans_method)

	res = None
	sd = dataParsers.parse_snp_data(snps_data_file , format=p_dict['data_format'], filter=p_dict['debug_filter'])
	num_outliers = gwa.prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'])
	if p_dict['remove_outliers']:
		assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA."

	phen_vals = phed.getPhenVals(p_i)
	snps = sd.getSnps()
	if mapping_method in ['emmax']:
		#Load genotype file (in binary format)
		sys.stdout.write("Retrieving the Kinship matrix K.\n")
		sys.stdout.flush()
		k_file = env['data_dir'] + "kinship_matrix_cm" + str(p_dict['call_method_id']) + ".pickled"
		kinship_file = p_dict['kinship_file']
		if not kinship_file and os.path.isfile(k_file): #Check if corresponding call_method_file is available
			kinship_file = k_file
		if kinship_file:   #Kinship file was somehow supplied..
			print 'Loading supplied kinship'
			k = lm.load_kinship_from_file(kinship_file, sd.accessions)
		else:
			print "No kinship file was found.  Generating kinship file:", k_file
			sd = dataParsers.parse_snp_data(snps_data_file , format=p_dict['data_format'])
			snps = sd.getSnps()
			k_accessions = sd.accessions[:]
			if p_dict['debug_filter']:
				import random
				snps = random.sample(snps, int(p_dict['debug_filter'] * len(snps)))
			k = lm.calc_kinship(snps)
			f = open(k_file, 'w')
			cPickle.dump([k, sd.accessions], f)
			f.close()
			num_outliers = gwa.prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers'])
			k = lm.filter_k_for_accessions(k, k_accessions, sd.accessions)
		sys.stdout.flush()
		sys.stdout.write("Done!\n")

	if p_dict['remove_outliers']:
		assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA."


	#Check whether result already exists.
	if p_dict['use_existing_results']:
		print "\nChecking for existing results."
		result_file = file_prefix + ".pvals"
		if os.path.isfile(result_file):
			res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps)
			pvals = True
		else:
			result_file = file_prefix + ".scores"
			if os.path.isfile(result_file):
				res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps)
				pvals = False
		if res:
			print "Found existing results.. (%s)" % (result_file)
		sys.stdout.flush()


	if not res: #If results weren't found in a file... then do GWA.

		sys.stdout.write("Finished loading and handling data!\n")

		print "FIRST STEP: Applying %s to data. " % (mapping_method)
		sys.stdout.flush()
		kwargs = {}
		additional_columns = []
		if mapping_method in ['emmax']:
			res = lm.emmax(snps, phen_vals, k)
		elif mapping_method in ['lm']:
			res = lm.linear_model(snps, phen_vals)
		else:
			print "Mapping method", mapping_method, 'was not found.'
			sys.exit(2)

		if mapping_method in ['lm', 'emmax']:
			kwargs['genotype_var_perc'] = res['var_perc']
			betas = map(list, zip(*res['betas']))
			kwargs['beta0'] = betas[0]
			kwargs['beta1'] = betas[1]
			additional_columns.append('genotype_var_perc')
			additional_columns.append('beta0')
			additional_columns.append('beta1')
			pvals = res['ps']
			sys.stdout.write("Done!\n")
			sys.stdout.flush()



		kwargs['correlations'] = calc_correlations(snps, phen_vals)
		additional_columns.append('correlations')

		res = gwaResults.Result(scores=pvals, snps_data=sd, name=result_name, **kwargs)

		if mapping_method in ["emmax", 'lm']:
		 	result_file = file_prefix + ".pvals"
		else:
		 	result_file = file_prefix + ".scores"
		res.write_to_file(result_file, additional_columns)

		print "Generating a GW plot."
		sys.stdout.flush()
		png_file = file_prefix + "_gwa_plot.png"
		#png_file_max30 = file_prefix+"_gwa_plot_max30.png"
		if mapping_method in ['lm', "emmax"]:
			res.neg_log_trans()
			if mapping_method in ["kw", "ft"]:# or p_dict['data_format'] != 'binary':
				#res.plot_manhattan(png_file=png_file_max30,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", 
				#	       plot_bonferroni=True,max_score=30)
				res.plot_manhattan(png_file=png_file, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$",
					       plot_bonferroni=True)
			else:
				if res.filter_attr("mafs", p_dict['mac_threshold']) > 0:
					#res.plot_manhattan(png_file=png_file_max30,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", 
					#	       plot_bonferroni=True,max_score=30)				
					res.plot_manhattan(png_file=png_file, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$",
						       plot_bonferroni=True)
		else:
			pass

		print "plotting histogram"
		hist_file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phenotype_name, trans_method, p_dict['remove_outliers'])
		hist_png_file = hist_file_prefix + "_hist.png"
		phed.plot_histogram(p_i, pngFile=hist_png_file)
	else:
		res.neg_log_trans()
		assert res.filter_attr("mafs", p_dict['mac_threshold']), 'All SNPs have MAC smaller than threshold'


	print "SECOND STEP:"
	res.filter_top_snps(p_dict['second_step_number'])
	snps = res.snps
	positions = res.positions
	chromosomes = res.chromosomes
	#Checking res_file exists
	file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.getPhenotypeName(p_i),
				mapping_method, trans_method, p_dict['remove_outliers'], p_dict['second_step_number'])
	res_file = file_prefix + '_res.cpickled'
	if p_dict['use_existing_results'] and os.path.isfile(res_file):
			print 'Found existing results for the second step... loading.'
			f = open(res_file, 'rb')
			second_res = cPickle.load(f)
			f.close()
	else:
		if mapping_method == 'lm':
			second_res = lm.linear_model_two_snps(snps, phen_vals)
		if mapping_method == 'emmax':
			second_res = lm.emmax_two_snps(snps, phen_vals, k)

		#Pickling results..
		print 'Saving results as pickled file:', res_file
		f = open(res_file, 'wb')
		cPickle.dump(second_res, f, protocol=2)
		f.close()



	#Plotting second step plots:
	score_array = -sp.log10(second_res['ps'])
	p3_score_array = -sp.log10(second_res['p3_ps'])
	p4_score_array = -sp.log10(second_res['p4_ps'])
	import plotResults as pr
	pr.plot_snp_pair_result(chromosomes, positions, score_array, file_prefix + '_scatter')
	pr.plot_snp_pair_result(chromosomes, positions, p3_score_array, file_prefix + '_p3_scatter')
	pr.plot_snp_pair_result(chromosomes, positions, p4_score_array, file_prefix + '_p4_scatter')



	if p_dict['region_plots']:
		import regionPlotter as rp
		regions_results = res.get_top_region_results(p_dict['region_plots'])
		plotter = rp.RegionPlotter()
		print "Starting region plots..."
		for reg_res in regions_results:
			chromosome = reg_res.chromosomes[0]
			caption = phenotype_name + "_c" + str(chromosome) + "_" + mapping_method
			png_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \
				+ "_e" + str(reg_res.positions[-1]) + ".png"
			tair_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \
				+ "_e" + str(reg_res.positions[-1]) + "_tair_info.txt"
			plotter.plot_small_result([reg_res], png_file=png_file, highlight_gene_ids=tair_ids,
						  caption=caption, tair_file=tair_file)

			#Plot Box-plot
			png_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \
				+ "_e" + str(reg_res.positions[-1]) + "_box_plot.png"
			(marker, score, chromosome, pos) = reg_res.get_max_snp()
			marker_accessions = sd.accessions
			phed.plot_marker_box_plot(p_i, marker=marker, marker_accessions=marker_accessions, \
						png_file=png_file, title="c" + str(chromosome) + "_p" + str(pos), \
						marker_score=score, marker_missing_val=sd.missing_val)
Example #10
0
            (SNP_names[:, 0].astype("int"), SNP_names[:, 1].astype("int"),
             np.array(test_H), np.array(test_p), MAF))
        test = np.transpose(test)
        header = 'Chromosome,Position,H,p-val,MAF'
        filename = "GWAS_KW_for_" + PHENO_file + "_" + str(
            sys.argv[2]).split("_")[2].replace(".npy", "") + ".csv"
        np.savetxt(filename, test, delimiter=",", header=header, fmt="%s")
        END_KW = datetime.now()
        print('\rKW completed in ' + str(END_KW - START))

    if EMMAX:
        START = datetime.now()
        print('_____________________________________________________________')
        print('Starting the computation of EMMAX using mixmogam')
        mm_results = lm.emmax(np.transpose(SNP_data),
                              Pheno_data,
                              K_data,
                              with_betas=True)
        betas = mm_results['betas']
        betas = [[0, 0] if i is None else i for i in betas]
        betas = [[0, 0] if len(i) != 2 else i for i in betas]
        betas = np.hstack(betas).reshape(len(betas), 2)
        test = np.vstack(
            (SNP_names[:, 0].astype("int"), SNP_names[:, 1].astype("int"),
             betas[:, 1], mm_results['ps'], MAF))
        test = np.transpose(test)
        header = 'Chromosome,Position,beta,p-val,MAF'
        filename = "GWAS_EMMAX_for_" + PHENO_file + "_" + str(
            sys.argv[2]).split("_")[2].replace(".npy", "") + ".csv"
        np.savetxt(filename,
                   test,
                   delimiter=",",