コード例 #1
0
ファイル: gwas.py プロジェクト: juliuspm/PyGWAS
def linear_model(snps, phenotypes, cofactors=None):
    lm = LinearModel(phenotypes)
    if cofactors:
        for cofactor in cofactors:
            lm.add_factor(cofactor)
    log.info("Running a standard linear model")
    t = Timer()
    res = lm.fast_f_test(snps)
    log.info('Took: %s' % t.stop(True))
    return res
コード例 #2
0
ファイル: gwas.py プロジェクト: juliuspm/PyGWAS
def anova(snps, phenotypes):
    """
    Run EMMAX
    """
    lmm = LinearModel(phenotypes)

    log.info("Running ANOVA")
    t = Timer()
    res = lmm.anova_f_test(snps)
    log.info('Took: %s' % t.stop(True))
    return res
コード例 #3
0
ファイル: gwas.py プロジェクト: juliuspm/PyGWAS
def mm_lrt_test(y, K):
    """
    Likelihood ratio test for whether the data (y) fits a mixed model with 
    two random terms fits significantly better.
    """
    lm = LinearModel(y)
    lmm = LinearMixedModel(y)
    lmm.add_random_effect(K)
    lmm_res = lmm.get_ML()
    ll0 = lm.get_ll()
    ll1 = lmm_res['max_ll']
    D = 2 * (ll1 - ll0)
    pval = sp.chi2.sf(D, 1)
    return {'pval':pval, 'lrt_stat':D}
コード例 #4
0
ファイル: gwas.py プロジェクト: juliuspm/PyGWAS
def lm_step_wise(phenotypes, sd=None, num_steps=10, file_prefix=None, forward_backwards=True, local=False,
        cand_gene_list=None, plot_xaxis=True, with_qq_plots=True, sign_threshold=None, log_qq_max_val=5,
        highlight_loci=None, save_pvals=False, markersize=3, chrom_col_map=None, **kwargs):
    """
    Run simple step-wise linear model forward-backward.
    """
    # import plotResults as pr
    if local:
        with_qq_plots = False

    if sd:
        kwargs['snps'] = sd.getSnps()
        kwargs['positions'] = sd.getPositions()
        kwargs['chromosomes'] = sd.get_chr_list()
        d = sd.get_mafs()
        kwargs['macs'] = d['mafs']
        kwargs['mafs'] = d['marfs']

    snps = kwargs['snps'][:]
    positions = kwargs['positions'][:]
    chromosomes = kwargs['chromosomes'][:]
    mafs = kwargs['mafs'][:]
    macs = kwargs['macs'][:]

    chr_pos_list = zip(chromosomes, positions)
    lm = LinearModel(phenotypes)
    num_snps = len(snps)

    if not sign_threshold:  # Then use Bonferroni threshold
        sign_threshold = 1.0 / (num_snps * 20.0)

    print "Running step-wise LM"
    s1 = time.time()
    step_info_list = []
    cofactors = []  # A list of the loci found, together with their statistics.
    cofactor_snps = []
    step_i = 0
    num_par = 2  # mean and variance scalar

    rss = lm.get_rss()
    ll = lm.get_ll(rss)
    criterias = {'ebics':[], 'mbics':[], 'bonf':[], 'mbonf':[]}
    (bic, extended_bic, modified_bic) = _calc_bic_(ll, num_snps, num_par, lm.n)  # Calculate the BICs
    criterias['ebics'].append(extended_bic)
    criterias['mbics'].append(modified_bic)
    max_cofactor_pval = 0
    criterias['mbonf'].append(max_cofactor_pval)
    criterias['bonf'].append(0)
    action = 'None'
    print '\nStep %d: action=%s, num_par=%d, ll=%0.2f, rss=%0.2f, bic=%0.2f, extended_bic=%0.2f, modified_bic=%0.2f' % \
        (step_i, action, num_par, ll, rss, bic, extended_bic, modified_bic)
    print 'Cofactors:', _cofactors_to_string_(cofactors)
    quantiles_dict = {'log':[], 'norm':[], 'labels':[]}

    for step_i in range(1, num_steps + 1):
        lm_res = lm.fast_f_test(snps)
        if step_i == 1:
            first_lm_res = lm_res
        min_pval_i = sp.argmin(lm_res['ps'])
        min_pval = lm_res['ps'][min_pval_i]
        min_pval_chr_pos = chr_pos_list[min_pval_i]
        print 'Min p-value:', min_pval
        criterias['bonf'].append(min_pval)
        step_info = {'rss':rss, 'll':ll, 'bic':bic, 'e_bic':extended_bic, 'm_bic':modified_bic,
                'mbonf':max_cofactor_pval, 'cofactors':map(tuple, cofactors[:]),
                'cofactor_snps':cofactor_snps[:], 'min_pval':min_pval,
                'min_pval_chr_pos': min_pval_chr_pos}

        lm_pvals = lm_res['ps'].tolist()
        # Plot gwas results per step 
        if file_prefix:
            _plot_manhattan_and_qq_(file_prefix, step_i - 1, lm_pvals, quantiles_dict, positions=positions,
                chromosomes=chromosomes, mafs=mafs, macs=macs, plot_bonferroni=True, highlight_markers=cofactors,
                cand_genes=cand_gene_list, plot_xaxis=plot_xaxis, log_qq_max_val=log_qq_max_val,
                with_qq_plots=with_qq_plots, highlight_loci=highlight_loci, write_pvals=save_pvals,
                markersize=markersize, chrom_col_map=chrom_col_map)
        if save_pvals:
            step_info['ps'] = lm_pvals


        if cand_gene_list:
            # Calculate candidate gene enrichments.
            pass
        step_info['kolmogorov_smirnov'] = agr.calc_ks_stats(lm_pvals)
        step_info['pval_median'] = agr.calc_median(lm_pvals)
        print step_info['kolmogorov_smirnov'], step_info['pval_median']
        step_info_list.append(step_info)


        # Adding the new SNP as a cofactor
        lm.add_factor(snps[min_pval_i])
        cofactor_snps.append(snps[min_pval_i])
        rss = lm.get_rss()
        ll = lm.get_ll(rss)
        num_par += 1
        action = '+'

        cofactors.append([min_pval_chr_pos[0], min_pval_chr_pos[1], min_pval])


        # Re-estimate the p-value of the cofactors... with the smallest in the list.
        cofactor_pvals = []
        for i, snp in enumerate(cofactor_snps):
            t_cofactors = cofactor_snps[:]
            del t_cofactors[i]
            lm.set_factors(t_cofactors)
            pval = lm.fast_f_test([snp])['ps'][0]
            cofactor_pvals.append(pval)
            cofactors[i][2] = -math.log10(pval)
        lm.set_factors(cofactor_snps)
        max_cofactor_pval = max(cofactor_pvals)
        criterias['mbonf'].append(max_cofactor_pval)

        # Remove the found SNP from considered SNPs
        del snps[min_pval_i]
        del positions[min_pval_i]
        del chromosomes[min_pval_i]
        del chr_pos_list[min_pval_i]
        del mafs[min_pval_i]
        del macs[min_pval_i]
        num_snps -= 1


        (bic, extended_bic, modified_bic) = _calc_bic_(ll, num_snps, num_par, lm.n)  # Calculate the BICs
        criterias['ebics'].append(extended_bic)
        criterias['mbics'].append(modified_bic)

        print '\nStep %d: action=%s, num_par=%d, ll=%0.2f, rss=%0.2f, bic=%0.2f, extended_bic=%0.2f, modified_bic=%0.2f' % \
            (step_i, action, num_par, ll, rss, bic, extended_bic, modified_bic)
        print 'Cofactors:', _cofactors_to_string_(cofactors)

    lm_res = lm.fast_f_test(snps)
    min_pval_i = sp.argmin(lm_res['ps'])
    min_pval = lm_res['ps'][min_pval_i]
    min_pval_chr_pos = chr_pos_list[min_pval_i]
    print 'Min p-value:', min_pval
    step_info = {'rss':rss, 'll':ll, 'bic':bic, 'e_bic':extended_bic, 'm_bic':modified_bic,
        'mbonf':max_cofactor_pval, 'cofactors':map(tuple, cofactors[:]),
        'cofactor_snps':cofactor_snps[:], 'min_pval':min_pval, 'min_pval_chr_pos': min_pval_chr_pos}
    lm_pvals = lm_res['ps'].tolist()
    if save_pvals:
        step_info['ps'] = lm_pvals

    # Now plotting!
    print "Generating plots"
    if file_prefix:
        _plot_manhattan_and_qq_(file_prefix, step_i, lm_pvals, quantiles_dict, positions=positions,
                    chromosomes=chromosomes, mafs=mafs, macs=macs, plot_bonferroni=True, highlight_markers=cofactors,
                    cand_genes=cand_gene_list, plot_xaxis=plot_xaxis, log_qq_max_val=log_qq_max_val,
                    with_qq_plots=with_qq_plots, highlight_loci=highlight_loci, write_pvals=save_pvals,
                    markersize=markersize, chrom_col_map=chrom_col_map)

    max_num_cofactors = len(cofactors)
    step_info['kolmogorov_smirnov'] = agr.calc_ks_stats(lm_pvals)
    step_info['pval_median'] = agr.calc_median(lm_pvals)
    print step_info['kolmogorov_smirnov'], step_info['pval_median']
    step_info_list.append(step_info)

    # Now backward stepwise.
    if forward_backwards:
        print 'Starting backwards..'
        while len(cofactor_snps) > 1:
            step_i += 1
            f_stats = sp.zeros(len(cofactor_snps))
            for i, snp in enumerate(cofactor_snps):
                t_cofactors = cofactor_snps[:]
                del t_cofactors[i]
                lm.set_factors(t_cofactors)
                res = lm.fast_f_test([snp])
                cofactors[i][2] = -math.log10(res['ps'][0])
                f_stats[i] = res['f_stats'][0]
            i_to_remove = f_stats.argmin()
            del cofactor_snps[i_to_remove]
            del cofactors[i_to_remove]
            lm.set_factors(cofactor_snps)
            num_snps += 1

            # Re-estimating the REML and ML.
            rss = lm.get_rss()
            ll = lm.get_ll(rss)
            num_par -= 1
            action = '-'

            # Update the p-values
            cofactor_pvals = []
            for i, snp in enumerate(cofactor_snps):
                t_cofactors = cofactor_snps[:]
                del t_cofactors[i]
                lm.set_factors(t_cofactors)
                res = lm.fast_f_test([snp])
                pval = res['ps'][0]
                cofactor_pvals.append(pval)
                cofactors[i][2] = -math.log10(pval)
            max_cofactor_pval = max(cofactor_pvals)
            criterias['mbonf'].append(max_cofactor_pval)

            # Calculate the BICs
            (bic, extended_bic, modified_bic) = _calc_bic_(ll, num_snps, num_par, lm.n)
            criterias['ebics'].append(extended_bic)
            criterias['mbics'].append(modified_bic)
            print '\nStep %d: action=%s, num_par=%d, ll=%0.2f, rss=%0.2f, bic=%0.2f, extended_bic=%0.2f, modified_bic=%0.2f' % \
                (step_i, action, num_par, ll, rss, bic, extended_bic, modified_bic)
            print 'Cofactors:', _cofactors_to_string_(cofactors)

            step_info = {'rss':rss, 'll':ll, 'bic':bic, 'e_bic':extended_bic, 'm_bic':modified_bic,
                'mbonf':max_cofactor_pval, 'cofactors':map(tuple, cofactors[:]),
                'cofactor_snps':cofactor_snps[:], 'min_pval':None, 'min_pval_chr_pos':None,
                'kolmogorov_smirnov':None, 'pval_median':None}
            step_info_list.append(step_info)
            print cofactors

    opt_dict, opt_indices = _analyze_opt_criterias_(criterias, sign_threshold, max_num_cofactors, file_prefix,
                        with_qq_plots, lm, step_info_list, quantiles_dict,
                        plot_bonferroni=True, cand_genes=cand_gene_list, plot_xaxis=plot_xaxis,
                        log_qq_max_val=log_qq_max_val, type='lm', highlight_loci=highlight_loci,
                        write_pvals=save_pvals, markersize=markersize,
                        chrom_col_map=chrom_col_map, **kwargs)


    for step_i in opt_indices:
        for h in ['min_pval', 'min_pval_chr_pos', 'kolmogorov_smirnov', 'pval_median']:
            step_info_list[step_i][h] = opt_indices[step_i][h]

    if file_prefix:
        _plot_stepwise_stats_(file_prefix, step_info_list, sign_threshold, type == 'lm')

    res_dict = {'step_info_list':step_info_list, 'first_lm_res':first_lm_res, 'opt_dict':opt_dict}

    secs = time.time() - s1
    if secs > 60:
        mins = int(secs) / 60
        secs = secs - mins * 60
        print 'Took %d mins and %f seconds.' % (mins, secs)
    else:
        print 'Took %f seconds.' % (secs)
    return res_dict
コード例 #5
0
ファイル: gwas.py プロジェクト: juliuspm/PyGWAS
def lin_reg_step(phen_vals, sd, cof_chr_pos_list, progress_file_writer=None, plot_prefix=None):
    """
    Standard linear regression single SNPs..
    
    Returns various stats useful for stepwise regression.
    """
    import bisect
    t = Timer()
    lm = LinearModel(phen_vals)

    h0_rss = lm.get_rss()
    step_dict = {}
    step_dict['h0_rss'] = h0_rss

    
    log.info('Looking up cofactors')
    cof_indices,cof_snps  = sd.get_snps_from_pos(cof_chr_pos_list)
    lm.set_factors(cof_snps)

    if not progress_file_writer == None:
        progress_file_writer.update_progress_bar(progress=0.20, task_status='Performing linear regression')
        progress_file_writer.set_step(0.05)


    log.info('Performing linear regression.')
    r = lm.fast_f_test(sd, progress_file_writer=progress_file_writer)
    min_pval_i = sp.argmin(r['ps'])
    step_dict['min_pval_i'] = min_pval_i
    step_dict['min_pval'] = r['ps'][min_pval_i]
    step_dict['mahalanobis_rss'] = r['rss'][min_pval_i]
    step_dict['min_pval_chr_pos'] = sd.get_chr_pos_from_index(min_pval_i)

    num_snps = sd.num_snps
    num_par = lm.X.shape[1] + 1
    step_dict['num_snps'] = num_snps
    step_dict['num_par'] = num_par
    rss = lm.get_rss()
    ll = lm.get_ll(rss)
    (bic, extended_bic, modified_bic) = _calc_bic_(ll, num_snps, num_par, lm.n)  # Calculate the BICs
    step_dict['ebic'] = extended_bic
    step_dict['mbic'] = modified_bic
    step_dict['bic'] = bic

    step_dict['rss'] = rss
    perc_var_expl = 1.0 - (rss / h0_rss)
    step_dict['perc_var_expl'] = perc_var_expl

    # Calculate maximum cofactor p-value\
    log.info('Updating the cofactor p-values')
    cof_pvals = []
    cof_chrom_pos_pval_list = []
    for i, snp in enumerate(cof_snps):
        t_cofactors = cof_snps[:]
        del t_cofactors[i]
        lm.set_factors(t_cofactors)
        res = lm.fast_f_test([snp])
        cof_pval = res['ps'][0]
        cof_pvals.append(cof_pval)
        cof_chrom_pos_pval_list.append((cof_chr_pos_list[i][0], cof_chr_pos_list[i][1], -math.log10(cof_pval)))
    for i, pval in zip(cof_indices, cof_pvals):
        r['ps'][i] = pval
    if len(cof_pvals):
        step_dict['max_cof_pval'] = max(cof_pvals)
    else:
        step_dict['max_cof_pval'] = 0.0
    log.info('Took:%s' % t.stop(True))

    return {'stats':step_dict, 'res':r,'cof_chrom_pos_pval_list':cof_chrom_pos_pval_list}