Ejemplo n.º 1
0
def test_lrt():

    import numpy as np

    from seak.lrt import LRTnoK
    from numpy import isclose

    np.random.seed(1)

    # random covariates
    X = np.random.randn(1000, 10)

    # random genotypes
    G_1 = np.random.binomial(1, 0.01, size=(1000, 10))

    # part of Y explained by co variates
    Y = X.dot(np.random.randn(10, 1) * 0.05)

    # part of Y explained by G_1 (3/10 causal SNPs)
    Y += G_1.dot(
        np.array([1. if i > 7 else 0.
                  for i in range(10)])[:, np.newaxis] * 0.5)

    # part of Y explained by random noise
    Y += np.random.randn(1000, 1)

    lrt = LRTnoK(X, Y)

    # print(lrt.model0)

    assert isclose(
        lrt.model0['nLL'], 1385.7447556588409
    ), 'Null model nLL changed. should be ~1385.7447556588409, is {}. Check LRTnoK.__init__'.format(
        lrt.model0['nLL'])

    altmodel = lrt.altmodel(G_1)

    # print(altmodel)

    assert isclose(
        altmodel['nLL'], 1385.7118679498765
    ), 'Alt model nLL changed. should be ~1385.7118679498765, is {}. Check LRTnoK.altmodel()'.format(
        altmodel['nLL'])
    assert isclose(
        altmodel['stat'], 0.06577541792876218
    ), 'Alt model LRT test statistic changed. should be ~0.06577541792876218, is {}. Check LRTnoK.altmodel()'.format(
        altmodel['stat'])

    sims = lrt.pv_sim(nsim=1000, seed=21)

    # print(sims['pv'])

    assert sims[
        'pv'] == 0.353, 'pv_sim() output changed. should be 0.353, is {}. Check LRTnoK.pv_sim()'.format(
            sims['pv'])
class GotNone(Exception):
    pass

# set up the covariatesloader

covariatesloader = CovariatesLoaderCSV(snakemake.params.phenotype,
                                       snakemake.input.covariates_tsv,
                                       snakemake.params.covariate_column_names,
                                       sep='\t',
                                       path_to_phenotypes=snakemake.input.phenotypes_tsv)

# initialize the null models
Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('noK')

null_model_score = ScoretestNoK(Y, X)
null_model_lrt = LRTnoK(X, Y)


# set up function to filter variants:
def maf_filter(mac_report):

    # load the MAC report, keep only observed variants with MAF below threshold
    mac_report = pd.read_csv(mac_report, sep='\t', usecols=['SNP', 'MAF', 'Minor', 'alt_greater_ref'])
    
    if snakemake.params.filter_highconfidence:
        vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool)) & (mac_report.hiconf_reg.astype(bool))]
    else:
        vids = mac_report.SNP[(mac_report.MAF < snakemake.params.max_maf) & (mac_report.Minor > 0) & ~(mac_report.alt_greater_ref.astype(bool))] 

    # this has already been done in filter_variants.py
    # load the variant annotation, keep only variants in high-confidece regions
        def test_gene(interval, seed):

            interval = interval.to_dict()

            pval_dict = {}
            pval_dict['gene'] = interval['name']

            out_dir = os.path.join(snakemake.params.out_dir_stats, interval['name'])
            os.makedirs(out_dir, exist_ok=True)
            
            # conditional analysis:
            # get the snps to condition on, and include them in the null model for the LRT
            cond_snps, cond_snps_vid = get_conditional(interval)
            null_model_lrt = LRTnoK(np.concatenate([X, cond_snps], axis=1), Y)

            # conditional analysis:
            # the score-test takes a second argument (G2) that allows conditioning on a second set of variants...
            def pv_score(GV, G2=cond_snps):
                # wraps score-test
                pv = null_model_score.pv_alt_model(GV, G2)
                if pv < 0.:
                    pv = null_model_score.pv_alt_model(GV, G2, method='saddle')
                return pv

            def call_test(GV, name):
                pval_dict['pv_score_' + name] = pv_score(GV)
                altmodel = null_model_lrt.altmodel(GV)
                res = null_model_lrt.pv_sim_chi2(250000, simzero=False, seed=seed)
                pval_dict['pv_lrt_' + name] = res['pv']
                pval_dict['lrtstat_' + name] = altmodel['stat']
                if 'h2' in altmodel:
                    pval_dict['h2_' + name] = altmodel['h2']

                if res['pv'] != 1.:
                    for stat in ['scale', 'dof', 'mixture', 'imax']:
                        pval_dict[stat + '_' + name] = res[stat]
                    if len(res['res'] > 0):
                        pd.DataFrame({interval['name']: res['res']}).to_pickle(out_dir + '/{}.pkl.gz'.format(name))

            # load rbp variants
            G, vids, weights, S, ncarrier, cummac, pos, V = get_rbp(interval)
            keep = ~is_plof(vids)

            # cholesky
            if G.shape[1] > 1:
                L, flag1 = get_cholesky(S)
            else:
                L, flag1 = np.eye(G.shape[1]), -1

            # do a score test (cholesky, and weighted cholesky)
            GWL = G.dot(np.diag(weights, k=0)).dot(L)
            call_test(GWL, 'linwcholesky')

            # sanity checks
            assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {}  got: {}'.format(interval['n_snp'], len(vids))
            assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum())

            if np.any(keep):

                if keep.sum() == 1:
                    # only single SNP is not LOF
                    GWL = G[:, keep].dot(np.diag(weights[keep], k=0)) # actually just the linear weighted kernel
                else:
                    L, flag2 = get_cholesky(S[np.ix_(keep, keep)])
                    GWL = G[:, keep].dot(np.diag(weights[keep], k=0)).dot(L)

                call_test(GWL, 'linwcholesky_notLOF')

            # conditional analysis: keep names of SNPs that we condition on 
            pval_dict['cond_snps'] = ','.join(cond_snps_vid)
                
            return pval_dict
Ejemplo n.º 4
0
    def test_gene(interval, seed):

        pval_dict = {}
        pval_dict['gene'] = interval['name']

        out_dir = os.path.join(snakemake.params.out_dir_stats, interval['name'])
        os.makedirs(out_dir, exist_ok=True)

        # conditional analysis:
        # get the snps to condition on, and include them in the null model for the LRT
        cond_snps, cond_snps_vid = get_conditional(interval)
        null_model_lrt = LRTnoK(np.concatenate([X, cond_snps], axis=1), Y)
        
        # conditional analysis:
        # the score-test takes a second argument (G2) that allows conditioning on a second set of variants...
        def pv_score(GV, G2=cond_snps):
            # wraps score-test
            pv = null_model_score.pv_alt_model(GV, G2)
            if pv < 0.:
                pv = null_model_score.pv_alt_model(GV, G2, method='saddle')
            return pv

        def call_test(GV, name):
            pval_dict['pv_score_' + name] = pv_score(GV)
            
            altmodel = null_model_lrt.altmodel(GV)

            res = null_model_lrt.pv_sim_chi2(250000, simzero=False, seed=seed)
            pval_dict['pv_lrt_' + name] = res['pv']
            pval_dict['lrtstat_' + name ] = altmodel['stat']
            if 'h2' in altmodel:
                pval_dict['h2_' + name ] = altmodel['h2']
            if res['pv'] != 1.:
                for stat in ['scale', 'dof', 'mixture', 'imax']:
                    pval_dict[stat + '_' + name] = res[stat]
                if len(res['res'] > 0):
                    pd.DataFrame({interval['name']: res['res']}).to_pickle(out_dir + '/{}.pkl.gz'.format(name))


        # load splice variants
        G1, vids, weights, ncarrier, cummac, is_plof, splice_preds_all = get_splice(interval)
        # keep indicates which variants are NOT "protein LOF" variants, i.e. variants already identified by the ensembl VEP
        keep = ~is_plof

        # sanity checks
        assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {}  got: {}'.format(interval['n_snp'], len(vids))
        assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum())


        # do a score burden test (max weighted), this is different than the baseline!
        G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True)
        
        try:
            call_test(G1_burden, 'linwb')
        except AssertionError as err:
            out_dump_np = '{}_{}_{{}}_covar.npy'.format(interval['name'],snakemake.params.phenotype)
            np.save(out_dump_np.format('G1burden'), G1_burden)
            np.save(out_dump_np.format('G1'), G1)
            np.save(out_dump_np.format('Covar'), np.concatenate([X, cond_snps], axis=1))
            np.save(out_dump_np.format('Y'), Y)
            logging.error('AssertionError encountered when testing gene {}, conditioning on {}. dumping Y, covariates and G1 to {}'.format(interval['name'], ','.join(cond_snps_vid),out_dump_np.format('*')))
            raise err

        # linear weighted kernel
        G1 = G1.dot(np.diag(np.sqrt(weights), k=0))

        # do a score test (linear weighted)
        call_test(G1, 'linw')

        # load plof burden
        G2 = get_plof(interval)

        if G2 is not None:

            if np.any(keep):

                # merged (single variable)
                G1_burden_mrg = np.maximum(G2, G1_burden)
                call_test(G1_burden_mrg, 'linwb_mrgLOF')

                # concatenated ( >= 2 variables)
                # we separate out the ones that are already part of the protein LOF variants!

                G1 = np.concatenate([G1[:, keep], G2], axis=1)
                call_test(G1, 'linw_cLOF')
            else:
                logging.info('All Splice-AI variants for gene {} where already identified by the Ensembl variant effect predictor'.format(interval['name']))

        # conditional analysis: keep names of SNPs that we condition on 
        pval_dict['cond_snps'] = ','.join(cond_snps_vid)
                
        return pval_dict
    def test_gene(interval, seed):

        pval_dict = {}
        pval_dict['gene'] = interval['name']

        out_dir = os.path.join(snakemake.params.out_dir_stats, interval['name'])
        os.makedirs(out_dir, exist_ok=True)
        
        # conditional analysis:
        # get the snps to condition on, and include them in the null model for the LRT
        cond_snps, cond_snps_vid = get_conditional(interval)
        null_model_lrt = LRTnoK(np.concatenate([X, cond_snps], axis=1), Y)
        
        # conditional analysis:
        # the score-test takes a second argument (G2) that allows conditioning on a second set of variants...
        def pv_score(GV, G2=cond_snps):
            # wraps score-test
            pv = null_model_score.pv_alt_model(GV, G2)
            if pv < 0.:
                pv = null_model_score.pv_alt_model(GV, G2, method='saddle')
            return pv

        def call_test(GV, name):
            pval_dict['pv_score_' + name] = pv_score(GV)
            altmodel = null_model_lrt.altmodel(GV)
            res = null_model_lrt.pv_sim_chi2(250000, simzero=False, seed=seed)
            pval_dict['pv_lrt_' + name] = res['pv']
            pval_dict['lrtstat_' + name ] = altmodel['stat']
            if 'h2' in altmodel:
                pval_dict['h2_' + name ] = altmodel['h2']
            if res['pv'] != 1.:
                for stat in ['scale', 'dof', 'mixture', 'imax']:
                    pval_dict[stat + '_' + name] = res[stat]
                if len(res['res'] > 0):
                    pd.DataFrame({interval['name']: res['res']}).to_pickle(out_dir + '/{}.pkl.gz'.format(name))

        # load missense variants
        G1, vids, weights, ncarrier, cummac, pos, ref, alt, cosine_similarity = get_missense(interval)

        # sanity checks
        assert len(vids) == interval['n_snp'], 'Error: number of variants does not match! expected: {}  got: {}'.format(interval['n_snp'], len(vids))
        assert cummac.sum() == interval['cumMAC'], 'Error: cumMAC does not match! expeced: {}, got: {}'.format(interval['cumMAC'], cummac.sum())

        # perform test using gene-specific distribution, gbvc
        G1_burden = np.max(np.where(G1 > 0.5, np.sqrt(weights), 0.), axis=1, keepdims=True)
        call_test(G1_burden, 'linwb')

        # perform local collapsing with weights
        if G1.shape[1] > 1:
            G1, clusters = collapser.collapse(G1, pos, np.sqrt(weights))
        else:
            G1 = G1.dot(np.diag(np.sqrt(weights), k=0))

        # perform test using gene-specific distribution, kernel-based
        call_test(G1, 'linwcollapsed')

        # load plof burden
        G2 = get_plof(interval)

        if G2 is not None:

            # merged (single variable)
            G1_burden_mrg = np.maximum(G2, G1_burden)
            call_test(G1_burden_mrg, 'linwb_mrgLOF')

            # concatenated
            call_test(np.concatenate([G1, G2], axis=1), 'linwcollapsed_cLOF')
        
        # conditional analysis: keep names of SNPs that we condition on 
        pval_dict['cond_snps'] = ','.join(cond_snps_vid)

        return pval_dict