Ejemplo n.º 1
0
    def test_linreg(self):
        logging.info("TestSingleSnp test_linreg")
        test_snps = Bed(self.bedbase)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file = self.file_name("linreg")

        frame1 = single_snp(test_snps=test_snps[:, :10],
                            pheno=pheno,
                            mixing=0,
                            leave_out_one_chrom=False,
                            G0=KernelIdentity(iid=test_snps.iid),
                            covar=covar,
                            output_file_name=output_file)

        frame1 = frame1[[
            'sid_index', 'SNP', 'Chr', 'GenDist', 'ChrPos', 'PValue'
        ]]
        self.compare_files(frame1, "linreg")

        frame2 = single_snp_linreg(test_snps=test_snps[:, :10],
                                   pheno=pheno,
                                   covar=covar,
                                   output_file_name=output_file)
        self.compare_files(frame2, "linreg")
Ejemplo n.º 2
0
    def test_G0_has_reader(self):
        logging.info("TestSingleSnp test_G0_has_reader")
        test_snps = Bed(self.bedbase)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file_name = self.file_name("G0_has_reader")

        frame0 = single_snp(test_snps=test_snps[:, :10],
                            pheno=pheno,
                            G0=test_snps,
                            leave_out_one_chrom=False,
                            covar=covar,
                            mixing=0,
                            output_file_name=output_file_name)
        self.compare_files(frame0, "one")

        frame1 = single_snp(test_snps=test_snps[:, :10],
                            pheno=pheno,
                            G0=KernelIdentity(test_snps.iid),
                            G1=test_snps,
                            leave_out_one_chrom=False,
                            covar=covar,
                            mixing=1,
                            output_file_name=output_file_name)
        self.compare_files(frame1, "one")
Ejemplo n.º 3
0
def _internal_determine_block_size(K0, K1, mixing, force_full_rank,
                                   force_low_rank):
    assert not (force_full_rank and force_low_rank), "real assert"

    if isinstance(K0, SnpKernel) and K0.snpreader.sid_count == 0:
        K0 = KernelIdentity(K0.iid)
    if isinstance(K1, SnpKernel) and K1.snpreader.sid_count == 0:
        K1 = KernelIdentity(K1.iid)

    ##########################
    # A special case: both kernels are the Identity so just return the first one
    ##########################
    if isinstance(K0, KernelIdentity) and isinstance(K1, KernelIdentity):
        return K0.iid_count

    ##########################
    # Special cases: mixing says to use just one kernel or the other kernel is just identity, so just return one kernel
    ##########################
    if mixing == 0.0 or isinstance(K1, KernelIdentity):
        if isinstance(K0, SnpKernel) and not force_full_rank and (
                force_low_rank or K0.snpreader.sid_count < K0.iid_count):
            return K0.snpreader.sid_count
        else:
            return K0.iid_count

    if mixing == 1.0 or isinstance(K0, KernelIdentity):
        if isinstance(K1, SnpKernel) and not force_full_rank and (
                force_low_rank or K1.snpreader.sid_count < K1.iid_count):
            return K1.snpreader.sid_count
        else:
            return K1.iid_count

    ##########################
    # A special case: Treat the kernels as collections of snps (i.e. low-rank)
    ##########################
    if (isinstance(K0, SnpKernel) and isinstance(K1, SnpKernel)
            and not force_full_rank and
        (force_low_rank
         or K0.snpreader.sid_count + K1.snpreader.sid_count < K0.iid_count)):
        return K0.snpreader.sid_count + K1.snpreader.sid_count

    ##########################
    # The most general case, treat the new kernels as kernels (i.e.. full rank)
    ##########################
    return K0.iid_count
Ejemplo n.º 4
0
def _K_per_chrom(K, chrom, iid,count_A1=None):
    if K is None:
        return KernelIdentity(iid)
    else:
        K_all = _kernel_fixup(K, iid_if_none=iid, standardizer=Unit(),count_A1=count_A1) 
        if isinstance(K_all, SnpKernel):
            return SnpKernel(K_all.snpreader[:,K_all.pos[:,0] != chrom],K_all.standardizer)
        else:
            raise Exception("Don't know how to make '{0}' work per chrom".format(K_all))
Ejemplo n.º 5
0
    def test_none(self):
        logging.info("TestSingleSnp test_none")
        test_snps = Bed(self.bedbase)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file = self.file_name("none")
        frame = single_snp(test_snps=test_snps[:, :10],
                           pheno=pheno,
                           mixing=0,
                           leave_out_one_chrom=False,
                           K0=KernelIdentity(test_snps.iid),
                           covar=covar,
                           output_file_name=output_file)

        self.compare_files(frame, "none")
Ejemplo n.º 6
0
    def test_mixid(self):
        logging.info("TestSingleSnp test_mixid")
        test_snps = Bed(self.bedbase)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file_name = self.file_name("mixid")
        frame = single_snp(test_snps=test_snps[:, :10],
                           pheno=pheno,
                           G0=test_snps[:, 10:100],
                           leave_out_one_chrom=False,
                           covar=covar,
                           K1=KernelIdentity(test_snps.iid),
                           mixing=.25,
                           output_file_name=output_file_name)

        self.compare_files(frame, "mixid")
Ejemplo n.º 7
0
 def k_index_to_nLL_mapper(k):
     _, G_in, pheno_in, covar_in = _fixup(test_snps,
                                          G,
                                          pheno,
                                          covar,
                                          count_A1=count_A1)
     nll_sum = 0
     mse_sum = 0
     n_folds_in = 0
     for fold_index, (train_idx,
                      test_idx) in _kfold(G.iid_count,
                                          n_folds,
                                          seed,
                                          end_with_all=False,
                                          iid_to_index=G.iid_to_index):
         assert set(train_idx).isdisjoint(set(test_idx)), "real assert"
         top_snps_in_fold = fold_index_to_top_snps[fold_index][:k]
         sid_idx_in_fold = G_in.sid_to_index(top_snps_in_fold)
         G_train = G_in[train_idx, sid_idx_in_fold] if k > 0 else None
         fastlmm = FastLMM(force_full_rank=force_full_rank,
                           force_low_rank=force_low_rank,
                           GB_goal=GB_goal)
         fastlmm.fit(
             K0_train=G_train,
             X=covar_in[train_idx, :],
             y=pheno_in[train_idx, :],
             h2raw=h2
         )  #iid intersection means when can give the whole covariate and pheno
         G_test = G_in[
             test_idx, sid_idx_in_fold] if k > 0 else KernelIdentity(
                 G_in.iid, G_in.iid[test_idx]
             )  #!!! instead of this, which blows up when # of iids is large, should switch to linear regression model with k is 0
         nll, mse = fastlmm.score(
             K0_whole_test=G_test,
             X=covar_in[test_idx, :],
             y=pheno_in[test_idx, :],
             return_mse_too=True
         )  #iid intersection means when can give the whole covariate and pheno
         nll_sum += nll
         mse_sum += mse
         n_folds_in += 1
     logging.info("k={0},nLL={1},average mse={2}".format(
         k, nll_sum, mse_sum / n_folds_in))
     return nll_sum
Ejemplo n.º 8
0
def _kernel_fixup(input,
                  iid_if_none,
                  standardizer,
                  test=None,
                  test_iid_if_none=None,
                  block_size=None,
                  train_snps=None,
                  count_A1=None):
    if test is not None and input is None:
        input = test
        test = None

    if isinstance(input, str) and input.endswith(".npz"):
        return KernelNpz(input)

    if isinstance(input, str):
        input = Bed(input, count_A1=count_A1
                    )  #Note that we don't return here. Processing continues
    if isinstance(test, str):
        test = Bed(test, count_A1=count_A1
                   )  #Note that we don't return here. Processing continues

    if isinstance(input, SnpReader):
        if test is not None:
            return _SnpWholeTest(train=train_snps,
                                 test=test,
                                 standardizer=standardizer,
                                 block_size=block_size)
        else:
            return SnpKernel(input,
                             standardizer=standardizer,
                             block_size=block_size)

    if input is None:
        return KernelIdentity(iid=iid_if_none, test=test_iid_if_none)

    return input
Ejemplo n.º 9
0
def _kernel_fixup(input,
                  iid_if_none,
                  standardizer,
                  test=None,
                  test_iid_if_none=None):
    if test is not None and input is None:
        input = test
        test = None

    if isinstance(input, str) and input.endswith(".npz"):
        return KernelNpz(input)

    if isinstance(input, str):
        input = Bed(input)
    if isinstance(test, str):
        test = Bed(test)

    if isinstance(input, SnpReader):
        return SnpKernel(input, standardizer=standardizer, test=test)

    if input is None:
        return KernelIdentity(iid=iid_if_none, test=test_iid_if_none)

    return input
Ejemplo n.º 10
0
    def test_match_cpp(self):
        '''
        match
            FaSTLMM.207\Data\DemoData>..\.cd.\bin\windows\cpp_mkl\fastlmmc -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.singlesnp.txt -logDelta 0 -verbose 100

        '''
        logging.info("TestSingleSnp test_match_cpp")
        snps = Bed(
            os.path.join(self.pythonpath, "tests/datasets/selecttest/snps"))
        pheno = os.path.join(self.pythonpath,
                             "tests/datasets/selecttest/pheno.txt")
        covar = os.path.join(self.pythonpath,
                             "tests/datasets/selecttest/covariate.txt")
        sim_sid = [
            "snp26250_m0_.19m1_.19", "snp82500_m0_.28m1_.28",
            "snp63751_m0_.23m1_.23", "snp48753_m0_.4m1_.4",
            "snp45001_m0_.26m1_.26", "snp52500_m0_.05m1_.05",
            "snp75002_m0_.39m1_.39", "snp41253_m0_.07m1_.07",
            "snp11253_m0_.2m1_.2", "snp86250_m0_.33m1_.33",
            "snp3753_m0_.23m1_.23", "snp75003_m0_.32m1_.32",
            "snp30002_m0_.25m1_.25", "snp26252_m0_.19m1_.19",
            "snp67501_m0_.15m1_.15", "snp63750_m0_.28m1_.28",
            "snp30001_m0_.28m1_.28", "snp52502_m0_.35m1_.35",
            "snp33752_m0_.31m1_.31", "snp37503_m0_.37m1_.37",
            "snp15002_m0_.11m1_.11", "snp3751_m0_.34m1_.34",
            "snp7502_m0_.18m1_.18", "snp52503_m0_.3m1_.3",
            "snp30000_m0_.39m1_.39", "isnp4457_m0_.11m1_.11",
            "isnp23145_m0_.2m1_.2", "snp60001_m0_.39m1_.39",
            "snp33753_m0_.16m1_.16", "isnp60813_m0_.2m1_.2",
            "snp82502_m0_.34m1_.34", "snp11252_m0_.13m1_.13"
        ]
        sim_idx = snps.sid_to_index(sim_sid)
        test_sid = [
            "snp26250_m0_.19m1_.19", "snp63751_m0_.23m1_.23",
            "snp82500_m0_.28m1_.28", "snp48753_m0_.4m1_.4",
            "snp45001_m0_.26m1_.26", "snp52500_m0_.05m1_.05",
            "snp75002_m0_.39m1_.39", "snp41253_m0_.07m1_.07",
            "snp86250_m0_.33m1_.33", "snp15002_m0_.11m1_.11",
            "snp33752_m0_.31m1_.31", "snp26252_m0_.19m1_.19",
            "snp30001_m0_.28m1_.28", "snp11253_m0_.2m1_.2",
            "snp67501_m0_.15m1_.15", "snp3753_m0_.23m1_.23",
            "snp52502_m0_.35m1_.35", "snp30000_m0_.39m1_.39",
            "snp30002_m0_.25m1_.25"
        ]
        test_idx = snps.sid_to_index(test_sid)

        for G0, G1 in [(snps[:, sim_idx], KernelIdentity(snps.iid)),
                       (KernelIdentity(snps.iid), snps[:, sim_idx])]:
            frame_h2 = single_snp(test_snps=snps[:, test_idx],
                                  pheno=pheno,
                                  G0=G0,
                                  G1=G1,
                                  covar=covar,
                                  h2=.5,
                                  leave_out_one_chrom=False)
            frame_log_delta = single_snp(test_snps=snps[:, test_idx],
                                         pheno=pheno,
                                         G0=G0,
                                         G1=G1,
                                         covar=covar,
                                         log_delta=0,
                                         leave_out_one_chrom=False)
            for frame in [frame_h2, frame_log_delta]:
                referenceOutfile = TestFeatureSelection.reference_file(
                    "single_snp/topsnps.single.txt")
                reference = pd.read_table(
                    referenceOutfile, sep="\t"
                )  # We've manually remove all comments and blank lines from this file
                assert len(frame) == len(reference)
                for _, row in reference.iterrows():
                    sid = row.SNP
                    pvalue = frame[frame['SNP'] == sid].iloc[0].PValue
                    reldiff = abs(row.Pvalue - pvalue) / row.Pvalue
                    assert reldiff < .035, "'{0}' pvalue_list differ too much {4} -- {2} vs {3}".format(
                        sid, None, row.Pvalue, pvalue, reldiff)
Ejemplo n.º 11
0
    def test_lr_real(self):
        do_plot = False

        import pylab
        logging.info("TestLinRegTrain test_lr_real")

        train_idx = np.r_[10:self.snpreader_whole.iid_count]  # iids 10 and on
        test_idx = np.r_[0:10]  # the first 10 iids

        #make covar just numbers 0,1,...
        covar = self.covariate_whole.read()
        covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
        covariate_train = covar[train_idx, :].read()
        covariate_test = covar[test_idx, :].read()
        K0_test_test = KernelIdentity(covariate_test.iid)

        #make pheno  # pheno = 2*covar+100+normal(0,1)*10
        pheno = self.pheno_whole.read()
        np.random.seed(0)
        pheno.val = covar.val * 2.0 + 100 + np.random.normal(
            size=covar.val.shape) * 10

        pheno_train = pheno[train_idx, :].read()
        pheno_test = pheno[test_idx, :].read()

        if do_plot:
            #Plot training x and y, testing x and y
            pylab.plot(covariate_train.val, pheno_train.val, ".",
                       covariate_test.val, pheno_test.val, ".")
            pylab.suptitle("Plot training x and y, testing x and y")
            pylab.show()

        Xtrain = np.c_[covariate_train.val,
                       np.ones((covariate_train.iid_count, 1))]
        Xtest = np.c_[covariate_test.val,
                      np.ones((covariate_test.iid_count, 1))]
        lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:, 0], rcond=-1)
        bs = lsqSol[0]  #weights
        r2 = lsqSol[1]  #squared residuals
        D = lsqSol[2]  #rank of design matrix
        N = pheno_train.iid_count
        REML = False
        if not REML:
            sigma2 = float(r2 / N)
            nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + N * 0.5
        else:
            sigma2 = float(r2 / (N - D))
            nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + 0.5 / sigma2 * r2
            nLL -= 0.5 * D * np.log(2 * np.pi * sigma2)
            #REML term

        predicted = Xtest.dot(bs)
        yerr = [np.sqrt(sigma2)] * len(predicted)
        if do_plot:
            pylab.plot(covariate_test.val, pheno_test.val, "g.",
                       covariate_test.val, predicted, "r.")
            pylab.xlim([-1, 10])
            pylab.errorbar(covariate_test.val,
                           predicted,
                           yerr,
                           linestyle='None')
            pylab.suptitle("real linear regression: actual to prediction")
            pylab.show()

        #These should all give the same result
        first_name = None
        for name, K0_train, K0_whole_test in [("Identity Kernel", None, None)]:

            first_name = first_name or name
            #Learn model, save, load
            modelx = LinearRegression().fit(K0_train=K0_train,
                                            X=covariate_train,
                                            y=pheno_train)

            filename = self.tempout_dir + "/model_lr_real.flm.p"
            pstutil.create_directory_if_necessary(filename)
            joblib.dump(modelx, filename)
            model = joblib.load(filename)

            do_test_on_train = True
            if do_test_on_train:
                #Predict with model (test on train)
                predicted_pheno, covar = model.predict(
                    K0_whole_test=K0_train, X=covariate_train)  #test on train
                output_file = self.file_name("lr_reala_" + name)
                Dat.write(output_file, predicted_pheno)
                covar2 = SnpData(
                    iid=covar.row, sid=covar.col[:, 1],
                    val=covar.val)  #kludge to write kernel to text format
                output_file = self.file_name("lr_reala.cov_" + name)
                Dat.write(output_file, covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_train.val, pheno_train.val, "g.",
                               covariate_train.val, predicted, "r.")
                    pylab.xlim([0, 50])
                    pylab.ylim([100, 200])
                    pylab.errorbar(covariate_train.val,
                                   predicted,
                                   yerr,
                                   linestyle='None')
                    pylab.suptitle(
                        name +
                        ": test on train: train X to true target (green) and prediction (red)"
                    )
                    pylab.show()

                self.compare_files(predicted_pheno, "lr2a_" + first_name)
                self.compare_files(covar2, "lr2a.cov_" + first_name)

            #Predict with model (test on test)
            predicted_pheno, covar = model.predict(
                K0_whole_test=K0_whole_test, X=covariate_test)  #test on train
            output_file = self.file_name("lr_realb_" + name)
            Dat.write(output_file, predicted_pheno)
            covar2 = SnpData(
                iid=covar.row, sid=covar.col[:, 1],
                val=covar.val)  #kludge to write kernel to text format
            output_file = self.file_name("lr_realb.cov_" + name)
            Dat.write(output_file, covar2)

            yerr = np.sqrt(np.diag(covar.val))
            predicted = predicted_pheno.val
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val, "g.",
                           covariate_test.val, predicted, "r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val,
                               predicted,
                               yerr,
                               linestyle='None')
                pylab.suptitle(
                    name +
                    ": test on test: test X to true target (green) and prediction (red)"
                )
                pylab.show()
                ## Plot y and predicted y (test on train)
                #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                #pylab.suptitle(name+": test on test: true target to prediction")
                #pylab.show()

            self.compare_files(predicted_pheno, "lr2b_" + first_name)
            self.compare_files(covar2, "lr2b.cov_" + first_name)