Beispiel #1
0
def test_coef_predict_same_as_predict_XGB():
    # Generate data p(z | x) = N(x, 1)
    def generate_data(n_draws):
        x = np.random.normal(0, 1, n_draws)
        z = np.random.normal(x, 1, n_draws)
        return x, z

    x_train, z_train = generate_data(5000)
    x_validation, z_validation = generate_data(5000)
    x_test, z_test = generate_data(5000)

    # Parameterize model
    model = flexcode.FlexCodeModel(XGBoost,
                                   max_basis=31,
                                   basis_system="cosine",
                                   regression_params={
                                       "max_depth": [3, 5, 8],
                                       'eta': [0.1, 0.2, 0.5]
                                   })

    # Fit and tune model
    model.fit(x_train, z_train)
    model.tune(x_validation,
               z_validation,
               bump_threshold_grid=np.linspace(0, 0.2, 3),
               sharpen_grid=np.linspace(0.5, 1.5, 3))

    cdes_predict, z_grid = model.predict(x_test, n_grid=200)

    coefs = model.predict_coefs(x_test)
    cdes_coefs = coefs.evaluate(z_grid)

    assert np.max(np.abs(cdes_predict - cdes_coefs)) <= 1e-4
Beispiel #2
0
def test_example():
    # Generate data p(z | x) = N(x, 1)
    def generate_data(n_draws):
        x = np.random.normal(0, 1, n_draws)
        z = np.random.normal(x, 1, n_draws)
        return x.reshape((len(x), 1)), z.reshape((len(z), 1))

    x_train, z_train = generate_data(10000)
    x_validation, z_validation = generate_data(10000)
    x_test, z_test = generate_data(10000)

    # Parameterize model
    model = flexcode.FlexCodeModel(NN,
                                   max_basis=31,
                                   basis_system="cosine",
                                   regression_params={"k": 20})

    # Fit and tune model
    model.fit(x_train, z_train)
    model.tune(x_validation,
               z_validation,
               bump_threshold_grid=np.linspace(0, 0.2, 3),
               sharpen_grid=np.linspace(0.5, 1.5, 3))

    # Estimate CDE loss
    model.estimate_error(x_test, z_test)

    cdes, z_grid = model.predict(x_test, n_grid=200)

    assert True
Beispiel #3
0
 def inform(self, training_data):
     """
       train flexzboost model model
     """
     speczs = training_data['redshift']
     print("stacking some data...")
     color_data = make_color_data(training_data)
     train_dat, val_dat, train_sz, val_sz = self.split_data(
         color_data, speczs, self.trainfrac)
     print("read in training data")
     model = flexcode.FlexCodeModel(XGBoost,
                                    max_basis=self.max_basis,
                                    basis_system=self.basis_system,
                                    z_min=self.zmin,
                                    z_max=self.zmax,
                                    regression_params=self.regress_params)
     print("fit the model...")
     model.fit(train_dat, train_sz)
     bump_grid = np.linspace(self.bumpmin, self.bumpmax, self.nbump)
     print("finding best bump thresh...")
     bestloss = 9999
     for bumpt in bump_grid:
         model.bump_threshold = bumpt
         model.tune(val_dat, val_sz)
         tmpcdes, z_grid = model.predict(val_dat, n_grid=self.nzbins)
         tmploss = cde_loss(tmpcdes, z_grid, val_sz)
         if tmploss < bestloss:
             bestloss = tmploss
             bestbump = bumpt
     model.bump_threshold = bestbump
     print("finding best sharpen parameter...")
     sharpen_grid = np.linspace(self.sharpmin, self.sharpmax, self.nsharp)
     bestloss = 9999
     bestsharp = 9999
     for sharp in sharpen_grid:
         model.sharpen_alpha = sharp
         tmpcdes, z_grid = model.predict(val_dat, n_grid=301)
         tmploss = cde_loss(tmpcdes, z_grid, val_sz)
         if tmploss < bestloss:
             bestloss = tmploss
             bestsharp = sharp
     model.sharpen_alpha = bestsharp
     self.model = model
     if self.inform_options['save_train']:
         with open(self.inform_options['modelfile'], 'wb') as f:
             pickle.dump(file=f,
                         obj=model,
                         protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #4
0
 def inform(self):
     """
       train flexzboost model model
     """
     speczs = self.training_data['redshift']
     print("stacking some data...")
     color_data = make_color_data(self.training_data)
     train_data, val_data, train_sz, val_sz = self.partition_data(
         color_data, speczs, self.trainfrac)
     print("read in training data")
     model = flexcode.FlexCodeModel(
         XGBoost,
         max_basis=self.max_basis,
         basis_system=self.basis_system,
         z_min=self.zmin,
         z_max=self.zmax,
         regression_params=self.regression_params)
     print("fit the model...")
     model.fit(train_data, train_sz)
     bump_grid = np.linspace(self.bumpmin, self.bumpmax, self.nbump)
     print("finding best bump thresh...")
     bestloss = 9999
     for bumpt in bump_grid:
         model.bump_threshold = bumpt
         model.tune(val_data, val_sz)
         tmpcdes, z_grid = model.predict(val_data, n_grid=self.nzbins)
         tmploss = cde_loss(tmpcdes, z_grid, val_sz)
         if tmploss < bestloss:
             bestloss = tmploss
             bestbump = bumpt
     model.bump_threshold = bestbump
     print("finding best sharpen parameter...")
     sharpen_grid = np.linspace(self.sharpmin, self.sharpmax, self.nsharp)
     bestloss = 9999
     bestsharp = 9999
     for sharp in sharpen_grid:
         model.sharpen_alpha = sharp
         tmpcdes, z_grid = model.predict(val_data, n_grid=301)
         tmploss = cde_loss(tmpcdes, z_grid, val_sz)
         if tmploss < bestloss:
             bestloss = tmploss
             bestsharp = sharp
     model.sharpen_alpha = bestsharp
     self.model = model
Beispiel #5
0
def main(argv):
    if len(argv) != 2:
        print("usage: train_FlexZBoost.py [yamlfile]")
        exit()
    else:
        infile = argv[1]
    with open(infile, "r") as infp:
        ymldata = yaml.load(infp)

    output_file = ymldata['output_file']
    validationfile = ymldata['sharpen_bumpthresh_outputfile']
    #trainfile = "z_2_3.step_all.healpix_10447_magwerrSNtrim.hdf5"
    trainfile = ymldata['training_file']
    max_basis = ymldata['max_basis_functions']
    basis_system = ymldata['basis_system']
    z_min = float(ymldata['z_min'])
    z_max = float(ymldata['z_max'])
    regression_params = ymldata['regression_params']
    bumpmin = float(ymldata['bump_thresh_grid_min'])
    bumpmax = float(ymldata['bump_thresh_grid_max'])
    bumpdelta = float(ymldata['bump_thresh_grid_delta'])
    train_frac = float(ymldata['training_fraction'])
    sharpmin = float(ymldata['sharpen_min'])
    sharpmax = float(ymldata['sharpen_max'])
    sharpdelta = float(ymldata['sharpen_delta'])

    bump_grid = np.arange(bumpmin, bumpmax, bumpdelta)
    print("read in training data")
    fz_data, sz_data = read_in_data(trainfile)
    print("partition into train and validate")
    fz_train, fz_val, sz_train, sz_val = partition_data(
        fz_data, sz_data, train_frac)
    print(fz_train.shape[0])
    print("train the model")
    model = flexcode.FlexCodeModel(XGBoost,
                                   max_basis=max_basis,
                                   basis_system=basis_system,
                                   z_min=z_min,
                                   z_max=z_max,
                                   regression_params=regression_params)
    model.fit(fz_train, sz_train)
    print("tune model, including bump trimming")
    #running with a grid as input wasn't working as it should, just add loop
    #model.tune(fz_val,sz_val,bump_threshold_grid=bump_grid)
    #the tuning computes the CDE loss for each bump_threshold in the grid and
    #chooses the best value based on the validation data

    #NOTE: sample runs on two samples both chose lowest possible bump thresh
    #do a brute force loop and spit out the CDE loss to make sure that the
    #lowest bump thresh really has the best loss score
    outfp = open(validationfile, "w")
    outfp.write("CDE Loss values for bump thresh and sharpen grids\n")
    bestloss = 9999
    for bumpt in bump_grid:
        model.bump_threshold = bumpt
        model.tune(fz_val, sz_val)
        tmpcdes, z_grid = model.predict(fz_val, n_grid=300)
        tmploss = cde_loss(tmpcdes, z_grid, sz_val)
        if tmploss < bestloss:
            bestloss = tmploss
            bestbump = bumpt
        print(f"\n\n\nbumptrim val: {bumpt} cde loss: {tmploss}")
        outfp.write(f"bumptrim val: {bumpt} cde loss: {tmploss}\n")
    print(f"\n\n\nbest bump threshold: {bestbump} setting in model\n\n\n")
    model.bump_threshold = bestbump
    #now do the same for sharpening parameter!
    #    sharpen_grid = np.arange(0.8,2.101,0.1)
    sharpen_grid = np.arange(sharpmin, sharpmax, sharpdelta)
    bestloss = 9999
    bestsharp = 9999
    for sharp in sharpen_grid:
        model.sharpen_alpha = sharp
        tmpcdes, z_grid = model.predict(fz_val, n_grid=301)
        tmploss = cde_loss(tmpcdes, z_grid, sz_val)
        if tmploss < bestloss:
            bestloss = tmploss
            bestsharp = sharp
        print(f"\n\n\nsharpparam: {sharp} cdeloss: {tmploss}")
        outfp.write(f"sharpparam: {sharp} cdeloss: {tmploss}\n")
    print(f"best sharpen param: {bestsharp}")
    model.sharpen_alpha = bestsharp
    # Saving the model
    pickle.dump(file=open(output_file, 'wb'),
                obj=model,
                protocol=pickle.HIGHEST_PROTOCOL)
    print(f"wrote out model file file to {output_file}")
    outfp.close()
    print(model.__dict__)

    print("finished")