Ejemplo n.º 1
0
    def test_kernel_cosine_float(self):
        ker = PairwiseKernel(metric='cosine')

        # X, X
        onx = convert_kernel(ker, 'X', output_names=['Y'], dtype=np.float32,
                             op_version=_TARGET_OPSET_)
        model_onnx = onx.to_onnx(
            inputs=[('X', FloatTensorType([None, None]))],
            target_opset=TARGET_OPSET)

        x = np.random.randn(4, 3)
        x[0, 0] = x[1, 1] = x[2, 2] = 10.
        x[3, 2] = 5.

        sess = InferenceSession(model_onnx.SerializeToString())
        res = sess.run(None, {'X': x.astype(np.float32)})[0]
        m1 = res
        m2 = ker(x)
        assert_almost_equal(m1, m2, decimal=5)

        # X, x
        onx = convert_kernel(ker, 'X', x_train=x,
                             output_names=['Y'], dtype=np.float32,
                             op_version=_TARGET_OPSET_)
        model_onnx = onx.to_onnx(
            inputs=[('X', FloatTensorType([None, None]))],
            target_opset=TARGET_OPSET)

        sess = InferenceSession(model_onnx.SerializeToString())
        res = sess.run(None, {'X': x.astype(np.float32)})[0]
        m1 = res
        m2 = ker(x)
        assert_almost_equal(m1, m2, decimal=5)
Ejemplo n.º 2
0
    def __init__(self,
                 rep_npy,
                 top_model_type,
                 rep_name_to_include_fpbase=None):
        self.seq2rep = build_seq2rep_dict(rep_npy)

        if rep_name_to_include_fpbase is not None:
            assert rep_name_to_include_fpbase in ['unirep', 'evotuned_unirep']

            fpbase_rep_map_file = os.path.join(data_io_utils.S3_DATA_ROOT,
                                               'datasets/for_acquisition',
                                               'fpbase_repname2seq2repvec.p')

            with open(fpbase_rep_map_file, 'rb') as f:
                repname2seq2repvec = pickle.load(f)

            seq2repvec = repname2seq2repvec[rep_name_to_include_fpbase]
            seq2repvec.update(self.seq2rep)

            self.seq2rep = seq2repvec

        self.top_model_type = top_model_type
        self.sparse_refit = False
        self.model = None
        self.alpha_in_interior = None

        self.kernel = PairwiseKernel(metric="cosine")
Ejemplo n.º 3
0
def fit_GPR(X_train, y_train, testData):
    gp_kernel = PairwiseKernel(metric='rbf')

    clf = GaussianProcessRegressor(kernel=gp_kernel)
    clf.fit(X_train, y_train)
    testPred = clf.predict(testData)
    return testPred
    print("done")
Ejemplo n.º 4
0
    def fit_gp(X, y, p):
        cur_metric = lambda x1, x2, gamma: metric(x1, x2, p)

        kernel = PairwiseKernel(metric=cur_metric)
        gp = GaussianProcessRegressor(kernel=kernel,
                                      alpha=y[:, 1]**2,
                                      normalize_y=False)
        gp.fit(X, y[:, 0])
        return gp
Ejemplo n.º 5
0
    def __init__(self, params=None, limit=None, model=None):
        """ Init """

        if model is None:
            kernel = PairwiseKernel(
                metric='laplacian') * DotProduct() + WhiteKernel(
                    noise_level=5.0)
            self.model = GaussianProcessClassifier(kernel=kernel, n_jobs=-1)
        else:
            self.fitted = True
            self.model = model

        if limit is not None:
            self.limit = limit
Ejemplo n.º 6
0
    def __init__(self, params=None, model=None, limit=.5, noise_level=5):
        """ Init """
        logging.info('Using scikit GPCLassifier')

        if model is None:
            kernel = PairwiseKernel(
                metric='laplacian') * DotProduct() + WhiteKernel(
                    noise_level=noise_level)
            self.model = GaussianProcessClassifier(kernel=kernel, n_jobs=-1)
        else:
            self.fitted = True
            self.model = model

        if limit is not None:
            self.limit = limit
Ejemplo n.º 7
0
    def test_gpr_cosine_fitted_true_double(self):
        gp = GaussianProcessRegressor(alpha=1e-5,
                                      n_restarts_optimizer=25,
                                      normalize_y=False,
                                      kernel=PairwiseKernel(metric='cosine'))
        gp, X = fit_regression_model(
            gp, n_features=2, n_samples=20, factor=0.01)

        # return_cov=False, return_std=False
        model_onnx = to_onnx(
            gp, initial_types=[('X', DoubleTensorType([None, None]))],
            target_opset=TARGET_OPSET)
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(X.astype(np.float64), gp, model_onnx,
                            verbose=False,
                            basename="SklearnGaussianProcessCosineDouble")
Ejemplo n.º 8
0
def KT_LUPI(X_train, y_train, y_train_label, X_test):
    gp_kernel = PairwiseKernel(metric='rbf')

    gpr = GaussianProcessRegressor(kernel=gp_kernel)
    gpr.fit(X_train, y_train)
    y_transform = gpr.predict(X_train)
    y_test_transform = gpr.predict(X_test)
    X = np.column_stack((X_train, y_transform))
    test_data = np.column_stack((X_test, y_test_transform))

    grid_param = [{
        'kernel': ['rbf'],
        'gamma': [.1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
        'C': [1, 10, 100, 1000]
    }]

    clf = GridSearchCV(SVC(), grid_param, cv=5)
    clf.fit(X, y_train_label)
    testPred = clf.predict(test_data)
    return testPred
Ejemplo n.º 9
0
def get_gp_prediction(x: np.ndarray,
                      y: np.ndarray,
                      n_samples: int,
                      n_points: int = 100):
    """Fit a GP to observed P( Y | X ) and sample some functions from it.

  Args:
    x: x-values (features)
    y: y-values (targets/labels)
    n_samples: The number of GP samples to use as basis functions.
    n_points: The number of points to subsample form x and y to fit each GP.

  Returns:
    a function that takes as input an array either of shape (n,) or (k, n)
    and outputs:
      if input is 1D -> output: (n, n_samples)
      if input is 2D -> output: (k, n, n_samples)
  """
    kernel = PairwiseKernel(metric='poly') + RBF()
    gp = GaussianProcessRegressor(kernel=kernel,
                                  alpha=0.4,
                                  n_restarts_optimizer=0,
                                  normalize_y=True)
    xmin = np.min(x)
    xmax = np.max(x)
    xx = np.linspace(xmin, xmax, n_points)
    y_samples = []
    rng = onp.random.RandomState(0)
    for i in range(n_samples):
        logging.info("Subsample 200 points and fit GP to P(Y|X)...")
        idx = rng.choice(len(x), 200, replace=False)
        gp.fit(x[idx, np.newaxis], y[idx])
        logging.info(f"Get a sample functions from the GP")
        y_samples.append(gp.sample_y(xx[:, np.newaxis], 1))
    y_samples = np.array(y_samples).squeeze()
    logging.info(f"Shape of samples: {y_samples.shape}")

    def predict(inputs: np.ndarray) -> np.ndarray:
        return interp1d(inputs, xmin, xmax, y_samples).T

    return jit(vmap(predict))
Ejemplo n.º 10
0
    def test_kernel_cosine_double(self):
        ker = PairwiseKernel(metric='cosine')
        onx = convert_kernel(ker, 'X', output_names=['Y'], dtype=np.float64,
                             op_version=_TARGET_OPSET_)
        model_onnx = onx.to_onnx(
            inputs=[('X', DoubleTensorType([None, None]))],
            target_opset=TARGET_OPSET)

        x = np.random.randn(4, 3)
        x[0, 0] = x[1, 1] = x[2, 2] = 10.
        x[3, 2] = 5.

        try:
            sess = InferenceSession(model_onnx.SerializeToString())
        except NotImplemented:
            # Failed to find kernel for FusedMatMul(1).
            return
        res = sess.run(None, {'X': x.astype(np.float64)})[0]
        m1 = res
        m2 = ker(x)
        assert_almost_equal(m1, m2, decimal=5)
while (n_out > 0 or final_fit==0) and num_finite >= 2:

    beta1, beta0, incert_slope, _, _ = ft.wls_matrix(time_vals[good_vals], data_vals[good_vals],
                                                     1. / err_vals[good_vals], conf_slope=0.99)

    # standardized dispersion from linearity
    res_stdized = np.sqrt(np.mean(
        (data_vals[good_vals] - (beta0 + beta1 * time_vals[good_vals])) ** 2 / err_vals[good_vals]))
    res = np.sqrt(np.mean((data_vals[good_vals] - (beta0 + beta1 * time_vals[good_vals])) ** 2))
    if perc_nonlin[min(niter, len(perc_nonlin) - 1)] == 0:
        opt_var = 0
    else:
        opt_var = (res / res_stdized ** 2) ** 2 * 100. / (5 * perc_nonlin[min(niter, len(perc_nonlin) - 1)])

    k1 = PairwiseKernel(1, metric='linear') + PairwiseKernel(1, metric='linear') * C(opt_var) * RQ(10, 3)  # linear kernel
    k2 = C(30) * ESS(length_scale=1, periodicity=1)  # periodic kernel
    k3 = C(50) * RBF(0.75)
    kernel = k1 + k2 + k3

    mu_x = np.nanmean(time_vals[good_vals])
    detr_t_pred = t_pred - mu_x
    detr_time_vals = time_vals - mu_x
    mu_y = np.nanmean(data_vals)
    detr_data_vals = data_vals - mu_y

    # if we remove a linear trend, normalize_y should be false...
    gp = GaussianProcessRegressor(kernel=kernel, optimizer=optimizer, n_restarts_optimizer=n_restarts_optimizer,
                                  alpha=err_vals[good_vals], normalize_y=False)
    gp.fit(detr_time_vals[good_vals].reshape(-1, 1), detr_data_vals[good_vals].reshape(-1, 1))
    y_pred, sigma = gp.predict(detr_t_pred.reshape(-1, 1), return_std=True)
Ejemplo n.º 12
0
    2.0 * Matern(length_scale=1.5, nu=1.5),
    2.0 * Matern(length_scale=2.5, nu=2.5),
    2.0 * Matern(length_scale=[0.5, 2.0], nu=0.5),
    3.0 * Matern(length_scale=[2.0, 0.5], nu=1.5),
    4.0 * Matern(length_scale=[0.5, 0.5], nu=2.5),
    RationalQuadratic(length_scale=0.5, alpha=1.5),
    ExpSineSquared(length_scale=0.5, periodicity=1.5),
    DotProduct(sigma_0=2.0),
    DotProduct(sigma_0=2.0)**2,
    RBF(length_scale=[2.0]),
    Matern(length_scale=[2.0])
]
for metric in PAIRWISE_KERNEL_FUNCTIONS:
    if metric in ["additive_chi2", "chi2"]:
        continue
    kernels.append(PairwiseKernel(gamma=1.0, metric=metric))


@pytest.mark.parametrize('kernel', kernels)
def test_kernel_gradient(kernel):
    # Compare analytic and numeric gradient of kernels.
    K, K_gradient = kernel(X, eval_gradient=True)

    assert_equal(K_gradient.shape[0], X.shape[0])
    assert_equal(K_gradient.shape[1], X.shape[0])
    assert_equal(K_gradient.shape[2], kernel.theta.shape[0])

    def eval_kernel_for_theta(theta):
        kernel_clone = kernel.clone_with_theta(theta)
        K = kernel_clone(X, eval_gradient=False)
        return K
Ejemplo n.º 13
0
def fit_GPR(X_train, x_star):
    gp_kernel = PairwiseKernel(metric= 'rbf')
    model = GaussianProcessRegressor(kernel=gp_kernel)
    model.fit(X_train, x_star)
    return model
Ejemplo n.º 14
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = BQHandler()
    io = IO(gs_bucket=options.gs_bucket)
    viz = Viz(io=io)

    location = 'all'
    if options.locations is not None:
        location = options.locations[0]

    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    if options.model == 'rf':
        model = RandomForestRegressor(
            n_estimators=options.n_estimators,
            n_jobs=-1,
            min_samples_leaf=options.min_samples_leaf,
            min_samples_split=options.min_samples_split,
            max_features=options.max_features,
            max_depth=options.max_depth,
            bootstrap=options.bootstrap)
    elif options.model == 'gbdt':
        model = GradientBoostingRegressor(
            subsample=options.subsample,
            n_estimators=options.n_estimators,
            min_samples_split=options.min_samples_split,
            max_features=options.max_features,
            max_depth=options.max_depth,
            loss=options.loss,
            learning_rate=options.gbdt_learning_rate,
            ccp_alpha=options.ccp_alpha)
    elif options.model == 'lr':
        model = SGDRegressor(warm_start=True,
                             max_iter=options.n_loops,
                             shuffle=options.shuffle,
                             power_t=options.power_t,
                             penalty=options.regularizer,
                             learning_rate=options.learning_rate,
                             eta0=options.eta0,
                             alpha=options.alpha,
                             tol=0.0001)
    elif options.model == 'svr':
        model = SVR()
    elif options.model == 'ard':
        model = ARDRegression(n_iter=options.n_loops,
                              alpha_1=options.alpha_1,
                              alpha_2=options.alpha_2,
                              lambda_1=options.lambda_1,
                              lambda_2=options.lambda_2,
                              threshold_lambda=options.threshold_lambda,
                              fit_intercept=options.fit_intercept,
                              copy_X=options.copy_X)
    elif options.model == 'gp':
        kernel = PairwiseKernel(metric='laplacian') * DotProduct()
        model = GaussianProcessRegressor(
            kernel=kernel,
            alpha=options.noise_level)  #alpha correspondes to white kernel
    elif options.model == 'llasso':
        model = LocalizedLasso(num_iter=options.n_loops,
                               batch_size=options.batch_size)
    elif options.model == 'nlasso':
        model = NetworkLasso(num_iter=options.n_loops,
                             batch_size=options.batch_size)

        graph_data = pd.read_csv(options.graph_data,
                                 names=[
                                     'date', 'start_hour', 'src', 'dst',
                                     'type', 'sum_delay', 'sum_ahead',
                                     'add_delay', 'add_ahead', 'train_count'
                                 ])

        #stations_to_pick = options.stations_to_pick.split(',')
        #graph = model.fetch_connections(graph_data, stations_to_pick)
        model.fetch_connections(graph_data)

    if options.pca:
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

    rmses, maes, r2s, skills, start_times, end_times, end_times_obj = [], [], [], [], [], [], []
    X_complete = []  # Used for feature selection

    start = starttime
    end = start + timedelta(days=int(options.day_step),
                            hours=int(options.hour_step))
    if end > endtime: end = endtime

    while end <= endtime and start < end:
        logging.info('Processing time range {} - {}'.format(
            start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M')))

        # Load data ############################################################
        try:
            logging.info('Reading data...')
            data = bq.get_rows(
                start,
                end,
                loc_col='trainstation',
                project=options.project,
                dataset=options.feature_dataset,
                table=options.feature_table,
                locations=options.locations,
                parameters=all_param_names,
                only_winters=options.only_winters,
                reason_code_table=options.reason_code_table,
                reason_codes_exclude=options.reason_codes_exclude,
                reason_codes_include=options.reason_codes_include)
            data = io.filter_train_type(labels_df=data,
                                        train_types=options.train_types,
                                        sum_types=True,
                                        train_type_column='train_type',
                                        location_column='trainstation',
                                        time_column='time',
                                        sum_columns=['train_count', 'delay'],
                                        aggs=aggs)

            # Filter only timesteps with large distribution in the whole network
            if options.filter_delay_limit is not None:
                data = io.filter_delay_with_limit(data,
                                                  options.filter_delay_limit)

            if options.n_samples is not None and options.n_samples < data.shape[
                    0]:
                logging.info('Sampling {} values from data...'.format(
                    options.n_samples))
                data = data.sample(options.n_samples)

            if options.y_avg_hours is not None:
                data = io.calc_running_delay_avg(data, options.y_avg_hours)

            if options.y_avg:
                data = io.calc_delay_avg(data)

            data.sort_values(by=['time', 'trainstation'], inplace=True)

            if options.month:
                logging.info('Adding month to the dataset...')
                data['month'] = data['time'].map(lambda x: x.month)
                if 'month' not in options.feature_params:
                    options.feature_params.append('month')

            l_data = data.loc[:, options.label_params]
            f_data = data.loc[:, options.feature_params]

        except ValueError as e:
            f_data, l_data = [], []

        if len(f_data) < 2 or len(l_data) < 2:
            start = end
            end = start + timedelta(days=int(options.day_step),
                                    hours=int(options.hour_step))
            continue

        logging.info('Processing {} rows...'.format(len(f_data)))

        train, test = train_test_split(data, test_size=0.1)
        X_train = train.loc[:,
                            options.feature_params].astype(np.float32).values
        y_train = train.loc[:, options.label_params].astype(
            np.float32).values.ravel()
        X_test = test.loc[:, options.feature_params].astype(np.float32).values
        y_test = test.loc[:, options.label_params].astype(
            np.float32).values.ravel()

        logging.debug('Features shape: {}'.format(X_train.shape))

        if options.normalize:
            logging.info('Normalizing data...')
            xscaler, yscaler = StandardScaler(), StandardScaler()

            X_train = xscaler.fit_transform(X_train)
            X_test = xscaler.transform(X_test)

            if len(options.label_params) == 1:
                y_train = yscaler.fit_transform(y_train.reshape(-1, 1)).ravel()
            else:
                y_train = yscaler.fit_transform(y_train)

        if options.pca:
            logging.info('Doing PCA analyzis for the data...')
            X_train = ipca.fit_transform(X_train)
            fname = options.output_path + '/ipca_explained_variance.png'
            viz.explained_variance(ipca, fname)
            #io._upload_to_bucket(filename=fname, ext_filename=fname)
            X_test = ipca.fit_transform(X_test)

        if options.model == 'llasso':
            graph_data = pd.read_csv(options.graph_data,
                                     names=[
                                         'date', 'start_hour', 'src', 'dst',
                                         'type', 'sum_delay', 'sum_ahead',
                                         'add_delay', 'add_ahead',
                                         'train_count'
                                     ])
            graph = model.fetch_connections(graph_data)

        logging.debug('Features shape after pre-processing: {}'.format(
            X_train.shape))

        # FIT ##################################################################

        if options.cv:
            logging.info('Doing random search for hyper parameters...')

            if options.model == 'rf':
                param_grid = {
                    "n_estimators": [5, 10, 50, 100],
                    "max_depth": [3, 20, None],
                    "max_features": ["auto", "sqrt", "log2", None],
                    "min_samples_split": [2, 5, 10],
                    "min_samples_leaf": [1, 2, 4, 10],
                    "bootstrap": [True, False]
                }
            elif options.model == 'gbdt':
                param_grid = {
                    "loss": ['ls', 'lad', 'huber'],
                    'learning_rate': [.0001, .001, .01, .1],
                    "n_estimators": [10, 50, 100, 200],
                    'subsample': [.1, .25, .5, 1],
                    "min_samples_split": [2, 5, 10],
                    "max_depth": [3, 10, 20, None],
                    "max_features": ["auto", "sqrt", "log2", None],
                    'ccp_alpha': [0, .001, 0.1]
                }
            elif options.model == 'lr':
                param_grid = {
                    "penalty": [None, 'l2', 'l1'],
                    "alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1],
                    "l1_ratio": [0.1, 0.15, 0.2, 0.5],
                    "shuffle": [True, False],
                    "learning_rate": ['constant', 'optimal', 'invscaling'],
                    "eta0": [0.001, 0.01, 0.1],
                    "power_t": [0.1, 0.25, 0.5]
                }
            elif options.model == 'svr':
                param_grid = {
                    "C": [0.001, 0.01, 0.1, 1, 10],
                    "epsilon": [0.01, 0.1, 0.5],
                    "kernel":
                    ['rbf', 'linear', 'poly', 'sigmoid', 'precomputed'],
                    "degree": [2, 3, 4],
                    "shrinking": [True, False],
                    "gamma": [0.001, 0.01, 0.1],
                    "coef0": [0, 0.1, 1]
                }
            elif options.model == 'gp':
                param_grid = {'alpha': [0.1, 0.5, 1, 2, 3, 4, 5, 10]}
            else:
                raise ("No param_grid set for given model ({})".format(
                    options.model))

            random_search = RandomizedSearchCV(
                model,
                param_distributions=param_grid,
                n_iter=int(options.n_iter_search),
                scoring='neg_root_mean_squared_error',
                n_jobs=-1,
                refit=True,
                return_train_score=True,
                cv=TimeSeriesSplit(n_splits=5),
                verbose=1)

            random_search.fit(X_train, y_train)
            logging.info("RandomizedSearchCV done.")
            fname = options.output_path + '/random_search_cv_results.txt'
            io.report_cv_results(random_search.cv_results_, fname, fname)
            model = random_search.best_estimator_
        else:
            logging.info('Training...')
            if options.model in ['rf', 'svr', 'ard', 'gp', 'gbdt']:
                model.fit(X_train, y_train)
                if options.feature_selection:
                    X_complete = X_train
                    y_complete = y_train
                    meta_complete = data.loc[:, options.meta_params]
            elif options.model in ['llasso']:
                model.fit(X_train,
                          y_train,
                          stations=train.loc[:, 'trainstation'].values)
            elif options.model in ['nlasso']:
                model.partial_fit(X_train,
                                  y_train,
                                  stations=train.loc[:, 'trainstation'].values)
            else:
                model.partial_fit(X_train, y_train)
                if options.feature_selection:
                    try:
                        X_complete = np.append(X_complete, X_train)
                        y_complete = np.append(Y_complete, y_train)
                        meta_complete = meta_complete.append(
                            data.loc[:, options.meta_params])
                    except (ValueError, NameError):
                        X_complete = X_train
                        y_complete = y_train
                        meta_complete = data.loc[:, options.meta_params]

        # EVALUATE #############################################################

        # Mean delay over the whole dataset (both train and validation),
        # used to calculate Brier Skill
        mean_delay = options.mean_delay
        if mean_delay is None:
            mean_delay = 3.375953418071136 if options.y_avg else 6.011229358531166

        # Check training score to estimate amount of overfitting
        # Here we assume that we have a datetime index (from time columns)
        y_pred_train = model.predict(X_train)
        if options.normalize:
            y_pred_train = yscaler.inverse_transform(y_pred_train)

        rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        mae_train = mean_absolute_error(y_train, y_pred_train)
        rmse_stat_train = math.sqrt(
            mean_squared_error(y_train, np.full_like(y_train, mean_delay)))
        skill_train = 1 - rmse_train / rmse_stat_train

        logging.info(
            'Training data metrics:\nRMSE: {}\nMAE: {}\nR2: {}\nBSS: {}'.
            format(rmse_train, mae_train, model.score(X_train, y_train),
                   skill_train))

        try:
            train.sort_values(by='time', inplace=True)
            train.set_index('time', inplace=True)

            range = ('2011-03-01', '2011-03-31')
            X_train_sample = train.loc[range[0]:range[1],
                                       options.feature_params].astype(
                                           np.float32).values
            y_train_sample = train.loc[range[0]:range[1],
                                       options.label_params].astype(
                                           np.float32).values.ravel()
            times = train.loc[range[0]:range[1], :].index.values.ravel()

            y_pred_sample = model.predict(X_train_sample)
            if options.normalize:
                y_pred_sample = yscaler.inverse_transform(y_pred_sample)

            df = pd.DataFrame(y_pred_sample, index=times)

            # Draw visualisation
            fname = '{}/timeseries_training_data_{}_{}.png'.format(
                options.output_path, range[0], range[1])
            viz.plot_delay(times, y_train_sample, y_pred_sample,
                           'Train dataset delay', fname)

            fname = 'scatter_training_data_{}_{}.png'.format(
                range[0], range[1])
            viz.scatter_predictions(times,
                                    y_train_sample,
                                    y_pred_sample,
                                    savepath=options.output_path,
                                    filename=fname)
        except:
            pass

        if options.model == 'llasso':
            print('X_test shape: {}'.format(X_test.shape))
            y_pred, weights = model.predict(X_test,
                                            test.loc[:, 'trainstation'].values)
        else:
            y_pred = model.predict(X_test)

        if options.normalize:
            y_pred = yscaler.inverse_transform(y_pred)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse_stat = math.sqrt(
            mean_squared_error(y_test, np.full_like(y_test, mean_delay)))
        skill = 1 - rmse / rmse_stat

        rmses.append(rmse)
        maes.append(mae)
        r2s.append(r2)
        skills.append(skill)
        start_times.append(start.strftime('%Y-%m-%dT%H:%M:%S'))
        end_times.append(end.strftime('%Y-%m-%dT%H:%M:%S'))
        end_times_obj.append(end)

        logging.info('RMSE: {}'.format(rmse))
        logging.info('MAE: {}'.format(mae))
        logging.info('R2 score: {}'.format(r2))
        logging.info('Brier Skill Score score: {}'.format(skill))

        start = end
        end = start + timedelta(days=int(options.day_step),
                                hours=int(options.hour_step))
        if end > endtime: end = endtime

    # SAVE #####################################################################
    io.save_scikit_model(model,
                         filename=options.save_file,
                         ext_filename=options.save_file)
    if options.normalize:
        fname = options.save_path + '/xscaler.pkl'
        io.save_scikit_model(xscaler, filename=fname, ext_filename=fname)
        fname = options.save_path + '/yscaler.pkl'
        io.save_scikit_model(yscaler, filename=fname, ext_filename=fname)

    if options.model == 'rf':
        fname = options.output_path + '/rfc_feature_importance.png'
        viz.rfc_feature_importance(model.feature_importances_,
                                   fname,
                                   feature_names=options.feature_params)
        #io._upload_to_bucket(filename=fname, ext_filename=fname)

    try:
        fname = options.output_path + '/learning_over_time.png'
        viz.plot_learning_over_time(end_times_obj,
                                    rmses,
                                    maes,
                                    r2s,
                                    filename=fname)
        #io._upload_to_bucket(filename=fname, ext_filename=fname)
    except Exception as e:
        logging.error(e)

    error_data = {
        'start_times': start_times,
        'end_times': end_times,
        'rmse': rmses,
        'mae': maes,
        'r2': r2s,
        'skill': skills
    }
    fname = '{}/training_time_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)

    # FEATURE SELECTION ########################################################
    if options.feature_selection:
        logging.info('Doing feature selection...')
        selector = SelectFromModel(model, prefit=True)
        print(pd.DataFrame(data=X_complete))
        X_selected = selector.transform(X_complete)

        selected_columns = f_data.columns.values[selector.get_support()]
        logging.info(
            'Selected following parameters: {}'.format(selected_columns))
        data_sel = meta_complete.join(
            pd.DataFrame(data=y_complete, columns=options.label_params)).join(
                pd.DataFrame(data=X_selected, columns=selected_columns))

        print(pd.DataFrame(data=X_selected, columns=selected_columns))
        print(data_sel)
Ejemplo n.º 15
0
    def set_params(self,
                   n_small_stop=None,
                   n_large_stop=None,
                   kernel=None,
                   length=1.0,
                   length_bounds=(1e-5, 1e5),
                   sigma_0=1.0,
                   periodicity=1.0,
                   gamma=1.0,
                   alpha_rquad=1.0,
                   nu=1.0,
                   **kwargs):

        # Define the interpolation region in N
        self.n_small_stop = n_small_stop
        self.n_large_stop = n_large_stop
        self.kernel_name = kernel

        # Kernels taken from tuned_parameters list
        if kernel == "dotprod":
            new_kernel = C(length, (1e-3, 1e3)) * DotProduct(
                sigma_0=sigma_0, sigma_0_bounds=(1e-5, 1e5))
        elif kernel == "expsin":
            new_kernel = C(1.0, (1e-3, 1e3)) * ExpSineSquared(
                length_scale=length,
                periodicity=periodicity,
                length_scale_bounds=length_bounds)
        elif kernel == "exp":
            new_kernel = C(1.0, (1e-3, 1e3)) * Exponentiation(
                RBF(length_scale=length, length_scale_bounds=length_bounds), 2)
        elif kernel == "matern":
            new_kernel = C(1.0, (1e-3, 1e3)) * Matern(
                length_scale=length, length_scale_bounds=length_bounds, nu=nu)
        elif kernel == "pairwise":
            new_kernel = C(1.0, (1e-3, 1e3)) * PairwiseKernel(
                gamma=gamma, gamma_bounds=(1e-5, 1e5))
        elif kernel == "rbf":
            new_kernel = C(1.0, (1e-3, 1e3)) * RBF(
                length_scale=length, length_scale_bounds=length_bounds)
        elif kernel == "rquad":
            new_kernel = C(1.0, (1e-3, 1e3)) * RationalQuadratic(
                length_scale=length,
                alpha=alpha_rquad,
                length_scale_bounds=length_bounds,
                alpha_bounds=(1e-5, 1e5))
        # Combinations of basic kernels
        elif kernel == "prod":
            new_kernel = C(1.0, (1e-3, 1e3)) * Product(
                RBF(length, length_bounds), Matern(
                    length, length_bounds, nu=nu))
        elif kernel == "sum":
            new_kernel = C(1.0, (1e-3, 1e3)) * Sum(
                RBF(length, length_bounds), Matern(
                    length, length_bounds, nu=nu))
        elif kernel == "prod2":
            new_kernel = C(1.0, (1e-3, 1e3)) * Product(
                RationalQuadratic(length_scale=length,
                                  alpha=alpha_rquad,
                                  length_scale_bounds=length_bounds,
                                  alpha_bounds=(1e-5, 1e5)),
                Matern(length, length_bounds, nu=nu))
        elif kernel == "sum2":
            new_kernel = C(1.0, (1e-3, 1e3)) * Sum(
                RationalQuadratic(length_scale=length,
                                  alpha=alpha_rquad,
                                  length_scale_bounds=length_bounds,
                                  alpha_bounds=(1e-5, 1e5)),
                Matern(length, length_bounds, nu=nu))
        else:
            raise NotImplementedError(f"Kernel not implemented {kernel}")

        # Call the super() class (GaussianProcessRegressor)
        # and give manually the parameteres taken from tuned_parameters list
        result = super().set_params(kernel=new_kernel, **kwargs)

        return result
Ejemplo n.º 16
0
    def fit_tile(tile, tmp_ref_dir, base_dir, out_dir):

        method = 'gpr'
        subspat = None
        ref_dem_date = np.datetime64('2013-01-01')
        gla_mask = '/calcul/santo/hugonnet/outlines/rgi60_merge.shp'
        inc_mask = '/calcul/santo/hugonnet/outlines/rgi60_buff_10.shp'
        write_filt = True
        clobber = True
        tstep = 1. / 12.
        time_filt_thresh = [-50, 50]
        opt_gpr = False
        filt_ref = 'both'
        filt_ls = False
        conf_filt_ls = 0.99
        # specify the exact temporal extent needed to be able to merge neighbouring stacks properly
        tlim = [np.datetime64('2000-01-01'), np.datetime64('2020-01-01')]

        #for sensitivity test: force final fit only, and change kernel parameters in entry of script
        force_final_fit = True
        k1 = PairwiseKernel(1, metric='linear')  # linear kernel
        k2 = C(period_var) * ESS(length_scale=1,
                                 periodicity=1)  # periodic kernel
        k3 = C(base_var * 0.6) * RBF(base_length * 0.75) + C(
            base_var * 0.3) * RBF(base_length * 1.5) + C(base_var * 0.1) * RBF(
                base_length * 3)
        k4 = PairwiseKernel(1, metric='linear') * C(nonlin_var) * RQ(
            nonlin_length, nonlin_alpha)
        kernel = k1 + k2 + k3 + k4

        lat, lon = SRTMGL1_naming_to_latlon(tile)
        epsg, utm = latlon_to_UTM(lat, lon)
        print('Fitting tile: ' + tile + ' in UTM zone ' + utm)

        # reference DEM
        ref_utm_dir = os.path.join(tmp_ref_dir, utm)
        ref_vrt = os.path.join(ref_utm_dir, 'tmp_' + utm + '.vrt')
        infile = os.path.join(base_dir, utm, tile + '.nc')
        outfile = os.path.join(out_dir, utm, tile + '_final.nc')

        fn_filt = os.path.join(base_dir, utm, tile + '_filtered.nc')

        if True:  #not os.path.exists(outfile):
            ft.fit_stack(infile,
                         fn_filt=fn_filt,
                         fit_extent=subspat,
                         fn_ref_dem=ref_vrt,
                         ref_dem_date=ref_dem_date,
                         exc_mask=gla_mask,
                         tstep=tstep,
                         tlim=tlim,
                         inc_mask=inc_mask,
                         filt_ref=filt_ref,
                         time_filt_thresh=time_filt_thresh,
                         write_filt=True,
                         outfile=outfile,
                         method=method,
                         filt_ls=filt_ls,
                         conf_filt_ls=conf_filt_ls,
                         nproc=nproc,
                         clobber=True,
                         kernel=kernel,
                         force_final_fit=force_final_fit)

            # write dh/dts for visualisation
            ds = xr.open_dataset(outfile)

            t0 = np.datetime64('2000-01-01')
            t1 = np.datetime64('2020-01-01')

            ft.get_full_dh(ds,
                           os.path.join(
                               os.path.dirname(outfile),
                               os.path.splitext(os.path.basename(outfile))[0]),
                           t0=t0,
                           t1=t1)

        else:
            print('Tile already processed.')
Ejemplo n.º 17
0

mnist8m = np.load('/dev/shm/mnist8m.npz')
X_all = mnist8m['X']
X_all /= np.linalg.norm(X_all, axis=1).max()

n_list = np.geomspace(100000, 999999, 10)

urandom_seed = SystemRandom().randrange(99999)
r = np.random.RandomState(urandom_seed)

sigma = np.sqrt(3 * X_all.shape[1])
dot_func = FastMixinKernel(
    DotProduct(sigma_0=0),
    PairwiseKernel(gamma=1 / np.square(sigma),
                   metric='linear',
                   pairwise_kernels_kwargs={'n_jobs': 1}))

res = np.zeros((len(n_list), 4))

desired_k = 20

for (i_cand, n_cand) in enumerate(n_list):
    result = []
    print(n_cand)
    I_train = sample_without_replacement(n_population=X_all.shape[0],
                                         n_samples=int(n_cand),
                                         random_state=r)

    X_train = X_all[I_train, :]
    n = X_train.shape[0]
Ejemplo n.º 18
0
        return self.gp_kernel.diag(X)

    def is_stationary(self):
        return self.gp_kernel.is_stationary()

mnist = fetch_openml('mnist_784', version=1, cache=True, data_home='~/python/datasets/')

n_list = np.linspace(1000, 70000, 15)

urandom_seed = SystemRandom().randrange(99999)
r = np.random.RandomState(urandom_seed)

sigma = np.sqrt(3*mnist.data.shape[1])
dot_func = FastMixinKernel(
    RBF(sigma),
    PairwiseKernel(gamma=1/np.square(sigma), metric='rbf', pairwise_kernels_kwargs={'n_jobs':-2})
)

res = np.zeros((len(n_list), 4))

desired_k = 10


for (i_cand, n_cand) in enumerate(n_list):
    result = []
    print(n_cand)
    I_train = sample_without_replacement(n_population=mnist.data.shape[0],
                                         n_samples=int(n_cand),
                                         random_state=r)

    X_train = mnist.data[I_train, :]/255.
Ejemplo n.º 19
0
            if diff - diff_std >0:
                base_var = 50 + (diff-diff_std)**2/2
            else:
                base_var = 50.
        else:
            base_var = 50.
    else:
        nonlin_var = (res / res_stdized) ** 2
        # nonlin_var = 1
        period_nonlinear = 100. / res_stdized ** 2
        # period_nonlinear = 10.
        base_var=50

    print(base_var)

    k1 = PairwiseKernel(1, metric='linear') + PairwiseKernel(1, metric='linear') * C(nonlin_var) * RQ(10, period_nonlinear)  # linear kernel
    k2 = C(30) * ESS(length_scale=1, periodicity=1)  # periodic kernel
    k3 = C(base_var) * RBF(1.5)
    kernel = k1 + k2 + k3

    mu_x = np.nanmean(time_vals[good_vals])
    detr_t_pred = t_pred - mu_x
    detr_time_vals = time_vals - mu_x
    mu_y = np.nanmean(data_vals)
    detr_data_vals = data_vals - mu_y

    # if we remove a linear trend, normalize_y should be false...
    gp = GaussianProcessRegressor(kernel=kernel, optimizer=optimizer, n_restarts_optimizer=n_restarts_optimizer,
                                  alpha=err_vals[good_vals], normalize_y=False)
    gp.fit(detr_time_vals[good_vals].reshape(-1, 1), detr_data_vals[good_vals].reshape(-1, 1))
    y_pred, sigma = gp.predict(detr_t_pred.reshape(-1, 1), return_std=True)
Ejemplo n.º 20
0
def main():

    if hasattr(options, 'dask'): client = Client('{}:8786'.format(options.dask))
    else: client = Client()

    logging.info(client)

    if hasattr(options, 's3_bucket'):
        fh = FileHandler(s3_bucket=options.s3_bucket)
        viz = Viz(io=fh)
    else:
        fh = FileHandler()
        viz = Viz()

    datasets = fh.read_data([options.train_data, options.test_data], options)

    X_train = datasets[0][0]
    y_train = datasets[0][1]
    X_test = datasets[1][0]
    y_test = datasets[1][1]

    # Train
    if options.model == 'svct':
        model = SVCT(verbose=True)
    elif options.model == 'gp':
        kernel = PairwiseKernel(metric='laplacian') *  DotProduct()
        model = GaussianProcessClassifier(kernel=kernel, n_jobs=-1)
    elif options.model == 'rfc':
        # param_grid_rfc = {
        # "n_estimators": [10, 100, 150, 200, 250, 500],
        # "max_depth": [20, 50, 100, None],
        # "max_features": ["auto", "log2", None],
        # "min_samples_split": [2,5,10],
        # "min_samples_leaf": [1, 2, 4],
        # "bootstrap": [False]
        # }

        # Fetched using 5-fold cv with random search from params above
        if "national" in options.dataset:
            params = {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False, 'n_jobs': -1}
        else:
            params = {'n_estimators': 250, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': None, 'max_depth': 20, 'bootstrap': False, 'n_jobs': -1}

        model = RandomForestClassifier(**params)

    elif options.model == 'gnb':
        model = GaussianNB()
    else:
        raise Exception('Model not defined')

    logging.info('Training...')
    if options.model == 'gnb':
        priors = []
        for i in np.arange(0,1,.05):
            for j in np.arange(0, 1-i, .05):
                k = 1 - i - j
                priors.append([i, j, k])

        param_grid_gnb = {
        'priors': priors+[None],
        'var_smoothing': expon(scale=.01)
        }
        model, cv_results = cv(model, param_grid_gnb, X_train, y_train, n_iter=500)
    else:
        with joblib.parallel_backend('dask'):
            model.fit(X_train, y_train)

    # Evaluate
    y_pred_train = model.predict(X_train)
    logging.info('Training report:\n{}'.format(classification_report(y_train, y_pred_train)))

    y_pred = model.predict(X_test)
    logging.info('Validation report:\n{}'.format(classification_report(y_test, y_pred)))

    fname = '{}/confusion_matrix_testset.png'.format(options.output_path)
    viz.plot_confusion_matrix(y_test, y_pred, np.arange(3), filename=fname)

    fname = '{}/confusion_matrix_testset_normalised.png'.format(options.output_path)
    viz.plot_confusion_matrix(y_test, y_pred, np.arange(3), True, filename=fname)

    if options.model == 'rfc':
        # Sort feature importances in descending order and rearrange feature names accordingly
        indices = np.argsort(model.feature_importances_)[::-1]
        names = [options.feature_params[i] for i in indices]
        importances = model.feature_importances_[indices]

        fname = '{}/feature_importances.png'.format(options.output_path)

        viz.rfc_feature_importance(importances, fname, names)

    if options.model == 'svct':
        fh.save_svct(model, options.save_path)
    else:
        fh.save_model(model, options.save_path)
Ejemplo n.º 21
0
# print("pt: {}".format(pt.shape))
# print("sigma: {}".format(sigma.shape))
# print("pt mesh: {}".format(pt_exact.shape))
# print("sigma exact: {}".format(sigma_exact.shape))

print("\nPreparing for exhaustive GridSearch():")

# Set the parameters to hyperoptimize
kernel1 = DotProduct(sigma_0=1.0, sigma_0_bounds=(1e-5, 1e5))
kernel2 = ExpSineSquared(length_scale=1.0,
                         periodicity=1.0,
                         length_scale_bounds=(1e-5, 1e5))
kernel3 = Exponentiation(
    RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)), 2)
kernel4 = Matern(length_scale=1.0, length_scale_bounds=(1e-5, 1e5), nu=1.5)
kernel5 = PairwiseKernel(gamma=1.0, gamma_bounds=(1e-5, 1e5))
kernel6 = Product(RBF(1.0, (1e-5, 1e5)), Matern(1.0, (1e-5, 1e5), nu=1.5))
kernel7 = RBF(length_scale=1.0, length_scale_bounds=(1e-5, 1e5))
kernel8 = RationalQuadratic(length_scale=1.0,
                            alpha=1.0,
                            length_scale_bounds=(1e-5, 1e5),
                            alpha_bounds=(1e-5, 1e5))
kernel9 = Sum(RBF(1.0, (1e-2, 1e2)), Matern(10, (1e-2, 1e2), nu=1.5))
# List of hyperparameters given to the GridSearchCV()
tuned_parameters = [{
    "kernel": [
        kernel1, kernel2, kernel3, kernel4, kernel5, kernel6, kernel7, kernel8,
        kernel9
    ]
}]
kernel_names = ["DP", "ES", "Exp", "Mat", "PW", "Prod", "RBF", "RQ", "Sum"]
Ejemplo n.º 22
0
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import numpy as np
from bkb_lib import BKB
from sklearn.gaussian_process.kernels import PairwiseKernel
import matplotlib.pyplot as plt

r_state = np.random.RandomState(seed=42)

d = 3
k = 1000
T = 2000
dot_func = PairwiseKernel(metric='linear')
w_star = r_state.randn(d).reshape(1, -1)
arms = r_state.randn(k, d)
arms_score = dot_func(arms, w_star.reshape(1, -1))
best_arm = np.argmax(arms_score)

noise_ratio = 0.01
noise_std = np.sqrt((arms_score.max() - arms_score.min()) * noise_ratio)
f = lambda x: dot_func(x, w_star) + r_state.randn() * noise_std

bkb_alg = BKB(lam=noise_std**2.,
              dot=dot_func,
              noise_variance=noise_std**2.,
              fnorm=1.0,
              delta=0.5,
              qbar=1,
Ejemplo n.º 23
0
x_test = scaler.transform(x_test)

#First try with standard model

pca = PCA(n_components=100)

x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

kmean = KMeans(n_clusters=10)
kmean.fit(x_train, y_train)

cent = kmean.predict(kmean.cluster_centers_)
cent = np_utils.to_categorical(cent, 10)

gp_kernel = PairwiseKernel(gamma=1, metric='linear')
gp = GaussianProcessRegressor(kernel=gp_kernel)

gp.fit(kmean.cluster_centers_, cent)

t = gp.predict(x_train)
t2 = gp.predict(x_test)

model = Sequential()
model.add(Dense(20, input_dim=10, activation='relu'))
model.add(Dense(10, activation='sigmoid'))
model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(t,