Ejemplo n.º 1
0
    def test_multioutput(self):

        # http://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py

        from sklearn.multioutput import MultiOutputRegressor
        from sklearn.ensemble import RandomForestRegressor

        # Create a random dataset
        rng = np.random.RandomState(1)
        X = np.sort(200 * rng.rand(600, 1) - 100, axis=0)
        y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T
        y += (0.5 - rng.rand(*y.shape))

        df = pdml.ModelFrame(X, target=y)

        max_depth = 30

        rf1 = df.ensemble.RandomForestRegressor(max_depth=max_depth,
                                                random_state=self.random_state)
        reg1 = df.multioutput.MultiOutputRegressor(rf1)

        rf2 = RandomForestRegressor(max_depth=max_depth,
                                    random_state=self.random_state)
        reg2 = MultiOutputRegressor(rf2)

        df.fit(reg1)
        reg2.fit(X, y)

        result = df.predict(reg2)
        expected = pd.DataFrame(reg2.predict(X))
        tm.assert_frame_equal(result, expected)
def test_multi_target_sample_weights_api():
    X = [[1, 2, 3], [4, 5, 6]]
    y = [[3.141, 2.718], [2.718, 3.141]]
    w = [0.8, 0.6]

    rgr = MultiOutputRegressor(Lasso())
    assert_raises_regex(ValueError, "does not support sample weights", rgr.fit, X, y, w)

    # no exception should be raised if the base estimator supports weights
    rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
    rgr.fit(X, y, w)
Ejemplo n.º 3
0
def test_acquisition_per_second_gradient(acq_func):
    rng = np.random.RandomState(0)
    X = rng.randn(20, 10)
    # Make the second component large, so that mean_grad and std_grad
    # do not become zero.
    y = np.vstack((X[:, 0], np.abs(X[:, 0])**3)).T

    for X_new in [rng.randn(10), rng.randn(10)]:
        gpr = cook_estimator("GP", Space(((-5.0, 5.0),)), random_state=0)
        mor = MultiOutputRegressor(gpr)
        mor.fit(X, y)
        check_gradient_correctness(X_new, mor, acq_func, 1.5)
def test_multi_target_sparse_regression():
    X, y = datasets.make_regression(n_targets=3)
    X_train, y_train = X[:50], y[:50]
    X_test = X[50:]

    for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]:
        rgr = MultiOutputRegressor(Lasso(random_state=0))
        rgr_sparse = MultiOutputRegressor(Lasso(random_state=0))

        rgr.fit(X_train, y_train)
        rgr_sparse.fit(sparse(X_train), y_train)

        assert_almost_equal(rgr.predict(X_test), rgr_sparse.predict(sparse(X_test)))
Ejemplo n.º 5
0
def test_multi_target_sample_weight_partial_fit():
    # weighted regressor
    X = [[1, 2, 3], [4, 5, 6]]
    y = [[3.141, 2.718], [2.718, 3.141]]
    w = [2., 1.]
    rgr_w = MultiOutputRegressor(SGDRegressor(random_state=0))
    rgr_w.partial_fit(X, y, w)

    # weighted with different weights
    w = [2., 2.]
    rgr = MultiOutputRegressor(SGDRegressor(random_state=0))
    rgr.partial_fit(X, y, w)

    assert_not_equal(rgr.predict(X)[0][0], rgr_w.predict(X)[0][0])
Ejemplo n.º 6
0
def test_multi_target_sample_weights():
    # weighted regressor
    Xw = [[1, 2, 3], [4, 5, 6]]
    yw = [[3.141, 2.718], [2.718, 3.141]]
    w = [2., 1.]
    rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
    rgr_w.fit(Xw, yw, w)

    # unweighted, but with repeated samples
    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
    y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
    rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
    rgr.fit(X, y)

    X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
    assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
Ejemplo n.º 7
0
def run_one_configuration(
    full_train_covariate_matrix,
    complete_target,
    new_valid_covariate_data_frames,
    new_valid_target_data_frame,
    std_data_frame,
    target_clusters,
    featurizer,
    model_name,
    parameters,
    log_file,
):
    model_baseline = dict()
    model_baseline["type"] = model_name
    model_baseline["target_clusters"] = target_clusters

    if model_name == "multi_task_lasso":
        model = MultiTaskLasso(max_iter=5000, **parameters)
    elif model_name == "xgboost":
        model = MultiOutputRegressor(
            XGBRegressor(n_jobs=10,
                         objective="reg:squarederror",
                         verbosity=0,
                         **parameters))

    model.fit(featurizer(full_train_covariate_matrix),
              complete_target.to_numpy(copy=True))
    model_baseline["model"] = lambda x: model.predict(featurizer(x))

    skill, _, _, _ = location_wise_metric(
        new_valid_target_data_frame,
        new_valid_covariate_data_frames,
        std_data_frame,
        model_baseline,
        "skill",
    )
    cos_sim, _, _, _ = location_wise_metric(
        new_valid_target_data_frame,
        new_valid_covariate_data_frames,
        std_data_frame,
        model_baseline,
        "cosine-sim",
    )
    with open(log_file, "a") as f:
        f.write(f"{len(target_clusters)} {parameters} {skill} {cos_sim}\n")
Ejemplo n.º 8
0
 def first_stage():
     return GridSearchCVList([
         LinearRegression(),
         WeightedMultiTaskLasso(
             alpha=0.05, fit_intercept=True, tol=1e-6, random_state=123),
         RandomForestRegressor(n_estimators=100,
                               max_depth=3,
                               min_samples_leaf=10,
                               random_state=123),
         MultiOutputRegressor(
             GradientBoostingRegressor(n_estimators=20,
                                       max_depth=3,
                                       min_samples_leaf=10,
                                       random_state=123))
     ],
                             param_grid_list=[{}, {}, {}, {}],
                             cv=3,
                             iid=True)
def load_SVM():
    '''
    Loads Support Vector Machine and gives a name for the output files.
    
    Parameters : None
    
    Returns    : model_name : (str) Name of the model for output file.
                       clf  : (Classifier) Building and Floor Classifier
                       regr : (REgressor) Longitude and Latitude Regressor
    '''
    model_name = "Support Vector Machine"
    clf = SVC(C=100, kernel="linear", max_iter=1000)
    clf = MultiOutputClassifier(clf)

    regr = SVR(C=100, kernel="linear", max_iter=1000)
    regr = MultiOutputRegressor(regr)

    return model_name, clf, regr
Ejemplo n.º 10
0
def randomSearch(base_model, random_grid):
    random = RandomizedSearchCV(MultiOutputRegressor(base_model),
                                param_distributions=random_grid,
                                n_iter=100,
                                cv=3,
                                verbose=2,
                                random_state=42,
                                n_jobs=-1)

    random.fit(train_X, train_y)
    print(random.best_params_)
    best_random = random.best_estimator_
    pred_y_train = best_random.predict(train_X)
    print_scores(train_y_array, pred_y_train)
    pred_y_test = best_random.predict(test_X)
    print_scores(test_y_array, pred_y_test)
    pred_y_dev = best_random.predict(dev_X)
    print_scores(dev_y_array, pred_y_dev)
Ejemplo n.º 11
0
def crossValidationMLPR(X, Y):
    """Fonction qui essaie plusieurs possibilités"""

    # On découpe le set en set d'enrtainement et de validation
    print("***Decoupe le set de validation***")
    x_train, x_validation, y_train, y_validation_txt = train_test_split(
        X, Y, stratify=Y, test_size=0.2, shuffle=True)
    y_train, y_validation = transformerGranuArgi(
        y_train), transformerGranuArgi(y_validation_txt)

    print('***Definition des parametres a tester***')
    param = {
        'hidden_layer_sizes': [
            tuple(np.random.randint(20, 35, np.random.randint(3, 5, 1)))
            for _ in range(5)
        ]
    }

    print('***Definition des modeles a entrainer***')
    mlpr = [
        MLPRegressor(solver='adam',
                     max_iter=1000,
                     alpha=1e-5,
                     activation='tanh',
                     hidden_layer_sizes=param['hidden_layer_sizes'][i])
        for i in range(len(param['hidden_layer_sizes']))
    ]
    multioutput_rna = [MultiOutputRegressor(modele) for modele in mlpr]

    # Score de resultat justes sur le set de validation
    resultat_sur_validation = [
        0 for _ in range(len(param['hidden_layer_sizes']))
    ]

    for i, modele in enumerate(multioutput_rna):
        print(
            f"[Entrainement du modele {i}] Couches de neurones : {param['hidden_layer_sizes'][i]}"
        )
        modele.fit(x_train, y_train)
        print(modele.score(x_validation, y_validation))
        y_res = modele.predict(x_validation)
        y_res = conversionPredictionSol(y_res)
        print(scorePrediction(y_res, np.array(y_validation_txt)))
        print('\n')
Ejemplo n.º 12
0
    def create_model(self, C=-1, gamma=-1, epsilon=-1):
        # questo controllo serve per dire che di solito uso i valori di default scelti da me (inizializzati nel costruttore),
        # altrimenti usi i valori passati come parametro
        if (C == -1):
            C = self.C
        if (gamma == -1):
            gamma = self.gamma
        if (epsilon == -1):
            epsilon = self.epsilon

        self.model = SVR(C=C, gamma=gamma, epsilon=epsilon)
        if (
                self.output_multi
        ):  # se uso molteplici y, devo fare il wrapping del svr in modo da saperle gestire
            multi_output_model = MultiOutputRegressor(estimator=self.model)
            self.model = multi_output_model

        print self.model
        return self.model
Ejemplo n.º 13
0
def gbr_model(yvar, n_estimators, max_depth, min_samples_leaf,
              min_samples_split, max_features, loss):
    if max_features != 'auto':
        max_features = int(max_features)
    n_estimators, min_samples_leaf, min_samples_split, max_depth = \
        int(n_estimators), int(min_samples_leaf), int(min_samples_split), int(max_depth)

    reg = GradientBoostingRegressor(random_state=42,
                                    max_depth=max_depth,
                                    n_estimators=n_estimators,
                                    max_features=max_features,
                                    min_samples_leaf=min_samples_leaf,
                                    loss=loss,
                                    min_samples_split=min_samples_split)
    if yvar.shape[1] == 1:
        reg_trans = reg
    if yvar.shape[1] != 1:
        reg_trans = MultiOutputRegressor(reg, n_jobs=-1)
    return reg_trans
Ejemplo n.º 14
0
def train_consumer():
    cdf = pd.read_csv(CONSUMER_TRAINING)

    xs = ['risk', 'delta_risk', 'grat_payoff', 'delta_grat_payoff',\
        'inv_payoff', 'delta_inv_payoff', 'surface_area_risk_factor',\
        'delta_surface_area_risk_factor']

    ys = ['GREED', 'FOCUS', 'SPEND', 'INVEST']

    cx, cy = cdf[xs], cdf[ys]
    '''
    will use multi-output regressor
    '''
    model = MultiOutputRegressor(
        GradientBoostingRegressor(random_state=0)).fit(cx, cy)

    # clear CMODEL_FILE
    open(CMODEL_FILE, 'w').close()
    pickle.dump(model, open(CMODEL_FILE, 'wb'))
Ejemplo n.º 15
0
    def _check_arguments(self, base_estimator, n_initial_points, acq_optimizer,
                         dimensions):
        """Check arguments for sanity."""

        if isinstance(base_estimator, str):
            base_estimator = cook_estimator(base_estimator,
                                            space=dimensions,
                                            random_state=self.rng.randint(
                                                0,
                                                np.iinfo(np.int32).max))

        if not is_regressor(base_estimator) and base_estimator is not None:
            raise ValueError("%s has to be a regressor." % base_estimator)

        is_multi_regressor = isinstance(base_estimator, MultiOutputRegressor)
        if "ps" in self.acq_func and not is_multi_regressor:
            self.base_estimator_ = MultiOutputRegressor(base_estimator)
        else:
            self.base_estimator_ = base_estimator

        if n_initial_points < 0:
            raise ValueError("Expected `n_initial_points` >= 0, got %d" %
                             n_initial_points)
        self._n_initial_points = n_initial_points
        self.n_initial_points_ = n_initial_points

        if acq_optimizer == "auto":
            if has_gradients(self.base_estimator_):
                acq_optimizer = "lbfgs"
            else:
                acq_optimizer = "sampling"

        if acq_optimizer not in ["lbfgs", "sampling"]:
            raise ValueError("Expected acq_optimizer to be 'lbfgs' or "
                             "'sampling', got {0}".format(acq_optimizer))

        if (not has_gradients(self.base_estimator_)
                and acq_optimizer != "sampling"):
            raise ValueError("The regressor {0} should run with "
                             "acq_optimizer"
                             "='sampling'.".format(type(base_estimator)))

        self.acq_optimizer = acq_optimizer
Ejemplo n.º 16
0
    def simulate(self, x_train, y_train, regression=True):
        writer = csv.writer(
            open("CSVResult/CSVResultDuringSimulation.csv", 'w'))
        writer.writerow(self.title)
        for simulation in self.HyperParameterArray:
            start_time = time.time()
            if regression:
                svr = svm.SVR(kernel=simulation.kernel,
                              gamma=simulation.gamma,
                              coef0=simulation.coef,
                              degree=simulation.degree,
                              C=simulation.C,
                              epsilon=simulation.epsilon)
                SVRegressor = MultiOutputRegressor(svr, n_jobs=8)
            else:
                SVRegressor = svm.SVC(kernel=simulation.kernel,
                                      gamma=simulation.gamma,
                                      coef0=simulation.coef,
                                      degree=simulation.degree,
                                      C=simulation.C)

            # I can evaluate the model also with cross Validation
            # CrossValidationScores = cross_val_score(SVRegressor, x_train, y_train, cv=5)
            # I can evaluate the model with kfold validation
            valScore, TrainingScore = validation.kFoldCross(
                SVRegressor.fit,
                SVRegressor.predict,
                x_train,
                y_train,
                n_splits=self.kfoldDim)
            valScore = np.array(valScore)
            TrainingScore = np.array(TrainingScore)
            timeSimulation = abs(time.time() - start_time)
            simulation.SaveResult(valScore, TrainingScore, timeSimulation)
            print("\n")
            print("Validation error: %0.2f (+/- %0.2f)" %
                  (valScore.mean(), valScore.std() * 2))
            print("Training Error: %0.2f (+/- %0.2f)" %
                  (TrainingScore.mean(), TrainingScore.std() * 2))
            print("time = %0.2f" % timeSimulation)
            param = simulation.getValue()
            writer.writerow(param)
Ejemplo n.º 17
0
    def test_sklearn_multioutput_regressor(self):
        for n_targets in [2, 3, 4]:
            for model_class in [DecisionTreeRegressor, ExtraTreesRegressor, RandomForestRegressor, LinearRegression]:
                seed = random.randint(0, 2**32 - 1)
                if model_class != LinearRegression:
                    model = MultiOutputRegressor(model_class(random_state=seed))
                else:
                    model = MultiOutputRegressor(model_class())
                X, y = datasets.make_regression(
                    n_samples=50, n_features=10, n_informative=5, n_targets=n_targets, random_state=seed
                )
                X = X.astype("float32")
                y = y.astype("float32")
                model.fit(X, y)

                torch_model = hummingbird.ml.convert(model, "torch", extra_config={constants.TREE_OP_PRECISION_DTYPE: "float64"})
                self.assertTrue(torch_model is not None)
                np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-5, atol=1e-4, err_msg="{}/{}/{}".format(n_targets, model_class, seed))
Ejemplo n.º 18
0
def create_model(model_type, shape=None, config=None):
    if model_type == 'Multi-Output Gradient Boosting Regressor':
        random_state = 0
        if config:
            data = json.loads(config)
            random_state = data.get('random_state') if data.get(
                'random_state') else 0
        model = MultiOutputRegressor(
            GradientBoostingRegressor(random_state=random_state))
        return model
    if model_type == 'Gradient Boosting Regressor':
        random_state = 0
        if config:
            data = json.loads(config)
            random_state = data.get('random_state') if data.get(
                'random_state') else 0
        model = GradientBoostingRegressor(random_state=random_state)
        return model

    if model_type == "SVM Classification":
        '''
        Pros:
         It works really well with clear margin of separation
         It is effective in high dimensional spaces
         It is effective in cases where number of dimensions is greater than the number of samples.
         It uses a subset of training points in the decision function (called support vectors), so it is also memory efficient
        Cons :
         It doesnt perform well, when we have large data set because the required training time is higher.
         It also doesnt perform very well, when the data set has more noise i.e. target classes are overlapping.
         SVM doesn't directly provide probability estimates, these are calculated using an expensive five-fold cross validation. 
         It is related SVC method of Python scikit learn library
        '''
        return __create_svm_classifier__(config)

    if model_type == "SVM Regression":
        return __create_svm_regressor__(config)

    if model_type == "KNN Classifier":
        return __create_KNN_classifier(config)

    if model_type == 'Keras Sequential Model':
        return __create_sequential_model(config)
def generate_joint_model(single_model):
    model = MultiOutputRegressor(single_model)
    model.fit(X_train, Y_train)
    
    score_train = model.score(X_train, Y_train)
    print('Score of train', round(score_train * 100, 1), "%")
    
    score = model.score(X_test, Y_test)
    print('Score of test', round(score * 100, 1), "%")
    
    model_path = model_folder + r"/" +  \
                    str(round(score, 3)).replace('.', '_') + r"_" +  \
                    str(model.get_params()['estimator']).split('(')[0] + \
                    '.joblib'
    joblib.dump(model, model_path)
    print("Save model file", model_path)
    
    return model, model_path
Ejemplo n.º 20
0
    def base_estimator(self, value):
        # Build `base_estimator` if string given
        if isinstance(value, str):
            value = cook_estimator(value,
                                   space=self.space,
                                   random_state=self.rng.randint(
                                       0,
                                       np.iinfo(np.int32).max))

        # Check if regressor
        if not is_regressor(value) and value is not None:
            raise ValueError(
                f"`base_estimator` must be a regressor. Got {value}")

        # Treat per second acquisition function specially
        is_multi_regressor = isinstance(value, MultiOutputRegressor)
        if self.acq_func.endswith("ps") and not is_multi_regressor:
            value = MultiOutputRegressor(value)

        self._base_estimator = value
Ejemplo n.º 21
0
def test_multi_target_regression_partial_fit():
    X, y = datasets.make_regression(n_targets=3)
    X_train, y_train = X[:50], y[:50]
    X_test, y_test = X[50:], y[50:]

    references = np.zeros_like(y_test)
    half_index = 25
    for n in range(3):
        sgr = SGDRegressor(random_state=0)
        sgr.partial_fit(X_train[:half_index], y_train[:half_index, n])
        sgr.partial_fit(X_train[half_index:], y_train[half_index:, n])
        references[:, n] = sgr.predict(X_test)

    sgr = MultiOutputRegressor(SGDRegressor(random_state=0))

    sgr.partial_fit(X_train[:half_index], y_train[:half_index])
    sgr.partial_fit(X_train[half_index:], y_train[half_index:])

    y_pred = sgr.predict(X_test)
    assert_almost_equal(references, y_pred)
Ejemplo n.º 22
0
def test_diff_detector_threshold(n_features_y: int, n_features_x: int):
    """
    Basic construction logic of thresholds_ attribute in the
    DiffBasedAnomalyDetector
    """
    X = np.random.random((100, n_features_x))
    y = np.random.random((100, n_features_y))

    model = DiffBasedAnomalyDetector(base_estimator=MultiOutputRegressor(
        estimator=LinearRegression()))

    # Model has own implementation of cross_validate
    assert hasattr(model, "cross_validate")

    # When initialized it should not have a threshold calculated.
    assert not hasattr(model, "feature_thresholds_")
    assert not hasattr(model, "aggregate_threshold_")
    assert not hasattr(model, "feature_thresholds_per_fold_")
    assert not hasattr(model, "aggregate_thresholds_per_fold_")

    model.fit(X, y)

    # Until it has done cross validation, it has no threshold.
    assert not hasattr(model, "feature_thresholds_")
    assert not hasattr(model, "aggregate_threshold_")
    assert not hasattr(model, "feature_thresholds_per_fold_")
    assert not hasattr(model, "aggregate_thresholds_per_fold_")

    # Calling cross validate should set the threshold for it.
    model.cross_validate(X=X, y=y)

    # Now we have calculated thresholds based on cross validation folds
    assert hasattr(model, "feature_thresholds_")
    assert hasattr(model, "aggregate_threshold_")
    assert hasattr(model, "feature_thresholds_per_fold_")
    assert hasattr(model, "aggregate_thresholds_per_fold_")
    assert isinstance(model.feature_thresholds_, pd.Series)
    assert len(model.feature_thresholds_) == y.shape[1]
    assert all(model.feature_thresholds_.notna())
    assert isinstance(model.feature_thresholds_per_fold_, pd.DataFrame)
    assert isinstance(model.aggregate_thresholds_per_fold_, dict)
Ejemplo n.º 23
0
def grid_search_gbr(xvar, yvar, n_estimators, max_depth, min_samples_leaf,
                    min_samples_split, cv, n_iter):
    n_estimators = list(map(int, n_estimators.split(',')))
    max_depth = list(map(int, max_depth.split(',')))
    min_samples_leaf = list(map(int, min_samples_leaf.split(',')))
    min_samples_split = list(map(int, min_samples_split.split(',')))
    n_iter = int(n_iter)
    cv = int(cv)

    if yvar.shape[1] == 1:
        yvar_ravel = yvar.values.ravel()
        parameters = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'min_samples_leaf': min_samples_leaf,
            'min_samples_split': min_samples_split
        }
        gbr = GradientBoostingRegressor(random_state=42)

    if yvar.shape[1] != 1:
        yvar_ravel = yvar
        parameters = {
            'estimator__n_estimators': n_estimators,
            'estimator__max_depth': max_depth,
            'estimator__min_samples_leaf': min_samples_leaf,
            'estimator__min_samples_split': min_samples_split
        }
        gbr = MultiOutputRegressor(GradientBoostingRegressor(random_state=42),
                                   n_jobs=-1)

    ss = ShuffleSplit(n_splits=cv, test_size=0.25, random_state=42)
    random_cv = RandomizedSearchCV(estimator=gbr,
                                   param_distributions=parameters,
                                   cv=ss,
                                   n_iter=n_iter,
                                   scoring='neg_mean_squared_error',
                                   n_jobs=-1,
                                   random_state=42)
    random_cv.fit(xvar, yvar_ravel)

    return list(random_cv.best_params_.items())
def make_bayesian_pred(df, next_week, debug=0):
    """
    This method creates predictions using bayesian regression.
    """
    space = {
        'estimator__alpha_1': [1e-10, 1e-5, 1],
        'estimator__alpha_2': [1e-10, 1e-5, 1],
        'estimator__lambda_1': [1e-10, 1e-5, 1],
        'estimator__lambda_2': [1e-10, 1e-5, 1],
        'estimator__n_iter': [10, 300, 1000],
        'estimator__normalize': [True, False],
        'estimator__fit_intercept': [True, False]
    }
    params = {
        'estimator__alpha_1': [1e-10, 1e-5, 1, 5],
        'estimator__alpha_2': [1e-10, 1e-5, 1, 5],
        'estimator__lambda_1': [1e-10, 1e-5, 1, 5],
        'estimator__lambda_2': [1e-10, 1e-5, 1, 5],
        'estimator__n_iter': [10, 300, 1000],
        'estimator__normalize': [True, False],
        'estimator__n_jobs': -1,
        'n_jobs': -1,
        'estimator__fit_intercept': [True, False]
    }
    X_train, X_test, Y_train, Y_test = process_data(df, next_week)
    multi_bay = MultiOutputRegressor(BayesianRidge())
    #multi_bay.set_params(**params)
    #best_random = grid_search(multi_bay, space, next_week, 3, X_train, Y_train)
    multi_bay.fit(X_train, Y_train)
    next_week[Y_train.columns] = multi_bay.predict(next_week[X_train.columns])
    if debug:
        y_pred_untrain = multi_bay.predict(X_train)
        print(next_week)
        print("Score: ", multi_bay.score(X_train, Y_train) * 100)
        print("MSE: ", metrics.mean_squared_error(Y_train, y_pred_untrain))
        print(
            "CV: ",
            ms.cross_val_score(multi_bay,
                               Y_train,
                               y_pred_untrain,
                               cv=10,
                               scoring='neg_mean_squared_error'))
    return next_week
Ejemplo n.º 25
0
 def test_multiple_treatments(self):
     np.random.seed(123)
     # Only applicable to continuous treatments
     # Generate data for 2 treatments
     TE = np.array([[TestOrthoForest._exp_te(x), TestOrthoForest._const_te(x)] for x in TestOrthoForest.X])
     coefs_T = uniform(0, 1, size=(TestOrthoForest.support_size, 2))
     T = np.matmul(TestOrthoForest.W[:, TestOrthoForest.support], coefs_T) + \
         uniform(-1, 1, size=(TestOrthoForest.n, 2))
     delta_Y = np.array([np.dot(TE[i], T[i]) for i in range(TestOrthoForest.n)])
     Y = delta_Y + np.dot(TestOrthoForest.W[:, TestOrthoForest.support], TestOrthoForest.coefs_Y) + \
         TestOrthoForest.epsilon_sample(TestOrthoForest.n)
     # Test multiple treatments with controls
     est = ContinuousTreatmentOrthoForest(n_trees=50, min_leaf_size=10,
                                          max_depth=50, subsample_ratio=0.30, bootstrap=False, n_jobs=4,
                                          model_T=MultiOutputRegressor(Lasso(alpha=0.024)),
                                          model_Y=Lasso(alpha=0.024),
                                          model_T_final=WeightedLassoCVWrapper(),
                                          model_Y_final=WeightedLassoCVWrapper())
     est.fit(Y, T, TestOrthoForest.X, TestOrthoForest.W)
     expected_te = np.array([TestOrthoForest.expected_exp_te, TestOrthoForest.expected_const_te]).T
     self._test_te(est, expected_te, tol=0.5, treatment_type='multi')
Ejemplo n.º 26
0
def fit(
    experiment,
    x_train,
    y_train,
    parameters,
    alpha: float = 0.5,
    delta_e_loss: bool = True,
):
    regressor = MultiOutputRegressor(
        LGBMRegressor(objective='quantile', alpha=alpha, **parameters))

    if delta_e_loss:
        cv = cross_val_score(regressor,
                             x_train,
                             y_train,
                             n_jobs=-1,
                             scoring=scorer)
    else:
        cv = cross_val_score(regressor, x_train, y_train, n_jobs=-1)

    return np.abs(cv.mean()), cv.std()
Ejemplo n.º 27
0
    def _approximate(self, X, y):
        """
        
        """

        if self.reg is not None:
            regressor = Ridge(alpha=self.reg,
                              solver='auto',
                              normalize=True,
                              tol=1e-10)
        else:
            pass

        if self.target == 'multi':
            targets = MultiOutputRegressor(regressor).fit(X, y)
        elif self.target == 'variate':
            targets = regressor.fit(X, y)
        else:
            raise ValueError('')

        return targets
Ejemplo n.º 28
0
def grant_predictor(onu_id,onu_df,window,predict,features,model,metric):
    index=0 # window start
    index_max = 0 # prediction end

    # list with metrics of each prediction in different observation windows
    metric_list = []
    reg = MultiOutputRegressor(model)#Implement the model

    while index+window < len(onu_df):
        interval=index+window # window final position

        df_tmp = onu_df.iloc[index:interval] # training dataset
        if interval+predict < len(onu_df): # check if prediction doesnt overflow input data
            index_max = interval+predict
        else:
            index_max = len(onu_df)-1

        # check if features evaluated is simple(counter) else counter+timestamp
        if len(features) == 1:
            X_pred = np.array(onu_df[features].iloc[interval:index_max]).reshape(-1,1)
            if len(X_pred) == 0:
                break
            # fitting the model
            reg.fit(np.array( df_tmp[features] ).reshape(-1,1) , df_tmp[['start','end']])
        else:
            X_pred = onu_df[features].iloc[interval:index_max]
            if len(X_pred) == 0:
                break
            # fitting the model
            reg.fit(df_tmp[features] , df_tmp[['start','end']])

        # make prediction
        pred = reg.predict(X_pred)
        # real values to compare with prediction
        Y_true = onu_df[['start','end']].iloc[interval:index_max]
        # metric calculation
        metric_list.append(metric(Y_true, pred,multioutput='uniform_average'))

        # shift past observations window in p positions
        index += predict

    return metric_list
Ejemplo n.º 29
0
def train(alpha=0.5, delta_e_loss=True):
    # Config is a variable that holds and saves hyperparameters and inputs

    configs = {
        'n_estimators': 100,
        'max_depth': 10,
        'num_leaves': 50,
        'reg_alpha': 0.00001,
        'reg_lambda': 0.00001,
        'subsample': 0.2,
        'colsample_bytree': 0.2,
        'min_child_weight': 0.001,
    }

    # Initilize a new wandb run
    wandb.init(project='colorml', config=configs)

    config = wandb.config

    regressor = MultiOutputRegressor(
        LGBMRegressor(objective='quantile', alpha=alpha, **config))

    if delta_e_loss:
        cv = cross_val_score(regressor,
                             X_train,
                             y_train,
                             n_jobs=5,
                             scoring=scorer,
                             cv=5)
    else:
        cv = cross_val_score(regressor, X_train, y_train, n_jobs=2, cv=5)

    mean = np.abs(cv.mean())
    std = np.abs(cv.std())
    wandb.log({'cv_mean': mean})
    wandb.log({'cv_std': std})

    wandb.run.summary['cv_mean'] = mean
    wandb.run.summary['cv_std'] = std
Ejemplo n.º 30
0
def multiouput_regressor(input, target, input_test, target_test, output):
    # dataset
    X = input
    y = target
    X_test = input_test
    y_test = target_test

    estimator = LinearRegression()
    model = MultiOutputRegressor(estimator)

    # Perform 6-fold cross validation
    #scores = cross_val_score(model, X, y, cv=5)
    #print("Cross-validated scores: ")
    #print(scores)

    # Make cross validated predictions
    scores = cross_validate(model, X, y, cv=5, return_estimator=True)
    model2 = scores['estimator'][1]

    predictions = model2.predict(X_test)

    # Remove exterme values
    mask = predictions[:, 1] <= 1
    y_test = y_test[mask]
    predictions = predictions[mask]

    accuracy = metrics.r2_score(y_test, predictions)
    print("Cross-Predicted Accuracy: {}".format(accuracy))

    # The line / model
    fig, ax = plt.subplots()
    ax.scatter(y_test[:, 0], y_test[:, 1], color='red', alpha=0.5)
    ax.scatter(predictions[:, 0], predictions[:, 1], color='blue', alpha=0.5)
    ax.set_xlabel('P')
    ax.set_ylabel('Q')
    plt.show()

    np.savetxt(output, predictions)
Ejemplo n.º 31
0
def runBaseLineRegression(model_params,data,estimator):

	#regr = MultiOutputRegressor(sklearn.linear_model.LinearRegression())
	regr = MultiOutputRegressor(estimator)
	#regr = MultiOutputRegressor(sklearn.linear_model.BayesianRidge())
	#regr = MultiOutputRegressor(sklearn.linear_model.Lasso())

	#data
	AP_train,TRP_train = data[0]
	AP_dev,TRP_dev = data[1]

	if model_params["DirectionForward"]:
		X_train,Y_train,X_dev,Y_dev = TRP_train,AP_train,TRP_dev,AP_dev
	else:
		X_train,Y_train,X_dev,Y_dev = AP_train,TRP_train,AP_dev,TRP_dev
		model_params["OutputNames"],model_params["InputNames"] = model_params["InputNames"],model_params["OutputNames"]

	regr.fit(X_train,Y_train)
	Y_dev_pred = regr.predict(X_dev)
	Y_train_pred = regr.predict(X_train)

	if model_params["DirectionForward"]:
		#train
		mse_totoal_train = customUtils.mse_p(ix = (3,6),Y_pred = Y_train_pred,Y_true = Y_train)
		#dev
		mse_totoal_dev = customUtils.mse_p(ix = (3,6),Y_pred = Y_dev_pred,Y_true = Y_dev)

	else:
		mse_totoal_train = mse(Y_train,Y_train_pred,multioutput = 'raw_values')
		mse_totoal_dev = mse(Y_dev,Y_dev_pred,multioutput = 'raw_values')

	
	model_location = os.path.join('models',model_params["model_name"] +  '.json')


	with open(os.path.join('model_params',model_params["model_name"] +  '.json'), 'w') as fp:
		json.dump(model_params, fp, sort_keys=True)

	_ = run_eval_base(model_location,dataset = "train",email = model_params["email"])
	_ = run_eval_base(model_location,dataset = "test",email = model_params["email"])
	mse_total = run_eval_base(model_location,dataset = "dev",email = model_params["email"])

	
	return (mse_totoal_train.tolist(),mse_totoal_dev.tolist(),mse_totoal_train.sum(),mse_totoal_dev.sum())
Ejemplo n.º 32
0
 def _init_gbd(self):
     cv_params = {
         'estimator__n_estimators': [500, 800, 1000, 1600, 2400],
         'estimator__max_depth': [3, 6, 8, 10]
     }  #
     other_params = {
         'learning_rate': self.learning_rate,
         'n_estimators': 500,
         'max_depth': 5,
         'min_child_weight': 1,
         'seed': 0,
         'subsample': 0.8,
         'colsample_bytree': 0.8,
         'gamma': 0,
         'reg_alpha': 0,
         'reg_lambda': 1
     }
     self.model = xgb.XGBRegressor(**other_params)
     self.model = MultiOutputRegressor(self.model)
     self.best_model = GridSearchCV(estimator=self.model,
                                    param_grid=cv_params,
                                    scoring='r2',
                                    cv=5,
                                    verbose=2)
Ejemplo n.º 33
0
def test_diff_detector_require_thresholds(require_threshold: bool):
    """
    Should fail if requiring thresholds, but not calling cross_validate
    """
    X = pd.DataFrame(np.random.random((100, 5)))
    y = pd.DataFrame(np.random.random((100, 2)))

    model = DiffBasedAnomalyDetector(
        base_estimator=MultiOutputRegressor(LinearRegression()),
        require_thresholds=require_threshold,
    )

    model.fit(X, y)

    if require_threshold:
        # FAIL: Forgot to call .cross_validate to calculate thresholds.
        with pytest.raises(AttributeError):
            model.anomaly(X, y)

        model.cross_validate(X=X, y=y)
        model.anomaly(X, y)
    else:
        # thresholds not required
        model.anomaly(X, y)
Ejemplo n.º 34
0
def neural_network(num_of_layers, is_multi=False):
    print("getting training data")
    X, usa_gross, _ = get_set("training")
    linear = get_linear_fit(X, usa_gross, [default_alpha_linear])[0]
    training_res = []
    test_res = []
    for _ in range(5):
        X, usa_gross, rating = get_set("training")
        X_picked = pick_needed_features(linear, X)
        # X_picked = X
        net = MLPRegressor(hidden_layer_sizes=(100, )*num_of_layers)
        net = MultiOutputRegressor(net)
        if is_multi:
            net.fit(X_picked, list(zip(usa_gross, rating)))
        else:
            net.fit(X_picked, list(zip(usa_gross)))
        predicts = net.predict(X_picked)
        training_res.append(mean_squared_error(predicts[:, 0], usa_gross))
        X, usa_gross, rating = get_set("validation")
        X_picked = pick_needed_features(linear, X)
        # X_picked = X
        predicts = net.predict(X_picked)
        test_res.append(mean_squared_error(predicts[:, 0], usa_gross))
    return np.mean(training_res), np.std(training_res), np.mean(test_res), np.std(test_res)
def get_model() -> AdaBoostRegressor:
    """
    Full pipeline for getting trained AdaBoostRegressor model
    Returns:
        clf:AdaBoostRegressor
    """
    # Read dataframes and drop excess columns and bad images
    # Special dataframe of our handmarked labels
    drop_columns = ["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.1.1"]
    drop_images = [104, 908, 906, 907, 905, 904] + list(range(905, 1000))


    df_augmented = read_data_frame(HAND_MARKED_LABELS, drop_columns, drop_images)
    # precompute stats and brute force (l,r) boundaries
    images_aug_, labels_aug_, imageids_aug_ = precompute_stats(df_augmented)
    imageids_aug_ = df_augmented.ImageId.to_numpy()
    labels_aug_ = np.array(labels_aug_)
    bounds_aug_, train_x_aug_, train_y_aug_ = brute_force_bounds(
        images_aug_, labels_aug_, imageids_aug_
    )
    # images on which we tested model
    test_ids = [776, 675, 42, 3, 714, 312, 127, 653, 592, 205, 179, 191]
    test_indices = np.in1d(df_augmented.ImageId.to_numpy(), test_ids).nonzero()[0]
    # delete test images from train
    deleted_test_x, deleted_test_y = (
        np.delete(train_x_aug_, test_indices, axis=0),
        np.delete(train_y_aug_, test_indices, axis=0),
    )

    x_train_aug = deleted_test_x[:]
    y_train_aug = deleted_test_y[:]
    # train
    clf = MultiOutputRegressor(AdaBoostRegressor(random_state=10, n_estimators=5)).fit(
        x_train_aug, y_train_aug
    )
    return clf
Ejemplo n.º 36
0
Archivo: svr.py Proyecto: a-jd/npsn
    def train_model(self, params):
        '''
        Input a dict, params, containing:
            nu: Float, fraction of support vectors (0,1]
            C: Float, penalty parameter of error (~1.0)
            kernel: String, 'linear', 'poly', 'rbf', sigmoid'
            degree: Int, degree of polynomial for poly
            gamma: String, 'scale'/'auto' for 'rbf', 'poly', 'sigmoid'
        Returns:
            Dict containing info on combination
        '''
        kernel = params['kernel']
        nu = params['nu']
        C = params['C']

        # Instantiate SVR
        if kernel in ['linear']:
            model = MOR(NuSVR(C=C, nu=nu, kernel=kernel))
        elif kernel in ['rbf', 'sigmoid']:
            gamma = params['gamma']
            model = MOR(NuSVR(C=C, nu=nu, kernel=kernel, gamma=gamma))
        elif kernel in ['poly']:
            gamma = params['gamma']
            degree = params['degree']
            model = MOR(
                NuSVR(C=C, nu=nu, kernel=kernel, degree=degree, gamma=gamma))

        # Print current combination
        print('Current SVR combination: {}'.format(params))

        # Flat versions of y (power/flux distribution)
        y_tr_fl, y_te_fl = self.flat_y()

        # Fit
        model.fit(self.x_train, y_tr_fl)

        # Hyperopt loss for each combination
        y_predict = model.predict(self.x_test)
        hyp_loss = sklmse(y_te_fl, y_predict)
        self.tr_hist.update_history(params, hyp_loss, model)

        return {'loss': hyp_loss, 'status': STATUS_OK}
Ejemplo n.º 37
0
        mplpyplot.show()
# nodebox section end


# Create a random dataset
rng = np.random.RandomState(1)
X = np.sort(200 * rng.rand(600, 1) - 100, axis=0)
y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T
y += (0.5 - rng.rand(*y.shape))

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=400,
                                                    random_state=4)

max_depth = 30
regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=max_depth,
                                                          random_state=0))
regr_multirf.fit(X_train, y_train)

regr_rf = RandomForestRegressor(max_depth=max_depth, random_state=2)
regr_rf.fit(X_train, y_train)

# Predict on new data
y_multirf = regr_multirf.predict(X_test)
y_rf = regr_rf.predict(X_test)

# Plot the results
plt.figure()
s = 50
a = 0.4
plt.scatter(y_test[:, 0], y_test[:, 1], edgecolor='k',
            c="navy", s=s, marker="s", alpha=a, label="Data")
Ejemplo n.º 38
0
feature = "Diabetes"
# get X and y data
train = pd.read_csv("train.csv", delimiter=",")
train = train.drop_duplicates() # ensure no duplicates
y_train = train[feature].to_frame()
names = y_train[feature].unique()
X_train = train.drop(feature, 1)
X_names = list(X_train)

# Get test data
test = pd.read_csv("test.csv", delimiter=",")
X_test = test

max_depth = 3
regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=max_depth))
regr_multirf.fit(X_train, y_train)

regr_rf = RandomForestRegressor(n_estimators=20, max_depth=max_depth)
regr_rf.fit(X_train, y_train)

# Predict on new data
y_multirf = regr_multirf.predict(X_test)
y_rf = regr_rf.predict(X_test)

# put predictions into csv
IDs = pd.DataFrame(X_test["ID"])
y_pred = pd.DataFrame(y_multirf)
pred_data = IDs.join(y_pred)
pred_data.columns = ['ID', 'Prediction']
pred_data.to_csv(path_or_buf="prediction_multirf.csv", index=False)