コード例 #1
0
ファイル: test_dummy.py プロジェクト: Aerlinger/scikit-learn
def test_quantile_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    median = np.median(y_learn, axis=0).reshape((1, -1))
    quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1))

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor(strategy="quantile", quantile=0.5)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(
        median, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)

    # Correctness oracle
    est = DummyRegressor(strategy="quantile", quantile=0.8)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(
        quantile_values, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)
コード例 #2
0
ファイル: test_dummy.py プロジェクト: aniryou/scikit-learn
def test_regressor_prediction_independent_of_X(strategy):
    y = [0, 2, 1, 1]
    X1 = [[0]] * 4
    reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
    reg1.fit(X1, y)
    predictions1 = reg1.predict(X1)

    X2 = [[1]] * 4
    reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
    reg2.fit(X2, y)
    predictions2 = reg2.predict(X2)

    assert_array_equal(predictions1, predictions2)
コード例 #3
0
ファイル: test_dummy.py プロジェクト: Aerlinger/scikit-learn
def test_constant_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="constant", constant=[43])
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [43] * len(X))

    reg = DummyRegressor(strategy="constant", constant=43)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [43] * len(X))
コード例 #4
0
ファイル: test_dummy.py プロジェクト: RONNCC/scikit-learn
def test_regressor():
    X = [[0]] * 4  # ignored
    y = [1, 2, 1, 1]

    reg = DummyRegressor()
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [5. / 4] * len(X))
コード例 #5
0
def train_classifier():
	X_train = tfv.transform(video_captions_train)
	X_test  = tfv.transform(video_captions_test)
	
	dummy = DummyRegressor(strategy="median")
	dummy.fit(X_train, Y_train)
	Y_pred_med = dummy.predict(X_test)
コード例 #6
0
ファイル: test_dummy.py プロジェクト: aniryou/scikit-learn
def test_dummy_regressor_on_3D_array():
    X = np.array([[['foo']], [['bar']], [['baz']]])
    y = np.array([2, 2, 2])
    y_expected = np.array([2, 2, 2])
    cls = DummyRegressor()
    cls.fit(X, y)
    y_pred = cls.predict(X)
    assert_array_equal(y_pred, y_expected)
コード例 #7
0
ファイル: test_dummy.py プロジェクト: NelleV/scikit-learn
def test_dummy_regressor_on_nan_value():
    X = [[np.NaN]]
    y = [1]
    y_expected = [1]
    clf = DummyRegressor()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_array_equal(y_pred, y_expected)
コード例 #8
0
ファイル: regressor.py プロジェクト: pombredanne/ramp-1
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = DummyRegressor()

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
コード例 #9
0
def test_multioutput_regressor():

    X_learn = np.random.randn(10, 10)
    y_learn = np.random.randn(10, 5)

    mean = np.mean(y_learn, axis=0).reshape((1, -1))

    X_test = np.random.randn(20, 10)
    y_test = np.random.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor()
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    assert_array_equal(np.tile(mean, (y_learn.shape[0], 1)), y_pred_learn)
    assert_array_equal(np.tile(mean, (y_test.shape[0], 1)), y_pred_test)
    _check_behavior_2d(est)
コード例 #10
0
ファイル: dairyml.py プロジェクト: g-simmons/dairyML
class PerfectClassifierMeanRegressor():        
    def fit(self,X: pd.DataFrame, y: pd.Series):
        self.X = X
        self.y = y
        self.regressor = DummyRegressor(strategy='mean')
        
    def cross_val(self,scoring,k=10):
        self.scores = {}

        for name, scorer in scoring.items():
            for split in ['train','test']:
                self.scores[split+'_'+name] = []

        splitter = KFold(n_splits=k,shuffle=True,random_state=7)   
        for train_index, test_index in splitter.split(self.X,self.y):
            
            X_train = self.X.values[train_index]
            y_train = self.y.values[train_index]
            
            X_test = self.X.values[test_index]
            y_test = self.y.values[test_index]
            
            # get test y class labels for perfect classification
            y_test_binary = (y_test != 0)
            y_train_binary = (y_train != 0)

            self.regressor.fit(X_train,y_train.reshape(-1,1))
            
            reg_pred_test = self.regressor.predict(X_test).flatten()
            reg_pred_train = self.regressor.predict(X_train).flatten()

            y_pred_test = np.multiply(y_test_binary,reg_pred_test)
            y_pred_train = np.multiply(y_train_binary,reg_pred_train)

            for name, scorer in scoring.items():
                self.scores['test_'+name].append(scorer(y_test,y_pred_test))
                self.scores['train_'+name].append(scorer(y_train,y_pred_train))

                
        return self.scores
    
    def get_params(self):
        return(self.regressor.get_params())
コード例 #11
0
def test_mean_strategy_multioutput_regressor():
    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    mean = np.mean(y_learn, axis=0).reshape((1, -1))

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor()
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)
コード例 #12
0
ファイル: test_dummy.py プロジェクト: RONNCC/scikit-learn
def test_multioutput_regressor():

    X_learn = np.random.randn(10, 10)
    y_learn = np.random.randn(10, 5)

    mean = np.mean(y_learn, axis=0).reshape((1, -1))

    X_test = np.random.randn(20, 10)
    y_test = np.random.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor()
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    assert_array_equal(np.tile(mean, (y_learn.shape[0], 1)), y_pred_learn)
    assert_array_equal(np.tile(mean, (y_test.shape[0], 1)), y_pred_test)
    _check_behavior_2d(est)
コード例 #13
0
ファイル: test_dummy.py プロジェクト: Aerlinger/scikit-learn
def test_median_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="median")
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
コード例 #14
0
ファイル: baselines.py プロジェクト: ardunn/latmats
class DummyEstimator(BaseTesterEstimator):

    def __init__(self):
        self.regressor = DummyRegressor()

    def fit(self, x, y):
        self.regressor.fit(x, y)

    def predict(self, x):
        return self.regressor.predict(x)
コード例 #15
0
ファイル: test_dummy.py プロジェクト: rpuegue/CoursA61
def test_mean_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 4  # ignored
    y = random_state.randn(4)

    reg = DummyRegressor()
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.mean(y)] * len(X))
コード例 #16
0
ファイル: test_dummy.py プロジェクト: aniryou/scikit-learn
def test_dummy_regressor_return_std():
    X = [[0]] * 3  # ignored
    y = np.array([2, 2, 2])
    y_std_expected = np.array([0, 0, 0])
    cls = DummyRegressor()
    cls.fit(X, y)
    y_pred_list = cls.predict(X, return_std=True)
    # there should be two elements when return_std is True
    assert_equal(len(y_pred_list), 2)
    # the second element should be all zeros
    assert_array_equal(y_pred_list[1], y_std_expected)
コード例 #17
0
ファイル: test_dummy.py プロジェクト: Asgardian8740/Django
def test_dummy_regressor_return_std():
    X = [[0]] * 3  # ignored
    y = np.array([2, 2, 2])
    y_std_expected = np.array([0, 0, 0])
    cls = DummyRegressor()
    cls.fit(X, y)
    y_pred_list = cls.predict(X, return_std=True)
    # there should be two elements when return_std is True
    assert len(y_pred_list) == 2
    # the second element should be all zeros
    assert_array_equal(y_pred_list[1], y_std_expected)
コード例 #18
0
ファイル: test_dummy.py プロジェクト: Aerlinger/scikit-learn
def test_mean_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    mean = np.mean(y_learn, axis=0).reshape((1, -1))

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor()
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)
コード例 #19
0
def test_quantile_strategy_regressor():
    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="quantile", quantile=0.5)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=0)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.min(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=1)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.max(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=0.3)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
コード例 #20
0
def test_constant_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    # test with 2d array
    constants = random_state.randn(5)

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor(strategy="constant", constant=constants)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d_for_constant(est)
コード例 #21
0
def test_constant_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    # test with 2d array
    constants = random_state.randn(5)

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor(strategy="constant", constant=constants)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d_for_constant(est)
コード例 #22
0
ファイル: test_dummy.py プロジェクト: Aerlinger/scikit-learn
def test_quantile_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="quantile", quantile=0.5)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=0)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.min(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=1)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.max(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=0.3)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
コード例 #23
0
class Mean:
    """Finds the mean of all the days in the history and uses them as the prediction"""
    def __init__(self, batch_size=1):
        self.batch_size = batch_size
        self.regressor_ = DummyRegressor(strategy="mean")

    def fit(self, X, y=None):
        self.regressor_.fit(X, y)

    def predict(self, X):
        predictions = self.regressor_.predict(X)
        predictions = np.round(predictions / self.batch_size) * self.batch_size
        return np.expand_dims(predictions, -1)
コード例 #24
0
def get_mean_reg(trait_name, metric_name, x, y):
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    # Create baseline regressor (always predicts mean)
    from sklearn.dummy import DummyRegressor
    mean_regressor = DummyRegressor(strategy='mean')
    mean_regressor.fit(x_train, y_train)

    evaluate(y_test, mean_regressor.predict(x_test), "Baseline Regressor")
    return mean_regressor
コード例 #25
0
def naive_model_compare_r2(X_tr, y_tr, X_te, y_te, y_pr):
    # Model
    print('--- model: {:.3}'.format(metrics.r2_score(y_te, y_pr)))
    # normal random distribution
    y_pr_rand = np.random.normal(0, 1, y_pr.shape)
    print('--- normal random distribution: {:.3}'\
          .format(metrics.r2_score(y_te, y_pr_rand)))
    # dummy regressors
    for s in ['mean', 'median']:
        dum = DummyRegressor(strategy=s).fit(X_tr, y_tr)
        y_pr_dum = dum.predict(X_te)
        print('--- dummy regressor ('+ s +') : r2_score={:.3}'\
              .format(metrics.r2_score(y_te, y_pr_dum)))
コード例 #26
0
class SVMTotal:
    def __init__(self, features, model_name, kernel: str=None, degree: int=None):
        if 'svm' in str.lower(model_name):
            self.model = SVR(gamma='scale', kernel=kernel, degree=degree)
        elif 'average' in str.lower(model_name):
            self.model = DummyRegressor(strategy='mean')
        elif 'median' in str.lower(model_name):
            self.model = DummyRegressor(strategy='median')
        elif 'per_raisha_baseline' in str.lower(model_name):
            self.per_raisha = None
        else:
            logging.error('Model name not in: svm, average, median')
            print('Model name not in: svm, average, median')
            raise Exception('Model name not in: svm, average, median')
        self.features = features
        self.model_name = model_name

    def fit(self, train_x: pd.DataFrame, train_y: pd.Series):
        if 'per_raisha_baseline' in str.lower(self.model_name):
            train_y.name = 'labels'
            train_x = train_x.merge(train_y, right_index=True, left_index=True)
            self.per_raisha = pd.DataFrame(train_x.groupby(by='raisha').labels.mean())
            self.per_raisha.columns = ['predictions']
        else:
            train_x = train_x[self.features]
            self.model = self.model.fit(train_x, train_y)

    def predict(self, validation_x: pd.DataFrame, validation_y: pd.Series):
        if 'per_raisha_baseline' in str.lower(self.model_name):
            validation_x = validation_x.merge(self.per_raisha, left_on='raisha', right_index=True)
            validation_x.index = validation_x.sample_id
            predictions = validation_x.predictions
        else:
            validation_x = validation_x[self.features]
            predictions = self.model.predict(validation_x)
        validation_y.name = 'labels'
        predictions = pd.Series(predictions, index=validation_y.index, name='predictions')
        if predictions.dtype == float:  # regression- create bins to measure the F-score
            bin_prediction, bin_test_y = utils.create_bin_columns(predictions, validation_y)
            four_bin_prediction, four_bin_test_y = utils.create_4_bin_columns(predictions, validation_y)
        else:
            bin_prediction, bin_test_y = pd.Series(name='bin_prediction'), pd.Series(name='bin_label')
            four_bin_prediction, four_bin_test_y =\
                pd.Series(name='four_bin_prediction'), pd.Series(name='four_bin_label')

        predictions = pd.DataFrame(predictions).join(validation_y).join(bin_test_y).join(bin_prediction)
        predictions = predictions.join(four_bin_test_y).join(four_bin_prediction)

        return predictions
コード例 #27
0
def run_dummy_regressor(train_embeds,
                        train_targets,
                        test_embeds,
                        test_targets,
                        scaler=None):
    dummy = DummyRegressor(strategy="median")
    dummy.fit(train_embeds, train_targets)

    if scaler == None:
        rmse = mean_squared_error(test_targets,
                                  dummy.predict(test_embeds),
                                  squared=False)
    else:
        rmse = mean_squared_error(test_targets,
                                  dummy.predict(test_embeds),
                                  squared=False)
        rmse = scaler.inverse_transform(np.array(rmse).reshape(1, -1))[0][0]
    print('Dummy regressor RMSE:', rmse)
    print(
        'Dummy regressor MAPE:',
        mean_absolute_percentage_error(test_targets,
                                       dummy.predict(test_embeds)))
    print('Dummy regressor R2:',
          r2_score(test_targets, dummy.predict(test_embeds)))
コード例 #28
0
def dummy_regression(x_train, y_train, x_test, y_test, strategy):
    """
    
    Regression strategies :

    case 1 : y_pred_random = np.random.randint(np.min(y), np.max(y), y_test.shape)
    case 2 : strategy is 'mean' or 'median'
    
    cf :
    - https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html
    
    """
    dum = DummyRegressor(strategy)
    dum.fit(x_train, y_train)
    y_predicted_dum = dum.predict(x_test)
    # np.sqrt(metrics.mean_squared_error(y_test, y_predicted_dum)) RMSE
    return y_predicted_dum, y_test
コード例 #29
0
class r07546035_DummyRegression(regression):
    def trainAlgo(self):

        self.model = DummyRegressor(strategy=self.param['strategy'],
                                    quantile=self.param['quantile'])

        self.model.fit(self.inputData['X'],
                       self.outputData['y'],
                       sample_weight=None)

    def predictAlgo(self):

        self.result['y'] = self.model.predict(self.inputData['X'])

    def get_paramsAlgo(self):

        self.model.get_params(self)
コード例 #30
0
ファイル: dnn.py プロジェクト: DLR-SC/haicu-ansim
    def dummy_train_test(self, strategy='mean'):
        clf = DummyRegressor(strategy=strategy)
        '''
            “mean”: always predicts the mean of the training set
        
            “median”: always predicts the median of the training set
        
            “quantile”: always predicts a specified quantile of the training set, provided with the quantile parameter.
        
            “constant”: always predicts a constant value that is provided by the user.
        '''

        clf.fit(self.X_train, self.y_train)
        y_pred = clf.predict(self.X_test)
        result_dic = {}

        result_dic['mse'] = round(mean_squared_error(y_pred=y_pred, y_true=self.y_test), 4)
        result_dic['mae'] = round(mean_absolute_error(y_pred=y_pred, y_true=self.y_test),  4)
        return result_dic
コード例 #31
0
ファイル: simbo_general.py プロジェクト: diogo149/simbo
def _minimize_simbo_general(fun,
                            x0,  # only used to get number of features
                            args=(),
                            callback=None,
                            batch_size=100,
                            population_size=10000,
                            maxiter=10000,
                            scorer=None, # if no scorer given, scores are constant
                            selector=None, # only relevant is sampler is given
                            sampler=None):
    n_iter = int(maxiter / batch_size)
    assert n_iter > 0

    dummy_generator = generative_models.DummyGenerator(len(x0))

    if scorer is None:
        scorer = DummyRegressor()
    if sampler is None:
        sampler = dummy_generator

    if isinstance(selector, float) and 0 < selector < 1:
        selector = percentile_selector(selector)

    for i in range(n_iter):
        if i == 0:
            batch = dummy_generator.sample(batch_size)
        else:
            population = sampler.sample(population_size)
            scores = scorer.predict(population)
            batch_w_score = heapq.nsmallest(batch_size, zip(scores, population),
                                            key=lambda x: x[0])
            batch = [v for score, v in batch_w_score]
        results = optimize_utils.score_multi(fun, batch, args, callback)
        selected = selector(results, batch) if selector is not None else batch
        scorer.fit(batch, results)
        sampler.fit(selected)

    best_fval, best_x = max(zip(results, batch), key=lambda x: x[0])
    nfev = batch_size * n_iter
    return optimize_utils.to_result(x=best_x, fun=best_fval,
                                    niter=n_iter, nfev=nfev)
コード例 #32
0
def testLinearRegression(x, y):
    # Mean for Linear Regression model using all of the data for training.
    model = LinearRegression().fit(x, y)
    ypred = model.predict(x)  
    lgError = mean_squared_error(y,ypred)

    # Average mean when using k-folds for training and testing.
    kFoldError = kFoldLinearRegression(x,y)

    # Mean when using dummy model with all of the data used for training.
    dummy = DummyRegressor(strategy="mean").fit(X=x, y=y)
    dummy_preds = dummy.predict(x)
    dummyError = mean_squared_error(y,dummy_preds)

    # Mean when using an 80:20 train:test split for Linear Regression.
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    model = LinearRegression().fit(X_train, y_train)
    preds = model.predict(X_test)
    splitError = mean_squared_error(y_test,preds)

    print("LG k-fold mean squared error: %f, baseline square error: %f, LG not-folded error: %f, LG 80:20 error: %f"
    %(kFoldError, dummyError, lgError, splitError))
コード例 #33
0
def do_polynomial_reg(type_id, model_class, degree, alpha=None):
    matrix = access.item_matrix(type_id)  # item id of abyssal magstab
    data = np.array(matrix)
    x = data[:, 1:]
    y = data[:, 0]

    kf = KFold(n_splits=5)
    mean_error = []
    dummy_mses = []

    Xpoly = PolynomialFeatures(degree=int(degree)).fit_transform(x)
    if alpha is not None:
        model = model_class(normalize=True, alpha=alpha)
    else:
        model = model_class(normalize=True)
    temp = []

    for train, test in kf.split(Xpoly):
        model.fit(Xpoly[train], y[train])
        ypred = model.predict(Xpoly[test])
        temp.append(mean_squared_error(y[test], ypred))

        dummy_model = DummyRegressor(strategy='mean')
        dummy_model.fit(Xpoly[train], y[train])
        dummy_pred = dummy_model.predict(Xpoly[test])

        mean_error.append(mean_squared_error(y[test], ypred))
        dummy_mses.append(mean_squared_error(y[test], dummy_pred))

    mse = np.array(mean_error).mean()
    dummy_mse = np.array(dummy_mses).mean()

    print(
        f'{model_class.__name__} w/ Polynomial Features Mean Squared Error: {mse}'
    )
    print(f'Dummy Classifier (Mean) Mean Squared Error: {dummy_mse}')
    print(f'Percentage Difference: {((mse - dummy_mse)/mse) * 100}%')
コード例 #34
0
def main():

    # read review data
    print('parsing review data...')
    reviews = parse_json('./yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json')

    # use only reviews posted after 2008
    valid_reviews = []
    for review in reviews:
        review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d')
        if review_date.year < 2008: 
            continue
        valid_reviews.append(review)
    reviews = valid_reviews

    # sample the data
    # sample_num = len(reviews)
    # print('sampling...', sample_num, 'out of', len(reviews))
    # reviews = sample(reviews, sample_num)

    # tokenize text for all reviews
    print('tokenizing text for all reviews...')
    texts = [review['text'] for review in reviews]
    count_vect = CountVectorizer(max_features = 100)
    X = count_vect.fit_transform(texts)

    # transform from occurrence to frequency
    print('converting occurrence to frequency...')
    tfidf_transformer = TfidfTransformer()
    X = tfidf_transformer.fit_transform(X)

    # load the linear model for normalization
    clf = joblib.load('./normalization/linear_model_for_normalization.pkl')

    # get labels
    print('calculating labels...')
    y = []
    for review in reviews:
        review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d')
        # normalize
        normalizor = clf.predict(np.array([[review_date.year]]))[0][0]
        review_quality = sum(review['votes'].values()) / normalizor
        y.append(review_quality)

    # splitting into train and test set
    print('splitting into train and test set...')
    train_len = int(X.shape[0] * 0.6)
    X_train = X[:train_len, :]
    y_train = y[:train_len]
    X_test = X[train_len:, :]
    y_test = y[train_len:]
    print('train size:', X_train.shape)
    print('test size:', X_test.shape)

    # convert to polynomial features
    # print('converting to polynomial features...')
    # poly = PolynomialFeatures(2)
    # X_train = poly.fit_transform(X_train.toarray())
    # X_test = poly.fit_transform(X_test.toarray())
    # print('train set: ', X_train.shape)
    # print('test set: ', X_test.shape)

    # scale the attributes to [0, 1]
    print('standardizing the features...')
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # training classifiers
    print('training, predicting and evaluating...')

    # Dummy Regression (baseline model)
    print('\nDummy Regression:')
    model = DummyRegressor(strategy='mean')
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Linear Regression
    print('\nLinear_regression: ')
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Ridge
    print('\nRidge: ')
    model = Ridge()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # passive aggresive
    print('\nPoly: ')
    model = PassiveAggressiveRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # AdaBoost
    print('\nAdaBoost: ')
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Random Forest
    print('\nRandom Forest:')
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))
コード例 #35
0
grades_in = ['NU_NOTA_CH','NU_NOTA_LC', 'NU_NOTA_CN','NU_NOTA_REDACAO']
grade_out = 'NU_NOTA_MT'

data_noNA = data_no0[tests].dropna()
x = data_noNA[grades_in]
y = data_noNA[grade_out]

seed = 4321
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=seed)

model_linear = LinearSVR(random_state=seed)
model_linear.fit(x_train, y_train)
predictions_linear = model_linear.predict(x_test)

model_svr = SVR()
model_svr.fit(x_train, y_train)
predictions_svr = model_svr.predict(x_test)

model_dummy = DummyRegressor()
model_dummy.fit(x_train, y_train)
predictions_dummy = model_dummy.predict(x_test)

linear = mean_squared_error(y_test, predictions_linear)**(1/2)
svr = mean_squared_error(y_test, predictions_svr)**(1/2)
dummy = mean_squared_error(y_test, predictions_dummy)**(1/2)
print(f'-=- Mean error of models -=-\nlinear: {linear}, svr: {svr}, dummy: {dummy}')

linear = r2_score(y_test, predictions_linear)
svr = r2_score(y_test, predictions_svr)
dummy = r2_score(y_test, predictions_dummy)
print(f'-=- R2 of models -=-\nlinear: {linear}, svr: {svr}, dummy: {dummy}')
コード例 #36
0
    pls = PLSRegression(n_components=ncomp)
    dumb = DummyRegressor(strategy='mean')

    mae = 0
    dumb_mae = 0
    for oidx, (train, test) in enumerate(cv):
        X_fmri_train = X_fmri[train]
        X_fmri_test = X_fmri[test]
        X_meg_train = X_meg[train]
        X_meg_test = X_meg[test]
        
        pls.fit(X_fmri_train, X_meg_train)
        pred = pls.predict(X_fmri_test)

        mae += mean_absolute_error(X_meg_test, pred)

        dumb.fit(X_fmri_train, X_meg_train)
        dumb_pred = dumb.predict(X_fmri_test)
        dumb_mae += mean_absolute_error(X_meg_test,dumb_pred)

    comp_scores.append(mae/nfolds)
    dumb_scores.append(dumb_mae/nfolds)

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
plt.plot(max_comps,comp_scores,max_comps,dumb_scores)
t_str = seed + str(band)
plt.title(t_str)
plt.savefig(home+'/tmp/meg_fmri_%s_%s.png'%(seed,band[0]))
コード例 #37
0
        X_fmri_test = X_fmri[test]
        X_meg_train = X_meg[train]
        X_meg_test = X_meg[test]
        y_train = y[train]
        y_test = y[test]

        X_train = np.hstack([X_fmri_train,X_meg_train])
        X_test = np.hstack([X_fmri_test,X_meg_test])
        
        pls.fit(X_train, y_train)
        pred = pls.predict(X_test)

        mae += mean_absolute_error(y_test, pred)

        dumb.fit(X_train, y_train)
        dumb_pred = dumb.predict(X_test)
        dumb_mae += mean_absolute_error(y_test,dumb_pred)

        if within:
            pls.fit(X_fmri_train, y_train)
            pred = pls.predict(X_fmri_test)
            fmri_mae += mean_absolute_error(y_test, pred)

            pls.fit(X_meg_train, y_train)
            pred = pls.predict(X_meg_test)
            meg_mae += mean_absolute_error(y_test, pred)

    comp_scores.append(mae/nfolds)
    dumb_scores.append(dumb_mae/nfolds)
    fmri_scores.append(fmri_mae/nfolds)
    meg_scores.append(meg_mae/nfolds)
コード例 #38
0
def main():

    # load training and testing data set
    print('parsing training set...')
    X_train, y_train = parse('./data_set/train_set.csv')
    print('parsing testing set...')
    X_test, y_test = parse('./data_set/test_set.csv')
    print('train set: ', X_train.shape)
    print('test set: ', X_test.shape)

    # The result turns out to be worse using non-linear polynomial regression
    # convert to polynomial features
    # print('converting to polynomial features...')
    # poly = PolynomialFeatures(2)
    # X_train = poly.fit_transform(X_train)
    # X_test = poly.fit_transform(X_test)
    # print('train set: ', X_train.shape)
    # print('test set: ', X_test.shape)

    # scale the attributes to [0, 1]
    print('standardizing the features...')
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # training classifiers
    print('training, predicting and evaluating...')

    # Dummy Regression (baseline model)
    print('\nDummy Regression: (baseline)')
    model = DummyRegressor(strategy='mean')
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Linear Regression
    print('\nLinear_regression: ')
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # KNN Regression
    # print('\nKNN Regression: ')
    # model = KNeighborsRegressor()
    # model.fit(X_train, y_train)
    # y_pre = model.predict(X_test)
    # print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    # print('r2_score: ', r2_score(y_test, y_pre))

    # Neural Network - Bernoulli Restricted Boltzmann Machine (RBM)
    # print('\nNeural Network - RBM: ')
    # model = BernoulliRBM()
    # model.fit(X_train, y_train)
    # y_pre = model.predict(X_test)
    # print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    # print('r2_score: ', r2_score(y_test, y_pre))

    # AdaBoost
    print('\nAdaBoost: ')
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Random Forest
    print('\nRandom Forest:')
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))
コード例 #39
0
import csv
import pickle

from sklearn.dummy import DummyRegressor
import numpy as np

age_range = 80
gender = {'male': 0, 'other': 0.5, 'female': 1}

X = np.array([[20 / age_range, gender['male']], [56 / age_range, gender['other']]])
Y = np.array([[.2]        , [.7]])


clf = DummyRegressor()
clf.fit(X, Y)

# print([r[2] for r in data])
print(Y)
# print([
#     movies[int(round(idx * len(movies)))]
#     for idx in clf.predict(X)
# ])
print(clf.predict([[0.2, 1]]))

with open('model.pk', 'wb') as outfile:
    pickle.dump(clf, outfile)
コード例 #40
0
X_test_s = test_season.drop('GAME_TOTAL', axis = 1).to_numpy()
y_test_s = test_season['GAME_TOTAL'].to_numpy()
Test_Vegas = test_season['TOTAL_CLOSE'].to_numpy()
Train_Vegas = train_season['TOTAL_CLOSE'].to_numpy()

#Vegas BASELINE = 17.650007402704748 
mean_squared_error(np.append(y_train_s,y_test_s), np.append(Train_Vegas,Test_Vegas), squared = False)

#DUMMY REGRESSOR:

dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train_s, y_train_s)
#-0.7833193001644205
dummy_regr.score(X_test_s, y_test_s)
#27.845427872989156
mean_squared_error(y_test_s, dummy_regr.predict(X_test_s), squared = False)

#OLS
regressor = sm.OLS(y_train_s, X_train_s)
regressor = regressor.fit()
#evidently this returned a 0.991 R**2
#second run gave us 0.993
regressor.summary()
preds = regressor.predict(X_test_s)
#18.5802074596655
mean_squared_error(y_test_s, preds, squared = False)

#RANDOM FOREST
rf = RandomForestRegressor(oob_score=True)
rf.fit(X_train_s,y_train_s)
#0.23057109964613554
from pathlib import Path
sys.path.append('/home/jiajunb/prosocial-conversations')
from models import XGBOOST_FEATURES, EIGENMETRICS

ROOT_DIR = Path('/shared/0/projects/prosocial/data/finalized/')
train_df = pd.read_csv(ROOT_DIR / 'data_cache/lr_or_xgboost/train.tsv',
                       sep='\t',
                       usecols=XGBOOST_FEATURES + EIGENMETRICS)

train_X = train_df[XGBOOST_FEATURES].values
train_y = train_df[EIGENMETRICS].values.reshape(-1)
dummy_clf = DummyRegressor(strategy="mean")
dummy_clf.fit(train_X, train_y)

# on training set
train_preds = dummy_clf.predict(train_X)
print(f'R^2 on training set: {r2_score(train_y, train_preds)}')
print(f'MSELoss on training set: {mean_squared_error(train_preds, train_y)}')

output_path = ROOT_DIR / 'model_checkpoints/dummy'
output_path.mkdir(exist_ok=True, parents=True)
joblib.dump(dummy_clf, output_path / 'dummy.model.buffer')

test_df = pd.read_csv(ROOT_DIR / 'data_cache/lr_or_xgboost/test.tsv',
                      sep='\t',
                      usecols=XGBOOST_FEATURES + EIGENMETRICS)
test_X = test_df[XGBOOST_FEATURES].values
test_y = test_df[EIGENMETRICS].values

# on test set
test_preds = dummy_clf.predict(test_X)
コード例 #42
0
O MSE, sigla em inglês para essá métrica, é uma medida que quanto mais perto de zero melhor. Veja o resultado quando calculamos o MSE de dois vetores iguais:
"""

mean_squared_error(y_teste, y_teste)

"""Nosso resultado é zero! 
Você deve estar se perguntando: meu modelo não está nem perto de zero, será que ele é tão ruim assim?

Nós ainda não temos como te dar essa resposta, precisamos de um critério comparativo, pois assim conseguimos dizer como nosso modelo está indo. Por exemplo, que tal classificar os nossos dados de uma maneira "bobinha"? Para isso temos os chamados métodos **Dummy**.
"""

from sklearn.dummy import DummyRegressor

modelo_dummy = DummyRegressor()
modelo_dummy.fit(x_treino, y_treino)
dummy_predicoes = modelo_dummy.predict(x_teste)

mean_squared_error(y_teste, dummy_predicoes)

"""Finalmente conseguimos responder se nosso modelo é tão ruim assim! Na realidade nosso modelo não é um dos melhores, temos muito o que melhorar, mas já somos melhores que uma classificação ingênua. 

Com isso, encerramos nossa última aula. Espero que vocês tenham gostado! 

Participem também do nosso **desafio final, valendo um Nintendo Switch**.

Bons estudos e boa sorte!

Forte abraço!

## Desafio 1 da [Allan Spadini](https://twitter.com/allanspadini)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor

diabetes = datasets.load_diabetes()

X = diabetes.data[:, None, 6]
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lm = LinearRegression().fit(X_train, y_train)
lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train)

y_predict = lm.predict(X_test)
y_predict_dummy_mean = lm_dummy_mean.predict(X_test)

print('Linear model, coefficients: ', lm.coef_)
print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(y_test, 
                                                                     y_predict_dummy_mean)))
print("Mean squared error (linear model): {:.2f}".format(mean_squared_error(y_test, y_predict)))
print("r2_score (dummy): {:.2f}".format(r2_score(y_test, y_predict_dummy_mean)))
print("r2_score (linear model): {:.2f}".format(r2_score(y_test, y_predict)))

# Plot outputs
plt.scatter(X_test, y_test,  color='black')
plt.plot(X_test, y_predict, color='green', linewidth=2)
plt.plot(X_test, y_predict_dummy_mean, color='red', linestyle = 'dashed', 
         linewidth=2, label = 'dummy')

plt.show()
コード例 #44
0
ファイル: 10_yelp_reviews.py プロジェクト: AntHar/DAT7
# add new features to the model
feature_cols = ['cool', 'useful', 'funny', 'length', 'love', 'hate']
X = yelp[feature_cols]
train_test_rmse(X, y)


# TASK 8 (BONUS): compare your best RMSE with RMSE for the null model

# split the data (outside of the function)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# use scikit-learn's built-in dummy regressor
from sklearn.dummy import DummyRegressor
dumb = DummyRegressor(strategy='mean')
dumb.fit(X_train, y_train)
y_dumb = dumb.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_dumb))

# or, create a NumPy array with the right length, and fill it with the mean of y_train
y_null = np.zeros_like(y_test, dtype=float)
y_null.fill(y_train.mean())
print np.sqrt(metrics.mean_squared_error(y_test, y_null))


# TASK 9 (BONUS): treat this as a classification problem, try KNN, maximize your accuracy

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=150)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred_class)
コード例 #45
0
def main():

    # load training and testing data set
    print('parsing training set...')
    X_train, y_train = parse('./data_set/train_set.csv')
    print('parsing testing set...')
    X_test, y_test = parse('./data_set/test_set.csv')
    print('train set: ', X_train.shape)
    print('test set: ', X_test.shape)

    # The result turns out to be worse using non-linear polynomial regression
    # convert to polynomial features
    # print('converting to polynomial features...')
    # poly = PolynomialFeatures(2)
    # X_train = poly.fit_transform(X_train)
    # X_test = poly.fit_transform(X_test)
    # print('train set: ', X_train.shape)
    # print('test set: ', X_test.shape)

    # scale the attributes to [0, 1]
    print('standardizing the features...')
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # training classifiers
    print('training, predicting and evaluating...')

    # Dummy Regression (baseline model)
    print('\nDummy Regression: (baseline)')
    model = DummyRegressor(strategy='mean')
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Linear Regression
    print('\nLinear_regression: ')
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # KNN Regression
    # print('\nKNN Regression: ')
    # model = KNeighborsRegressor()
    # model.fit(X_train, y_train)
    # y_pre = model.predict(X_test)
    # print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    # print('r2_score: ', r2_score(y_test, y_pre))

    # Neural Network - Bernoulli Restricted Boltzmann Machine (RBM)
    # print('\nNeural Network - RBM: ')
    # model = BernoulliRBM()
    # model.fit(X_train, y_train)
    # y_pre = model.predict(X_test)
    # print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    # print('r2_score: ', r2_score(y_test, y_pre))

    # AdaBoost
    print('\nAdaBoost: ')
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Random Forest
    print('\nRandom Forest:')
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))
コード例 #46
0
model = LinearRegression()
model.fit(X_train, y_train)
# verifichiamo l'apprendimento sui dati di training
pred_train = model.predict(X_train)
train_score = mean_squared_error(y_train, pred_train)
print('Train error', train_score)
# testiamo il modello sui dati di test
pred_test = model.predict(X_test)
test_score = mean_squared_error(y_test, pred_test)
print('Test error', test_score)

# prestazioni del modello dummy (baseline)
dummy = DummyRegressor()
dummy.fit(X_train, y_train)
# dummy sul training set
pred_train_dummy = dummy.predict(X_train)
dummy_train_score = mean_squared_error(y_train, pred_train_dummy)
print('Dummy train error', dummy_train_score)
# dummy sul test set
pred_test_dummy  = dummy.predict(X_test)
dummy_test_score = mean_squared_error(y_test, pred_test_dummy)
print('Dummy test error', dummy_test_score)

# prepariamo dati per il grafico di confronto
report = {
    'train': [dummy_train_score, train_score],
    'test' : [dummy_test_score, test_score],
    'model': ['dummy', 'regression']
}
report_df = pd.DataFrame(report)
report_df = report_df.set_index(report_df['model'])
コード例 #47
0
ファイル: ch4_EDA.py プロジェクト: minjaelee0522/hydropower
# In[35]:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# # Baseline Model

# In[36]:

from sklearn.dummy import DummyRegressor

dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)
dummy_regr.predict(X_train)
baseline = dummy_regr.score(X_train, y_train)
print("Baseline R^2: %f" % baseline)

# # Multiple Linear Regression

# In[37]:

ols = linear_model.LinearRegression()
ols.fit(X_train, y_train)
print("Coefficients: %s" % ols.coef_)
print("Intercept: %f" % ols.intercept_)
y_test_prediction = ols.predict(X_test)
ols.score(X_train, y_train)

# In[40]:
コード例 #48
0
ファイル: DummyRegressor.py プロジェクト: lm2612/Ridge_3
def DummyPrediction(X_train, y_train, X_test, y_test):
    dummy = DummyRegressor()
    dummy = dummy.fit(X_train, y_train)
    y_pred = dummy.predict(X_test)
    return (y_pred)
コード例 #49
0
import pandas as pd
from sklearn.dummy import DummyRegressor

# Loading in the data
canucks = pd.read_csv('data/canucks_subbed.csv')

# Define X and y
X = canucks.loc[:, ['No.', 'Age', 'Height', 'Weight', 'Experience']]
y = canucks['Salary']

# Create a model
model = DummyRegressor(strategy="mean")

# Fit your data
model.fit(X, y)

# Predict the labels of X
model.predict(X)

# The model accuracy
accuracy = round(model.score(X, y), 2)

accuracy
コード例 #50
0
def test_regressor_scatter():
    """Tests regressor scatter."""
    X, y = load_boston(return_X_y=True)
    estimator = DummyRegressor()
    estimator.fit(X, y)
    regressor_scatter(X, y, estimator.predict(X), 'regressor_scatter.pdf')