def test_linear_regression_multiple_outcome(random_state=0):
    # Test multiple-outcome linear regressions
    X, y = make_regression(random_state=random_state)

    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    clf = LinearRegression(fit_intercept=True)
    clf.fit((X), Y)
    assert_equal(clf.coef_.shape, (2, n_features))
    Y_pred = clf.predict(X)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_multiple_outcome(random_state=0):
    "Test multiple-outcome linear regressions"
    X, y = make_regression(random_state=random_state)

    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    clf = LinearRegression(fit_intercept=True)
    clf.fit((X), Y)
    assert_equal(clf.coef_.shape, (2, n_features))
    Y_pred = clf.predict(X)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
Exemple #3
0
def test_linear_regression_multiple_outcome(random_state=0):
    # Test multiple-outcome linear regressions
    X, y = make_regression(random_state=random_state)

    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    reg = LinearRegression()
    reg.fit((X), Y)
    assert reg.coef_.shape == (2, n_features)
    Y_pred = reg.predict(X)
    reg.fit(X, y)
    y_pred = reg.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_sparse_multiple_outcome(random_state=0):
    "Test multiple-outcome linear regressions with sparse data"
    random_state = check_random_state(random_state)
    X, y = make_sparse_uncorrelated(random_state=random_state)
    X = sparse.coo_matrix(X)
    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    ols = LinearRegression()
    ols.fit(X, Y)
    assert_equal(ols.coef_.shape, (2, n_features))
    Y_pred = ols.predict(X)
    ols.fit(X, y.ravel())
    y_pred = ols.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_linear_regression_sparse_multiple_outcome(random_state=0):
    # Test multiple-outcome linear regressions with sparse data
    random_state = check_random_state(random_state)
    X, y = make_sparse_uncorrelated(random_state=random_state)
    X = sparse.coo_matrix(X)
    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    ols = LinearRegression()
    ols.fit(X, Y)
    assert_equal(ols.coef_.shape, (2, n_features))
    Y_pred = ols.predict(X)
    ols.fit(X, y.ravel())
    y_pred = ols.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
Exemple #6
0
def MethodSelect(Xt, XT, Yt, YT):
    reg = LinearRegression()
    reg.fit(Xt, Yt)
    predict = reg.predict(XT)
    err = dp.rmsErr(predict, YT)
    if err > 100:
        a = XT[0]
        b = Xt[0]
        c = [a]
        pre = reg.predict(c)
        print(pre[0])

        for i in range(0, len(a)):
            print(a[i], b[i])
        print('\n\n\n')
    return err
class PredictLoss(BaseLR):
    def __init__(self, hist=30, posmax=15, lr=0.2):
        from sklearn.linear_model.base import LinearRegression
        from collections import deque
        self.hist = hist
        self.track = deque(maxlen=self.hist)
        self.regr = LinearRegression()
        self.poscases = 0
        self.posmax = posmax
        self.lr = lr

    def __call__(self, env):
        if len(self.track) > 5:
            y = np.array(self.track)
            x = np.array(range(len(y.shape))).reshape(-1, 1)
            self.regr.fit(x, y)
            coef_ = self.regr.coef_[0]
            preds = self.regr.predict(x)
            fst = preds[0]
            lst = preds[-1]
            e = np.sqrt(((y - preds)**2).mean())
            if coef_ > 0:
                self.poscases += 1
                if self.poscases >= self.posmax:
                    raise EarlyStopException
            else:
                self.poscases -= 1
                if self.poscases < 0:
                    self.poscases = 0
            diff = np.abs(fst - lst)
            coef = np.clip(diff/e, 1e-6, 1)
            lr = self.lr*coef
            print(lr, e, diff, coef_, coef, file=open('log.txt', 'a'))
            env.model.set_param("learning_rate", lr)
    def linearRegression_sales(self):  #线性回归
        path = u'4.Advertising.csv'
        data = self.readFile(path)
        #         x=data[['TV', 'Radio', 'Newspaper']]
        x = data[['TV', 'Radio']]
        y = data['Sales']
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            random_state=1)
        # print x_train, y_train
        linreg = LinearRegression()
        model = linreg.fit(x_train, y_train)
        print model
        print linreg.coef_
        print linreg.intercept_
        y_hat = linreg.predict(np.array(x_test))
        mse = np.average((y_hat - y_test)**2)
        rmse = np.sqrt(mse)
        print mse, rmse

        t = np.arange(len(x_test))
        plt.plot(t, y_test, 'r-', linewidth=2, label='Test')
        plt.plot(t, y_hat, 'g-', linewidth=2, label='Predict')
        plt.grid()
        plt.legend(loc='upper right')
        plt.show()
Exemple #9
0
 def test_predict_hdf_dataframe(self):
     # create some data
     x = np.array(list(range(0, 10)))
     y = x * 2
     df = pd.DataFrame({'x': x,
                        'y': y})
     X = df['x']
     Y = df['y']
     # put into Omega -- assume a client with pandas, scikit learn
     os.environ['DJANGO_SETTINGS_MODULE'] = ''
     om = Omega()
     om.runtime.pure_python = True
     om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True
     om.datasets.put(X, 'datax', as_hdf=True)
     om.datasets.put(Y, 'datay', as_hdf=True)
     # have Omega fit the model then predict
     lr = LinearRegression()
     lr.fit(reshaped(X), reshaped(Y))
     pred = lr.predict(reshaped(X))
     om.models.put(lr, 'mymodel2')
     # -- using data provided locally
     #    note this is the same as
     #        om.datasets.put(X, 'foo')
     #        om.runtimes.model('mymodel2').predict('foo')
     result = om.runtime.model('mymodel2').predict('datax')
     pred2 = result.get()
     self.assertTrue(
         (pred == pred2).all(), "runtimes prediction is different(1)")
     self.assertTrue(
         (pred == pred2).all(), "runtimes prediction is different(2)")
def compare_panorama_cubic(greenery_measure="vegetation", **kwargs):
    """ Compare/plot the segmentation results of panoramic and cubic
        images to each other. Also use linear regression to determine
        how they relate to each other.
    """

    green_kwargs = select_green_model(greenery_measure)

    panorama_tiler = TileManager(cubic_pictures=False, **kwargs, **green_kwargs)
    cubic_tiler = TileManager(cubic_pictures=True, **kwargs, **green_kwargs)

    panorama_green = panorama_tiler.green_direct()
    cubic_green = cubic_tiler.green_direct()

    _remove_missing(panorama_green, cubic_green)
    x = np.arange(0, 0.8, 0.01)

    x_pano = np.array(panorama_green["green"]).reshape(-1, 1)
    y_cubic = np.array(cubic_green["green"])
    reg = LinearRegression().fit(x_pano, y_cubic)
    print(reg.score(x_pano, y_cubic))
    print(reg.coef_[0], reg.intercept_)
    plt.figure()
    plt.scatter(panorama_green["green"], cubic_green["green"])
    plt.plot(x, reg.predict(x.reshape(-1, 1)))
    plt.xlabel("panoramas")
    plt.ylabel("cubic")
    plt.xlim(0, max(0.001, max(panorama_green["green"])*1.1))
    plt.ylim(0, max(0.001, max(cubic_green["green"])*1.1))

    plot_greenery(panorama_green, show=False, title="panorama")
    plot_greenery(cubic_green, show=False, title="cubic")
    plt.show()
    def get_scikit_prediction(x=np.array([1, 2, 3]), y=np.array([1, 2, 3])):

        from sklearn.linear_model.base import LinearRegression as ScikitLinearRegression

        regression = ScikitLinearRegression()
        regression.fit(x, y)

        return regression.predict(x)
Exemple #12
0
def train():
    X = np.array([[1, 2, 3, 4], [4, 5, 6, 7], [7, 8, 9, 10]])
    y = np.array([10, 20, 30])
    X_test = np.array([[10, 20, 30, 40], [40, 50, 60, 70], [70, 80, 90, 100]])
    reg = LinearRegression()
    reg.fit(X, y)
    print('coef_:', reg.coef_)
    print('intercept_:', reg.intercept_)
    print('predict:', reg.predict(X_test))
Exemple #13
0
 def test_fit(self):
     # create some data
     x = np.array(list(range(0, 10)))
     y = x * 2
     df = pd.DataFrame({'x': x,
                        'y': y})
     X = df[['x']]
     Y = df[['y']]
     # put into Omega
     os.environ['DJANGO_SETTINGS_MODULE'] = ''
     om = Omega()
     om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True
     om.datasets.put(X, 'datax')
     om.datasets.put(Y, 'datay')
     om.datasets.get('datax')
     om.datasets.get('datay')
     # create a model locally, store (unfitted) in Omega
     lr = LinearRegression()
     om.models.put(lr, 'mymodel2')
     self.assertIn('mymodel2', om.models.list('*'))
     # predict locally for comparison
     lr.fit(X, Y)
     pred = lr.predict(X)
     # try predicting without fitting
     with self.assertRaises(NotFittedError):
         result = om.runtime.model('mymodel2').predict('datax')
         result.get()
     # have Omega fit the model then predict
     result = om.runtime.model('mymodel2').fit('datax', 'datay')
     result.get()
     # check the new model version metadata includes the datax/y references
     meta = om.models.metadata('mymodel2')
     self.assertIn('metaX', meta.attributes)
     self.assertIn('metaY', meta.attributes)
     # -- using data already in Omega
     result = om.runtime.model('mymodel2').predict('datax')
     pred1 = result.get()
     # -- using data provided locally
     #    note this is the same as
     #        om.datasets.put(X, 'foo')
     #        om.runtimes.model('mymodel2').predict('foo')
     result = om.runtime.model('mymodel2').fit(X, Y)
     result = om.runtime.model('mymodel2').predict(X)
     pred2 = result.get()
     # -- check the local data provided to fit was stored as intended
     meta = om.models.metadata('mymodel2')
     self.assertIn('metaX', meta.attributes)
     self.assertIn('metaY', meta.attributes)
     self.assertIn('_fitX', meta.attributes.get('metaX').get('collection'))
     self.assertIn('_fitY', meta.attributes.get('metaY').get('collection'))
     self.assertTrue(
         (pred == pred1).all(), "runtimes prediction is different(1)")
     self.assertTrue(
         (pred == pred2).all(), "runtimes prediction is different(2)")
def test_linear_regression():
    # Test LinearRegression on a simple dataset.
    # a simple dataset
    X = [[1], [2]]
    Y = [1, 2]

    clf = LinearRegression()
    clf.fit(X, Y)

    assert_array_almost_equal(clf.coef_, [1])
    assert_array_almost_equal(clf.intercept_, [0])
    assert_array_almost_equal(clf.predict(X), [1, 2])

    # test it also for degenerate input
    X = [[1]]
    Y = [0]

    clf = LinearRegression()
    clf.fit(X, Y)
    assert_array_almost_equal(clf.coef_, [0])
    assert_array_almost_equal(clf.intercept_, [0])
    assert_array_almost_equal(clf.predict(X), [0])
def test_linear_regression():
    # Test LinearRegression on a simple dataset.
    # a simple dataset
    X = [[1], [2]]
    Y = [1, 2]

    clf = LinearRegression()
    clf.fit(X, Y)

    assert_array_almost_equal(clf.coef_, [1])
    assert_array_almost_equal(clf.intercept_, [0])
    assert_array_almost_equal(clf.predict(X), [1, 2])

    # test it also for degenerate input
    X = [[1]]
    Y = [0]

    clf = LinearRegression()
    clf.fit(X, Y)
    assert_array_almost_equal(clf.coef_, [0])
    assert_array_almost_equal(clf.intercept_, [0])
    assert_array_almost_equal(clf.predict(X), [0])
def test_linear_regression_sparse(random_state=0):
    # Test that linear regression also works with sparse data
    random_state = check_random_state(random_state)
    for i in range(10):
        n = 100
        X = sparse.eye(n, n)
        beta = random_state.rand(n)
        y = X * beta[:, np.newaxis]

        ols = LinearRegression()
        ols.fit(X, y.ravel())
        assert_array_almost_equal(beta, ols.coef_ + ols.intercept_)

        assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
def test_linear_regression_sparse(random_state=0):
    # Test that linear regression also works with sparse data
    random_state = check_random_state(random_state)
    for i in range(10):
        n = 100
        X = sparse.eye(n, n)
        beta = random_state.rand(n)
        y = X * beta[:, np.newaxis]

        ols = LinearRegression()
        ols.fit(X, y.ravel())
        assert_array_almost_equal(beta, ols.coef_ + ols.intercept_)

        assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
Exemple #18
0
def eval_linear(data_set, test_size=0.4):
    # load training data from feature matrix
    x, y = data_set.load_training_data()

    # cross validation evaluation
    model = LinearRegression(normalize=True)
    #model = RFE(model, 10)
    score = cross_val_score(model, x, y, scoring='neg_mean_squared_error')
    print('Mean squared error: {}'.format(-score))

    # to visualize:
    # split data into train and test set
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=test_size,
                                                        shuffle=True,
                                                        random_state=0)

    # train model on train set
    model = LinearRegression(normalize=True)
    model = model.fit(x_train, y_train)
    print(model.coef_)

    pprint(model)

    # plot train performance
    predict_train = model.predict(x_train)
    plt.figure()
    plt.title('train')
    plt.scatter(y_train, predict_train)

    # plot test performance
    predict = model.predict(x_test)
    plt.figure()
    plt.title('test')
    plt.scatter(y_test, predict)
    plt.show()
class LinearRegressionImpl():

    def __init__(self, fit_intercept=True, normalize=False, copy_X=True, n_jobs=None):
        self._hyperparams = {
            'fit_intercept': fit_intercept,
            'normalize': normalize,
            'copy_X': copy_X,
            'n_jobs': n_jobs}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)
Exemple #20
0
 def test_predict(self):
     # create some data
     x = np.array(list(range(0, 10)))
     y = x * 2
     df = pd.DataFrame({'x': x,
                        'y': y})
     X = df[['x']]
     Y = df[['y']]
     # put into Omega
     os.environ['DJANGO_SETTINGS_MODULE'] = ''
     om = Omega()
     om.runtime.celeryapp.conf.CELERY_ALWAYS_EAGER = True
     om.datasets.put(X, 'datax')
     om.datasets.put(Y, 'datay')
     om.datasets.get('datax')
     om.datasets.get('datay')
     # create a model locally, fit it, store in Omega
     lr = LinearRegression()
     lr.fit(X, Y)
     pred = lr.predict(X)
     om.models.put(lr, 'mymodel')
     self.assertIn('mymodel', om.models.list('*'))
     # have Omega predict it
     # -- using data already in Omega
     result = om.runtime.model('mymodel').predict('datax')
     pred1 = result.get()
     # -- using data provided locally
     #    note this is the same as
     #        om.datasets.put(X, 'foo')
     #        om.runtimes.model('mymodel').predict('foo')
     result = om.runtime.model('mymodel').predict(X)
     pred2 = result.get()
     self.assertTrue(
         (pred == pred1).all(), "runtimes prediction is different(1)")
     self.assertTrue(
         (pred == pred2).all(), "runtimes prediction is different(2)")
class StackedRegression(LinearModel, RegressorMixin):
    def __init__(self, weights=None, cv_train_size=None):
        estimators = []
        estimators.append(KNeighborsRegressor(n_neighbors=3))
        estimators.append(DecisionTreeRegressor())
        estimators.append(BayesianRidge())
        # estimators.append(BayesianRidge())
        self.estimators = estimators
        self.stacker = LinearRegression()
        self.weights = weights if weights is not None else {}
        self.cv_train_size = cv_train_size if cv_train_size is not None else 0.7
        self._is_fitted = False

    def fit_stack(self, X, y):
        print('fitting')
        print(X.shape)
        n_train = int(X.shape[0] * self.cv_train_size)
        for estimator in self.estimators:
            estimator.fit(X[:n_train, :], y[:n_train])
        predictions = np.concatenate([np.matrix(estimator.predict(X[n_train:, :])).transpose()
                                      for estimator in self.estimators], axis=1)
        self.stacker.fit(predictions, y[n_train:])
        self._is_fitted = True
        print('fitted')
        print(self.stacker.residues_)

    def fit(self, X, y):
        if not self._is_fitted:
            raise NotFittedError('StackedRegression must call fit_stack before fit.')
        for estimator in self.estimators:
            estimator.fit(X, y)

    def predict(self, X):
        predictions = np.concatenate([np.matrix(estimator.predict(X)).transpose()
                                      for estimator in self.estimators], axis=1)
        return self.stacker.predict(predictions)
def test_linear_regression_sample_weights():
    rng = np.random.RandomState(0)

    for n_samples, n_features in ((6, 5), (5, 10)):
        y = rng.randn(n_samples)
        X = rng.randn(n_samples, n_features)
        sample_weight = 1.0 + rng.rand(n_samples)

        clf = LinearRegression()
        clf.fit(X, y, sample_weight)
        coefs1 = clf.coef_

        assert_equal(clf.coef_.shape, (X.shape[1], ))
        assert_greater(clf.score(X, y), 0.9)
        assert_array_almost_equal(clf.predict(X), y)

        # Sample weight can be implemented via a simple rescaling
        # for the square loss.
        scaled_y = y * np.sqrt(sample_weight)
        scaled_X = X * np.sqrt(sample_weight)[:, np.newaxis]
        clf.fit(X, y)
        coefs2 = clf.coef_

        assert_array_almost_equal(coefs1, coefs2)
def test_linear_regression_sample_weights():
    rng = np.random.RandomState(0)

    for n_samples, n_features in ((6, 5), (5, 10)):
        y = rng.randn(n_samples)
        X = rng.randn(n_samples, n_features)
        sample_weight = 1.0 + rng.rand(n_samples)

        clf = LinearRegression()
        clf.fit(X, y, sample_weight)
        coefs1 = clf.coef_

        assert_equal(clf.coef_.shape, (X.shape[1], ))
        assert_greater(clf.score(X, y), 0.9)
        assert_array_almost_equal(clf.predict(X), y)

        # Sample weight can be implemented via a simple rescaling
        # for the square loss.
        scaled_y = y * np.sqrt(sample_weight)
        scaled_X = X * np.sqrt(sample_weight)[:, np.newaxis]
        clf.fit(X, y)
        coefs2 = clf.coef_

        assert_array_almost_equal(coefs1, coefs2)
from sklearn.linear_model.base import LinearRegression

reg = LinearRegression()
reg.fit(ages_train, net_worths_train)
print("Slope %s" % reg.coef_)
print("Intercept %s" % reg.intercept_)

print("Score = ", reg.score(ages_test, net_worths_test))






try:
    plt.plot(ages, reg.predict(ages), color="blue")
except NameError:
    pass
plt.scatter(ages, net_worths)
plt.show()


### identify and remove the most outlier-y points
cleaned_data = []
try:
    predictions = reg.predict(ages_train)
    cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train )
except NameError:
    print "your regression object doesn't exist, or isn't name reg"
    print "can't make predictions to use in identifying outliers"
    plt.scatter( feature, target, color=test_color ) 
for feature, target in zip(feature_train, target_train):
    plt.scatter( feature, target, color=train_color ) 

### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")


from sklearn.linear_model.base import LinearRegression

reg = LinearRegression()
reg.fit(feature_train, target_train)
print("Slope %s" % reg.coef_)
print("Intercept %s" % reg.intercept_)

print("Score = ", reg.score(feature_test, target_test))
### draw the regression line, once it's coded
try:
    plt.plot( feature_test, reg.predict(feature_test) )
except NameError:
    pass
reg.fit(feature_test, target_test)
plt.plot(feature_train, reg.predict(feature_train), color="b")
plt.xlabel(features_list[1])
plt.ylabel(features_list[0])
plt.legend()
plt.show()
print("Slope2 %s" % reg.coef_)
print("Intercept2 %s" % reg.intercept_)
def get_cv_error(x_train, x_test, y_train, y_test):
    model = LinearRegression(normalize=True)
    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    return np.average(np.abs(y_test - predict))
def get_error(x, y):
    model = LinearRegression(normalize=True)
    model.fit(x, y)
    predict = model.predict(x)
    return np.average(np.abs(y - predict))
def draw_data_size_vs_performance_chart():
    ''' Create figure for paper '''
    paths = glob('output/data-sizes/*.results/*/results.json') + \
            glob('output/model-h2048p512-mfs-true.results/*/results.json')
    df = read_json_files(paths)

    def parse_path(val):
        if 'model-h2048p512' in val:
            return 100
        else:
            return float(re.search(r'(\d+)\.results', val).group(1))

    df['data-pct'] = df['path'].apply(parse_path)
    df['words'] = 1.8e9 * df['data-pct'] / 100
    df = df.append([{
        'words': 1e11,
        'model': 'Yuan et al. (T: SemCor)',
        "competition": "SemEval13",
        'F1': 0.670
    }, {
        'words': 1e11,
        'model': 'Yuan et al. (T: OMSTI)',
        "competition": "SemEval13",
        'F1': 0.673
    }, {
        'words': 1e11,
        'model': 'Yuan et al. (T: SemCor)',
        "competition": "Senseval2",
        'F1': 0.736
    }, {
        'words': 1e11,
        'model': 'Yuan et al. (T: OMSTI)',
        "competition": "SemEval13",
        'F1': 0.673
    }, {
        'words': 1e11,
        'model': 'Yuan et al. (T: SemCor)',
        "competition": "Senseval2",
        'F1': 0.736
    }, {
        'words': 1e11,
        'model': 'Yuan et al. (T: OMSTI)',
        "competition": "Senseval2",
        'F1': 0.724
    }])
    print(df)

    def get_xy(competition, model):
        sub_df = df[df['model'].str.contains(model, regex=False)]
        sub_df = sub_df.query('competition == "%s"' %
                              competition).sort_values('words')
        return sub_df['words'], sub_df['F1']

    with PdfPages('output/data_size_vs_performance.pdf') as pdf:
        se13_semcor_handle, = plt.plot(*get_xy('SemEval13', '(T: SemCor)'),
                                       '-o',
                                       label='SemEval13 (T: SemCor)')
        se13_mun_handle, = plt.plot(*get_xy('SemEval13', '(T: SemCor+OMSTI)'),
                                    '--o',
                                    label='SemEval13 (T: OMSTI)')
        se2_semcor_handle, = plt.plot(*get_xy('Senseval2', '(T: SemCor)'),
                                      ':o',
                                      label='Senseval2 (T: SemCor)')
        se2_mun_handle, = plt.plot(*get_xy('Senseval2', '(T: SemCor+OMSTI)'),
                                   '-.o',
                                   label='Senseval2 (T: OMSTI)')
        plt.legend(handles=[
            se13_semcor_handle, se13_mun_handle, se2_semcor_handle,
            se2_mun_handle
        ],
                   loc='lower right')
        plt.axis([1.5e7, 1.1e11, 0, 1])
        plt.ylabel('F1')
        plt.xlabel('Tokens')
        plt.xscale('log')
        pdf.savefig()
        plt.show()
        plt.close()
    # extrapolate from data
    lr = LinearRegression()
    words, f1s = get_xy('SemEval13', 'Our LSTM (T: SemCor)')
    lr.fit(f1s.values.reshape([-1, 1]), np.log10(words.values.reshape([-1,
                                                                       1])))
    print('Extrapolated data size (words):')
    print(lr.predict([[0.75], [0.8]]))
Exemple #29
0
#print(boston.target)
#切分数据集
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=2) 

#简单线性回归
model1 = LinearRegression(normalize=True)
model1.fit(X_train, y_train)
#模型的拟合优度
simpleScore=model1.score(X_test, y_test)
print(simpleScore)
##回归系数
#print(model1.coef_)
#截距项
#print(model1.intercept_)
#print(simpleScore)

#模型测试,并利用均方根误差(MSE)对测试结果进行评价
#模型的拟合值
y_pred=model1.predict(X_test)
print("MSE:",metrics.mean_squared_error(y_test, y_pred))

#交叉验证
predicted=cross_val_predict(model1, boston.data, boston.target, cv=10)
print ("MSE:", metrics.mean_squared_error(boston.target, predicted))

#画图
import matplotlib.pyplot as plt
plt.scatter(boston.target, predicted, color="y", marker="o")
plt.scatter(boston.target, boston.target, color="g", marker="+")
plt.show()
This next section jumbles the rows, but keeps the relationship between X and y.
This is so that we can train the linearRegression model, and then test it on different
data so that we know that it is now able to get the answers right!
"""
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2)

# Create and train a classifier
clf = LinearRegression(n_jobs=-1)
# training data
clf.fit(X_train, y_train)
# test the data
accuracy = clf.score(X_test, y_test)

# predict future <forecast_col> values
forecast_set = clf.predict(X_lately)

df['Forecast'] = np.nan

# set up dates to use on the graph
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day

for i in forecast_set:
    next_date = datetime.datetime.fromtimestamp(next_unix)
    next_unix += one_day
    df.loc[next_date] = [np.nan for _ in range(len(df.columns) - 1)] + [i]

# plot it
Exemple #31
0
# Splitting data into test_random_forest and train
# train_set, test_set = train_test_split(data_df, test_size=0.01, random_state=np.random.randint(1, 1000))
# Removing all unused variable for memory management

# Separate output from inputs
y_train = data_df['time_to_failure']
x_train_seg = data_df['segment_id']
x_train = data_df.drop(['time_to_failure','segment_id'], axis=1)

# y_test = test_set['time_to_failure']
# x_test_seg = test_set['segment_id']
# x_test = test_set.drop(['time_to_failure'], axis=1)
# x_test = x_test.drop(['segment_id'], axis=1)

model = LinearRegression(n_jobs=4) 
model.fit(x_train, y_train)

mh = ModelHolder(model, most_dependent_columns)
mh.save(model_name)
model = None
mh_new = load_model(model_name)
model, most_dependent_columns = mh_new.get()

print('Evaluating test data , transforming test data now ... ')
print('Calculating score and error .. ')
y_pred = model.predict(x_train)
print('Score', model.score(x_train, y_train))

mas = mean_absolute_error(y_train, y_pred)
print('Mean Absolute Error', mas)
Exemple #32
0
data_set.loc[data_set[EMBARKED] == 'Q', EMBARKED] = 2
# print(data_set.describe())

algorithm = LinearRegression()

kf = KFold(data_set.shape[0], n_folds=3, random_state=2)

predictors = [GENDER, 'Pclass']
# predictors = [GENDER, 'Age']

predictions = []

for train, test in kf:
    train_predictors = (data_set[predictors].iloc[train, :])

    train_target = data_set['Survived'].iloc[train]
    algorithm.fit(train_predictors, train_target)

    test_prediction = algorithm.predict(data_set[predictors].iloc[test, :])
    predictions.append(test_prediction)

predictions = np.concatenate(predictions, axis=0)

predictions = predictions > 0.5

accuracy = np.sum(data_set['Survived'] == predictions)/data_set.shape[0]

print(accuracy)


Exemple #33
0
from sklearn.datasets.base import load_boston
from sklearn.linear_model.base import LinearRegression

boston_data = load_boston()
x = boston_data['data']
y = boston_data['target']

model = LinearRegression()
model.fit(x, y)
sample_house = [[
    2.29690000e-01, 0.00000000e+00, 1.05900000e+01, 0.00000000e+00,
    4.89000000e-01, 6.32600000e+00, 5.25000000e+01, 4.35490000e+00,
    4.00000000e+00, 2.77000000e+02, 1.86000000e+01, 3.94870000e+02,
    1.09700000e+01
]]

prediction = model.predict(sample_house)

print(prediction)
Exemple #34
0
    inp_prices = list()
    features = list()
    def get_inp_features(self): 
        return self.inp_features
    def get_inp_prices(self): 
        return self.inp_prices
    def get_features(self): 
        return self.features
    
    def read(self):
        F, N = map(int, raw_input().split(' '))              
        for _ in range(N):
            inp_f = map(float, raw_input().strip().split())
            self.inp_features.append(inp_f[:F:])
            self.inp_prices.append(inp_f[F::])
        questions = int(raw_input())        
        for _ in range(questions):
            self.features.append(map(float, raw_input().split()))
        
reader = inp_reader()
reader.read()
inp_features = reader.get_inp_features()
inp_prices = reader.get_inp_prices()
features = reader.get_features()
 
model = LinearRegression()

model.fit(inp_features, inp_prices)
prices=model.predict(features)
for el in prices:
    print (el[0])
Exemple #35
0
X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                    boston.target,
                                                    test_size=0.2,
                                                    random_state=2)

#增加特征多项式让线性回归模型更好地拟合数据
#多项式的个数的不断增加,可以在训练集上有很好的效果,但很容易造成过拟合
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)
#多项式线性回归
model2 = LinearRegression(normalize=True)
model2.fit(X_train_poly, y_train)
mutilScore = model2.score(X_test_poly, y_test)
print(mutilScore)

#模型测试,并利用均方根误差(MSE)对测试结果进行评价
#模型的拟合值
y_pred = model2.predict(X_test_poly)
print("MSE:", metrics.mean_squared_error(y_test, y_pred))

#交叉验证
predicted = cross_val_predict(model2, boston.data, boston.target, cv=10)
print("MSE:", metrics.mean_squared_error(boston.target, predicted))

#画图
import matplotlib.pyplot as plt
plt.scatter(boston.target, predicted, color="y", marker="o")
plt.scatter(boston.target, boston.target, color="g", marker="+")
plt.show()
Exemple #36
0
def main():
    df = load_train_data()
    logger.info('column hash = %d', utils.column_hash(df))
    df = preprocess.drop_column(df, 'fullVisitorId')
    df = preprocess.drop_column(df, 'sessionId')
    #    debug_info(df)

    y = df['totals_transactionRevenue']
    X = preprocess.drop_column(df, 'totals_transactionRevenue')

    #    X, _, y, _ = utils.split_data(X, y, ratio=0.9, seed=42)

    #    n_classes = 10
    n_models = 100

    y_max = y.max()

    for i in range(n_models):

        X_train, X_test, y_train, y_test = utils.split_data(X, y)

        logger.info('training')

        #         y_train, quants = preprocess.make_class_target(y_train, n_classes)
        #         logger.info('y_train.unique() = %s', y_train.unique())
        #         logger.info('quants = %s', quants)

        #        y_train = preprocess.make_class_target2(y_train, y_max, n_classes)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)

        logger.info('X_train.shape = %s', X_train.shape)

        #         cumulative = np.cumsum(pca.explained_variance_ratio_)
        #         pylab.plot(cumulative, 'r-')
        #         pylab.show()

        #        model = build_classifier(X_train.shape[1], n_classes)
        model = build_regressor(X_train.shape[1])
        EPOCHS = 100
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=5)
        history = model.fit(X_train,
                            y_train,
                            epochs=EPOCHS,
                            validation_split=0.1,
                            verbose=0,
                            callbacks=[early_stop,
                                       utils.EpochCallback()])

        linear_model = LinearRegression()
        linear_model.fit(X_train, y_train)

        logger.info('predicting')
        logger.info('X_test.shape = %s', X_test.shape)

        X_test = scaler.transform(X_test)

        #        y_classes = model.predict(X_test)
        #        y_pred = postprocess.make_real_predictions(y_classes, quants)
        #        y_pred = postprocess.make_real_predictions2(y_classes, y_max)

        y_pred = model.predict(X_test).flatten()
        y_linear_pred = linear_model.predict(X_test)

        rms = np.sqrt(mean_squared_error(y_test, y_pred))
        linear_rms = np.sqrt(mean_squared_error(y_test, y_linear_pred))
        logger.info('rms = %s', rms)
        logger.info('linear_rms = %s', linear_rms)

        #        save_model(model, i, quants, scaler)
        save_model2(model, linear_model, i, y_max, scaler)


#    plot_history_classifier(history)
    plot_history_regressor(history)

    pylab.figure()
    pylab.scatter(y_pred, y_test, alpha=0.5)
    pylab.xlabel("pred")
    pylab.ylabel("test")

    hist_revenue(y_linear_pred, 'y_linear_pred')
    hist_revenue(y_pred, 'y_pred')
    hist_revenue(y_test, 'y_test')

    pylab.show()
Exemple #37
0
import pandas as pd
from matplotlib.pyplot import scatter, plot, show
from sklearn.linear_model.base import LinearRegression

bmi_life_data = pd.read_csv('bmi_to_life_expect.csv')
x_data = bmi_life_data[['BMI']]
y_data = bmi_life_data[['Life expectancy']]
# print x_data, y_data
bmi_life_model = LinearRegression()
bmi_life_model.fit(x_data, y_data)

laos_life_exp = bmi_life_model.predict([50.00])
print(laos_life_exp)
scatter(x_data, y_data)
plot(x_data, bmi_life_model.predict(x_data))
show()
def moudle_select(X, test_A, y, moudelselect, threshold=False, Rate=False):
    '''
    Function :model
    X : train data 
    test_A : predict data
    y : result label
    predict_A : predict data
    moudelselect : waht' model do you select?
    threshold:False
    Rate:False
    
    
    modelselect :
    1,XGBRegressor
    2,ensemble.RandomForestRegressor
    3,linear_model.Lasso
    4,LinearRegression
    5,linear_model.BayesianRidge
    6,DecisionTreeRegressor
    7,ensemble.RandomForestRegressor
    8,ensemble.GradientBoostingRegressor
    9,ensemble.AdaBoostRegressor
    10,BaggingRegressor
    11,ExtraTreeRegressor
    12,SVR
    13,MLPRegressor
    other:MLPRegressor
    '''

    mse = []
    sum_mse = 0.0
    predict_A = pd.DataFrame(np.zeros((100, 10)))

    for index in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        if (moudelselect == 1):
            model = xgb.XGBRegressor(
                model=xgb.XGBRegressor(max_depth=17,
                                       min_child_weigh=5,
                                       eta=0.025,
                                       gamma=0.06,
                                       subsample=1,
                                       learning_rate=0.1,
                                       n_estimators=100,
                                       silent=0,
                                       n_jobs=-1,
                                       objective='reg:linear'))

        elif (moudelselect == 2):
            model = ensemble.RandomForestRegressor(
                n_estimators=25,
                criterion='mse',
                max_depth=14,
                min_samples_split=0.1,
                min_samples_leaf=2,
                min_weight_fraction_leaf=0.0,
                max_features=0.95,
                max_leaf_nodes=None,
                min_impurity_split=1e-07,
                bootstrap=True,
                oob_score=False,
                n_jobs=-1,
                random_state=None,
                verbose=0,
                warm_start=False)
        elif (moudelselect == 3):
            model = linear_model.Lasso(alpha=0.1,
                                       max_iter=1000,
                                       normalize=False)

        elif (moudelselect == 4):
            model = LinearRegression(fit_intercept=False,
                                     n_jobs=1,
                                     normalize=False)

        elif (moudelselect == 5):
            model = linear_model.BayesianRidge(alpha_1=1e-06,
                                               alpha_2=1e-06,
                                               compute_score=False,
                                               copy_X=True,
                                               fit_intercept=True,
                                               lambda_1=1e-06,
                                               lambda_2=1e-06,
                                               n_iter=500,
                                               normalize=False,
                                               tol=10,
                                               verbose=False)

        elif (moudelselect == 6):
            model = DecisionTreeRegressor(criterion='mse',
                                          splitter='best',
                                          max_depth=3,
                                          min_samples_split=0.1,
                                          min_samples_leaf=0.1,
                                          min_weight_fraction_leaf=0.1,
                                          max_features=None,
                                          random_state=None,
                                          max_leaf_nodes=None,
                                          presort=False)

        elif (moudelselect == 7):
            model = ensemble.RandomForestRegressor(
                n_estimators=1000,
                criterion='mse',
                max_depth=14,
                min_samples_split=0.1,
                min_samples_leaf=2,
                min_weight_fraction_leaf=0.0,
                max_features='auto',
                max_leaf_nodes=None,
                min_impurity_split=1e-07,
                bootstrap=True,
                oob_score=False,
                n_jobs=-1,
                random_state=None,
                verbose=0,
                warm_start=False)
        elif (moudelselect == 8):
            model = ensemble.GradientBoostingRegressor(n_estimators=800,
                                                       learning_rate=0.1,
                                                       max_depth=4,
                                                       random_state=0,
                                                       loss='ls')

        elif (moudelselect == 9):
            model = ensemble.AdaBoostRegressor(base_estimator=None,
                                               n_estimators=120,
                                               learning_rate=1,
                                               loss='linear',
                                               random_state=None)

        elif (moudelselect == 10):
            model = BaggingRegressor(base_estimator=None,
                                     n_estimators=500,
                                     max_samples=1.0,
                                     max_features=1.0,
                                     bootstrap=True)
        elif (moudelselect == 11):
            model = ExtraTreeRegressor(criterion='mse',
                                       splitter='random',
                                       max_depth=3,
                                       min_samples_split=0.1,
                                       min_samples_leaf=1,
                                       min_weight_fraction_leaf=0.01,
                                       max_features='auto',
                                       random_state=None,
                                       max_leaf_nodes=None,
                                       min_impurity_split=1e-07)

        elif (moudelselect == 12):
            model = SVR(kernel='rbf',
                        degree=3,
                        gamma='auto',
                        coef0=0.1,
                        tol=0.001,
                        C=1,
                        epsilon=0.1,
                        shrinking=True,
                        cache_size=200,
                        verbose=False,
                        max_iter=-1)

        elif (moudelselect == 13):
            model = MLPRegressor(hidden_layer_sizes=(100, ),
                                 activation='relu',
                                 solver='adam',
                                 alpha=0.0001,
                                 batch_size='auto',
                                 learning_rate='constant',
                                 learning_rate_init=0.001,
                                 power_t=0.5,
                                 max_iter=200,
                                 shuffle=True,
                                 random_state=None,
                                 tol=0.0001,
                                 verbose=False,
                                 warm_start=False,
                                 momentum=0.9,
                                 nesterovs_momentum=True,
                                 early_stopping=False,
                                 validation_fraction=0.1,
                                 beta_1=0.9,
                                 beta_2=0.999,
                                 epsilon=1e-08)
        else:
            model = MLPRegressor(activation='relu',
                                 alpha=0.001,
                                 solver='lbfgs',
                                 max_iter=90,
                                 hidden_layer_sizes=(11, 11, 11),
                                 random_state=1)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print("index: ", index, mean_squared_error(y_test, y_pred))
        sum_mse += mean_squared_error(y_test, y_pred)
        #
        #
        if (threshold == False):
            y_predict = model.predict(test_A)
            predict_A.ix[:, index] = y_predict
            mse.append(mean_squared_error(y_test, y_pred))
        else:
            if (mean_squared_error(y_test, y_pred) <= 0.03000):
                y_predict = model.predict(test_A)
                predict_A.ix[:, index] = y_predict
                mse.append(mean_squared_error(y_test, y_pred))


#        if(Rate==False):
#            mse_rate = mse / np.sum(mse)
#            #predict_A = predict_A.ix[:,~(data==0).all()]
#            for index in range(len(mse_rate)):
#                y+=predict_A.ix[:,index]*mse_rate[index]
#
    y = 0.0
    mse = mse / np.sum(mse)
    mse = pd.Series(mse)
    mse_rate_asc = mse.sort_values(ascending=False)
    mse_rate_asc = mse_rate_asc.reset_index(drop=True)
    mse_rate_desc = mse.sort_values(ascending=True)
    indexs = list(mse_rate_desc.index)
    for index in range(len(mse)):
        y += mse_rate_asc.ix[index] * predict_A.ix[:, indexs[index]]

    print("y_predict_mean: ", y.mean())
    print("y_predict_var: ", y.var())
    y = pd.DataFrame(y)
    y.to_excel("H:/java/python/src/machinelearning/test/predict.xlsx",
               index=False)
    predict_A.to_excel(
        "H:/java/python/src/machinelearning/test/predict_testA.xlsx",
        index=False)
    print("Averge mse:", sum_mse / len(mse))