Example #1
0
def test_load_boston():
    res = load_boston()
    assert_equal(res.data.shape, (506, 13))
    assert_equal(res.target.size, 506)
    assert_equal(res.feature_names.size, 13)
    assert_true(res.DESCR)

    # test return_X_y option
    X_y_tuple = load_boston(return_X_y=True)
    bunch = load_boston()
    assert_true(isinstance(X_y_tuple, tuple))
    assert_array_equal(X_y_tuple[0], bunch.data)
    assert_array_equal(X_y_tuple[1], bunch.target)
def main():
    boston = datasets.load_boston()
    y = boston.target       # House prices
    mean = np.mean(y)
    y = y > mean            # y now means is_above_average_house_price

    fns = boston.feature_names
    predictors = np.array([
        'NOX',              # Air concentration of nitrous-oxide
        'CRIM',             # Crime rate per capita
        ])
    X_idx = np.in1d(fns, predictors)
    X = boston.data[:, X_idx]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=33)

    for p, x in zip(predictors, np.rollaxis(X, 1)):
        print('%s vs House price - srcc: %f, p_value: %f' % (
            (p, ) + stats.spearmanr(x, y)))

    model = GaussianNB()
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)

    matches = y_hat == y_test
    print('Success rate: %i / %i = %f' % (
        matches.sum(), matches.size, float(matches.sum()) / matches.size))
Example #3
0
def boston():
    dataset = load_boston()
    X, y = dataset.data, dataset.target
    # X, y = make_regression(n_samples=100000, n_features=13)
    X = StandardScaler().fit_transform(X).astype(np.float32)
    y = y.reshape(-1, 1).astype(np.float32)
    return shuffle(X, y, random_state=42)
Example #4
0
 def test_continue_train(self):
     X, y = load_boston(True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {
         'objective': 'regression',
         'metric': 'l1',
         'verbose': -1
     }
     lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)
     init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
     model_name = 'model.txt'
     init_gbm.save_model(model_name)
     evals_result = {}
     gbm = lgb.train(params, lgb_train,
                     num_boost_round=30,
                     valid_sets=lgb_eval,
                     verbose_eval=False,
                     # test custom eval metrics
                     feval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)),
                     evals_result=evals_result,
                     init_model='model.txt')
     ret = mean_absolute_error(y_test, gbm.predict(X_test))
     self.assertLess(ret, 3.5)
     self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5)
     for l1, mae in zip(evals_result['valid_0']['l1'], evals_result['valid_0']['mae']):
         self.assertAlmostEqual(l1, mae, places=5)
     os.remove(model_name)
Example #5
0
def get_cmap_scatter_plot():
    boston = datasets.load_boston()
    prices = boston['target']
    lower_status = boston['data'][:,-1]
    nox = boston['data'][:,4]

    x, y = get_data_sources(x=lower_status, y=prices)
    x_mapper, y_mapper = get_mappers(x, y)

    color_source = ArrayDataSource(nox)
    color_mapper = dc.reverse(dc.RdYlGn)(
        DataRange1D(low=nox.min(), high=nox.max())
    )

    scatter_plot = ColormappedScatterPlot(
        index=x, value=y,
        index_mapper=x_mapper, value_mapper=y_mapper,
        color_data=color_source,
        color_mapper=color_mapper,
        marker='circle',
        title='Color represents nitric oxides concentration',
        render_method='bruteforce',
        **PLOT_DEFAULTS
    )

    add_axes(scatter_plot, x_label='Percent lower status in the population',
             y_label='Median house prices')

    return scatter_plot
Example #6
0
def test_regressors_int():
    # test if regressors can cope with integer labels (by converting them to
    # float)
    regressors = all_estimators(type_filter='regressor')
    boston = load_boston()
    X, y = boston.data, boston.target
    X, y = shuffle(X, y, random_state=0)
    X = StandardScaler().fit_transform(X)
    y = np.random.randint(2, size=X.shape[0])
    for name, Reg in regressors:
        if Reg in dont_test or Reg in (CCA,):
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            # separate estimators to control random seeds
            reg1 = Reg()
            reg2 = Reg()
        set_random_state(reg1)
        set_random_state(reg2)

        if Reg in (_PLS, PLSCanonical, PLSRegression):
            y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))])
            y_ = y_.T
        else:
            y_ = y

        # fit
        reg1.fit(X, y_)
        pred1 = reg1.predict(X)
        reg2.fit(X, y_.astype(np.float))
        pred2 = reg2.predict(X)
        assert_array_almost_equal(pred1, pred2, 2, name)
def test_regression_with_custom_objective():
    tm._skip_if_no_sklearn()
    from sklearn.metrics import mean_squared_error
    from sklearn.datasets import load_boston
    from sklearn.cross_validation import KFold

    def objective_ls(y_true, y_pred):
        grad = (y_pred - y_true)
        hess = np.ones(len(y_true))
        return grad, hess

    boston = load_boston()
    y = boston['target']
    X = boston['data']
    kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf:
        xgb_model = xgb.XGBRegressor(objective=objective_ls).fit(
            X[train_index], y[train_index]
        )
        preds = xgb_model.predict(X[test_index])
        labels = y[test_index]
    assert mean_squared_error(preds, labels) < 25

    # Test that the custom objective function is actually used
    class XGBCustomObjectiveException(Exception):
        pass

    def dummy_objective(y_true, y_pred):
        raise XGBCustomObjectiveException()

    xgb_model = xgb.XGBRegressor(objective=dummy_objective)
    np.testing.assert_raises(XGBCustomObjectiveException, xgb_model.fit, X, y)
Example #8
0
def load_data():
    """Load the Boston dataset."""

    boston = datasets.load_boston()
        
    
    return boston
def get_messy_data(df):
  r"""Function for testing... just messes up the input data with Nans and Infs.

  Parameters
  ----------
  df : a pandas data frame
    
  Returns
  -------
  df
    a messy pandas data frame.

  """
  # Put one 'nan' in 25% of the rows, then one inf in 25%.
  df = df.as_matrix()
  from sklearn import datasets
  rng = np.random.RandomState(2) 
  dataset = datasets.load_boston()
  n_samples = df.shape[0]
  n_features = df.shape[1]
  missing_rate = 0.05
  n_missing_samples = int(np.floor(n_samples * missing_rate))
  missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples),
     np.ones(n_missing_samples)))
  rng.shuffle(missing_samples)
  missing_features = rng.randint(0, n_features, n_missing_samples)
  X_missing = df.copy()
  X_missing[np.where(missing_samples)[0], missing_features] = np.nan
  rng.shuffle(missing_samples)
  X_missing[np.where(missing_samples)[0], missing_features] = np.inf
  return X_missing
Example #10
0
def test_regressors_train():
    estimators = all_estimators()
    regressors = [(name, E) for name, E in estimators if issubclass(E,
        RegressorMixin)]
    boston = load_boston()
    X, y = boston.data, boston.target
    X, y = shuffle(X, y, random_state=0)
    # TODO: test with intercept
    # TODO: test with multiple responses
    X = Scaler().fit_transform(X)
    y = Scaler().fit_transform(y)
    for name, Reg in regressors:
        if Reg in dont_test or Reg in meta_estimators:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            reg = Reg()
        if hasattr(reg, 'alpha'):
            reg.set_params(alpha=0.01)

        # raises error on malformed input for fit
        assert_raises(ValueError, reg.fit, X, y[:-1])
        # fit
        reg.fit(X, y)
        reg.predict(X)
        assert_greater(reg.score(X, y), 0.5)
Example #11
0
def bokeh_plot(crime,zn,inidus,optradio,nox,rm,age,dis,rad,tax,ptratio,Bk,lstat):
    from django.conf import settings
    import os
    from sklearn.externals import joblib
    from sklearn import datasets
    from bokeh.plotting import figure, show, output_file
    from bokeh.resources import CDN
    from bokeh.embed import components

    clf=joblib.load(os.path.join(settings.PROJECT_ROOT,'app','machine_SVR.pkl'))
    boston = datasets.load_boston()
    y = boston.target
    Y=SVR_fitting(crime,zn,inidus,optradio,nox,rm,age,dis,rad,tax,ptratio,Bk,lstat)


    predicted = clf.predict(boston.data)
    predict_y=Y
    p = figure(title = "Boston dataset")
    p.xaxis.axis_label = 'Measured'
    p.yaxis.axis_label = 'Predicted'

    p.scatter(y,predicted)
    p.asterisk(x=predict_y, y=predict_y, size=20, color="#F0027F")
    script, div = components(p, CDN)
    return script, div
Example #12
0
def main():

	# boston data sets
	boston = datasets.load_boston()

	# 部屋数
	rooms = boston.data[:,5]

	# 家の値段
	house_prices = boston.target

	# 部屋の数と家の値段の関係をプロットする。
	plt.scatter(rooms, house_prices, color='r')


	# 最小二乗法で誤差が最も少なくなる直線を得る
	# x = np.array([rooms],np.one(len(rooms))).T
	x = np.array([[v, 1] for v in rooms])  # バイアス項を追加する
	y = house_prices

	# print np.ones_like(rooms)

	# 最小二乗法で誤差が最も少なくなる直線を得る
	(slope,bias), total_error, _, _ = np.linalg.lstsq(x, y)

	# 得られた直線をプロットする
	plt.plot(x[:, 0], slope * x[:, 0] + bias)
	# plt.xlabel('部屋の数')
	# plt.ylabel('家の値段 (単位: 1000 ドル)')
	plt.grid()
	plt.xlabel('rooms')
	plt.ylabel('price')
	plt.show()
Example #13
0
def test_boston_housing_regression_with_sample_weights():
    tm._skip_if_no_sklearn()
    from sklearn.metrics import mean_squared_error
    from sklearn.datasets import load_boston
    from sklearn.cross_validation import KFold

    boston = load_boston()
    y = boston['target']
    X = boston['data']
    sample_weight = np.ones_like(y, 'float')
    kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)

    for train_index, test_index in kf:
        xgb_model = xgb.XGBRegressor().fit(
            X[train_index], y[train_index],
            sample_weight=sample_weight[train_index]
        )

        preds = xgb_model.predict(X[test_index])
        # test other params in XGBRegressor().fit
        preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
        preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
        preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
        labels = y[test_index]

        assert mean_squared_error(preds, labels) < 25
        assert mean_squared_error(preds2, labels) < 370
        assert mean_squared_error(preds3, labels) < 25
        assert mean_squared_error(preds4, labels) < 370
def demo(X = None, y = None, test_size = 0.1):
    
    if X == None:
        boston = load_boston()
        X = pd.DataFrame(boston.data)
        y = pd.DataFrame(boston.target)



    base_estimator = DecisionTreeRegressor(max_depth = 5)


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape

    # If you want to compare with BaggingRegressor.
    # bench =  BaggingRegressor(base_estimator = base_estimator, n_estimators = 10, max_samples = 1, oob_score = True).fit(X_train, y_train)
    # print bench.score(X_test, y_test)
    # print mean_squared_error(bench.predict(X_test), y_test)

    clf = BasicSegmenterEG_FEMPO(ngen=30,init_sample_percentage = 1, n_votes=10, n = 10, base_estimator = base_estimator,
        unseen_x = X_test, unseen_y = y_test)
    clf.fit(X_train, y_train)
    print clf.score(X_test,y_test)
    y = clf.predict(X_test)
    print mean_squared_error(y, y_test)
    print y.shape

    return clf, X_test, y_test
Example #15
0
def test_RFECV():
    from sklearn.datasets import load_boston
    from sklearn.datasets import load_breast_cancer
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import RFECV

    # Regression
    X, y = load_boston(return_X_y=True)
    bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1,
                            n_estimators=10, n_jobs=1,
                            objective='reg:squarederror',
                            random_state=0, verbosity=0)
    rfecv = RFECV(
        estimator=bst, step=1, cv=3, scoring='neg_mean_squared_error')
    rfecv.fit(X, y)

    # Binary classification
    X, y = load_breast_cancer(return_X_y=True)
    bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1,
                            n_estimators=10, n_jobs=1,
                            objective='binary:logistic',
                            random_state=0, verbosity=0)
    rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='roc_auc')
    rfecv.fit(X, y)

    # Multi-class classification
    X, y = load_iris(return_X_y=True)
    bst = xgb.XGBClassifier(base_score=0.4, booster='gblinear',
                            learning_rate=0.1,
                            n_estimators=10, n_jobs=1,
                            objective='multi:softprob',
                            random_state=0, reg_alpha=0.001, reg_lambda=0.01,
                            scale_pos_weight=0.5, verbosity=0)
    rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='neg_log_loss')
    rfecv.fit(X, y)
Example #16
0
def load_extended_boston():
    boston = load_boston()
    X = boston.data

    X = MinMaxScaler().fit_transform(boston.data)
    X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)
    return X, boston.target
Example #17
0
def load_boston_df(include_tgt=True, tgt_name="target", shuffle=False):
    """Loads the boston housing dataset into a dataframe with the
    target set as the "target" feature or whatever name
    is specified in ``tgt_name``.

    Parameters
    ----------

    include_tgt : bool, optional (default=True)
        Whether to include the target

    tgt_name : str, optional (default="target")
        The name of the target feature

    shuffle : bool, optional (default=False)
        Whether to shuffle the rows


    Returns
    -------

    X : Pandas ``DataFrame`` or ``H2OFrame``, shape=(n_samples, n_features)
        The loaded dataset
    """
    bo = load_boston()
    X = pd.DataFrame.from_records(data=bo.data, columns=bo.feature_names)

    if include_tgt:
        X[tgt_name] = bo.target

    return X if not shuffle else shuffle_dataframe(X)
Example #18
0
def test_boston_housing_regression():
    from sklearn.metrics import mean_squared_error
    from sklearn.datasets import load_boston
    from sklearn.model_selection import KFold

    boston = load_boston()
    y = boston['target']
    X = boston['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])

        preds = xgb_model.predict(X[test_index])
        # test other params in XGBRegressor().fit
        preds2 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=3)
        preds3 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=0)
        preds4 = xgb_model.predict(X[test_index], output_margin=False,
                                   ntree_limit=3)
        labels = y[test_index]

        assert mean_squared_error(preds, labels) < 25
        assert mean_squared_error(preds2, labels) < 350
        assert mean_squared_error(preds3, labels) < 25
        assert mean_squared_error(preds4, labels) < 350
Example #19
0
def overview():
    boston = load_boston()
    features = [
        [0, 'CRIM', "per capita crime rate by town"],
        [1, 'ZN', "proportion of residential land zoned for lots over 25,000 sq.ft."],
        [2, 'INDUS', "proportion of non-retail business acres per town"],
        [3, 'CHAS', "Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)"],
        [4, 'NOX', "nitric oxides concentration (parts per 10 million)"],
        [5, 'RM', "average number of rooms per dwelling"],
        [6, 'AGE', "proportion of owner-occupied units built prior to 1940"],
        [6, 'DIS', "weighted distances to five Boston employment centres"],
        [7, 'RAD', "index of accessibility to radial highways"],
        [8, 'TAX', "full-value property-tax rate per $10,000"],
        [9, 'PTRATIO', "pupil-teacher ratio by town"],
        [10, 'B', "1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town"],
        [11, 'LSTAT', "% lower status of the population"],
        [12, 'MEDV', "Median value of owner-occupied homes in $1000's"],
    ]

    plot_row = 4
    plot_col = 4
    plt.figure(figsize=(10, 10))
    for f in features:
        print '{}:\t{}'.format(f[1], f[2])

    for feature in features:
        # plt.subplot(行数, 列数, 何番目のプロットか)
        plt.subplot(plot_row, plot_col, feature[0] + 1)
        plt.scatter(boston.data[:, feature[0]], boston.target)
        plt.xlabel(feature[1])
    plt.tight_layout()
Example #20
0
def base_stats():
    boston = load_boston()
    # print boston.feature_names
    # print boston.DESCR

    x = boston.data
    y = boston.target
    lr = LinearRegression()
    lr.fit(x, y)
    rmse = np.sqrt(lr.residues_/len(x))
    print 'RMSE: {}'.format(rmse)

    # plt.subplot(行数, 列数, 何番目のプロットか)
    plt.subplot(2, 1, 1)
    plt.scatter(lr.predict(x), boston.target)
    plt.plot([0, 50], [0, 50], '-', color=(.9, .3, .3), lw=4)
    plt.xlabel('predicted')
    plt.ylabel('real')

    x = np.array([np.concatenate((v, [1])) for v in boston.data])
    y = boston.target

    s, total_error, _, _ = np.linalg.lstsq(x, y)
    rmse = np.sqrt(total_error[0] / len(x))
    print 'Residual: {}'.format(rmse)

    plt.subplot(2, 1, 2)
    plt.plot(np.dot(x, s), boston.target, 'ro')
    plt.plot([0, 50], [0, 50], 'g-')
    plt.ylabel('real')
Example #21
0
def main():
    # ボストンデータセットを読み込む
    boston = datasets.load_boston()
    # 部屋の数
    rooms = boston.data[:, 5]
    # 家の値段
    house_prices = boston.target

    plt.scatter(rooms, house_prices, color="r")

    # 最小二乗法で誤差が最も少なくなる直線を得る
    x = np.array([[v, 1] for v in rooms])  # バイアス項を追加する
    y = house_prices
    (slope, bias), total_error, _, _ = np.linalg.lstsq(x, y)

    # 得られた直線をプロットする
    plt.plot(x[:, 0], slope * x[:, 0] + bias)

    # 訓練誤差の RMSE
    rmse = np.sqrt(total_error[0] / len(x))
    msg = "RMSE (training): {0}".format(rmse)
    print(msg)

    # グラフを表示する
    plt.xlabel("Number of Room")
    plt.ylabel("Price of House ($1,000)")
    plt.grid()
    plt.show()
    plt.savefig("image.png")
Example #22
0
def main(unused_argv):
  # Load dataset
  boston = datasets.load_boston()
  x, y = boston.data, boston.target

  # Split dataset into train / test
  x_train, x_test, y_train, y_test = model_selection.train_test_split(
      x, y, test_size=0.2, random_state=42)

  # Scale data (training set) to 0 mean and unit standard deviation.
  scaler = preprocessing.StandardScaler()
  x_train = scaler.fit_transform(x_train)

  # Build 2 layer fully connected DNN with 10, 10 units respectively.
  feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
      x_train)
  regressor = tf.contrib.learn.DNNRegressor(
      feature_columns=feature_columns, hidden_units=[10, 10])

  # Fit
  regressor.fit(x_train, y_train, steps=5000, batch_size=1)
  
  # Transform
  x_transformed = scaler.transform(x_test)
  
  # Predict and score
  y_predicted = list(regressor.predict(x_transformed, as_iterable=True))
  score = metrics.mean_squared_error(y_predicted, y_test)

  print('MSE: {0:f}'.format(score))
Example #23
0
 def test_template(params={'objective': 'regression', 'metric': 'l2'},
                   X_y=load_boston(True), feval=mean_squared_error,
                   num_round=100, init_model=None, custom_eval=None,
                   early_stopping_rounds=10,
                   return_data=False, return_model=False):
     params['verbose'], params['seed'] = -1, 42
     X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
     lgb_train = lgb.Dataset(X_train, y_train, params=params)
     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
     if return_data:
         return lgb_train, lgb_eval
     evals_result = {}
     gbm = lgb.train(params, lgb_train,
                     num_boost_round=num_round,
                     valid_sets=lgb_eval,
                     valid_names='eval',
                     verbose_eval=False,
                     feval=custom_eval,
                     evals_result=evals_result,
                     early_stopping_rounds=early_stopping_rounds,
                     init_model=init_model)
     if return_model:
         return gbm
     else:
         return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
    def generate_data(case, sparse=False):
        # Generate regression / classification data. 
        bunch = None 
        if case == 'regression':
            bunch = datasets.load_boston()
        elif case == 'classification': 
            bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
        X, y = shuffle(bunch.data, bunch.target)
        offset = int(X.shape[0] * 0.8) 
        X_train, y_train = X[:offset], y[:offset]
        X_test, y_test = X[offset:], y[offset:] 
        if sparse:
            X_train = csr_matrix(X_train)
            X_test = csr_matrix(X_test)
        else:
            X_train = np.array(X_train)
            X_test = np.array(X_test)
        y_test = np.array(y_test)
        y_train = np.array(y_train)
        data = {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
        }

        return data 
    def test_rrf_vs_sklearn_reg(self):
        """Test R vs. sklearn on boston housing dataset. """
        from sklearn.datasets import load_boston
        from sklearn.cross_validation import train_test_split
        from sklearn.metrics import mean_squared_error
        from sklearn.ensemble import RandomForestRegressor

        boston = load_boston()
        X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target,
                                                            test_size=0.2, random_state=13)

        n_samples, n_features = X_train.shape
        mtry = int(np.floor(0.3 * n_features))
        # do 100 trees
        r_rf = RRFEstimatorR(**{'ntree': 100, 'nodesize': 1, 'replace': 0,
                                'mtry': mtry, 'corr.bias': False,
                                'sampsize': n_samples, 'random_state': 1234})
        r_rf.fit(X_train, y_train)
        y_pred = r_rf.predict(X_test)
        r_mse = mean_squared_error(y_test, y_pred)

        p_rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=1, bootstrap=False,
                                     max_features=mtry, random_state=1)
        p_rf.fit(X_train, y_train)
        y_pred = p_rf.predict(X_test)
        p_mse = mean_squared_error(y_test, y_pred)
        print('%.4f vs %.4f' % (r_mse, p_mse))
        # should be roughly the same (7.6 vs. 7.2)
        np.testing.assert_almost_equal(r_mse, p_mse, decimal=0)
Example #26
0
def get_bar_plot():
    boston = datasets.load_boston()
    prices = boston['target']

    ys, bin_edges = np.histogram(prices, bins=10)
    ys = ys.astype('d') / ys.sum()
    xs = (bin_edges[:-1] + bin_edges[1:]) / 2.0

    x, y = get_data_sources(x=xs, y=ys)
    x_mapper, y_mapper = get_mappers(x, y)

    # we need to make the range of the x coordinate a bit larger, otherwise
    # half of the first and last bar are cut
    delta = bin_edges[1] - bin_edges[0]
    x_mapper.range.low = xs[0] - delta / 2.
    x_mapper.range.high = xs[-1] + delta / 2.

    y_mapper.range.high += 0.02

    bar_plot = BarPlot(
        index = x,
        value = y,
        index_mapper = x_mapper,
        value_mapper = y_mapper,
        fill_color = 'blue',
        bar_width = 3.0,
        **PLOT_DEFAULTS
    )

    add_axes(bar_plot, x_label='Median house prices', y_label='Frequency')

    return bar_plot
def get_data():

    data = load_boston()

    clf = LinearRegression()

    clf.fit(data.data, data.target)

    predicted = clf.predict(data.data)

    plt.figure(num=None, figsize=(14, 6), dpi=80, facecolor='w', edgecolor='k')
    
    plt.scatter(data.target, predicted)
    
    plt.plot([0, 50], [0, 50], '--k')
    
    plt.axis('tight')
    
    plt.xlabel('True price of Houses ($1000s)')
    
    plt.ylabel('Predicted price of Houses ($1000s)')
        
    img = StringIO.StringIO()
    
    plt.savefig(img,bbox_inches='tight')
    
    img.seek(0)
    
    plt.close()

    return img
    
Example #28
0
def get_variable_size_scatter_plot():
    boston = datasets.load_boston()
    prices = boston['target']
    lower_status = boston['data'][:,-1]
    tax = boston['data'][:,9]

    x, y = get_data_sources(x=lower_status, y=prices)
    x_mapper, y_mapper = get_mappers(x, y)

    # normalize between 0 and 10
    marker_size = tax / tax.max() * 10.

    scatter_plot = ScatterPlot(
        index=x, value=y,
        index_mapper=x_mapper, value_mapper=y_mapper,
        marker='circle',
        marker_size=marker_size,
        title='Size represents property-tax rate',
        **PLOT_DEFAULTS
    )
    scatter_plot.color = (0.0, 1.0, 0.3, 0.4)

    add_axes(scatter_plot, x_label='Percent lower status in the population',
             y_label='Median house prices')

    return scatter_plot
Example #29
0
def get_jitter_plot():
    boston = datasets.load_boston()
    prices = boston['target']

    x, y = get_data_sources(y=prices)
    x_mapper, y_mapper = get_mappers(x, y)

    jitter_plot = JitterPlot(
        index=y,
        mapper=y_mapper,
        marker='circle',
        jitter_width=100,
        **PLOT_DEFAULTS
    )
    jitter_plot.line_width = 1.

    x_axis = PlotAxis(orientation='bottom',
                      title='Median house prices',
                      mapper=jitter_plot.mapper,
                      component=jitter_plot,
                      **AXIS_DEFAULTS)

    jitter_plot.underlays.append(x_axis)

    return jitter_plot
Example #30
0
    def load_boston():
        from sklearn.datasets import load_boston
        boston = load_boston()
        # print(boston.DESCR)

        # print(boston.feature_names)
        # CRIM      : 人口1人当たりの犯罪発生数
        # ZN        : 25,000 平方フィート以上の住居区画の占める割合
        # INDUS     : 小売業以外の商業が占める面積の割合
        # CHAS      : チャールズ川によるダミー変数 (1: 川の周辺, 0: それ以外)
        # NOX       : NOx の濃度
        # RM        : 住居の平均部屋数
        # AGE       : 1940 年より前に建てられた物件の割合
        # DIS       : 5 つのボストン市の雇用施設からの距離 (重み付け済)
        # RAD       : 環状高速道路へのアクセスしやすさ
        # TAX       : $10,000 ドルあたりの不動産税率の総計
        # PTRATIO   : 町毎の児童と教師の比率
        # B         : 町毎の黒人 (Bk) の比率を次の式で表したもの。 1000(Bk – 0.63)^2
        # LSTAT     : 給与の低い職業に従事する人口の割合 (%)

        # pp.pprint(boston.data)
        # print(np.array(boston.data).shape)
        # pp.pprint(boston.target) # house prices
        X = boston.data
        y = boston.target
        return SklearnDataGenerator.shuffle(X, y)
Example #31
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

plt.style.use(style='ggplot')
plt.rcParams['figure.figsize'] = (10, 6)

train = load_boston(return_X_y=False)

data = pd.DataFrame(data= np.c_[train['data'], train['target']])
data = data.select_dtypes(include=[np.number]).interpolate().dropna()
print(data.info())

numeric_features = data.select_dtypes(include=[np.number])

corr = numeric_features.corr()

print(corr)
data = data.drop([2],axis=1)
X = data.drop([13], axis=1)
Y = data[13]
#print(X)
#print(Y)

X_train, X_test,y_train, y_test = train_test_split(
                                    X, Y, random_state=42, test_size=.33)

from sklearn import linear_model
Example #32
0
from sklearn.datasets import load_boston
import numpy as np
import wandb

wandb.init()

# Save hyperparameters
wandb.config.lr = 0.000001
wandb.config.epochs = 1

# Load Dataset
data, target = load_boston(return_X_y=True)

# Initialize model
weights = np.zeros(data.shape[1])
bias = 0

# Train Model
for _ in range(wandb.config.epochs):
    np.random.shuffle(data)
    for i in range(data.shape[0]):
        x = data[i, :]
        y = target[i]

        err = y - np.dot(weights, x)
        if (err < 0):
            weights -= wandb.config.lr * x
            bias -= wandb.config.lr
        else:
            weights += wandb.config.lr * x
            bias += wandb.config.lr
Example #33
0
# coding: utf-8

# Ali Nehrani

import numpy as np
import pandas as pd
from sklearn import (datasets, metrics, cluster, feature_selection, manifold,
                     decomposition, preprocessing, mixture)
from matplotlib import pyplot as plt
from IPython.core.debugger import Tracer
from sklearn.metrics import normalized_mutual_info_score

import pdb

# Load the boston dataset to variable
boston = datasets.load_boston()

resultDataFrame = pd.DataFrame(columns=['Cluster', 'NMIS Accuracy'])

df = pd.DataFrame(boston.data)
df.columns = boston.feature_names
df['target'] = boston.target

# boston data
X = boston.data
#print('boston data shape:', boston.data.shape)
print('boston data:', X)
# iris features
print('boston features:', boston.feature_names)
#print('boston target head', boston.data.head)
# boston target
import sys,os
sys.path.append(os.path.join(os.path.dirname(sys.executable),'share','pydaal_examples','examples','python','source'))
from DecisionForest import Regression
from utils import printNumericTable
from daal.data_management import HomogenNumericTable
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np

data = load_boston()
x = data.data
y = data.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.40, random_state=42)

trainData = HomogenNumericTable(x_train)
testData=HomogenNumericTable(x_test)
nD_y_train= y_train[:,np.newaxis]
trainDependentVariables= HomogenNumericTable(nD_y_train)
nD_y_test = y_test[:,np.newaxis]
testGroundTruth = HomogenNumericTable(nD_y_test)
'''
Instantiate Decision Forest object Regression(nTrees = 100, observationsPerTreeFraction = 1,featuresPerNode=0,maxTreeDepth=0,
				 minObservationsInLeafNodes=5,impurityThreshold=0,varImportance=None,resultsToCompute=0)
'''
#Instantiate Linear Regression object
daal_DF = Regression(nTrees=100,maxTreeDepth=15,resultsToCompute=3)
#Training
trainingResult = daal_DF.training(trainData,trainDependentVariables)
#Prediction
pred_nT = daal_DF.predict(trainingResult,trainData)
#Serialize the training object
# -*- coding: utf-8 -*-

#%%
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import pandas as pd
from tensorflow import keras

#%%
from sklearn.datasets import load_boston
housing = load_boston()

#%%
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
        housing.data, housing.target, random_state=7)
x_train, x_valid, y_train, y_valid = train_test_split(
        x_train_all, y_train_all, random_state=11)

#%%
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

#%%
# subclass API
class WideDeepModel(keras.models.Model):
    def __init__(self):
        """Define the model layers"""
Example #36
0
def main():
    data = load_boston()
    X_ = data['data']
    y_ = data['target']

    X_ = (X_ - np.mean(X_, axis=0)) / np.std(X_, axis=0)

    n_features = X_.shape[1]
    print "nunber of features", n_features

    n_hidden = 15

    W1_ = np.random.randn(n_features, n_hidden)
    b1_ = np.random.randn(n_hidden)
    #b1_ = np.zeros(n_hidden)
    W2_ = np.random.randn(n_hidden, 1)
    b2_ = np.random.randn(1)
    #b2_ = np.zeros(1)

    X, y = Input(), Input()
    W1, b1 = Input(), Input()
    W2, b2 = Input(), Input()

    l1 = Linear(X, W1, b1)
    s1 = Sigmoid(l1)
    l2 = Linear(s1, W2, b2)
    cost = MSE(y, l2)

    feed_dict = {
        X: X_,
        y: y_,
        W1: W1_,
        b1: b1_,
        W2: W2_,
        b2: b2_
    }

    epochs = 1000
    m = X_.shape[0]
    batch_size = 11
    steps_per_epoch = m // batch_size

    graph = topological_sort(feed_dict)
    trainables = [W1, b1, W2, b2]

    print("Total number of examples = {}".format(m))
    loss_list = []

    for i in range(epochs):
        loss = 0
        for j in range(steps_per_epoch):
            # Step 1
            # Randomly sample a batch of examples
            X_batch, y_batch = resample(X_, y_, n_samples=batch_size)

            # Reset value of X and y Inputs
            X.value = X_batch
            y.value = y_batch

            # Step 2
            forward_and_backward(graph)

            # Step 3
            sgd_update(trainables)

            loss += graph[-1].value

        print("Epoch: {}, Loss: {:.3f}".format(i+1, loss/steps_per_epoch))
        loss_list.append(loss/steps_per_epoch)

    plt.figure()
    plt.plot(loss_list)
    plt.show()
Example #37
0
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

#read data
dataset = load_boston()
X = dataset.data
y = dataset.target

print('data features number = ', str(X.shape[1]), ' feature')
print("----------------------------------------------------------------------")

#select beast features
#model = LinearRegression(copy_X=True)
#FeatureSelectionMethod = SelectFromModel(estimator=model)
#X = FeatureSelectionMethod.fit_transform(X,y)
#
#print('new data features number = ',str(X.shape[1]),' feature')
#print("----------------------------------------------------------------------")

#normaize data
normalizer = StandardScaler(copy=True, with_mean=True, with_std=True)
X = normalizer.fit_transform(X)

#split data to train , valid , test
X_train_valid, X_test, y_train_valid, y_test = train_test_split(
                               random_state=0,
                               n_nearest_features=5)
    iterative_impute_scores = get_scores_for_imputer(imputer, X_missing,
                                                     y_missing)

    return ((full_scores.mean(), full_scores.std()),
            (zero_impute_scores.mean(), zero_impute_scores.std()),
            (mean_impute_scores.mean(), mean_impute_scores.std()),
            (iterative_impute_scores.mean(), iterative_impute_scores.std()))


results_diabetes = np.array(get_results(load_diabetes()))
mses_diabetes = results_diabetes[:, 0] * -1
stds_diabetes = results_diabetes[:, 1]

results_boston = np.array(get_results(load_boston()))
mses_boston = results_boston[:, 0] * -1
stds_boston = results_boston[:, 1]

n_bars = len(mses_diabetes)
xval = np.arange(n_bars)

x_labels = [
    'Full data', 'Zero imputation', 'Mean Imputation',
    'Multivariate Imputation'
]
colors = ['r', 'g', 'b', 'orange']

# plot diabetes results
plt.figure(figsize=(12, 6))
ax1 = plt.subplot(121)
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 15 09:12:13 2017

@author: arellave
"""

#from sklearn import datasets
import sklearn.datasets as d
import numpy as np

#print (datasets.load_*?

#Loading a dataset

boston = d.load_boston()
print (boston.DESCR)

housing = d.fetch_california_housing()
print (housing.DESCR)

X, y = boston.data, boston.target

#print (datasets.make_*?

#Creating a Dataset
reg_data = d.make_regression()
complex_reg_data = d.make_regression(1000,10,5,2,1.0)
print (complex_reg_data[0].shape)

classification_set = d.make_classification(weights=[0.1])
Example #40
0
def boston_df():
    boston_data = load_boston()
    return pd.DataFrame(
        data=boston_data.data,
        columns=boston_data.feature_names).assign(MEDV=boston_data.target)
from sklearn import datasets
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

loaded_data = datasets.load_boston()
data_X = loaded_data.data  # 没有括号只是属性
data_y = loaded_data.target

model = LinearRegression()  # 如果没有能力,就取默认参数,定义这个model
model.fit(data_X, data_y)  # 这是代表默认值,默认值已经很好了

print(model.predict(data_X[:4, :]))
print(data_y[:4])
# 以上是用自己导入的模型,训练
X, y = datasets.make_regression(n_samples=100, n_features=1, n_targets=1, noise=1)
plt.scatter(X, y)
plt.show()
        # Evaluate the model performance.
        if dataset == 'flowers':
            accuracy = accuracy_score(y_test, predictions)
            print('Classifier accuracy {:.2f}'.format(accuracy))
        else:
            rmse = sqrt(mean_squared_error(y_test, predictions))
            print('Regression root mean squared error {:.2f}'.format(rmse))

    @staticmethod
    def get_model(dataset):
        """Train a classifier or a regression model with a sklearn algorithm.
        Note that there are MANY hyperparameters you pass into these models.
        Refer to the online sklearn docs for more information."""
        if dataset == 'flowers':
            return GradientBoostingClassifier(random_state=RANDOM_SEED)
        else:
            return GradientBoostingRegressor(random_state=RANDOM_SEED)


if __name__ == "__main__":
    # Get some sample data from sklearn datasets. Setting return_X_y to True will
    # constrain the output to be a tuple containing only the data and the targets.
    flower_data = datasets.load_iris(return_X_y=True)
    housing_data = datasets.load_boston(return_X_y=True)

    # Predict with the two models and the two datasets.
    predictor = StochasticGradientBoostingDemo()
    predictor.make_prediction(flower_data, 'flowers')
    predictor.make_prediction(housing_data, 'housing')
Example #43
0
def X_boston():
    return datasets.load_boston().data[rows, :2]
Example #44
0
def y_boston():
    return pd.Series(datasets.load_boston().target[rows], name='target')
Example #45
0
#!/usr/bin/env python3
# _*_ coding:utf-8 _*_

import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import graphviz

from xgboost import plot_tree
from sklearn.datasets import load_boston
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


data = load_boston()['data']
label = load_boston()['target']

data = np.delete(data, [1, 2, 3, 8, 9], axis=1)


x_train, x_test, y_train, y_test = \
    train_test_split(data, label, test_size=0.1)


dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)

############################### origin API ##########################
param = {
    # General Parameters
    'booster': 'gbtree',  # booster
Example #46
0
def load_data():
    """Load the Boston dataset."""

    boston = datasets.load_boston()
    return boston
#[2] folder 03_dlfs

from Classes.NeuralNetwork import NeuralNetwork
from Classes.SGD import SGD
from Classes.Dense import Dense
from Classes.Sigmoid import Sigmoid
from Classes.Linear import Linear
from Classes.MeanSquaredError import MeanSquaredError
from Classes.Trainer import Trainer
import numpy as np
from numpy import ndarray

### test data ###
from sklearn.datasets import load_boston

boston = load_boston()
data = boston.data
target = boston.target

# Scaling the data
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
data = s.fit_transform(data)


def to_2d_np(a: ndarray, type: str = "col") -> ndarray:

    assert a.ndim == 1, \
    "Input tensors must be 1 dimensional"

    if type == "col":
Example #48
0
from tinyml.linear_model.SGDRegressor import SGDRegressor as tinymlSGDRegressor
from tinyml.ensemble.GradientBoostingRegressor import GradientBoostingRegressor as tinymlGradientBoostingRegressor
from tinyml.ensemble.RandomForestRegressor import RandomForestRegressor as tinymlRandomForestRegressor
from tinyml.ensemble.XGBRegressor import XGBRegressor as tinymlXGBRegressor
from tinyml.tree.DecisionTreeRegressor import DecisionTreeRegressor as tinymlDecisionTreeRegressor

from sklearn.linear_model import LinearRegression as sklearnLinearRegression
from sklearn.linear_model import SGDRegressor as sklearnSGDRegressor
from sklearn.tree import DecisionTreeRegressor as sklearnDecisonTreeRegressor
from sklearn.ensemble import RandomForestRegressor as sklearnRnadomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor as sklearnGradientBoostRegressor
from xgboost import XGBRegressor

if __name__ == '__main__':

    boston_X, boston_y = load_boston(return_X_y=True)

    boston_train_X, boston_test_X, boston_train_y, boston_test_y = train_test_split(
        boston_X, boston_y, test_size=0.3, random_state=0)

    data = boston_train_X, boston_train_y, boston_test_X, boston_test_y

    rmse_tinyml_linear_regression = train_and_eval(data,
                                                   tinymlLinearRegression())
    print('tinyml LinearRegression:', rmse_tinyml_linear_regression)
    rmse_sklearn_linear_regression = train_and_eval(data,
                                                    sklearnLinearRegression())
    print('sklearn LinearRegression:', rmse_sklearn_linear_regression)
    print('\n')
    std_scaler = StandardScaler()
    std_scaler.fit(boston_train_X)
Example #49
0
def test_regression_boston():
    boston = load_boston()
    data = data_df_from_bunch(boston)
    er = SimpleRegressor()
    er.fit(data, target_col='target')
Example #50
0
from xgboost import XGBClassifier, XGBRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score,accuracy_score
#ㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡㅡ#

# x, y = load_boston(return_X_y=True)
datasets = load_boston()
x=datasets.data
y=datasets['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, random_state=77)

#2. 모델 
model = XGBRegressor(n_estimators=1000, learning_rate=0.01, n_jobs=8)

#3. 훈련

model.fit(x_train, y_train, verbose=1, eval_metric=['rmse'],
         eval_set=[(x_train,y_train), (x_test, y_test)],
         early_stopping_rounds=20)

aaa = model.score(x_test, y_test)
print('score : ' , aaa)

y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print('r2 : ', r2)
 def test_shap_interactions(self, client: "Client") -> None:
     from sklearn.datasets import load_boston
     X, y = load_boston(return_X_y=True)
     params = {'objective': 'reg:squarederror'}
     self.run_shap_interactions(X, y, params, client)
Example #52
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import datasets
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn import cross_validation
from sklearn.utils import shuffle
# Получение процентного соотношения важности той или иной характеристики в наборе данных. Подойдёт при распределении и сопоставлении весов различных показателей
# Load housing data
housing_data = datasets.load_boston()

# Shuffle the data
X, y = shuffle(housing_data.data, housing_data.target, random_state=7)

# Split data into training and testing datasets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=7)

# AdaBoost Regressor model
regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                              n_estimators=400,
                              random_state=7)
regressor.fit(X_train, y_train)

# Evaluate performance of AdaBoost regressor
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
print('\nADABOOST REGRESSOR')
print('Mean squared error =', round(mse, 2))
Example #53
0
def data_loader() -> np.ndarray:
    dataset = load_boston()
    x = dataset.data
    y = dataset.target[:, np.newaxis]
    print("Total samples in our dataset is: {}".format(x.shape[0]))
    return x, y
Example #54
0
from sklearn.datasets import fetch_20newsgroups, load_boston

#news = fetch_20newsgroups(subset='all')

#print(news.data)
#print(news.target)
lb = load_boston()

print("获取特征值")
print(lb.data)
print("目标值")
print(lb.target)
print(lb.DESCR)
Example #55
0
from sklearn.datasets import load_iris
from sklearn.datasets import load_boston
from sklearn import tree
import pandas as pd
import tree_extract_rule

boston = load_boston()  # Load Dataset

boston_target = pd.Series(boston.target, name='target')  # target_data

boston_class = pd.Series()  # creat Dataserie for classes
for idx, i in enumerate(boston_target):
    if i <= 25:
        boston_class = boston_class.append(pd.Series(['low'], index=[idx]))
    if i > 25:
        boston_class = boston_class.append(pd.Series(['high'], index=[idx]))

boston_class.name = 'target'  # Dataserie with classes
boston_data = pd.DataFrame(
    boston.data,
    columns=boston.feature_names)  # create DataFrame out of Dataset

liste = boston_data.columns  # read the column names for rule_extraction
blf = tree.DecisionTreeClassifier()  # create the tree class_weight='balanced'
blf = blf.fit(boston_data, boston_class)  # train the tree

rules = tree_extract_rule.extract_rules(
    blf, liste, boston_data,
    boston_class)  # extract rules ,target_class='high'

r = pd.DataFrame.from_dict(rules)
Example #56
0
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 29 11:58:46 2020

@author: utkuk
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

X, y = load_boston(return_X_y=True)
boston = load_boston()
X_egitim, X_test, y_egitim, y_test = train_test_split(X,
                                                      y,
                                                      test_size=0.3,
                                                      random_state=3)
# Lineer Model
lineerModel = LinearRegression()  # model objesinin yaratılması
lineerModel.fit(X_egitim,
                y_egitim)  # modelin eğitim verisi kullanılarak uydurulması
lineer_egitim_r2 = lineerModel.score(
    X_egitim, y_egitim)  # modelin eğitim verisi üzerinde R^2 değeri
lineer_test_r2 = lineerModel.score(
    X_test, y_test)  # modelin test verisi üzerinde R^2 değeri
print('Lineer: Egitim verisi R2 degeri ', lineer_egitim_r2)
print('Lineer: Test verisi R2 degeri', lineer_test_r2)
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold

from photonai.base import Hyperpipe, PipelineElement, OutputSettings

X, y = load_boston(True)

my_pipe = Hyperpipe(
    name="default_pipe",
    metrics=[
        "mean_absolute_error",
        "mean_squared_error",
        "pearson_correlation",
    ],  # the performance metrics of interest
    best_config_metric="mean_absolute_error",
    eval_final_performance=False,
    inner_cv=KFold(n_splits=10, shuffle=True, random_state=42),
    verbosity=2,
    output_settings=OutputSettings(plots=False, project_folder="./tmp/"),
)

# ADD ELEMENTS TO YOUR PIPELINE
my_pipe += PipelineElement("SimpleImputer",
                           missing_values=np.nan,
                           strategy="median")
my_pipe += PipelineElement("StandardScaler")
my_pipe += PipelineElement("GaussianProcessRegressor")

# NOW TRAIN YOUR PIPELINE
my_pipe.fit(X, y)
Example #58
0
def load_data():
    boston = datasets.load_boston()
    X = boston.data
    y = boston.target
    features = boston.feature_names
    return X, y, features
Example #59
0
class RegModelTest(unittest.TestCase):
    # Setup testing data
    x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
    y = np.array([-1, 0.2, 0.9, 2.1, 3.4, 4.2, 5.6, 6.5, 7.3])
    X_train, X_test, y_train, y_test = train_test_split(x.reshape(-1, 1),
                                                        y,
                                                        random_state=1512)
    boston = load_boston()
    boston_x_train, boston_x_test, boston_y_train, boston_y_test = train_test_split(
        boston.data, boston.target, random_state=1512)
    # Setup Linear data
    regression_single = RegressionModel(X_train, y_train)
    regression_single.ls_fit()

    regression_boston = RegressionModel(boston_x_train, boston_y_train)
    regression_boston.ls_fit()

    # Setup Ridge data
    ridge_single = RegressionModel(X_train, y_train)
    ridge_single.ridge_fit(alpha=0.6)

    ridge_boston = RegressionModel(boston_x_train, boston_y_train)
    ridge_boston.ridge_fit(alpha=0.6)

    # Setup Lasso Data
    lasso_single = RegressionModel(X_train, y_train)
    lasso_single.lasso_fit(alpha=0.6)

    lasso_boston = RegressionModel(boston_x_train, boston_y_train)
    lasso_boston.lasso_fit(alpha=0.6)

    def test_all_fit(self):
        self.assertTrue(len(self.regression_single.coeffs))
        self.assertTrue(len(self.regression_boston.coeffs))

        self.assertTrue(len(self.ridge_single.coeffs))
        self.assertTrue(len(self.ridge_boston.coeffs))

        self.assertTrue(len(self.lasso_single.coeffs))
        self.assertTrue(len(self.lasso_boston.coeffs))

    def test_predict(self):
        """
        Only one model (standard or ridge or lasso) is enough as the previous test checks if the coeffs array
        is not empty
        """
        self.regression_single.predict(self.X_test)
        self.assertTrue(len(self.regression_single.y_pred))
        self.regression_boston.predict(self.boston_x_test)
        self.assertTrue(len(self.regression_boston.y_pred))

    def test_score(self):
        """
        Similarly, score calculation is the same across all models, so one is enough.
        Make a NumPy copy of prediction data (as y_test is a NumPy array) and check if their shapes are equal
        (otherwise score function will not work)
        """
        pred_copy_simple = np.copy(self.regression_single.y_pred)
        pred_copy_boston = np.copy(self.regression_boston.y_pred)

        self.assertEqual(pred_copy_simple.shape, self.y_test.shape)
        self.assertEqual(pred_copy_boston.shape, self.boston_y_test.shape)
Example #60
0
# 회귀

from xgboost import XGBClassifier, XGBRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_selection import SelectFromModel  # feature 컬럼을 선택
from sklearn.metrics import r2_score, accuracy_score

x, y = load_boston(return_X_y=True)  # 사이킷런에서 자동으로 x, y 부여

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)

model = XGBRegressor(n_jobs=8)

model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print('r2: ', score)

thresholds = np.sort(
    model.feature_importances_)  # 컬럼들의 값 -> sort(낮은 숫자부터 순차적으로 정렬)
print(thresholds)  # 이 값들 모두 합치면 1 (컬럼 13개)
# r2:  0.9221188601856797
# [0.00134153 0.00363372 0.01203115 0.01220458 0.01447935 0.01479119
#  0.0175432  0.03041655 0.04246345 0.0518254  0.06949984 0.30128643
#  0.42848358]