コード例 #1
0
# ML
areas = df.pop('Area')
y = df['Dry_Yield']
X = df.drop(['Dry_Yield'], axis=1)
X_train, X_validation, y_train, y_validation = \
    train_test_split(X, y, test_size=.2, random_state=7)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

nn = MLPRegressor(random_state=7, verbose=99, max_iter=5000)
nn.fit(X_train_scaled, y_train)

scr = score_util.score(nn, scaler, X_validation, y_validation)
print(scr)

# grid = GridSearchCV(
#     estimator=nn,
#     param_grid={
#         'activation': ['relu', 'tanh', 'logistic'],
#         'solver': ['adam'],
#         'alpha': [.0000001, .000001, .00001, .0001],
#         'learning_rate_init': [.0000001, .000001, .00001, .0001],
#         'hidden_layer_sizes': [
#             (2048, 1024, 512),
#             (512, 256, 128, 32, 16),
#             (256, 128, 32, 16, 8, 4),
#         ]},
#     verbose=100,
コード例 #2
0
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model


# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# evaluate model
estimator = KerasRegressor(build_fn=baseline_model,
                           epochs=10000,
                           batch_size=17000,
                           verbose=1)
kfold = KFold(n_splits=10, random_state=seed)
estimator.fit(
    scaler.transform(X_train),
    y_train.values,
    callbacks=[
        ModelCheckpoint(
            './results/20170824_tf_dnn_reg/keras/val_acc_best.chkpnt',
            monitor="loss",
            save_best_only=True,
            save_weights_only=False,
            verbose=5),
        EarlyStopping(monitor='loss', min_delta=.0001, patience=25, verbose=5)
    ])

estimator.model.load_weights(
    './results/20170824_tf_dnn_reg/keras/val_acc_best.chkpnt')
scr = score_util.score(estimator, scaler, X_validation, y_validation)
print(scr)
コード例 #3
0
ファイル: main.py プロジェクト: dfaivre/python-ml-poc-2018
        train_split_idx]
    X_test_split, y_test_split = X.iloc[test_split_idx], y.iloc[test_split_idx]

    log.info("fitting transforms")
    transform_pipe.fit(X_train_split)
    log.info("transforming")
    X_train_transformed = transform_pipe.transform(X_train_split)

    logging.info("Running on input data shape: %s", X_train_transformed.shape)
    # model = ExtraTreesRegressor(verbose=99, n_jobs=4)
    # model = MLPRegressor(verbose=99, max_iter=150, tol=.01, learning_rate='constant', alpha=.1)
    model = GradientBoostingRegressor(verbose=99,
                                      n_estimators=100,
                                      max_depth=8)
    model.fit(X_train_transformed, y_train_split)
    logging.info("Scoring")
    scr = score_util.score(model, transform_pipe, X_test_split, y_test_split)

    joblib.dump(scr, os.path.join(env.result_path, f"score_{i}.pickle"))
    joblib.dump(scr, os.path.join(env.result_path, f"model_{i}.pickle"))

    scores.append(scr)

# for score in scores:
#     logging.info(score)

combined_score = score_util.combine(scores)
logging.info("kfold scores combined: %s", combined_score)
joblib.dump(combined_score,
            os.path.join(env.result_path, f"combined_score.pickle"))
コード例 #4
0
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from data_scripts import pcs_data_loader as dl
from modeling import score_util

result_path = './results/20170828_et_optimizer'

df = dl.shape_pps_data(dl.load_corn_rows_mssql())

# ML
areas = df.pop('Area')
y = df['Dry_Yield'].values
X = df.drop(['Dry_Yield'], axis=1).values

kcv = KFold(n_splits=5, random_state=971)
pipeline = make_pipeline(
    StandardScaler(), ExtraTreesRegressor(n_jobs=2,
                                          verbose=99,
                                          n_estimators=10))
scores = []
for train_split_idx, test_split_idx in kcv.split(X):
    X_train_split, y_train_split = X[train_split_idx], y[train_split_idx]
    X_test_split, y_test_split = X[test_split_idx], y[test_split_idx]

    pipeline.fit(X_train_split, y_train_split)
    scr = score_util.score(pipeline, None, X_test_split, y_test_split)
    scores.append(scr)
    print(scr)
コード例 #5
0
df = dl.load_corn_data_frame()

# ML
areas = df.pop('Area')
y = df['Dry_Yield']
X = df.drop(['Dry_Yield'], axis=1)
X_train, X_validation, y_train, y_validation = \
    train_test_split(X, y, test_size=.2, random_state=7)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

extra_trees = ExtraTreesRegressor(n_jobs=-1, verbose=True)
extra_trees.fit(X_train_scaled, y_train)
scr = score_util.score(extra_trees, scaler, X_validation, y_validation)

grid_search_cv = GridSearchCV(
    estimator=extra_trees,
    param_grid={"n_estimators": [5, 10, 15, 20, 25, 30, 35, 40]},
    error_score=0,
    n_jobs=2,
    verbose=99)

grid_search_cv.fit(X_train_scaled, y_train)
print(grid_search_cv.best_params_)

scr = score_util.score(grid_search_cv.best_estimator_, scaler, X_validation,
                       y_validation)
print(scr)
コード例 #6
0
ファイル: scratch.py プロジェクト: dfaivre/python-ml-poc-2018
# ML
areas = df.pop('Area')
y = df['Dry_Yield']
X = df.drop(['Dry_Yield'], axis=1)
X_train, X_validation, y_train, y_validation = \
    train_test_split(X, y, test_size=.3, random_state=7)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

logger.info("Fitting extra trees")
extra_trees = ExtraTreesRegressor(n_jobs=-1, verbose=True)
extra_trees.fit(X_train_scaled, y_train)
scr = score_util.score(extra_trees, scaler, X_validation, y_validation)

# explore
# seed = 11
# num_folds = 3
# model = Pipeline([('Scaler', StandardScaler()), ('ET', ExtraTreesRegressor())])
# kfold = KFold(n_splits=num_folds, random_state=seed)
#
# scoring = 'neg_mean_squared_error'
# cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=-1)


def _score(m):
    scaler.fit(X_validation)
    predictions = m.predict(scaler.transform(X_validation))
    mean_sq = mean_squared_error(y_validation, predictions)
コード例 #7
0
with open(model_path_, 'wb') as f:
    pickle.dump(model, f)
    print(f'model saved: {model_path_}')

scaler_path_ = f'{result_base_path}/et_scaler_{run_id}.pickle'
with open(scaler_path_, 'wb') as f:
    pickle.dump(scaler, f)
    print(f'model saved: {scaler_path_}')

results = []
for idx, elb_data in enumerate(
        sql_to_scikit_converter.load_cached_elbs(df.columns)):
    year_id, elb_X, elb_y, extra_cols = elb_data
    print(f'comparing elb year id: {year_id}, index: {idx}')

    elb_score = score_util.score(model, scaler, elb_X, elb_y)
    print(elb_score)

    results.append((year_id, elb_score, extra_cols))

rdf: pandas.DataFrame = pandas.concat([
    pandas.DataFrame([_id for (_id, _, _) in results], columns=['year_id']),
    score_util.create_data_frame([scr for (_, scr, _) in results]),
    pandas.DataFrame(
        pandas.Series([c for (_, _, c) in results], name='extra_cols'))
],
                                      axis=1)

os.makedirs(result_base_path, exist_ok=True)

rdf.to_csv(f'{result_base_path}/elb_harvest_predictions_results_{run_id}.csv')
コード例 #8
0
    ],
                                                    axis=1)

    # remove any extra enum dummy columns in elb (that training isn't aware of)
    elb_extra_cols = set(pps_elb_cells.columns) - train_cols
    if any(elb_extra_cols):
        print(
            f"WARNING: ELB has unknown training enum (dummy) cols: {','.join(elb_extra_cols)}"
        )
        pps_elb_cells.drop(elb_extra_cols, axis=1, inplace=True)

    elb_y = pps_elb_cells['Dry_Yield']
    elb_X = pps_elb_cells.drop(['Dry_Yield', 'Area'], axis=1)
    # order columns to match training
    elb_X = elb_X[X.columns]
    elb_score = score_util.score(extra_trees, scaler, elb_X, elb_y)
    print(elb_score)

    results.append((year_id, elb_score, elb_extra_cols))

rdf: pandas.DataFrame = pandas.concat([
    pandas.DataFrame([_id for (_id, _, _) in results], columns=['year_id']),
    score_util.create_data_frame([scr for (_, scr, _) in results]),
    pandas.DataFrame(
        pandas.Series([c for (_, _, c) in results], name='extra_cols'))
],
                                      axis=1)

rdf.to_csv(
    './results/20170823_elb_predictions/elb_harvest_predictions_results_{:%Y%m%d%H%m}.csv'
    .format(datetime.now()))