Exemple #1
0
async def preprocess(ctx):
    await ctx.send("Running preprocessing...")
    try:
        pre()
    except Exception as e:
        await ctx.send('ERROR:\n' + str(e))
    await ctx.send("Completed running preprocessing.")
Exemple #2
0
def learning_process_final(train_path, test_path, sample_path, clf_code,
                           n_estimators, num_leaves, learning_rate, top_k):
    """Returns a pd.DataFrame for test prediction.

    Parameters
    ----------
    train_path : str
        The path of the train dataset.
    test_path : str
        The path of the test dataset.
    sample_path : str
        The path of the sample csv.
    clf_mode : str
        The type of classifier for model 
    n_estimators : int
        The number of estimators for model
    num_leaves : int
        The number of leaf nodes for model
    learning_rate : float
        The Learning_rate of leaf nodes for model
    top_k : int
        The number of candidates estimators for final prediction of model
    """

    start = time.time()

    train_cls = pre(train_path, True)
    test_cls = pre(test_path, False)

    train_df = train_cls.main()
    test_df = test_cls.main()

    sample_df = pd.read_csv(sample_path)

    print('data_loading_time : %f.2' % (time.time() - start))

    start = time.time()
    main_cls = tp(train_df=train_df,
                  test_df=test_df,
                  sample_df=sample_df,
                  clf_code=clf_code,
                  n_estimators=(n_estimators, n_estimators),
                  num_leaves=(num_leaves, num_leaves),
                  lr=learning_rate,
                  top_k=top_k)

    result_sample = main_cls.main_experience()

    print('learning_time : %f.2' % (time.time() - start))

    return result_sample
def MLP(data_directory, model_dir, features):
    X_train, X_test, y_train, y_test, predict_X, features = pre(
        data_directory, features)
    os.chdir(model_dir)
    model = mlp(random_state=1, max_iter=10000)
    grid = gs(estimator=model,
              param_grid={
                  'hidden_layer_sizes': [(500, 500)],
                  'activation': ['logistic', 'tanh', 'relu'],
                  'alpha': np.exp(2.303 * np.arange(-8, 0)),
                  'learning_rate': ['constant']
              },
              cv=5,
              n_jobs=6)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_estimator_.score(X_test, y_test))

    joblib.dump(
        grid.best_estimator_, 'mlp_%d_%.4f.m' %
        (len(features), grid.best_estimator_.score(X_test, y_test)))

    df = pd.DataFrame(columns=['ml_bandgap', 'pbe_bandgap'])
    df['pbe_bandgap'] = y_test
    df['ml_bandgap'] = grid.best_estimator_.predict(X_test)
    print(df)
def DecisionTree(data_directory, model_dir, features):
    X_train, X_test, y_train, y_test, predict_X, features = pre(data_directory, features)
    os.chdir(model_dir)
    model = dt(random_state=1)
    grid = gs(estimator=model, param_grid={'criterion': ['mse', 'friedman_mse', 'mae'], 'splitter': ['best', 'random'],
                                           'max_features': ['auto', 'sqrt', 'log2']}, cv=5)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_estimator_.score(X_test, y_test))
    joblib.dump(grid.best_estimator_, 'dtr_%d_%.4f.m'%(len(features),grid.best_estimator_.score(X_test, y_test)))

    df = pd.DataFrame(columns=['ml_bandgap', 'pbe_bandgap'])
    df['pbe_bandgap'] = y_test
    df['ml_bandgap'] = grid.best_estimator_.predict(X_test)
    print(df)
Exemple #5
0
def learning_process_vali(train_path, clf_code, n_estimators, num_leaves,
                          learning_rate, top_k):
    """Returns a pd.DataFrame for validation results.

    Parameters
    ----------
    train_path : str
        The path of the train dataset.
    clf_mode : str
        The type of classifier for model 
    n_estimators : int
        The number of estimators for model
    num_leaves : int
        The number of leaf nodes for model
    learning_rate : float
        The Learning_rate of leaf nodes for model
    top_k : int
        The number of candidates estimators for final prediction of model
    """

    start = time.time()

    train_cls = pre(train_path, True)
    train_df = train_cls.main()

    print('data_loading_time : %f.2' % (time.time() - start))

    #####
    start = time.time()

    main_cls = vp(base_df=train_df,
                  clf_code=clf_code,
                  n_estimators=n_estimators,
                  num_leaves=num_leaves,
                  lr=learning_rate,
                  top_k=top_k)

    result_sample = main_cls.process_learning()

    print('learning_time : %f.2' % (time.time() - start))

    return result_sample
Exemple #6
0
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd

from preprocessing import preprocessing as pre
from to_csv import to_csv

# 讀取預處理資料
train_x, val_x, test_x, train_y, val_y = pre()

# 標準化
ss = StandardScaler()
train_x = ss.fit_transform(train_x)
val_x = ss.transform(val_x)
test_x = ss.transform(test_x)

kf = KFold(n_splits=10, shuffle=False)
fold = 0
train_array = []
val_dic = {}
test_dic = {}
val_dic['id'] = pd.read_csv(r'../ntut-ml-2020-regression/valid-v3.csv').id
test_dic['id'] = pd.read_csv(r'../ntut-ml-2020-regression/test-v3.csv').id

for train_index, test_index in kf.split(train_x):
    # print('train_index:%s , test_index: %s ' % (train_index, test_index))
    trainfold_x = train_x[train_index]