def train_run():
    l_reg = XGBClassifier(seed=27, silent=False, verbose_eval=True, subsample=0.8,  reg_lambda=5.0, n_estimators=800, min_child_weight=1, max_depth=15,
                          learning_rate=0.03, gamma=0.5, colsample_bytree=0.9, colsample_bylevel=0.7)

    file_path = os.path.join(os.getcwd(), 'input_data/train_amex/train.csv')
    data = pd.read_csv(file_path)
    data, y_hat, scaler, pca = data_cleaner_train(data)
    data.loc[:, 'yhat'] = y_hat
    d1 = data.loc[data.yhat == 1, :]
    d2 = data.loc[data.yhat == 0, :]
    data_op = pd.DataFrame()

    for i in range(3):
        print(i)
        d1_sample = d1.sample(frac=1)
        d2_sample = d2.sample(n=len(d1_sample), replace=False)
        data_tmp = d1_sample.append(d2_sample)
        data_op = data_op.append(data_tmp, ignore_index=True)

    data_op = data_op.sample(frac=1)
    yhat_op = data_op.loc[:, 'yhat']

    data_op.drop('yhat', axis=1, inplace=True)
    data.drop('yhat', axis=1, inplace=True)

    # # specify parameters and distributions to sample from
    # one_to_left = st.beta(10, 1)
    # from_zero_positive = st.expon(0, 50)
    #
    # param_dist = {
    #     'silent': [False],
    #     'max_depth': [15, 10,15],
    #     'learning_rate': [0.03, 0.05, 0.1],
    #     'subsample': [0.7, 0.8, 0.9, 1.0],
    #     'colsample_bytree': [0.7, 0.8, 0.9],
    #     'colsample_bylevel': [0.7, 0.8, 0.9],
    #     'min_child_weight': [0.5, 1.0, 3.0, 7.0, 10.0],
    #     'gamma': [0.25, 0.5, 1.0],
    #     'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0],
    #     'n_estimators': [200, 500, 800]}
    #
    # # run randomized search
    # n_iter_search = 10
    # random_search = RandomizedSearchCV(l_reg, param_distributions=param_dist,
    #                                    n_iter=n_iter_search, cv=3, scoring='roc_auc', n_jobs=3)
    #
    # model = random_search.fit(data_op.values, yhat_op.values)
    # report(random_search.cv_results_)

    model = l_reg.fit(data_op.values, yhat_op.values)

    output = model.predict(data.values)

    roc = roc_auc_score(y_hat.values, output)
    print('#############################################################')
    print(roc)
    joblib.dump(pca, pca_file_name)
    joblib.dump(model, model_filename)
    joblib.dump(scaler, scaler_filename)
Example #2
0
def train_run():
    l_reg = RandomForestClassifier(random_state=0, n_jobs=3, verbose=3)
    file_path = os.path.join(os.getcwd(), 'input_data/train_amex/train.csv')

    data = pd.read_csv(file_path)

    data, y_hat, scaler = data_cleaner_train(data)

    data.loc[:, 'yhat'] = y_hat
    d1 = data.loc[data.yhat == 1, :]
    d2 = data.loc[data.yhat == 0, :]

    data_op = pd.DataFrame()

    for i in range(5):
        print(i)
        d1_sample = d1.sample(frac=1, replace=False)
        d2_sample = d2.sample(n=len(d1_sample), replace=False)
        data_tmp = d1_sample.append(d2_sample)
        data_op = data_op.append(data_tmp, ignore_index=True)

    data_op = data_op.sample(frac=1)
    yhat_op = data_op.loc[:, 'yhat']

    # predictors = list(data_op.drop('yhat', axis=1).columns)
    # modelfit(l_reg, data_op, predictors, performCV=True, printFeatureImportance=True, cv_folds=5)

    data_op.drop('yhat', axis=1, inplace=True)
    data.drop('yhat', axis=1, inplace=True)

    # specify parameters and distributions to sample from
    param_dist = {
        "max_depth": [3, 10, 20, 30, 40, 50],
        "max_features": sp_randint(5, 50),
        "min_samples_split": sp_randint(2, 20),
        "bootstrap": [True, False],
        "criterion": ["gini", "entropy"]
    }

    # run randomized search
    n_iter_search = 10
    random_search = RandomizedSearchCV(l_reg,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=5,
                                       scoring='roc_auc',
                                       n_jobs=1)

    model = random_search.fit(data_op.values, yhat_op.values)
    report(random_search.cv_results_)

    # model = l_reg.fit(data_op.values, yhat_op.values)
    output = model.predict(data.values)
    roc = roc_auc_score(y_hat.values, output)
    print(roc)
    joblib.dump(model, model_filename)
    joblib.dump(scaler, scaler_filename)
def train_run():
    l_reg = GradientBoostingClassifier(max_features=None,
                                       subsample=0.80,
                                       random_state=0,
                                       verbose=True,
                                       learning_rate=0.05,
                                       n_estimators=200,
                                       min_samples_split=500,
                                       min_samples_leaf=500,
                                       max_depth=15)
    file_path = os.path.join(os.getcwd(), 'input_data/train_amex/train.csv')
    data = pd.read_csv(file_path)
    data, y_hat, scaler, pca = data_cleaner_train(data)
    data.loc[:, 'yhat'] = y_hat
    d1 = data.loc[data.yhat == 1, :]
    d2 = data.loc[data.yhat == 0, :]
    data_op = pd.DataFrame()

    for i in range(1):
        print(i)
        d1_sample = d1.sample(frac=1)
        d2_sample = d2.sample(n=len(d1_sample), replace=False)
        data_tmp = d1_sample.append(d2_sample)
        data_op = data_op.append(data_tmp, ignore_index=True)

    data_op = data_op.sample(frac=1)
    yhat_op = data_op.loc[:, 'yhat']

    data_op.drop('yhat', axis=1, inplace=True)
    data.drop('yhat', axis=1, inplace=True)

    # # specify parameters and distributions to sample from
    # param_dist = {'max_depth': range(10, 30,5), 'min_samples_split': range(100, 1000, 200), 'n_estimators': range(100, 1000, 300), 'min_samples_leaf': range(100, 1000, 200)}
    #
    # # run randomized search
    # n_iter_search = 10
    # random_search = RandomizedSearchCV(l_reg, param_distributions=param_dist,
    #                                    n_iter=n_iter_search, cv=3, scoring='roc_auc', n_jobs=3)
    #
    # model = random_search.fit(data_op.values, yhat_op.values)
    # report(random_search.cv_results_)

    model = l_reg.fit(data_op.values, yhat_op.values)

    output = model.predict(data.values)

    roc = roc_auc_score(y_hat.values, output)
    print('#############################################################')
    print(roc)
    joblib.dump(pca, pca_file_name)
    joblib.dump(model, model_filename)
    joblib.dump(scaler, scaler_filename)
def train_run():
    l_reg = RandomForestClassifier(n_estimators=500,
                                   max_depth=30,
                                   random_state=0,
                                   n_jobs=7,
                                   verbose=2)
    file_path = os.path.join(os.getcwd(), 'input_data/train_amex/train.csv')
    userlogs_path = os.path.join(
        os.getcwd(), 'input_data/train_amex/historical_user_logs.csv')
    data = pd.read_csv(file_path)
    userlogs = pd.read_csv(userlogs_path)

    data, y_hat, scaler, to_keep_prod_2, is_interesting = data_cleaner_train(
        data, userlogs)
    with open(to_keep_file_name, 'wb') as f:
        pickle.dump(to_keep_prod_2, f)
    is_interesting.to_pickle(is_int_fname)

    data.loc[:, 'yhat'] = y_hat
    d1 = data.loc[data.yhat == 1, :]
    d2 = data.loc[data.yhat == 0, :]

    data_op = pd.DataFrame()
    yhat_op = pd.DataFrame()

    for i in range(50):
        print(i)
        d1_sample = d1.sample(n=10000, replace=True)
        d2_sample = d2.sample(n=len(d1_sample), replace=True)
        data_tmp = d1_sample.append(d2_sample)
        data_op = data_op.append(data_tmp, ignore_index=True)
    yhat_op = data_op.loc[:, 'yhat']
    data_op.drop('yhat', axis=1, inplace=True)
    data.drop('yhat', axis=1, inplace=True)
    data = data.sample(frac=1)
    model = l_reg.fit(data_op.values, yhat_op.values)
    output = model.predict(data.values)
    # output[output > 0.5] = 1
    # output[output < 0.5] = 0
    roc = roc_auc_score(y_hat.values, output)
    print(roc)
    joblib.dump(model, model_filename)
    joblib.dump(scaler, scaler_filename)