Beispiel #1
0
def ModelPredict(para):

    # para=[7.16, 0.4, 0.31,0.9,2.7,1.48, 0.78, 0.86]
    data = pd.DataFrame(columns=('R', 'angle', 'occusion', 'score'))
    print("预测开始:")

    for i in range(int(len(para) / 4)):
        print(i * 4)
        data.loc[i] = para[i * 4:i * 4 + 4]
    print(data)
    y_test = data.pop('score')
    x_test = data
    print(x_test)
    print(y_test)
    cab, lgb, xgb, gbdt, stack_lr = LoadModel()

    print("加载完毕:")
    y_pred_cab_test = cab.predict(x_test)
    y_pred_lgb_test = lgb.predict(x_test)
    y_pred_xgb_test = xgb.predict(x_test)
    y_pred_gbdt_test = gbdt.predict(x_test)

    print("stack")
    stack_x_test = pd.DataFrame()
    stack_x_test['Method_1'] = y_pred_cab_test
    stack_x_test['Method_2'] = y_pred_lgb_test
    stack_x_test['Method_3'] = y_pred_xgb_test
    stack_x_test['Method_4'] = y_pred_gbdt_test
    stack_pred = stack_lr.predict(stack_x_test)
    print("stack_mae:",
          mean_absolute_error(y_test, stack_pred))  #mae:2.1501818709279975
    print(stack_pred.tolist())
    return stack_pred.tolist()
Beispiel #2
0
def main_gbmclassifier(datastruct, experiment_id=None):
    print("Light GBM model")
    mlflow.set_experiment("Light GBM Experiments")
    df, train_x, train_y, test_x, test_y = datastruct

    train_data = lightgbm.Dataset(train_x, label=train_y)
    test_data = lightgbm.Dataset(test_x, label=test_y)

    metrics = {}

    with mlflow.start_run():
        print("Training model")
        start_timer = time.time()

        parameters = {
            'application': 'binary',
            'objective': 'binary',
            'metric': 'auc',
            'is_unbalance': 'true'
        }

        lightgbm.train(parameters, train_data, valid_sets=test_data)
        pred_y = lightgbm.predict(test_x)

        # train 200 small models
        # models = []
        # for var in train_x.columns:
        #     sys.stdout.write('\r')
        #     #base_estimator = DecisionTreeClassifier(min_samples_leaf=base_min_samples_leaf, random_state=0)
        #     model = lightgbm.train(parameters, train_data, valid_sets=test_data)
        #     models.append(model)
        #     sys.stdout.write('> {} / 200'.format(len(models)))
        #     sys.stdout.flush()

        stop_timer = time.time()
        print("Model trained")

        # predictions = [m.predict_proba(x.reshape(-1,1))[:,1] for (m, x) in zip(models, test_x.values.T)]

        # pred_y = np.array(predictions).T.mean(axis=1)
        # pred_y_logit = logit(np.array(predictions).T).sum(axis=1)

        metrics['roc_auc'] = roc_auc_score(test_y, pred_y)
        metrics['roc_auc_logit'] = roc_auc_score(test_y, pred_y_logit)
        metrics['elapsed_time'] = (stop_timer - start_timer)

        #mlflow logging
        mlflow.log_param('model_type', "200 Ada Boosted Decision Trees")
        mlflow.log_param('features', train_x.columns)
        mlflow.log_param('sample_size', df.shape)
        mlflow.log_param('min_samples_leaf', base_min_samples_leaf)
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_metrics(metrics)

        print("Completed")
Beispiel #3
0
def test_model(lgbm_model, data_dir):
    """
    Test the LightGBM model from the EMBER dataset from the vectorized features
    """
    # Read data
    X_test, y_test = read_vectorized_features(data_dir, subset="test")

    # Filter unlabeled data
    test_rows = (y_test != -1)

    test_features = X_test[test_rows]
    test_labels = y_test[test_rows]

    test_predictions = lgb.predict(test_features)

    return test_predictions
def style_predict(palette):
    def sortByLight2(elem):
        hls=colorsys.rgb_to_hls(*elem)
        return hls[1]
    # build a color palette
    palette.sort(key=sortByLight2,reverse=True)
    palette1=[*palette[0],*palette[1],*palette[2],*palette[3],*palette[4]]
    x=pd.Series(palette1,dtype='float64')
    y_pred = gbm.predict(x, num_iteration=gbm.best_iteration)
    print("Probability:",y_pred)
    y_pred=y_pred.tolist()[0]
    style=['cute','fresh','technology']
    if(max(y_pred)<EPS):
        return -1
    pred_Y=y_pred.index(max(y_pred))
    return style[pred_Y]
Beispiel #5
0
def evaluate_cb(**params):
    print('=' * 100)
    warnings.simplefilter('ignore')
    params['max_depth'] = int(params['max_depth'])
    params['max_bin'] = int(params['max_bin'])
    params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
    params['bagging_freq'] = int(params['bagging_freq'])
    params['num_leaves'] = int(params['num_leaves'])

    start_params.update(params)
    print('Training with params: {}'.format(params))
    lgb.train(start_params,
              trn_data,
              20000,
              valid_sets=[val_data],
              early_stopping_rounds=300,
              verbose_eval=False)
    val_pred = lgb.predict(val_data)
    val_score = roc_auc_score(y_cv, val_pred)
    print("Val score: {:<8.5f}".format(val_score))
    return val_score
 def predict(self):
     return lightgbm.predict(self.test_df.values)
Beispiel #7
0
def m5_predict():
    # directory = "/Users/apple/automl/auto-hpo/input/data/PredictFutureSales/"
    #
    # data = pd.read_pickle('/Users/apple/automl/auto-hpo/examples/predictfuturesales/cfp_data.pkl')
    data_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
    df = pd.read_pickle(data_dir + '/m5_data_FIRST_DAY_1.pkl')
    df = create_dt(False)
    df.to_pickle('m5_test_data.pkl')
    useless_cols = ["id", "date", "sales", "d", "wm_yr_wk", "weekday"]
    train_cols = df.columns[~df.columns.isin(useless_cols)]
    #lgb = pickle.load(open('/Users/apple/automl/auto-hpo/output/xgbmodel/18_model_train.pkl', 'rb'))
    lgb = pickle.load(
        open('/pfs/auto-hpo/auto-hpo/output/model/gbm/18_model_train.pkl',
             'rb'))

    alphas = [1.035, 1.03, 1.025, 1.02]
    weights = [1 / len(alphas)] * len(alphas)
    sub = 0.

    for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

        te = create_dt(False)
        cols = [f"F{i}" for i in range(1, 29)]

        for tdelta in range(0, 28):
            day = fday + timedelta(days=tdelta)
            print(icount, day)
            tst = te[(te.date >= day - timedelta(days=max_lags))
                     & (te.date <= day)].copy()
            create_fea(tst)
            tst = tst.loc[tst.date == day, train_cols]
            te.loc[te.date == day, "sales"] = alpha * lgb.predict(
                tst)  # magic multiplier by kyakovlev

        te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
        #     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h),
        #                                                                           "id"].str.replace("validation$", "evaluation")
        te_sub["F"] = [
            f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount() + 1
        ]
        te_sub = te_sub.set_index(["id", "F"
                                   ]).unstack()["sales"][cols].reset_index()
        te_sub.fillna(0., inplace=True)
        te_sub.sort_values("id", inplace=True)
        te_sub.reset_index(drop=True, inplace=True)
        te_sub.to_csv(f"submission_m5_{icount}.csv", index=False)
        if icount == 0:
            sub = te_sub
            sub[cols] *= weight
        else:
            sub[cols] += te_sub[cols] * weight
        print(icount, alpha, weight)

    sub2 = sub.copy()
    sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
    sub = pd.concat([sub, sub2], axis=0, sort=False)
    sub.to_csv("m5_18_submission.csv", index=False)

    sub.head(10)

    sub.id.nunique(), sub["id"].str.contains("validation$").sum()

    sub.shape
# X_score =np.delete(X_score,0,1)
M=X_score.shape[0]
scores_fin = 1+np.zeros(M)
for m in models:
    ger=m[0]
    las=m[1]
    gbr=m[2]
    Enet=m[3]
    lgb=m[4]
    las2=m[5]
    ger_predict=ger.predict(X_score)
    las_predict=las.predict(X_score)
    gbr_predict=gbr.predict(X_score)
    Enet_predict=Enet.predict(X_score)
    lgb_predict=lgb.predict(X_score)
    X_stack=pd.DataFrame({"A":[]})
    X_stack=pd.concat([X_stack,pd.DataFrame(ger_predict),pd.DataFrame(las_predict),pd.DataFrame(gbr_predict),pd.DataFrame(Enet_predict),pd.DataFrame(lgb_predict)],axis=1)
    X_stack=np.array(X_stack)
    X_stack=np.delete(X_stack,0,1)
    scores_fin=scores_fin*(las2.predict(X_stack))
scores_fin = scores_fin ** (1/nF)
# """
# #########################################################建立模型#######################################################
# """
# x_train = all_data[:ntrain]
# x_test = all_data[ntrain:]
# n_folds = 5
#
# def rmsle_cv(model):
#     kf = KFold(n_folds, shuffle=True, random_state=42)
print(grid.best_params_)
print(grid.best_score_)

lgb_params['reg_alpha'] = grid.best_params['reg_alpha']
lgb_params['reg_lambda'] = grid.best_params['reg_lambda']
lgb_params['colsample_bytree'] = grid.best_params['colsample_bytree']
lgb_params['colsample_bytree'] = grid.best_params['colsample_bytree']
lgb_params['n_estimators'] = grid.best_params_['n_estimators']
lgb.set_params(**lgb_params)
'''

X = train.drop(['target'],axis=1)
test = test.drop(['target'],axis=1)
Y = train['target'].values



lgb.fit(X,Y,verbose=False)
pred = lgb.predict(test)
print(len(pred))
submission = pd.DataFrame({'ID' : range(0,len(pred)),'item_cnt_month': pred})
submission.to_csv(SUBMISSION_FILE,index=False)
print('Process Complete {:.4f}'.format((time.time() - start_time)/60))







Beispiel #10
0
def main():
    """
    load data
    """
    train_set = pd.read_csv('../data/train.csv')
    test_set = pd.read_csv('../data/test.csv')

    #Without outlier remover, with basic nanRemover 0.12416413124809748
    """
    Remove Outliers
    """
    outliers = train_set[train_set['GrLivArea'] > 4500].index
    print(outliers)

    outliers = [197, 523, 691, 854, 1182, 1298]

    train_set.drop(outliers, inplace=True)

    #With outlier remover 0.10970218665126451
    """
    fix salePrice skewness
    """
    train_set["SalePrice"] = np.log1p(train_set["SalePrice"])
    y_train_values = train_set["SalePrice"].values
    """
    prepare combined data.
    """
    train_set_id = train_set['Id']
    test_set_id = test_set['Id']

    train_set_rows = train_set.shape[0]
    test_set_rows = test_set.shape[0]

    train_set.drop('Id', axis=1, inplace=True)
    test_set.drop('Id', axis=1, inplace=True)
    train_set.drop('SalePrice', axis=1, inplace=True)

    combined_data = pd.concat((train_set, test_set))
    """
    create data transform pipeline
    """
    transform_pipeline = Pipeline(steps=[
        ('OutlierRemover', OutlierRemover()),
        ('NaNImputer', NaNImputer()),
        ('NaNRemover', NaNRemover()),
        ('AdditionalFeatureGenerator', AdditionalFeatureGenerator()),
        ('TypeTransformer', TypeTransformer()),
        ('ErrorImputer', ErrorImputer()),
        ('SkewFixer', SkewFixer()),
        ('Scaler', Scaler()),
        ('FeatureDropper', FeatureDropper()),
        ('Dummyfier', Dummyfier()),
    ])

    transformed_data = transform_pipeline.transform(combined_data)
    train_data = transformed_data[:train_set_rows]
    predict_data = transformed_data[train_set_rows:]
    """
    try various regressors
    """

    rf_param = {
        # 'bootstrap': [True],
        'max_depth': [3, 4, 5],
        'min_samples_leaf': [3, 4, 5],
        'n_estimators': [5, 7, 10]
    }
    ls_param = {
        'alpha': [0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008],
        'max_iter': [10000],
        "normalize": [False]
    }

    elnet_param = {
        'alpha': [0.0003, 0.0004, 0.0005],
        'l1_ratio': [0.9, 0.95, 0.99, 1],
        'max_iter': [10000]
    }

    ridge_param = {'alpha': [10, 10.1, 10.2, 10.3, 10.4, 10.5]}

    svr_param = {
        'gamma': [1e-08, 1e-09],
        'C': [100000, 110000],
        'epsilon': [1, 0.1, 0.01]
    }
    gbm_param = {
        "n_estimators": [1000],
        'min_child_weight': [1, 5],
        'gamma': [0.1, 0.2],
        'subsample': [0.6],
        'colsample_bytree': [0.6],
        'max_depth': [3, 4],
        'eta': [0.01],
        'eval_metric': ['mae']
    }

    lgb_params = {
        'objective': ['regression'],
        'num_leaves': [255],
        'max_depth': [8],
        'bagging_seed': [3],
        'boosting_type': ['gbdt'],
        'min_sum_hessian_in_leaf': [100],
        'learning_rate': np.linspace(0.05, 0.1, 2),
        'bagging_fraction': np.linspace(0.7, 0.9, 2),
        'bagging_freq': np.linspace(30, 50, 3, dtype='int'),
        'max_bin': [15, 63],
    }

    rf = get_best_estimator(train_data,
                            y_train_values,
                            estimator=RandomForestRegressor(),
                            params=rf_param,
                            n_jobs=4)
    elnet = get_best_estimator(train_data,
                               y_train_values,
                               estimator=ElasticNet(),
                               params=elnet_param,
                               n_jobs=4)
    lso = get_best_estimator(train_data,
                             y_train_values,
                             estimator=Lasso(),
                             params=ls_param,
                             n_jobs=4)
    rdg = get_best_estimator(train_data,
                             y_train_values,
                             estimator=Ridge(),
                             params=ridge_param,
                             n_jobs=4)
    svr = get_best_estimator(train_data,
                             y_train_values,
                             estimator=SVR(),
                             params=svr_param,
                             n_jobs=4)

    gbm = get_best_estimator(train_data,
                             y_train_values,
                             estimator=xgb.XGBRegressor(),
                             params=gbm_param,
                             n_jobs=4)
    lbm = get_best_estimator(train_data,
                             y_train_values,
                             estimator=lgb.LGBMRegressor(),
                             params=lgb_params,
                             n_jobs=4)

    def cv_rmse(model):
        kfolds = KFold(n_splits=5, shuffle=True, random_state=42)
        rmse = np.sqrt(-cross_val_score(model,
                                        train_data,
                                        y_train_values,
                                        scoring="neg_mean_squared_error",
                                        cv=kfolds))
        return (rmse)

    # print("Randomforest  model rmse : ", cv_rmse(rf).mean())
    # print("elastic model rmse : ", cv_rmse(elnet).mean())
    # print("lasso model rmse : ", cv_rmse(lso).mean())
    # print("ridge model rmse : ", cv_rmse(rdg).mean())
    # print("svr model rmse : ", cv_rmse(svr).mean())
    # print("xgboost model rmse : ", cv_rmse(gbm).mean())
    # print("lightgbm model rmse : ", cv_rmse(lbm).mean())

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(rf.predict(predict_data))
    })
    submission.to_csv('submission_rf.csv', index=False)

    submission = pd.DataFrame({
        "Id":
        test_set_id,
        "SalePrice":
        np.expm1(elnet.predict(predict_data))
    })
    submission.to_csv('submission_elnet.csv', index=False)

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(lso.predict(predict_data))
    })
    submission.to_csv('submission_lso.csv', index=False)

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(rdg.predict(predict_data))
    })
    submission.to_csv('submission_rdg.csv', index=False)

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(svr.predict(predict_data))
    })
    submission.to_csv('submission_svr.csv', index=False)

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(gbm.predict(predict_data))
    })
    submission.to_csv('submission_gbm.csv', index=False)

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(lbm.predict(predict_data))
    })
    submission.to_csv('submission_lbm.csv', index=False)

    model = StackingRegressor(regressors=[rf, elnet, lso, rdg, svr],
                              meta_regressor=Lasso(alpha=0.0005))

    # Fit the model on our data
    model.fit(train_data, y_train_values)
    print("StackingRegressor model rmse : ", cv_rmse(model).mean())

    # y_pred = model.predict(train_data)
    # print(sqrt(mean_squared_error(y_train_values, y_pred)))

    # Predict test set
    ensembled = np.expm1(model.predict(predict_data))
    """
    export submission data
    """
    submission = pd.DataFrame({"Id": test_set_id, "SalePrice": ensembled})
    submission.to_csv('submission_stacking.csv', index=False)
    """" Ensemble Weights """
    from scipy.optimize import minimize
    regressors = [rf, elnet, lso, rdg, svr, gbm, lbm]

    predictions = []
    for clf in regressors:
        predictions.append(
            clf.predict(train_data))  # listing all our predictions

    def mse_func(weights):
        # scipy minimize will pass the weights as a numpy array
        final_prediction = 0
        for weight, prediction in zip(weights, predictions):
            final_prediction += weight * prediction
        return mean_squared_error(y_train_values, final_prediction)

    starting_values = [0.5] * len(
        predictions)  # minimize need a starting value
    bounds = [(0, 1)] * len(predictions)  # weights are bound between 0 and 1
    res = minimize(mse_func, starting_values, bounds=bounds, method='SLSQP')
    print('Result Assessment: {message_algo}'.format(
        message_algo=res['message']))
    print('Ensemble Score: {best_score}'.format(best_score=res['fun']))
    print('Best Weights: {weights}'.format(weights=res['x']))

    ##  All
    sale_price_ensemble = (
        np.expm1(rf.predict(predict_data)) * res['x'][0] +
        np.expm1(elnet.predict(predict_data)) * res['x'][1] +
        np.expm1(lso.predict(predict_data)) * res['x'][2] +
        np.expm1(rdg.predict(predict_data)) * res['x'][3] +
        np.expm1(svr.predict(predict_data)) * res['x'][4] +
        np.expm1(gbm.predict(predict_data)) * res['x'][5] +
        np.expm1(lgb.predict(predict_data)) * res['x'][6])

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": sale_price_ensemble
    })
    submission.to_csv('submission_average.csv', index=False)
Beispiel #11
0
def lgb(df_train, df_test):
    df_train['bodyType'] = df_train['bodyType'].replace(np.nan, -1).astype(int)
    df_train['model'] = df_train['model'].replace(np.nan, -1).astype(int)
    df_train['fuelType'] = df_train['fuelType'].replace(np.nan, -1).astype(int)
    df_train['gearbox'] = df_train['gearbox'].replace(np.nan, -1).astype(int)
    df_train['notRepairedDamage'] = df_train['notRepairedDamage'].replace(
        '-', -1)

    df_train['name_count'] = df_train.groupby(['name'
                                               ])['SaleID'].transform('count')
    df_train['creatDate'] = df_train['creatDate'].astype(str).str[0:4]
    df_train['regDate'] = df_train['regDate'].astype(str).str[0:4]
    df_train['used_year'] = df_train['creatDate'].astype(
        int) - df_train['regDate'].astype(int)
    df_train['power'] = df_train['power'].map(lambda x: 600 if x > 600 else x)

    df_train['bodyType_0'] = df_train['bodyType'].apply(lambda x: 1
                                                        if x == 0 else 0)
    df_train['bodyType_1'] = df_train['bodyType'].apply(lambda x: 1
                                                        if x == 1 else 0)
    df_train['bodyType_2'] = df_train['bodyType'].apply(lambda x: 1
                                                        if x == 2 else 0)
    df_train['bodyType_3'] = df_train['bodyType'].apply(lambda x: 1
                                                        if x == 3 else 0)
    df_train['bodyType_4'] = df_train['bodyType'].apply(lambda x: 1
                                                        if x == 4 else 0)
    df_train['bodyType_5'] = df_train['bodyType'].apply(lambda x: 1
                                                        if x == 5 else 0)
    df_train['bodyType_6'] = df_train['bodyType'].apply(lambda x: 1
                                                        if x == 6 else 0)
    df_train['bodyType_7'] = df_train['bodyType'].apply(lambda x: 1
                                                        if x == 7 else 0)
    df_train['bodyType_-1'] = df_train['bodyType'].apply(lambda x: 1
                                                         if x == -1 else 0)
    df_train['fuelType_0'] = df_train['fuelType'].apply(lambda x: 1
                                                        if x == 0 else 0)
    df_train['fuelType_1'] = df_train['fuelType'].apply(lambda x: 1
                                                        if x == 1 else 0)
    df_train['fuelType_2'] = df_train['fuelType'].apply(lambda x: 1
                                                        if x == 2 else 0)
    df_train['fuelType_3'] = df_train['fuelType'].apply(lambda x: 1
                                                        if x == 3 else 0)
    df_train['fuelType_4'] = df_train['fuelType'].apply(lambda x: 1
                                                        if x == 4 else 0)
    df_train['fuelType_5'] = df_train['fuelType'].apply(lambda x: 1
                                                        if x == 5 else 0)
    df_train['fuelType_6'] = df_train['fuelType'].apply(lambda x: 1
                                                        if x == 6 else 0)
    df_train['fuelType_-1'] = df_train['fuelType'].apply(lambda x: 1
                                                         if x == -1 else 0)

    feature_choose0 = [
        'SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
        'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
        'seller', 'offerType', 'creatDate', 'price', 'bodyType_0',
        'bodyType_1', 'bodyType_2', 'bodyType_3', 'bodyType_4', 'bodyType_5',
        'bodyType_6', 'bodyType_7', 'bodyType_-1', 'fuelType_0', 'fuelType_1',
        'fuelType_2', 'fuelType_3', 'fuelType_4', 'fuelType_5', 'fuelType_6',
        'fuelType_-1'
    ]

    feature_choose0_test = [
        'SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
        'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
        'seller', 'offerType', 'creatDate', 'bodyType_0', 'bodyType_1',
        'bodyType_2', 'bodyType_3', 'bodyType_4', 'bodyType_5', 'bodyType_6',
        'bodyType_7', 'bodyType_-1', 'fuelType_0', 'fuelType_1', 'fuelType_2',
        'fuelType_3', 'fuelType_4', 'fuelType_5', 'fuelType_6', 'fuelType_-1'
    ]

    feature_choose1 = [
        'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9',
        'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'used_year', 'name_count'
    ]

    feature_choose2 = ['price']

    X_scaler = StandardScaler()
    Y_scaler = StandardScaler()
    df_scaler_X = X_scaler.fit_transform(df_train[feature_choose1])
    df_scaler_X1 = pd.DataFrame(df_scaler_X, columns=feature_choose1)
    df_train = pd.concat([df_train[feature_choose0], df_scaler_X1], axis=1)

    df_scaler_Y = Y_scaler.fit_transform(df_train[feature_choose2])
    df_scaler_Y1 = pd.DataFrame(df_scaler_Y, columns=['price'])

    kk = ['kilometer', 'power']
    t1 = df_train.groupby(kk[0], as_index=False)[kk[1]].agg({
        kk[0] + '_' + kk[1] + '_count':
        'count',
        kk[0] + '_' + kk[1] + '_max':
        'max',
        kk[0] + '_' + kk[1] + '_median':
        'median',
        kk[0] + '_' + kk[1] + '_min':
        'min',
        kk[0] + '_' + kk[1] + '_sum':
        'sum',
        kk[0] + '_' + kk[1] + '_std':
        'std',
        kk[0] + '_' + kk[1] + '_mean':
        'mean'
    })
    df_train = pd.merge(df_train, t1, on=kk[0], how='left')

    train_X = df_train.drop(labels=[
        'SaleID', 'price', 'regDate', 'creatDate', 'regionCode', 'name',
        'offerType', 'seller'
    ],
                            axis=1).values
    train_Y = df_scaler_Y1.values
    x_train, x_test, y_train, y_test = train_test_split(train_X,
                                                        train_Y,
                                                        test_size=0.2)

    import lightgbm as lgbm
    model_lgbm = lgbm.LGBMRegressor(n_estimators=10000,
                                    learning_rate=0.02,
                                    boosting_type='gbdt',
                                    objective='regression_l1',
                                    max_depth=-1,
                                    num_leaves=31,
                                    min_child_samples=20,
                                    feature_fraction=0.8,
                                    bagging_freq=1,
                                    bagging_fraction=0.8,
                                    lambda_l2=2,
                                    random_state=2020,
                                    metric='mae')

    lgbm = model_lgbm.fit(x_train, y_train)

    df_out = pd.DataFrame(data=None)
    df_out['SaleID'] = df_test['SaleID']
    df_test['bodyType'] = df_test['bodyType'].replace(np.nan, -1)
    df_test['fuelType'] = df_test['fuelType'].replace(np.nan, -1)
    df_test['gearbox'] = df_test['gearbox'].replace(np.nan, -1)
    df_test['notRepairedDamage'] = df_test['notRepairedDamage'].replace(
        '-', -1)
    df_test['name_count'] = df_test.groupby(['name'
                                             ])['SaleID'].transform('count')
    df_test['creatDate'] = df_test['creatDate'].astype(str).str[0:4]
    df_test['regDate'] = df_test['regDate'].astype(str).str[0:4]
    df_test['used_year'] = df_test['creatDate'].astype(
        int) - df_test['regDate'].astype(int)
    df_test['power'] = df_test['power'].map(lambda x: 600 if x > 600 else x)

    df_test['bodyType_0'] = df_test['bodyType'].apply(lambda x: 1
                                                      if x == 0 else 0)
    df_test['bodyType_1'] = df_test['bodyType'].apply(lambda x: 1
                                                      if x == 1 else 0)
    df_test['bodyType_2'] = df_test['bodyType'].apply(lambda x: 1
                                                      if x == 2 else 0)
    df_test['bodyType_3'] = df_test['bodyType'].apply(lambda x: 1
                                                      if x == 3 else 0)
    df_test['bodyType_4'] = df_test['bodyType'].apply(lambda x: 1
                                                      if x == 4 else 0)
    df_test['bodyType_5'] = df_test['bodyType'].apply(lambda x: 1
                                                      if x == 5 else 0)
    df_test['bodyType_6'] = df_test['bodyType'].apply(lambda x: 1
                                                      if x == 6 else 0)
    df_test['bodyType_7'] = df_test['bodyType'].apply(lambda x: 1
                                                      if x == 7 else 0)
    df_test['bodyType_-1'] = df_test['bodyType'].apply(lambda x: 1
                                                       if x == -1 else 0)

    df_test['fuelType_0'] = df_test['fuelType'].apply(lambda x: 1
                                                      if x == 0 else 0)
    df_test['fuelType_1'] = df_test['fuelType'].apply(lambda x: 1
                                                      if x == 1 else 0)
    df_test['fuelType_2'] = df_test['fuelType'].apply(lambda x: 1
                                                      if x == 2 else 0)
    df_test['fuelType_3'] = df_test['fuelType'].apply(lambda x: 1
                                                      if x == 3 else 0)
    df_test['fuelType_4'] = df_test['fuelType'].apply(lambda x: 1
                                                      if x == 4 else 0)
    df_test['fuelType_5'] = df_test['fuelType'].apply(lambda x: 1
                                                      if x == 5 else 0)
    df_test['fuelType_6'] = df_test['fuelType'].apply(lambda x: 1
                                                      if x == 6 else 0)
    df_test['fuelType_-1'] = df_test['fuelType'].apply(lambda x: 1
                                                       if x == -1 else 0)

    df_scaler_test_X = X_scaler.fit_transform(df_test[feature_choose1])
    df_scaler_test_X1 = pd.DataFrame(df_scaler_test_X, columns=feature_choose1)
    df_test = pd.concat([df_test[feature_choose0_test], df_scaler_test_X1],
                        axis=1)

    kk = ['kilometer', 'power']
    t1 = df_test.groupby(kk[0], as_index=False)[kk[1]].agg({
        kk[0] + '_' + kk[1] + '_count':
        'count',
        kk[0] + '_' + kk[1] + '_max':
        'max',
        kk[0] + '_' + kk[1] + '_median':
        'median',
        kk[0] + '_' + kk[1] + '_min':
        'min',
        kk[0] + '_' + kk[1] + '_sum':
        'sum',
        kk[0] + '_' + kk[1] + '_std':
        'std',
        kk[0] + '_' + kk[1] + '_mean':
        'mean'
    })
    df_test = pd.merge(df_test, t1, on=kk[0], how='left')

    df_test = df_test.drop(labels=[
        'SaleID', 'regDate', 'creatDate', 'regionCode', 'name', 'offerType',
        'seller'
    ],
                           axis=1).values
    test_X = df_test

    df_out['price1'] = Y_scaler.inverse_transform(lgbm.predict(test_X))
    df_out = df_out[['SaleID', 'price1']]
    return df_out
	def testLightGBM(self):
		self.predicted_labels =  lgb.predict(self.val_data)
		print ("LightGBM  score " + str(rmse(self.predicted_labels,self.val_labels)))
Beispiel #13
0
    'max_depth': 7,
    'learning_rate': 0.05,
    'max_bin': 200
}
param['metric'] = ['auc', 'binary_logloss']

num_round = 50
from datetime import datetime
start = datetime.now()
lgb = lgb.train(param, train_dataset, num_round)
stop = datetime.now()

execution_time_lgb = stop - start
print('--' * 20, execution_time_lgb, '--' * 20)

ypred2 = lgb.predict(x_test)
print(ypred2)

for i in range(ypred2.shape[0]):
    if ypred2[i] > 0.5:
        ypred2[i] = 1
    else:
        ypred2[i] = 0

lgb_xgb = accuracy_score(y_test, ypred2)
print(confusion_matrix(y_test, ypred2))

#||----------------------------------------------------------------------------------------------------------------
from sklearn.metrics import roc_auc_score

xgb_auc = roc_auc_score(y_test, ypred)
Beispiel #14
0
    list(trains['comment_text']) + list(tests['comment_text']))
word_index = tokenizer.word_index
train_X = tokenizer.texts_to_sequences(train['comment_text'])
test_X = tokenizer.texts_to_sequences(tests['comment_text'])
train_X = pad_sequences(train_X, maxlen=220)
test_X = pad_sequences(test_X, maxlen=220)

train_X = np.hstack([train_X, other_trains_1])
test_X = np.hstack([test_X, other_trains_2])

from sklearn.model_selection import StratifiedKFold

params = {
    'max_depth': -1,
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'num_leaves': 2**9 - 1,
    'colsample_bytree': 0.28,
    'objective': 'binary',
    'n_jobs': -1,
    'eval_metric': 'auc'
}
import lightgbm as lgb
xtrain = lgb.Dataset(train_X, label_train)
num_round = 10000
lgb = lgb.train(params, xtrain, num_round)
yp = lgb.predict(test_X)
from sklearn.metrics import roc_auc_score, f1_score

print(roc_auc_score(list(label_test.values), list(yp)))
Beispiel #15
0
                                                          y,
                                                          test_size=0.2,
                                                          random_state=42)

        # Dataset
        lgb_train = lgb.Dataset(data=x_train, label=y_train)
        lgb_eval = lgb.Dataset(data=x_val, label=y_val)

        lgb = lgb.train(train_set=lgb_train,
                        valid_sets=lgb_eval,
                        params=params,
                        verbose_eval=200,
                        early_stopping_rounds=early_stopping_rounds,
                        num_boost_round=num_boost_round)

        y_pred = lgb.predict(x_val)
        score = log_loss(y_val, y_pred)
        logger.info(f'Fold No: {n_fold} | {metric}: {score}')
        logger.info(f"Train Shape: {x_train.shape}")
        for thresh in np.arange(0.1, 0.301, 0.01):
            thresh = np.round(thresh, 2)
            f1 = f1_score(y_val, (y_pred > thresh).astype(int))
            logger.info(f"F1 score at threshold {thresh} is {f1}")

        test_pred = lgb.predict(test)

        if len(prediction) == 0:
            prediction = test_pred
        else:
            prediction += test_pred
print(best_params)

### 训练
params['learning_rate'] = 0.01
lgb.train(
    params,  # 参数字典
    lgb_train,  # 训练集
    valid_sets=lgb_eval,  # 验证集
    num_boost_round=2000,  # 迭代次数
    early_stopping_rounds=50  # 早停次数
)

### 线下预测
print("线下预测")
preds_offline = lgb.predict(offline_test_X,
                            num_iteration=lgb.best_iteration)  # 输出概率
offline = offline_test[['instance_id', 'is_trade']]
offline['preds'] = preds_offline
offline.is_trade = offline['is_trade'].astype(np.float64)
print('log_loss', metrics.log_loss(offline.is_trade, offline.preds))

### 线上预测
print("线上预测")
preds_online = lgb.predict(online_test_X,
                           num_iteration=lgb.best_iteration)  # 输出概率
online = online_test[['instance_id']]
online['preds'] = preds_online
online.rename(columns={'preds': 'predicted_score'}, inplace=True)  # 更改列名
online.to_csv("./data/20180405.txt", index=None, sep=' ')  # 保存结果

### 保存模型
Beispiel #17
0
                       verbose=100,
                       eval_set=(X_test[predictors], y_test))
        from sklearn.metrics import mean_squared_error
        print("线下误差:{}".format(0.5 * mean_squared_error(
            y_test, self.model.predict(X_test[predictors]))))
        return self

    def predict(self, X):
        # 对测试集进行预测,传入模型,和测试数据
        fea_test = pd.read_csv("./feature/fea_test.csv")
        fea_test1 = pd.read_csv("./feature/fea_test_1.csv")
        fea_test2 = pd.read_csv("./feature/fea_test_2.csv")
        X = pd.merge(X, fea_test, how="left", on="id")
        X = pd.merge(X, fea_test1, how="left", on="id")
        X = pd.merge(X, fea_test2, how="left", on="id")
        X = self.__make_feature(test=X, train=pd.DataFrame())
        predictors = [f for f in list(X.columns) if f not in self.no_use]
        test_pred = self.model.predict(X[predictors])
        print("最大值:{}".format(test_pred.max()))
        return test_pred

    def get_params(self):
        return self.params


biorad = b_model()
train = pd.read_csv("../raw_data/d_train.csv", encoding="gbk")
test = pd.read_csv("../raw_data/d_test_A.csv", encoding="gbk")
lgb = biorad.fit(train)
lgb.predict(test)
Beispiel #18
0
def train():
    train, train_label, valid, valid_label, test, test_label = get_allData()
    model = ML_model(train, valid, train_label, valid_label)
    import warnings
    warnings.filterwarnings("ignore")
    rf = model.rf()
    print("the model is rf and the test's f1 is: ",
          f1_score(test_label, rf.predict(test), average="macro"))
    print("the model is rf and the test's precision_score is: ",
          precision_score(test_label, rf.predict(test), average="macro"))
    print("the model is rf and the test's recall_score is: ",
          recall_score(test_label, rf.predict(test), average="macro"))
    print(
        "----------------------------------------------------------------------------------------"
    )
    gboost = model.gboost()
    print("the model is gboost and the test's f1 is: ",
          f1_score(test_label, gboost.predict(test), average="macro"))
    print("the model is gboost and the test's precision_score is: ",
          precision_score(test_label, gboost.predict(test), average="macro"))
    print("the model is gboost and the test's recall_score is: ",
          recall_score(test_label, gboost.predict(test), average="macro"))
    print(
        "----------------------------------------------------------------------------------------"
    )
    svm = model.svm()
    print("the model is svm and the test's f1 is: ",
          f1_score(test_label, svm.predict(test), average="macro"))
    print("the model is svm and the test's precision_score is: ",
          precision_score(test_label, svm.predict(test), average="macro"))
    print("the model is svm and the test's recall_score is: ",
          recall_score(test_label, svm.predict(test), average="macro"))
    print(
        "----------------------------------------------------------------------------------------"
    )
    xbg = model.xgboost()
    print("the model is xbg and the test's f1 is: ",
          f1_score(test_label, xbg.predict(test), average="macro"))
    print("the model is xbg and the test's precision_score is: ",
          precision_score(test_label, xbg.predict(test), average="macro"))
    print("the model is xbg and the test's recall_score is: ",
          recall_score(test_label, xbg.predict(test), average="macro"))
    print(
        "----------------------------------------------------------------------------------------"
    )
    lgb = model.lgb()
    print("the model is lgb and the test's f1 is: ",
          f1_score(test_label, lgb.predict(test), average="macro"))
    print("the model is lgb and the test's precision_score is: ",
          precision_score(test_label, lgb.predict(test), average="macro"))
    print("the model is lgb and the test's recall_score is: ",
          recall_score(test_label, lgb.predict(test), average="macro"))
    print(
        "----------------------------------------------------------------------------------------"
    )
    stack = model.stacking()
    print("the model is stack and the test's f1 is: ",
          f1_score(test_label, stack.predict(test), average="macro"))
    print("the model is stack and the test's precision_score is: ",
          precision_score(test_label, stack.predict(test), average="macro"))
    print("the model is stack and the test's recall_score is: ",
          recall_score(test_label, stack.predict(test), average="macro"))
    print(
        "----------------------------------------------------------------------------------------"
    )