Beispiel #1
0
def baseline_test_score(n_neighbors, Average_weights, lag, outputlength,
                        train_ratio, horizon, weights):
    shops = range(1, 2001)
    print "Getting regressor..."
    neigh = getRegressor(n_neighbors, Average_weights, lag, outputlength,
                         train_ratio, horizon, weights)
    print "Done"
    yTrue = []
    yPred = []
    res = []
    for i in shops:
        print i
        yt = get_truth_value_last_elements(i, lag, outputlength, horizon)
        yTrue.append(yt)
        yp = ItePredict(neigh,
                        get_feature_including_mean_diff(i, lag, outputlength),
                        horizon, lag, weights)
        if len(yt) != len(yp):
            print "found %s" % i
            sys.exit(0)
        yp = pd.DataFrame(yp)
        yPred.append(yp)
        curloss = loss(yp, yt)
        curloss = [i, curloss]
        res.append(curloss)
    # print yTrue, yPred
    yTrue = pd.concat(yTrue)
    yPred = pd.concat(yPred)
    tmp = loss(yPred, yTrue)
    res.append(['all', tmp])
    DESFOLDER = os.path.join(HOME, "Dropbox", "dataset", "Analysis",
                             "IterativeModel", "Tuning")
    file_name = "neghbours-%s_targetweight-%s_lag-%s_day-%s.csv" % (
        n_neighbors, Average_weights, lag, datetime.datetime.now().date())
    DESFile = os.path.join(DESFOLDER,
                           get_name_from_weights(weights) + file_name)
    final_rec = pd.DataFrame(res, columns=['shop_id', 'loss'])
    final_rec.to_csv(DESFile, index=False)
    return tmp
def xgBoost_out14(source,
                  day,
                  predictors,
                  predictors_type,
                  ifGS=True,
                  target_variables=['Tar_1', 'Tar_2'],
                  ifCompetition=False,
                  useTrainCV=True,
                  cv_folds=5,
                  early_stopping_rounds=50,
                  X_test_comp='l'):

    for tar in target_variables[day - 1:day]:
        report_file = "xgBoost_14out_removeSHOPID_%s_day_%s.txt" % (
            predictors_type, tar)
        report_file = os.path.join(ReportFolder, report_file)

        #1 get data
        X = source[predictors]
        target_variables_plus = [tar] + ['shop_id', 'day']
        y = source[target_variables_plus]

        # X_train = X[predictors][:-1]
        # y_train = y[:][:-1]
        # X_test = X[predictors][-1:]
        # y_test = y[:][-1:]

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.10,
                                                            random_state=0)

        # if ifGS:
        #     # initial tuning for default paras -> to set n_estimators
        #     xgb_param = {'reg_alpha': 0, 'subsample': 0.8, 'seed': 0, 'colsample_bytree': 0.8,
        #                  'objective': 'reg:linear', 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 1,
        #                  'gamma': 0}
        #     dtrain = xgb.DMatrix(X_train.values, y_train[tar].values, feature_names=X_train.columns.values)
        #     deval = xgb.DMatrix(X_test.values, y_test[tar].values, feature_names=X_test.columns.values)
        #     watchlist = [(dtrain, 'train'), (deval, 'val')]
        #     xgtrain = xgb.DMatrix(source[predictors].values, label=source[tar].values,
        #                           feature_names=X_train.columns.values)
        #     cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=10000, nfold=cv_folds,
        #                       early_stopping_rounds=50, seed=0, show_stdv=False)
        #
        #     print cvresult.shape[0]  # so we get the best n_estimators for competition
        #
        #     # clf = xgb.train(xgb_param, dtrain, num_boost_round=cvresult.shape[0], evals=watchlist,
        #     #                 early_stopping_rounds=100)
        #     #
        #     # feature_imp = clf.get_fscore()
        #     # sorted_scoreDic = sorted(feature_imp.items(), key=operator.itemgetter(1), reverse=True)
        #     #
        #     # # report_file_confirm_iter = "xgBoost_14out_removeSHOPID_%s_day_%s.txt" % (predictors_type, tar)
        #     # # report_file_confirm_iter = os.path.join(ReportFolder, report_file_confirm_iter)
        #     #
        #     # y_pred = clf.predict(xgb.DMatrix(X_test.values, feature_names=X_test.columns.values))
        #     # loss_score = loss(y_pred, y_test[tar], ifchecked=False)
        #     # print "loss_score: ", loss_score
        #
        #     with open(report_file, 'a+') as fw:
        #         fw.write("-------------------\Initial set rounds for default paras:\n")
        #         # if tar == 1:
        #         #     fw.write(str(sorted_scoreDic))
        #         fw.write(str(xgb_param))
        #         fw.write("\n")
        #         fw.write(str(cvresult.shape[0]))
        #         # fw.write('loss_score for random 10% samples: ')
        #         # fw.write(str(loss_score))
        #
        #     best_rounds = cvresult.shape[0]
        best_rounds = 1000

        if ifGS:
            #2 Gridsearch to set the best parameters
            cv_params_1 = {
                'max_depth': [6, 7, 8, 9],
                'min_child_weight': [1, 3]
            }
            ind_params_1 = {
                'learning_rate': 0.1,
                'seed': 0,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'objective': 'reg:linear',
                'silent': 1,
                'n_estimators': best_rounds
            }
            optimized_GBM = GridSearchCV(
                estimator=xgb.XGBRegressor(**ind_params_1),
                param_grid=cv_params_1,
                scoring=scorer,
                cv=KFold(n_splits=cv_folds, shuffle=True, random_state=0),
                n_jobs=-1)
            optimized_GBM.fit(X, y[tar])
            print optimized_GBM.cv_results_
            print type(optimized_GBM.cv_results_)
            print optimized_GBM.best_params_, optimized_GBM.best_score_
            best_params_1 = optimized_GBM.best_params_
            best_score_1 = optimized_GBM.best_score_
            with open(report_file, 'a+') as fw:
                fw.write("-------------------!!!!\:\n")
                fw.write(str(optimized_GBM.cv_results_))
                fw.write('\n....:\n')
                fw.write(str(cv_params_1) + '\n')
                fw.write(str(ind_params_1) + '\n')
                fw.write(
                    str(optimized_GBM.cv_results_['mean_test_score']) + '\n')
                fw.write(
                    str(optimized_GBM.best_params_) +
                    str(optimized_GBM.best_score_) + '\n')

            # gamma: 可选可不选
            cv_params_2 = {'gamma': [0.1, 0.3]}
            ind_params_2 = {
                'learning_rate': 0.1,
                'subsample': 0.8,
                'seed': 0,
                'colsample_bytree': 0.8,
                'objective': 'reg:linear',
                'max_depth': 3,
                'min_child_weight': 1,
                'n_estimators': best_rounds
            }
            ind_params_2.update(best_params_1)
            optimized_GBM = GridSearchCV(
                estimator=xgb.XGBRegressor(**ind_params_2),
                param_grid=cv_params_2,
                scoring=scorer,
                cv=KFold(n_splits=cv_folds, shuffle=True, random_state=0),
                n_jobs=-1)
            optimized_GBM.fit(X, y[tar])
            print optimized_GBM.cv_results_
            best_params_2 = optimized_GBM.best_params_
            best_score_2 = optimized_GBM.best_score_
            if best_score_2 < best_score_1:
                best_params_2 = {'gamma': 0.0}
            best_params_2.update(best_params_1)
            with open(report_file, 'a+') as fw:
                fw.write("-------------------\:\n")
                fw.write(str(optimized_GBM.cv_results_))
                fw.write('\n....:\n')
                fw.write(str(cv_params_2) + '\n')
                fw.write(str(ind_params_2) + '\n')
                fw.write(
                    str(optimized_GBM.cv_results_['mean_test_score']) + '\n')
                fw.write(
                    str(optimized_GBM.best_params_) +
                    str(optimized_GBM.best_score_) + '\n')

            # 3 GridSearch on rest parameters:
            cv_params_3 = {
                'subsample': [0.8, 0.9],
                'colsample_bytree': [i / 10.0 for i in range(8, 10)]
            }
            ind_params_3 = {
                'learning_rate': 0.1,
                'seed': 0,
                'colsample_bytree': 0.8,
                'objective': 'reg:linear',
                'max_depth': 6,
                'min_child_weight': 1,
                'n_estimators': best_rounds
            }
            ind_params_3.update(best_params_2)
            optimized_GBM = GridSearchCV(
                estimator=xgb.XGBRegressor(**ind_params_3),
                param_grid=cv_params_3,
                scoring=scorer,
                cv=KFold(n_splits=cv_folds, shuffle=True, random_state=0),
                n_jobs=-1)
            optimized_GBM.fit(X, y[tar])
            print optimized_GBM.cv_results_
            best_params_3 = optimized_GBM.best_params_
            best_score_3 = optimized_GBM.best_score_
            best_params_3.update(best_params_2)
            with open(report_file, 'a+') as fw:
                fw.write("-------------------\:\n")
                fw.write(str(optimized_GBM.cv_results_))
                fw.write('\n....:\n')
                fw.write(str(cv_params_3) + '\n')
                fw.write(str(ind_params_3) + '\n')
                fw.write(
                    str(optimized_GBM.cv_results_['mean_test_score']) + '\n')
                fw.write(
                    str(optimized_GBM.best_params_) +
                    str(optimized_GBM.best_score_) + '\n')

            cv_params_5 = {'reg_alpha': [100, 150]}
            ind_params_5 = {
                'learning_rate': 0.1,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'seed': 0,
                'colsample_bytree': 0.8,
                'objective': 'reg:linear',
                'max_depth': 6,
                'min_child_weight': 1,
                'n_estimators': best_rounds
            }
            ind_params_5.update(best_params_3)
            optimized_GBM = GridSearchCV(
                estimator=xgb.XGBRegressor(**ind_params_5),
                param_grid=cv_params_5,
                scoring=scorer,
                cv=KFold(n_splits=cv_folds, shuffle=True, random_state=0),
                n_jobs=-1)
            optimized_GBM.fit(X, y[tar])
            print optimized_GBM.cv_results_
            with open(report_file, 'a+') as fw:
                fw.write("-------------------\:\n")
                fw.write(str(optimized_GBM.cv_results_))
                fw.write('\n....:\n')
                fw.write(str(cv_params_5) + '\n')
                fw.write(str(ind_params_5) + '\n')
                fw.write(
                    str(optimized_GBM.cv_results_['mean_test_score']) + '\n')
                fw.write(
                    str(optimized_GBM.best_params_) +
                    str(optimized_GBM.best_score_) + '\n')
            sys.exit(0)

        #3 xgb.cv choose the optimized n_estimators
        if ifGS == False:
            xgb_param = {
                'reg_alpha': 100,
                'colsample_bytree': 0.8,
                'learning_rate': 0.05,
                'min_child_weight': 3,
                'subsample': 0.9,
                'seed': 0,
                'objective': 'reg:linear',
                'max_depth': 7,
                'gamma': 0.3
            }
            # xgb_param = {'reg_alpha': 100, 'subsample': 0.9, 'seed': 0, 'colsample_bytree': 0.7,
            #              'objective': 'reg:linear', 'learning_rate': 0.04, 'max_depth': 7, 'min_child_weight': 1, 'gamma': 0.1}
            dtrain = xgb.DMatrix(X_train.values,
                                 y_train[tar].values,
                                 feature_names=X_train.columns.values)
            deval = xgb.DMatrix(X_test.values,
                                y_test[tar].values,
                                feature_names=X_test.columns.values)
            watchlist = [(dtrain, 'train'), (deval, 'val')]
            xgtrain = xgb.DMatrix(source[predictors].values,
                                  label=source[tar].values,
                                  feature_names=X_train.columns.values)
            cvresult = xgb.cv(xgb_param,
                              xgtrain,
                              num_boost_round=10000,
                              nfold=cv_folds,
                              early_stopping_rounds=early_stopping_rounds,
                              seed=0,
                              show_stdv=False)

            print cvresult.shape[
                0]  # so we get the best n_estimators for competition

            clf = xgb.train(xgb_param,
                            dtrain,
                            num_boost_round=cvresult.shape[0],
                            evals=watchlist,
                            early_stopping_rounds=early_stopping_rounds)

            feature_imp = clf.get_fscore()
            sorted_scoreDic = sorted(feature_imp.items(),
                                     key=operator.itemgetter(1),
                                     reverse=True)

            # print sorted_scoreDic

            report_file_confirm_iter = "setRounds_xgBoost_14out_removeSHOPID_%s_day_%s.txt" % (
                predictors_type, tar)
            report_file_confirm_iter = os.path.join(ReportFolder,
                                                    report_file_confirm_iter)

            y_pred = clf.predict(
                xgb.DMatrix(X_test.values,
                            feature_names=X_test.columns.values))
            loss_score = loss(y_pred, y_test[tar], ifchecked=False)
            print "loss_score: ", loss_score

            with open(report_file_confirm_iter, 'a+') as fw:
                fw.write(
                    "-------------------After choose all paras, test best rounds for a smaller learning rate, 0.04\:\n"
                )
                fw.write(str(sorted_scoreDic))
                fw.write(str(xgb_param))
                fw.write("\n")
                fw.write(str(cvresult.shape[0]))
                fw.write('loss_score: ')
                fw.write(str(loss_score))
Beispiel #3
0
def Model_for_competition(algorithm_name,
                          pickle_model_file_name,
                          fgfs,
                          ReportFolder,
                          source,
                          predictors_type,
                          shop_range,
                          key,
                          predictors,
                          target_variables,
                          estimator_output_length=14,
                          required_prediction_length=14,
                          ifSaveModel=False,
                          predict_mode=["drop", "filled"],
                          n_estimators=300,
                          min_samples_split=2,
                          min_samples_leaf=1,
                          iterOrCopy="iterative",
                          estimator_withParams="l",
                          xgb_param="l",
                          early_stopping_rounds=50):

    target_variables_full = target_variables
    target_variables = target_variables[0:estimator_output_length]

    tmp_predictors = ['shop_id', 'day'] + predictors
    X = source[tmp_predictors]
    target_variables_plus = target_variables_full + ['shop_id', 'day']
    y = source[target_variables_plus]

    if 2 < 1:
        estimator_withParams = pickle.load(open(pickle_model_file_name, 'rb'))
    else:
        # X_train_forRounds, X_test_forRounds, y_train_forRounds, y_test_forRounds = train_test_split(X, y, test_size=0.1, random_state=0)
        # X_train_forRounds = X_train_forRounds[predictors]
        # y_train_forRounds = y_train_forRounds[target_variables]
        # X_test_forRounds = X_test_forRounds[predictors]
        # y_test_forRounds = y_test_forRounds[target_variables]
        X_train_forRounds = X[predictors][:-1]
        y_train_forRounds = y[:][:-1]
        X_test_forRounds = X[predictors][-1:]
        y_test_forRounds = y[:][-1:]

        X_train = X[predictors]
        y_train = y[target_variables]
        if algorithm_name == "xgBoost":
            xgb_param_list = [{
                'subsample': 0.9,
                'reg_alpha': 100,
                'seed': 0,
                'colsample_bytree': 0.8,
                'objective': 'reg:linear',
                'learning_rate': 0.04,
                'max_depth': 6,
                'min_child_weight': 1,
                'gamma': 0.0
            }]
            num_list = [3500]
            clf_list = []
            # for day in xrange(1, estimator_output_length+1):
            for day in xrange(1, 8):
                tar = 'Tar_%s' % day
                # xgtrain = xgb.DMatrix(source[predictors].values, label=source[tar].values,
                #                       feature_names=X_train.columns.values)
                # cvresult = xgb.cv(xgb_param_list[0], xgtrain, num_boost_round=5000, nfold=3,
                #                   early_stopping_rounds=50, seed=0, show_stdv=False)
                #
                # rounds_nice = cvresult.shape[0]
                #
                # with open('num_rounds.csv', 'a+') as fw:
                #     fw.write(str(key) + ' ')
                #     fw.write(str(tar) + '\:')
                #     fw.write(str(rounds_nice) + '\n')
                y_train_tmp = y_train_forRounds[[tar]]
                y_test_tmp = y_test_forRounds[[tar]]
                dtrain = xgb.DMatrix(
                    X_train_forRounds.values,
                    y_train_tmp[tar].values,
                    feature_names=X_train_forRounds.columns.values)
                deval = xgb.DMatrix(
                    X_test_forRounds.values,
                    y_test_tmp[tar].values,
                    feature_names=X_test_forRounds.columns.values)
                watchlist = [(dtrain, 'train'), (deval, 'val')]
                clf = xgb.train(xgb_param_list[0],
                                dtrain,
                                num_boost_round=5000,
                                early_stopping_rounds=100,
                                evals=watchlist)
                # clf = xgb.train(xgb_param_list[0], dtrain, num_boost_round=cvresult.shape[0],
                #                 early_stopping_rounds=50, evals=watchlist)

                y_pred = clf.predict(
                    xgb.DMatrix(X_test_forRounds.values,
                                feature_names=X_test_forRounds.columns.values))
                loss_score = loss(y_pred, y_test_tmp[tar], ifchecked=False)
                # with open(, 'a+') as fw:
                #     fw.write(str(key) + ' ')
                #     fw.write(str(tar) + ' ')
                #     fw.write(str(loss_score))
                #     fw.write('\n')

                all_shop_ids = set(y_test_tmp['shop_id'])
                loss_score_shop_list = []
                for sh in all_shop_ids:
                    row = [sh]
                    row.append(
                        loss(
                            y_pred[np.array(y_test_tmp['shop_id'] == sh), :],
                            y_test_tmp[y_test_tmp['shop_id'] == sh].
                            iloc[:,
                                 np.
                                 logical_and(y_test_tmp.columns != 'shop_id',
                                             y_test_tmp.columns != 'day')]))
                    loss_score_shop_list.append(row)

                results_df = []
                results_df_columns = []
                results_df_columns.extend([
                    'cluster_label', 'Tar_label', 'loss_score', 'shop_id',
                    'loss_theLast14Days', 'bst.best_ntree_limit'
                ])
                for r in loss_score_shop_list:
                    results_df.append(key, day, loss_score, shop_id, r,
                                      clf.bst.best_ntree_limit)
                results_df.to_csv(
                    'oldFeatures_num_rounds_cluster_shopID_tar.csv',
                    index=False)
            return True
def rfOrETR_para_tuning(source, pickle_model_file_name, ReportFolder, estimator_withParams, fgfs, report_file,report_per_shop, predictors_type, predictors, target_variables, iterOrCopy = ["iterative"], estimator_output_length = 14, required_prediction_length = 14, n_folds = 5, ifSavePickle= False):
    target_variables_full = target_variables
    target_variables = target_variables[0:estimator_output_length]
    tmp_predictors = ['shop_id', 'day'] + predictors
    X = source[tmp_predictors]
    target_variables_plus = target_variables_full + ['shop_id', 'day']
    y = source[target_variables_plus]

    # if os.path.exists(pickle_model_file_name):
    #     print "already trained"
    #     return True

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10,random_state = 0)
    X_train = X[predictors]
    y_train = y[target_variables]
    if len(target_variables)==1:
        y_train = y_train.values.ravel()
    print "fitting regressor.."
    estimator_withParams.fit(X_train, y_train)
    print "finish!"

    # if not os.path.exists(pickle_model_file_name) and ifSavePickle == True:
    #     pickle.dump(estimator_withParams,open(pickle_model_file_name,'wb'))


    x = IterativePredictionModel(estimator_withParams, estimator_output_length, required_prediction_length, fgfs, predictors)

    for iterOrCopy in ["iterative", "copy"]:
        report_per_shop = "%s_%s_iterLength_%s_n%s_minS%s_minL%s_iterOrCopy_%s.csv" % (
            algorithm_name, predictors_type, iter, n_estimators, min_samples_split, min_samples_leaf, iterOrCopy)
        report_per_shop = os.path.join(ReportFolder, report_per_shop)
        y_pred = x.do_iterative_prediction(X_test, how = iterOrCopy)
        loss_score = loss(y_pred,y_test[target_variables_full])
        print "loss: ", loss_score

        feature_imp = {}
        for i in xrange(0, estimator_withParams.feature_importances_.shape[0]):
            feature_imp.setdefault(predictors[i], estimator_withParams.feature_importances_[i])
        sorted_scoreDic = sorted(feature_imp.items(), key=operator.itemgetter(1), reverse=True)
        oob_score_result = estimator_withParams.oob_score_
        results_df = []
        results_df.extend([oob_score_result, loss_score, n_estimators, min_samples_split, min_samples_leaf, predictors_type, iter, iterOrCopy])
        results_df_columns = []
        results_df_columns.extend(['oob_score_result', 'loss_score', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'predictors_type', 'iterLength', 'iterOrCopy'])

        for i in xrange(0, len(sorted_scoreDic)):
            results_df_columns.append(sorted_scoreDic[i][0])
            results_df.append(sorted_scoreDic[i][1])
        results_df = pd.DataFrame([results_df], columns = results_df_columns)
        if os.path.exists(report_file):
            results_df.to_csv(report_file, index=False, mode='a')
        else:
            results_df.to_csv(report_file, index=False)
        loss_score_dict = {}
        for i in xrange(0, y_test.shape[0]):
            shop_id_test = y_test['shop_id'].values[i]
            loss_score_dict.setdefault(shop_id_test, [[], []])[1].extend(
                y_test.iloc[i, np.logical_and(y_test.columns != 'shop_id', y_test.columns != 'day')])
            loss_score_dict.setdefault(shop_id_test, [[], []])[0].extend(y_pred[i])
        loss_score_shop_list = []
        for shop in xrange(1, 2001):
            if shop in loss_score_dict:
                row = [shop]
                this_loss = loss(loss_score_dict[shop][0], loss_score_dict[shop][1])
                row.append(this_loss)
                loss_score_shop_list.append(row)
        with open(report_per_shop, 'a+') as fw:
            for r in loss_score_shop_list:
                    fw.write(','.join(map(str, r)) + '\n')