コード例 #1
0
ファイル: model_A4.py プロジェクト: WisleyWang/2020-
def build_model(train_data, test, pred, label, seed=2099, is_shuffle=True):
    train_pred = np.zeros((train_data.shape[0], ))
    test_pred = np.zeros((test.shape[0], ))
    n_splits = 5
    # Kfold
    fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed)
    kf_way = fold.split(train_data[pred])
    # params
    #     test_x=np.concatenate([test[pred].values,geohash_test],axis=1)
    # train
    for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1):
        train_x, train_y = train_data[pred].iloc[train_idx].values, train_data[
            label].iloc[train_idx]
        valid_x, valid_y = train_data[pred].iloc[valid_idx].values, train_data[
            label].iloc[valid_idx]
        #         geohash_tr_x,geohash_val_x=geohash_train[train_idx],geohash_train[valid_idx]
        #         train_x=np.concatenate([train_x,geohash_tr_x],axis=1)
        #         valid_x=np.concatenate([valid_x,geohash_val_x],axis=1)

        # 数据加载
        clf = LGBMRegressor(
            learning_rate=0.5,
            n_estimators=6000,
            boosting_type='gbdt',
            objective='regression',
            num_leaves=156,
            subsample=0.8,
            njobs=-1,
            max_depth=6,
            reg_lambda=0,
            colsample_bytree=0.8,
            random_state=2019,  # 2019
            metric=['mse'])

        clf.fit(train_x,
                train_y,
                eval_set=[(valid_x, valid_y)],
                eval_metric=['mse'],
                categorical_feature='auto',
                early_stopping_rounds=100,
                verbose=100)

        train_pred[valid_idx] = clf.predict(valid_x,
                                            num_iteration=clf.best_iteration_)

        test_pred += clf.predict(
            test[pred], num_iteration=clf.best_iteration_) / fold.n_splits

    print('mean_squared_error:',
          mean_squared_error(train_data[label].values, train_pred))
    test['label'] = test_pred
    return test[['loadingOrder', 'label']], clf
コード例 #2
0
class LGBMRegressorPrim(primitive):
    def __init__(self, random_state=0):
        super(LGBMRegressorPrim, self).__init__(name='LGBMRegressor')
        self.hyperparams = []
        self.type = 'Regressor'
        self.description = "LightGBM is a gradient boosting framework that uses tree based learning algorithms."
        self.hyperparams_run = {'default': True}
        self.random_state = random_state
        self.model = LGBMRegressor()
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.model.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        output['predictions'] = self.model.predict(output['X'])
        output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"])
        final_output = {0: output}
        return final_output
コード例 #3
0
def get_ntree():
    rmse_t_total, rmse_v_total = [], []
    for ntree in range(10, 500, 10):
        lgb_base = LGBMRegressor(n_estimators=ntree,
                                 objective='regression',
                                 random_state=1234,
                                 n_jobs=2,
                                 colsample_bytree=0.8,
                                 reg_alpha=1,
                                 max_depth=10,
                                 subsample=0.8)

        print('此时 ntree = %s' % ntree)
        lgb_base.fit(X_t, y_t)
        y_t_pre = lgb_base.predict(X_t)
        y_v_pre = lgb_base.predict(X_v)
        rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre))
        rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre))
        rmse_t_total.append(rmse_t_each)
        rmse_v_total.append(rmse_v_each)
        myfile = open('D:\\workspace python\\statContest\\save\\' +
                      'lgbbase2_rmse_0412.txt',
                      'a',
                      encoding='utf-8')
        print(rmse_t_each, ',', rmse_v_each, file=myfile)
        myfile.close()
    return rmse_t_total, rmse_v_total
コード例 #4
0
ファイル: model_A2.py プロジェクト: WisleyWang/2020-
def bulid_onetrain(train_data, test,pred= features,label= 'label',seed=1099,est=6000, is_shuffle=True):
    train_x,train_y=train_data[features].values,train_data[label].values
    clf=LGBMRegressor( learning_rate=0.01,
    boosting_type = 'gbdt',
    objective = 'regression',
    n_estimators=est,
    num_leaves=156,
    subsample=0.8,
    njobs=-1,
    max_depth=8,
    reg_lambda=0,
    colsample_bytree=0.8,
    random_state=2019,  # 2019
    metric=['mse'])

    clf.fit(
    train_x, train_y,
    eval_set=[(train_x, train_y)],
    eval_metric=['mse'],
    categorical_feature='auto',
    verbose=100)        

    #train_pred= clf.predict(train_x, num_iteration=clf.best_iteration_)


    test_pred= clf.predict(test[pred], num_iteration=clf.best_iteration_)

    #print('mean_squared_error:',mean_squared_error(train_y,train_pred))
    test['label'] = test_pred
    return test[['loadingOrder', 'label']],clf
コード例 #5
0
def train_lightgbm(verbose=True):
    """Train a boosted tree with LightGBM."""
    if verbose: print("Training with LightGBM")
    df = pd.read_csv(STAGE1_LABELS)
    x = np.array([
        np.mean(np.load(os.path.join(FEATURE_FOLDER, '%s.npy' % str(id))),
                axis=0).flatten() for id in df['id'].tolist()
    ])
    y = df['cancer'].as_matrix()

    trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(
        x, y, random_state=42, stratify=y, test_size=0.20)
    '''
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2'},
        'num_leaves': 21,
        'learning_rate': 0.001,
        'nthread':24,
        'subsample':0.80,
        'colsample_bytree':0.80,
        'seed':42,
        'verbose': verbose,
    }
    '''

    skf = StratifiedKFold(n_splits=5, random_state=2048, shuffle=True)
    result = []
    clfs = []
    oof_preds = []
    for train_index, test_index in skf.split(x, y):
        trn_x, val_x = x[train_index, :], x[test_index, :]
        trn_y, val_y = y[train_index], y[test_index]

        val_ids = pd.DataFrame(ids.iloc[test_index].values, columns=['id'])

        clf = LGBMRegressor(max_depth=50,
                            num_leaves=21,
                            n_estimators=5000,
                            min_child_weight=1,
                            learning_rate=0.001,
                            nthread=24,
                            subsample=0.80,
                            colsample_bytree=0.80,
                            seed=42)

        clf.fit(trn_x,
                trn_y,
                eval_set=[(val_x, val_y)],
                verbose=verbose,
                eval_metric='l2',
                early_stopping_rounds=300)

        val_preds = pd.DataFrame(clf.predict(val_x), columns=["cancer"])
        oof_preds.append(pd.concat([val_ids, val_preds], axis=1))
        clfs.append(clf)

    return clfs, oof_preds
コード例 #6
0
    def LGB_train(self,X_train, X_valid, labels_train, labels_valid, X_test, lgb_param_all):
        lgb_param_contrl = {'early_stopping_rounds': 100, 'categorical_feature': 'auto'}
        lgb_param = lgb_param_all.copy()
        objective_type = lgb_param['objective_type']
        lgb_param.pop('objective_type')

        for k in ['early_stopping_rounds', 'categorical_feature']:
            if k in lgb_param:
                lgb_param_contrl[k] = lgb_param[k]
                lgb_param.pop(k)

        if not self.config.retrain:
            # 调用已有模型进行增量训练
            model_load = self.load_model()
            if not model_load:
                print('不存在模型:{},从头训练'.format(self.modelName))
                if objective_type == 'regressor':
                    clf = LGBMRegressor(**lgb_param)
                else:
                    clf = LGBMClassifier(**lgb_param)

                clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                        early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'],
                        categorical_feature=lgb_param_contrl['categorical_feature'])
            else:
                clf = model_load.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                                     early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'],
                                     categorical_feature=lgb_param_contrl['categorical_feature'])
        else:
            if objective_type == 'regressor':
                clf = LGBMRegressor(**lgb_param)
            else:
                clf = LGBMClassifier(**lgb_param)
            clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                    early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'],
                    categorical_feature=lgb_param_contrl['categorical_feature'])


        val_lgb_pre = clf.predict(X_valid.values, num_iteration=clf.best_iteration_)
        test_lgb_pre = clf.predict(X_test.values, num_iteration=clf.best_iteration_)

        metrics_name = self.config.metrics_name
        myMetrics = defindMetrics.MyMetrics(metrics_name)
        score_lgb = myMetrics.metricsFunc(val_lgb_pre, labels_valid)

        self.save_model(clf, self.config.saveModel)
        return val_lgb_pre, test_lgb_pre, score_lgb
コード例 #7
0
def lightGBM_train_nocross(j,param,x_train, x_test, y_train, y_test):
    gbm = LGBMRegressor(**param,num_leaves=31,learning_rate=0.01,object='regression')
    gbm.fit(x_train, y_train)
    y_pred = gbm.predict(x_test)
    y_pred = DataFrame(y_pred)
    rmse_lightGBM.append(np.sqrt(mean_squared_error(y_pred, y_test)))
    r2_lightGBM.append(r2_score(y_test, y_pred))
    return rmse_lightGBM,r2_lightGBM,gbm
コード例 #8
0
def lgb(x_train, y_train, x_val, y_val):
    lgb = LGBMRegressor(n_estimators=1000,
                        max_depth=10,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        learning_rate=0.01,
                        random_state=2020)
    lgb.fit(x_train, y_train)
    result = lgb.predict(x_val)
    score = mean_absolute_error(result, y_val)
    return score
コード例 #9
0
 def get_model_result(self, params: dict) -> dict:
     X, y = self.X, self.Y
     X_test, y_test = self.X_test, self.Y_test
     # X, y = self.X.values, self.Y.values
     # X_test, y_test = self.X_test.values, self.Y_test.values
     if isinstance(self.estimator, lgb.Booster):
         params["metric"] = "auc"
         estimator = lgb.train(params, self.dataset_train)
         pred_train = pd.Series(estimator.predict(self.dataset_train),
                                index=self.X.index)
         pred_test = pd.Series(estimator.predict(self.dataset_test),
                               index=self.X_test.index)
     elif isinstance(self.estimator, LGBMRegressor):
         estimator = LGBMRegressor(**params)
         estimator.fit(X, y, eval_metric="auc")
         pred_train = pd.Series(estimator.predict(X), index=self.X.index)
         pred_test = pd.Series(estimator.predict(X_test),
                               index=self.X_test.index)
     elif isinstance(self.estimator, LGBMClassifier):
         estimator = LGBMClassifier(**params)
         estimator.fit(X, y, eval_metric="auc")
         pred_train = pd.Series(estimator.predict_proba(X)[:, 1],
                                index=self.X.index)
         pred_test = pd.Series(estimator.predict_proba(X_test)[:, 1],
                               index=self.X_test.index)
     else:
         raise TypeError(
             "Input model should be a `lgb.Booster` or `LGBMClassifier`/`LGBMRegressor`!"
         )
     # 置空得分
     pred_train.loc[~pred_train.index.isin(self.hit_indices)] = np.nan
     pred_test.loc[~pred_test.index.isin(self.hit_indices)] = np.nan
     # 计算模型评估指标
     ks_train, ks_test = calc_ks(-pred_train,
                                 y), calc_ks(-pred_test, y_test)
     auc_train, auc_test = calc_auc(pred_train,
                                    y), calc_auc(pred_test, y_test)
     # return {'train': (ks_train, auc_train), 'test': (ks_test, auc_test)}
     return {"ks": (ks_train, ks_test), "auc": (auc_train, auc_test)}
コード例 #10
0
def get_model(brand_string, train_brand, test_brand):
    brand1 = pd.read_csv(brand_string)
    brand1 = brand1.iloc[90:, :].reset_index(drop=True)
    X_brand1 = brand1.drop(['brand', 'cnt'], axis=1)
    y_train = brand1['cnt'].values

    X_train = pd.concat([X_brand1, train_brand], axis=1)

    X_test = test.drop(['cnt'], axis=1)
    X_test = pd.concat([X_test, test_brand], axis=1)

    model = LGBMRegressor().fit(X_train, y_train)
    brand1_pre = model.predict(X_test)
    return brand1_pre
コード例 #11
0
def tune_params():
    rmse_t_total, rmse_v_total = [], []
    for max_depth in range(6, 11):
        for subsample in [0.6, 0.7, 0.8]:
            for colsample_bytree in [0.6, 0.7, 0.8]:
                for reg_alpha in [0.1, 1, 10]:
                    lgb_base = LGBMRegressor(n_estimators=150,
                                             objective='regression',
                                             random_state=1234,
                                             n_jobs=3,
                                             colsample_bytree=colsample_bytree,
                                             reg_alpha=reg_alpha,
                                             max_depth=max_depth,
                                             subsample=subsample)
                    _params = {
                        'max_depth': max_depth,
                        'subsample': subsample,
                        'colsample_bytree': colsample_bytree,
                        'reg_alpha': reg_alpha,
                    }
                    lgb_base.fit(X_t, y_t)
                    y_t_pre = lgb_base.predict(X_t)
                    y_v_pre = lgb_base.predict(X_v)
                    rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre))
                    rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre))
                    rmse_t_total.append(rmse_t_each)
                    rmse_v_total.append(rmse_v_each)
                    print(_params)
                    myfile1 = open(
                        'D:\\workspace python\\statContest\\save\\' +
                        'lgbbase2_saveparams_rmse_0412.txt',
                        'a',
                        encoding='utf-8')
                    print(_params['max_depth'],
                          _params['subsample'],
                          _params['colsample_bytree'],
                          _params['reg_alpha'],
                          file=myfile1)

                    myfile1.close()
                    print(rmse_t_each, rmse_v_each)
                    myfile = open('D:\\workspace python\\statContest\\save\\' +
                                  'lgbbase2_tunparms_rmse_0412.txt',
                                  'a',
                                  encoding='utf-8')
                    print(rmse_t_each, ',', rmse_v_each, file=myfile)
                    myfile.close()
    return rmse_t_total, rmse_v_total
コード例 #12
0
def predict(X_train, Y_train, X_test):
    print("Y_train is 1:", Y_train.count(1))
    print("Y_train is 0:", Y_train.count(0))
    clfs = [
        LGBMRegressor(learning_rate=0.0475, max_depth=13, n_estimators=100, num_leaves=80),
        XGBRegressor(learning_rate=0.0475, max_depth=4, n_estimators=300)]
    X = np.array(X_train, dtype='float32')
    y = np.array(Y_train, dtype='float32')
    X_predict = np.array(X_test, dtype='float32')
    dataset_blend_train = np.zeros((X.shape[0], len(clfs)), dtype='float32')
    dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)), dtype='float32')

    '''5折stacking'''
    n_folds = 5
    skf = StratifiedKFold(n_splits=n_folds)
    for j, clf in enumerate(clfs):
        '''依次训练各个单模型'''
        print("clf", j)
        dataset_blend_test_j = np.zeros((X_predict.shape[0], n_folds), dtype='float32')
        for i, (train, test) in enumerate(skf.split(X, y)):
            '''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。'''
            print("stacking Fold", i)
            X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
            # if j == 0:
            #     class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
            #     clf.class_weight = dict(enumerate(class_weights))
            # else:
            #     class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
            #     clf.scale_pos_weight = class_weights[1] / class_weights[0]
            #     print('scale_pos_weight:', clf.scale_pos_weight)
            clf.fit(X_train, y_train)
            y_submission = clf.predict(X_test)
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict(X_predict)
        '''对于测试集,直接用这k个模型的预测值均值作为新的特征'''
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
        del dataset_blend_test_j
        # print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))
    # clf = LogisticRegression()
    # clf = GradientBoostingRegressor(learning_rate=0.02, max_depth=6)
    clf = LGBMRegressor()
    class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y)
    clf.class_weight = dict(enumerate(class_weights))
    dataset_blend_train = np.append(dataset_blend_train, X, axis=1)
    dataset_blend_test = np.append(dataset_blend_test, X_predict, axis=1)
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict(dataset_blend_test)
    return y_submission
コード例 #13
0
def train_lgb_model(best_nodes, X_train_scaled, Y_train):

    rsg = LGBMRegressor(
        learning_rate=best_nodes["learning_rate"],
        n_estimators=int(best_nodes["n_estimators"]),
        max_depth=best_nodes["max_depth"],
        #eval_metric=best_nodes["eval_metric"],
        num_leaves=best_nodes["num_leaves"],
        subsample=best_nodes["subsample"],
        colsample_bytree=best_nodes["colsample_bytree"],
        min_child_samples=best_nodes["min_child_samples"],
        min_child_weight=best_nodes["min_child_weight"])

    rsg.fit(X_train_scaled, Y_train)
    Y_pred = rsg.predict(X_train_scaled)
    print("mse:", np.mean((Y_pred - Y_train)**2))
    print("rmse:", np.sqrt(np.mean((Y_pred - Y_train)**2)))
    return rsg
コード例 #14
0
def lightBGM_model_with_test(X, Y):
    model = LGBMRegressor(num_leaves=36,
                          n_estimators=100,
                          learning_rate=0.07,
                          random_state=0)

    useful_feature = get_useful_features_byLightBGM(X, Y)
    X_U = X[useful_feature]

    x1, x2, y1, y2 = train_test_split(X_U, Y, test_size=0.2)
    y1_log = np.log1p(y1)
    model.fit(x1, y1_log, verbose=True)

    predict_log = model.predict(x2)
    predict = np.expm1(predict_log)
    error = error_fun(predict, y2)[1]

    del x1, x2, y1, y2
    return error
コード例 #15
0
    def fit(self):
        if self.First_change:  #boxcox变换
            act = boxcox(self.train_label + 0.1)[0]
            self.act_ = boxcox(self.train_label + 0.1)[1]
        else:
            act = self.train_label
        steps = self.steps
        actual = act
        n_samples = len(self.train_label)
        y_pred_train = np.zeros(n_samples, np.float32)
        n_estimators_list = self.n_estimators_list
        for i in range(1):
            num = np.random.randint(0, 5000)
            print("----training begin----")
            for step in range(steps):
                print(step)
                actual = actual - y_pred_train  #残差计算
                if step > 0:  #残差进行标签压缩变换,和boxcox变换
                    actual_ = sigmod(actual)
                    actual_box = boxcox(actual_)[0]
                    actual_box_val = boxcox(actual_)[1]
                    self.box_value.append(actual_box_val)
                    actual_used = actual_box
                else:
                    actual_used = actual
#阶段模型生成
                model = LGBMRegressor(n_estimators=n_estimators_list[step],
                                      max_depth=3,
                                      learning_rate=0.02,
                                      subsample=1,
                                      colsample_bytree=1)
                model.fit(self.train.values, actual_used)  #阶段模型训练
                y_pred_train_ = model.predict(self.train.values)  #阶段预测输出
                if step > 0:  #阶段反变换计算输出
                    y_pred_train = (y_pred_train_ * actual_box_val +
                                    1)**(1 / actual_box_val)
                    y_pred_train = sigmod_trans(y_pred_train)
                else:
                    y_pred_train = y_pred_train_
                self.model_list.append(model)  #阶段模型存储
コード例 #16
0
def modelingLGBM(hold_out_train,hold_out_test):
    from sklearn.linear_model import LassoCV as LaCV
    from sklearn.ensemble import RandomForestRegressor as RFR

    from sklearn.linear_model import Ridge
    from sklearn.linear_model import RANSACRegressor
    from sklearn.neural_network import MLPRegressor as MLP
    from xgboost.sklearn import XGBRegressor as XGBR
    from xgboost.sklearn import DMatrix
    from lightgbm.sklearn import LGBMRegressor as LGBM

    traindata=hold_out_train.copy()
    testdata=hold_out_test.copy()
    traindata=traindata.drop(['Store','Customers','Date','Open','PromoInterval','monthstr'],axis=1)
    testdata=testdata.drop(['Store','Customers','Date','Open','PromoInterval','monthstr'],axis=1)
    train_x=traindata.drop(['Sales'],axis=1)
    train_y=np.log1p(traindata['Sales'])
    test_x=testdata.drop(['Sales'],axis=1)

    # #归一化
    # min_max_scaler = MinMaxScaler()
    # train_x = min_max_scaler.fit_transform(train_x)
    # test_x = min_max_scaler.fit_transform(test_x)

    smalest_rmspe=1000
    subsamples=np.arange(0.5,0.6,0.1)
    for subsample in subsamples:
        time1 = time.time()
        lgbmModel = LGBM(n_estimators=8000,subsample=0.8)
        print(lgbmModel)
        lgbmModel.fit(train_x, train_y)
        sales_predict = lgbmModel.predict(test_x)
        rmspe = RMSPE(testdata['Sales'], np.expm1(sales_predict))
        print(rmspe)
        time2 = time.time()
        print('耗费时间:', (time2 - time1))
        if smalest_rmspe>rmspe:
            smalest_rmspe=rmspe
            best_model=lgbmModel
    return best_model
コード例 #17
0
class b_model:
    # 这个地方可以定义全局变量
    params = {
        'learning_rate': 0.015,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mse',
        'num_leaves': 12,
        'max_depth': 9,
        'max_bin': 130,
        'feature_fraction': 0.9,
        'reg_lambda': 50,
        'min_data': 25,
        'min_child_weight': 0.001,
        'verbose': -1,
    }
    no_use = [
        "血糖", "blood_sugar", "id", "blood_sugar_log", '体检日期',
        'feature_5_less_25', 'feature_4_less_60', '性别'
    ]

    def __init__(self):
        # 在创建类的时候需要哪些参数
        self.model = LGBMRegressor(learning_rate=0.015,
                                   objective="regression",
                                   metric='mse',
                                   num_leaves=12,
                                   max_depth=9,
                                   max_bin=130,
                                   feature_fraction=0.9,
                                   reg_lambda=50,
                                   min_data=25,
                                   min_child_weight=0.001,
                                   num_boost_round=3000,
                                   random_state=42)

    def __make_feature(self, train, test):
        # 构造特征
        if train.empty:
            test['性别'] = test['性别'].map({'男': 1, '女': 0, '??': 1})
            return test
        if test.empty:
            train['性别'] = train['性别'].map({'男': 1, '女': 0, '??': 1})
            return train
        else:
            train_id = train.id.values.copy()
            test_id = test.id.values.copy()
            data = pd.concat([train, test])
            data['性别'] = data['性别'].map({'男': 1, '女': 0, '??': 1})
            train_feat = data[data.id.isin(train_id)]
            test_feat = data[data.id.isin(test_id)]
            return train_feat, test_feat

    def fit(self, X, y=None):
        X.drop(X[X["年龄"] >= 84].index, inplace=True)
        fea_train = pd.read_csv("./feature/fea_train.csv")
        fea_train1 = pd.read_csv("./feature/fea_train_1.csv")
        fea_train2 = pd.read_csv("./feature/fea_train_2.csv")
        X = pd.merge(X, fea_train, how="left", on="id")
        X = pd.merge(X, fea_train1, how="left", on="id")
        X = pd.merge(X, fea_train2, how="left", on="id")
        X = self.__make_feature(train=X, test=pd.DataFrame())
        if y == None:
            y = X["血糖"].values
        predictors = [f for f in list(X.columns) if f not in self.no_use]
        X_train, X_test, y_train, y_test = train_test_split(X[predictors],
                                                            y,
                                                            test_size=0.1,
                                                            random_state=42)
        self.model.fit(X_train[predictors],
                       y_train,
                       eval_metric="mse",
                       early_stopping_rounds=100,
                       verbose=100,
                       eval_set=(X_test[predictors], y_test))
        from sklearn.metrics import mean_squared_error
        print("线下误差:{}".format(0.5 * mean_squared_error(
            y_test, self.model.predict(X_test[predictors]))))
        return self

    def predict(self, X):
        # 对测试集进行预测,传入模型,和测试数据
        fea_test = pd.read_csv("./feature/fea_test.csv")
        fea_test1 = pd.read_csv("./feature/fea_test_1.csv")
        fea_test2 = pd.read_csv("./feature/fea_test_2.csv")
        X = pd.merge(X, fea_test, how="left", on="id")
        X = pd.merge(X, fea_test1, how="left", on="id")
        X = pd.merge(X, fea_test2, how="left", on="id")
        X = self.__make_feature(test=X, train=pd.DataFrame())
        predictors = [f for f in list(X.columns) if f not in self.no_use]
        test_pred = self.model.predict(X[predictors])
        print("最大值:{}".format(test_pred.max()))
        return test_pred

    def get_params(self):
        return self.params
コード例 #18
0
"""

from lightgbm.sklearn import LGBMRegressor
from xgboost.sklearn import XGBRegressor
from sklearn import ensemble
from sklearn.metrics import mean_squared_error,mean_absolute_error
import pandas as pd

data = pd.read_csv('original_train.csv')
test =  pd.read_csv('original_test.csv')

nn_train = pd.read_csv('nn_train_7day.csv')
nn_test = pd.read_csv('nn_test_7day.csv')

nn_train = nn_train[['nn_4', 'nn_8', 'nn_14', 'nn_7', 'nn_18', 'nn_16', 'nn_22', 'nn_15', '']]

y_train = data.loc[90:,'count1'].values
y_test = test['count1']

model = LGBMRegressor().fit(nn_train,y_train)
y_pre = model.predict(nn_test)

print(mean_squared_error(y_pre,y_test))
print(mean_absolute_error(y_pre,y_test))
print(sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), 
                 nn_train.columns), 
             reverse=True))



コード例 #19
0
ファイル: lgb.py プロジェクト: Mandule/DIGIX-RANK-2020
        learning_rate=0.1,
        num_leaves=255,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=2020,
        metric='RMSE',
        n_jobs=24,
    )
    clf.fit(
        X_trn,
        Y_trn,
        eval_set=[(X_val, Y_val)],
        early_stopping_rounds=200,
        verbose=1000,
    )
    oof[val_idx] = clf.predict(X_val)
    sub += clf.predict(X_test) / skf.n_splits

sub = pd.DataFrame({
    'queryid': test.query_id,
    'documentid': test.doc_id,
    'predict_label': sub,
})

oof = pd.DataFrame({
    'query_id': train.query_id,
    'doc_id': train.doc_id,
    'oof': oof,
    'label': train.label,
})
ks_auc=pd.DataFrame()
for feature_num in feature_num_range: #计算在不同的feature_num下xgb模型在测试集的KS和AUC表现
    
    chosen_feature=feat_imp.index[:feature_num] #选取feature_importance排在前feature_num的变量
    
    lgbm_model.set_params(n_estimators=500)
    lgbm_param_temp = lgbm_model.get_params()
    
    lgbm_train = lgb.Dataset(X.loc[:,chosen_feature],Y)
    
    cvresult = lgb.cv(lgbm_param_temp, lgbm_train, num_boost_round=lgbm_param_temp['n_estimators'],nfold=5,metrics='auc',early_stopping_rounds=100)
    best_n_estimators_temp=len(cvresult['auc-mean'])
    
    lgbm_model.set_params(n_estimators=best_n_estimators_temp)
    lgbm_model.fit(X.loc[:,chosen_feature],Y,eval_metric='auc')   
    preds=lgbm_model.predict(P_test.loc[:,chosen_feature]) 
    
    ks_value,bad_percent,good_percent=pf.cal_ks(-preds,y_test,section_num=20)
                      
    false_positive_rate,recall,thresholds = roc_curve(y_test, preds)
    roc_auc=auc(false_positive_rate,recall) 
    
    ks_auc=pd.concat([ks_auc,pd.DataFrame([np.max(ks_value),roc_auc]).T])
    
ks_auc.columns=['ks','auc']
ks_auc.index=feature_num_range
    
print(ks_auc)    


'''final_feature_num可以选取KS和AUC相加最高的,也可以手动指定'''
コード例 #21
0
            unuseful_feature.append(i[1])
    use_features = [aa for aa in features if aa not in unuseful_feature]
    print('有用:', len(use_features))
    print('无用:', len(unuseful_feature))
    print('全部:', len(features))

    train_X_1 = train_[use_features]

    x1, x2, y1, y2 = train_test_split(train_X_1, train_y_1, test_size=0.2)

    model_1 = LGBMRegressor(learning_rate=0.07,
                            num_leaves=41,
                            n_estimators=110,
                            random_state=0)
    model_1.fit(x1, y1.values.ravel(), verbose=True)
    val_1 = model_1.predict(x2)
    '''
    preds_1 = model_1.predict(test_X)
    '''
    print(error_(val_1, y2))
    val_1_error = error_(val_1, y2)

    del x1, x2, y1, y2
    gc.collect()

    ##############################################################
    train_y_2 = train_['舒张压']

    model_2 = LGBMRegressor(num_leaves=36,
                            n_estimators=140,
                            random_state=0,
コード例 #22
0
ファイル: fundamental_09.py プロジェクト: HC-kang/fundamental
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')

    return results

param_grid = {
    'n_estimators' : [50, 100],
    'max_depth' : [1, 10]
}

model = LGBMRegressor(random_state=random_state)
my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5)

model = LGBMRegressor(max_depth=10, n_estimators=100, random_state=random_state)
model.fit(train, y)
prediction = model.predict(test)
prediction

prediction = np.expm1(prediction)
prediction

submission = pd.read_csv('sample_submission.csv')
submission.head()

submission['price']=prediction
submission.head()

submission_csv_path = ('submission_{}_RMSLE_{}.csv'.format('lgbm', '0.164399'))
submission.to_csv(submission_csv_path, index = False)
print(submission_csv_path)
コード例 #23
0
def reg_model(labelled_data, unlabelled_data):
    """ Parameters: training dataframe, unknown dataframe
        Returns: results dataframe (Instance, Income)

        ffill on NaN from training data,
        Replaces NaN in test data with ffill, 
        cat-encodes non-numeric fields, 
        scales values,
        80/20 splits data to help verify model, 
        uses LightGBM
    """

    # print("throwing away rows to speed up model")
    # speed up testing by throwing away some data
    # clean_labelled = labelled_data.sample(frac=0.2)
    clean_labelled = labelled_data.copy()
    clean_unlabelled = unlabelled_data.copy()

    print("cleaning data...")
    # get rid of weird value
    clean_labelled.loc[:,
                       "Work Experience in Current Job [years]"] = pandas.to_numeric(
                           labelled_data[
                               "Work Experience in Current Job [years]"],
                           errors="coerce")
    clean_unlabelled.loc[:,
                         "Work Experience in Current Job [years]"] = pandas.to_numeric(
                             unlabelled_data[
                                 "Work Experience in Current Job [years]"],
                             errors="coerce")
    print("mixed type issue fixed..")

    # fix additional income field
    clean_labelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric(
        np.fromiter(map(
            lambda s: s.replace(" EUR", ""),
            clean_labelled[
                "Yearly Income in addition to Salary (e.g. Rental Income)"],
        ),
                    dtype=np.float),
        errors="coerce")
    clean_unlabelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric(
        np.fromiter(map(
            lambda s: s.replace(" EUR", ""),
            clean_unlabelled[
                "Yearly Income in addition to Salary (e.g. Rental Income)"],
        ),
                    dtype=np.float),
        errors="coerce")

    # dropping useless columns
    drop_columns(clean_unlabelled)
    drop_columns(clean_labelled)

    # removing NaN values
    clean_labelled.fillna(method="ffill", inplace=True)
    clean_unlabelled = clean_unlabelled[all_columns]
    clean_unlabelled.fillna(method="ffill", inplace=True)

    # input data for final predictions
    unknown_data = clean_unlabelled.drop(["Instance"], axis=1)

    print("splitting data into train and test...")
    # 80/20 split, and separating targets
    split = split_data(clean_labelled)
    train_data, train_target, test_data, test_target = split

    print("encoding categorical data...")
    # categorical encoding
    cat = CatBoostEncoder()
    train_data = cat.fit_transform(train_data, train_target)
    test_data = cat.transform(test_data)
    unknown_data = cat.transform(unknown_data)

    # separate additional income
    train_add_income = train_data[
        "Yearly Income in addition to Salary (e.g. Rental Income)"].values
    test_add_income = test_data[
        "Yearly Income in addition to Salary (e.g. Rental Income)"].values
    unknown_add_income = unknown_data[
        "Yearly Income in addition to Salary (e.g. Rental Income)"].values

    train_data = train_data[no_income_columns]
    test_data = test_data[no_income_columns]
    unknown_data = unknown_data[no_income_columns]

    train_target = train_target[
        "Total Yearly Income [EUR]"].values - train_add_income
    test_target = test_target["Total Yearly Income [EUR]"].values

    print("scaling values...")
    # scaling values
    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)
    unknown_data = scaler.transform(unknown_data)

    print("fitting model...")
    # fit model
    reg = LGBMRegressor()
    # reg = TransformedTargetRegressor(
    #     regressor=mod,
    #     transformer=scaler
    # )
    reg.fit(train_data, train_target)

    print("predicting test data...")
    test_result = reg.predict(test_data, num_iterations=15000)
    # add additional income
    test_result = test_result + test_add_income

    print("analysing test results...")
    # validate test
    error = mean_absolute_error(test_target, test_result)
    score = explained_variance_score(test_target, test_result)
    print("Mean absolute error of test data: ", error)
    print("Score: ", score)

    print("predicting unknown data...")
    # predict and format
    values = reg.predict(unknown_data)
    values = values + unknown_add_income

    results = pandas.DataFrame({
        "Instance": clean_unlabelled["Instance"].values,
        "Total Yearly Income [EUR]": values
    })
    print("Finished.")
    return results
コード例 #24
0
ファイル: lgb.py プロジェクト: Mandule/CCFDD-2020
        n_jobs=24,
    )

    clf.fit(
        X_trn,
        Y_trn,
        sample_weight=W_trn,
        eval_set=[(X_val, Y_val)],
        eval_metric='rmse',
        eval_sample_weight=[W_val],
        early_stopping_rounds=200,
        categorical_feature=category_feats,
        verbose=100,
    )

    oof[val_idx] = clf.predict(X_val)
    sub += clf.predict(X_sub) / gkf.n_splits
    feat_imp_df['imp'] += clf.feature_importances_ / gkf.n_splits

# In[ ]:

pred_sub = search_f1(df_train.label, oof, sub)

# In[ ]:

plt.figure(figsize=(15, 30))
feat_imp_df = feat_imp_df.sort_values('imp', ignore_index=True)
sns.barplot(x='imp', y='feat', data=feat_imp_df)
plt.savefig('imp.png')

# In[ ]:
コード例 #25
0
ファイル: modelB.py プロジェクト: IlyaGusev/turing
    with open(os.path.join(folder, "clf_A.pkl"), 'wb') as file:
        pickle.dump(clf_A, file)
    with open(os.path.join(folder, "clf_B.pkl"), 'wb') as file:
        pickle.dump(clf_B, file)
    with open(os.path.join(folder, "vectorizers.pkl"), 'wb') as file:
        pickle.dump(vectorizers, file)

elif sys.argv[1] == "load":
    print("Loading")

    with open(os.path.join(folder, "clf_A.pkl"), 'rb') as file:
        clf_A = pickle.load(file)
    with open(os.path.join(folder, "clf_B.pkl"), 'rb') as file:
        clf_B = pickle.load(file)
    with open(os.path.join(folder, "vectorizers.pkl"), 'rb') as file:
        vectorizers = pickle.load(file)

print("Loading test")

test, *_ = process_data("../data/test/", train=False, vectorizers=vectorizers)

T = test[features].values
T = np.stack([np.concatenate(T[i]) for i in range(T.shape[0])])

print("Predicting")

pred_A = clf_A.predict(T)
pred_B = clf_B.predict(T)

pd.DataFrame(np.stack([pred_A, pred_B]).T, index=test.index, columns=["Alice", "Bob"]).to_csv("../submitions/answer-B.csv")
コード例 #26
0
                   objective='reg:linear',
                   min_child_weight=6,
                   n_estimators=1000,
                   max_depth=7,
                   colsample_bytree=0.6)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
accuracy = xgb.score(X_test, y_test)
'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%'
mean_absolute_error(y_test, xgb_pred)
mean_squared_error(y_test, xgb_pred)
np.sqrt(mean_squared_error(y_test, xgb_pred))

lgb = LGBMRegressor(objective='regression')
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_test)
accuracy = lgb.score(X_test, y_test)
'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%'
mean_absolute_error(y_test, lgb_pred)
mean_squared_error(y_test, lgb_pred)
np.sqrt(mean_squared_error(y_test, lgb_pred))

from yellowbrick.regressor import ResidualsPlot
visualizer = ResidualsPlot(xgb)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer

rid_pred_t = rid.predict(X_train)
la_pred_t = la.predict(X_train)
plt.scatter(la_pred_t, y_train, c="blue", marker="s", label="Training data")
コード例 #27
0
ファイル: lgbm_steam.py プロジェクト: zhangyang30003/steam
x_train = all_data[:2888]
x_test = all_data[2888:]
#val and train
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train,
                                                  test_size=0.2,
                                                  shuffle=True,
                                                  random_state=42)

########################################################################
print('start ML')
score = []
train_range = range(1, 1000, 10)
for i in train_range:
    print(i)
    lgr = LGBMRegressor(learning_rate=0.05,
                        n_estimators=i,
                        subsample=0.8,
                        subsample_freq=1,
                        colsample_bytree=0.8,
                        random_state=2019)
    lgr.fit(x_train, y_train)
    mse = mean_squared_error(y_val, lgr.predict(x_val))
    #    print(mse)
    score.append(mse)

plt.plot(train_range, score)

result = pd.DataFrame(lgr.predict(x_test))
result.to_csv('sub_8-6.txt', index=False, header=0)
コード例 #28
0
ファイル: hyperparam.py プロジェクト: dawidkopczyk/blog
print("BEST PARAMETERS: " + str(best_params))

# Print best CV score
scores = [-trial['result']['loss'] for trial in trials.trials]
print("BEST CV SCORE: " + str(np.max(scores)))

# Print execution time
tdiff = trials.trials[-1]['book_time'] - trials.trials[0]['book_time']
print("ELAPSED TIME: " + str(tdiff.total_seconds() / 60))    

# Set params
est.set_params(**best_params)

# Fit    
est.fit(X_train, y_train)
y_pred = est.predict(X_test)

# Predict
score = r2_score(y_test, y_pred)
print("R2 SCORE ON TEST DATA: {}".format(score))

#==============================================================================
# Tree structure of hyperparameter space (Optional)
#============================================================================== 
# You must change the evaluate function in order to extract learning rate 
# and n_estimators from choices. Please add the following code to the start of 
# evaluate function
#    # Choices
#    if 'choices' in params.keys():
#        params['learning_rate'] = params['choices']['learning_rate']
#        params['n_estimators'] = params['choices']['n_estimators']
コード例 #29
0
for i in range(len(duration)):
    duration_hours.append(int(duration[i].split(sep = "h")[0]))    
    duration_mins.append(int(duration[i].split(sep = "m")[0].split(sep = "h")[-1]))

X["Duration_hours"] = duration_hours
X["Duration_mins"] = duration_mins
X.drop(["Duration"], axis = 1,inplace = True)

X.drop(["Dep_Time"], axis = 1,inplace = True)

X.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from lightgbm.sklearn import LGBMRegressor
reg = LGBMRegressor()
reg.fit(X_train,y_train)

y_pred=reg.predict(X_test)


from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

filename = 'flightfare.pkl'
pickle.dump(reg, open(filename, 'wb'))

コード例 #30
0
def modelIntergrated( hold_out_train,hold_out_test,test):

    traindata = hold_out_train.copy().drop(['Store', 'Customers', 'Date', 'Open', 'PromoInterval', 'monthstr'], axis=1)
    train_x = traindata.drop(['Sales'], axis=1)
    train_y = np.log1p(traindata['Sales'])

    testdata = hold_out_test.copy()
    testdata = testdata.drop(['Store', 'Customers', 'Date', 'Open', 'PromoInterval', 'monthstr'], axis=1)
    ho_test_x = testdata.drop(['Sales'], axis=1)
    ho_test_y=testdata['Sales']

    finaltest_x = test.copy().drop(['Id', 'Store', 'Date', 'Open', 'PromoInterval', 'monthstr'], axis=1)

    predictions=[]
    RMSPES=[]
    start = time.time()
    k=5
    for i in range(0,k):
        lgbmModel = LGBM(n_estimators=8000, subsample=0.8,random_state=i)
        lgbmModel.fit(train_x,train_y)
        sales_predict = lgbmModel.predict(ho_test_x)
        final_predict = lgbmModel.predict(finaltest_x)
        test['sales_predict'] = np.expm1(final_predict)

        smallest_rmspe = RMSPE(testdata['Sales'], np.expm1(sales_predict))
        print(smallest_rmspe)
        hold_out_test['sales_predict'] = np.expm1(sales_predict)
        res = hold_out_test[['Store', 'Date', 'Sales', 'sales_predict']]
        # res2 = hold_out_test[['Store', 'Date', 'Sales', 'sales_predict']]

        # showFigure(res)

        res.loc[:, 'errorabs'] = abs((res['sales_predict'] - res['Sales']) / res['Sales'])
        res.loc[:, 'error'] = ((res['sales_predict'] - res['Sales']) / res['Sales'])
        res.sort_values(['errorabs'], ascending=False, inplace=True)
        # print(res[res['error']>=0].count())
        # print(res[res['error'] <= 0].count())
        # b_w = 0.900
        # for i in range(1, 101):
        #     predict = sales_predict * (0.900 + i / 1000)
        #     rmspe = RMSPE(testdata['Sales'], np.expm1(predict))
        #     if rmspe < smallest_rmspe:
        #         b_w = 0.900 + i / 1000
        #         smallest_rmspe = rmspe
        #         res2.loc[:, 'sales_predict'] = np.expm1(predict)
        # print(smallest_rmspe)
        # print(b_w)
        # showFigure(res2)

        stores = range(1, 1116)
        hold_out_test['w'] = 1
        for store in stores:
            s1 = pd.DataFrame(hold_out_test[hold_out_test['Store'] == store],columns=['Store', 'Date', 'Sales', 'sales_predict'])
            s = []
            for i in range(1, 201):
                error = RMSPE(s1.Sales, s1.sales_predict * (0.800 + i / 1000))
                s.append(error)
            score = pd.Series(s, index=[(0.800 + i / 1000) for i in range(1, 201)])
            BS = score[score.values == score.values.min()]
            a = np.array(BS.index.values)
            hold_out_test.loc[hold_out_test['Store'] == store, 'w'] = a
            test.loc[test['Store'] == store, 'w'] = a

        res3 = hold_out_test[['Store', 'Date', 'Sales', 'sales_predict', 'w']]
        res3['sales_predict'] = hold_out_test['sales_predict'] * hold_out_test['w']
        RMSPES.append(RMSPE(res3['Sales'], res3['sales_predict']))

        finalres = test[['Id']]
        finalres['Sales'] = test['predict_sales'] * test['w']
        predictions.append(finalres['Sales'])

    print(RMSPES)
    finalres = test[['Id']]
    finalres['Sales']=0
    for i in range(0,k):
        finalres['Sales']+=predictions[i]
    finalres['Sales']=finalres['Sales']/k
    end = time.time()
    print((end-start))
    finalres.to_csv('../submissionResult/submissionResult_lightGBM_mean.csv', index=False)