コード例 #1
0
def tune_params():  
    f1_t_total, f1_v_total = [], []
    for max_depth in range(6,15):
        for subsample in [0.6,0.7,0.8]:
            for colsample_bytree in [0.6,0.7,0.8]:
                for reg_alpha in [0.1,1,10]:
                    lgb_base = LGBMClassifier(n_estimators = 150,objective = 'binary',
                                      random_state=1234,n_jobs = 3,colsample_bytree=colsample_bytree, 
                                      reg_alpha=reg_alpha,
                                      max_depth = max_depth, subsample = subsample)
                    _params = { 'max_depth':max_depth,
                        'subsample':subsample,
                            'colsample_bytree':colsample_bytree,
                                'reg_alpha':reg_alpha,
                            }
                    lgb_base.fit(X_t, y_t)
                    y_t_pre = lgb_base.predict(X_t)
                    y_v_pre = lgb_base.predict(X_v)
                    f1_t_each = f1_score(y_t, y_t_pre,average = 'micro')
                    f1_v_each = f1_score(y_v, y_v_pre,average = 'micro')
                    f1_t_total.append(f1_t_each)
                    f1_v_total.append(f1_v_each)
                    print(_params)
                    myfile1 = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_saveparams_f1_0418.txt',
                                  'a', encoding='utf-8')
                    print(_params['max_depth'],_params['subsample'],_params['colsample_bytree'],
                          _params['reg_alpha'],file = myfile1)
                    
                    myfile1.close()
                    print(f1_t_each,f1_v_each)
                    myfile = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_tunparms_f1_0418.txt',
                                  'a', encoding='utf-8')
                    print(f1_t_each,',',f1_v_each,file = myfile)
                    myfile.close()                   
    return f1_t_total,f1_v_total
コード例 #2
0
    def baseline_xiong(self, profile: Profile, shared: Storage, logger: Logger,
                       converted):
        a_std = converted[1].std(-1)
        g_mean = converted[3].mean(-1)
        g_std = converted[3].std(-1)
        m_over_0_count = (converted[2] >= 0.0).sum(-1).astype(np.float32)
        a_mean = converted[1].mean(-1)
        a_l2_std = np.sqrt(converted[1][:, 0, :]**2 +
                           converted[1][:, 1, :]**2 +
                           converted[1][:, 2, :]**2).std(-1)[:, np.newaxis]
        m_l2_std = np.sqrt(converted[2][:, 0, :]**2 +
                           converted[2][:, 1, :]**2 +
                           converted[2][:, 2, :]**2).std(-1)[:, np.newaxis]

        features = np.concatenate(
            (a_std, g_mean, g_std, m_over_0_count, a_mean, a_l2_std, m_l2_std),
            axis=1)
        labels = converted[
            0]  # onehot.fit_transform(converted[0].reshape(-1, 1)).toarray()

        length = labels.shape[0]

        classifier = LGBMClassifier()
        classifier.fit(features[:int(length * 0.7)],
                       labels[:int(length * 0.7)])

        validate_y = labels[int(length * 0.7):]
        predict_y = classifier.predict(features[int(length * 0.7):])
        logger.info('Xiong')
        logger.info(f'Accuracy: {accuracy_score(validate_y, predict_y)}')
        logger.info(
            f'Precision: {precision_score(validate_y, predict_y, average=None)}'
        )
        logger.info(
            f'Recall: {recall_score(validate_y, predict_y, average=None)}')
コード例 #3
0
def lgb_initialise(param={}):
    config = LGBMClassifier().get_params()
    config['boosting_type'] = 'gbdt'
    config['class_weight'] = None
    config['colsample_bytree'] = 0.7
    config['importance_type'] = 'split'
    config['is_unbalance'] = True
    config['learning_rate'] = 0.05
    config['max_depth'] = 4
    config['min_child_samples'] = 20
    config['min_child_weight'] = 0.001
    config['min_split_gain'] = 0.0
    config['n_estimators'] = 600
    config['n_jobs'] = -1
    config['nthread'] = 3
    config['num_leaves'] = 8
    config['objective'] = 'binary'
    config['random_state'] = None
    config['reg_alpha'] = 0
    config['reg_lambda'] = 0
    config['seed'] = 777
    config['silent'] = False
    config['subsample'] = 0.8
    config['subsample_for_bin'] = 200000
    config['subsample_freq'] = 0

    config.update(param)

    return LGBMClassifier(**config)
コード例 #4
0
def runTrain(train_df, workspace, debug, model_config):
    kfold_setting = model_config['kfold_setting']
    is_single = kfold_setting['num_folds'] < 1

    model_param = model_config['model_param']
    if debug:
        model_param['n_estimators'] = 100
        model_param['learning_rate'] = 0.3

    clf = LGBMClassifier(**model_param)

    feats = [
        f for f in train_df.columns if f not in
        ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
    ]
    X, y = train_df[feats], train_df['TARGET']

    if is_single:
        print("INFO: num_folds is less than 1, SINGLE MODEL would be trained.")
        model = clf
        with timer('Train Single LGB Model'):
            clf.fit(X, y, eval_set=[(X, y)], eval_metric='auc', verbose=100)
        workspace.save(model, 'single_model.pkl')
        # I do not prepare a analysis report for single_model
    else:
        cv = gen_cv(**kfold_setting)
        model = KFoldClassifier(clf, cv)
        with timer('Train KFold LGB Model'):
            model.fit(X, y)
        workspace.save(model, 'kfold_model.pkl')
        workspace.gen_report('kfold')
    return model
コード例 #5
0
ファイル: models.py プロジェクト: lpkirwin/cicada
def fit_lgb_model(model_spec, early_stopping_rounds=10):
    ms = model_spec
    print("loading data using", ms["filename"])
    df = ms["dataset"]()
    print("converting all feature columns to float32")
    for col in ms["features"].values():
        df[col] = df[col].astype("float32")
    print(df.describe().T)
    n_targets_with_null = df["target"].isna().sum()
    print("dropping", n_targets_with_null, "rows with null in target")
    df = df[df["target"].notna()]
    monotone_constraints = [
        ms["monotone_constraints"].get(col, 0) for col in ms["features"].keys()
    ]
    print("monotone constraints:", monotone_constraints)
    model = LGBMClassifier(
        n_estimators=5_000,
        num_leaves=11,
        learning_rate=0.01,
        monotone_constraints=monotone_constraints,
        monotone_constraints_method="advanced",
    )
    X = df[ms["features"].values()]
    y = df["target"]
    n_games = df["game_id"].max() + 1
    game_pct = (df["game_id"] + 1) / n_games
    w = config.GAME_WEIGHTING_FACTOR + (1 - config.GAME_WEIGHTING_FACTOR) * game_pct
    eval_size = ms["validation_size"]
    X_tr, X_te = X.iloc[:-eval_size], X.iloc[-eval_size:]
    y_tr, y_te = y.iloc[:-eval_size], y.iloc[-eval_size:]
    w_tr, w_te = w.iloc[:-eval_size], w.iloc[-eval_size:]
    eval_set = [(X_te.values, y_te.values)]
    model.fit(
        X_tr,
        y_tr,
        sample_weight=w_tr,
        eval_set=eval_set,
        eval_sample_weight=[w_te],
        early_stopping_rounds=early_stopping_rounds,
        verbose=early_stopping_rounds,
    )
    print("refitting model with full dataset")
    model.set_params(n_estimators=model.best_iteration_)
    model.fit(X, y, sample_weight=w)
    pred = pd.Series(model.predict_proba(X)[:, 1])
    print("distribution of predictions:")
    print(pred.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]))
    feature_importances = [
        (feature, importance)
        for feature, importance in zip(
            model.booster_.feature_name(),
            model.booster_.feature_importance(importance_type="gain"),
        )
    ]
    print("feature importance (by gain):")
    for feature, importance in sorted(feature_importances, key=lambda row: -row[1]):
        print(f"    {feature}: {importance}")
    filepath = os.path.join(FILEPATH, "models", ms["filename"])
    print("saving to", filepath)
    model.booster_.save_model(filepath)
コード例 #6
0
def lgb(x_train, y_train, x_val, y_val):
    lgb = LGBMClassifier(n_estimators=1000,
                         max_depth=10,
                         subsample=0.7,
                         colsample_bytree=0.7,
                         learning_rate=0.01,
                         random_state=2020)
    lgb.fit(x_train, y_train)
    result = lgb.predict(x_val)
    score = f1_score(result, y_val)
    return score
コード例 #7
0
    def setUp(self):
        X_train, y_train, X_test, y_test = titanic_survive()
        train_names, test_names = titanic_names()

        model = LGBMClassifier()
        model.fit(X_train, y_train)

        self.explainer = ClassifierExplainer(
                            model, X_test, y_test, 
                            cats=[{'Gender': ['Sex_female', 'Sex_male', 'Sex_nan']}, 
                                                'Deck', 'Embarked'],
                            labels=['Not survived', 'Survived'],
                            idxs=test_names)
コード例 #8
0
    def setUp(self):
        X_train, y_train, X_test, y_test = titanic_survive()
        train_names, test_names = titanic_names()

        model = LGBMClassifier()
        model.fit(X_train, y_train)

        self.explainer = ClassifierExplainer(
                            model, X_test, y_test, roc_auc_score, 
                            shap='tree',
                            cats=['Sex', 'Cabin', 'Embarked'],
                            labels=['Not survived', 'Survived'],
                            idxs=test_names)
コード例 #9
0
    def fit(self, X, y):
        sss = StratifiedShuffleSplit(n_splits=self.hsic_splits,
                                     random_state=42)
        idxs = []
        hsics = []
        for train_index, test_index in list(sss.split(X, y)):
            hsic_lasso2 = HSICLasso()
            hsic_lasso2.input(X[train_index], y[train_index])
            hsic_lasso2.classification(
                self.n_features, B=self.B,
                M=self.M)  #(self.n_features, B=self.B, M=self.M)
            hsics.append(hsic_lasso2)

            # not just best features - get their neighbors (similar features) too
            all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel()
            for i in range(len(all_ft_idx)):
                idx = np.array(hsic_lasso2.get_index_neighbors(
                    feat_index=i, num_neighbors=10),
                               dtype=int)
                score = np.array(hsic_lasso2.get_index_neighbors_score(
                    feat_index=i, num_neighbors=10),
                                 dtype=int)
                idx = idx[np.where(score > self.neighbor_threshold)[0]]
                all_ft_idx = np.concatenate((all_ft_idx, idx))
            all_ft_idx = np.unique(all_ft_idx)

            idxs.append(all_ft_idx)
            if len(idxs) == 1:
                self.hsic_idx_ = idxs[0]
            else:
                self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_)
        print("HSIC done.", len(self.hsic_idx_))

        print("Upsampling with ADASYN... (features: " +
              str(len(self.hsic_idx_)) + ")")
        sm = ADASYN(sampling_strategy="minority",
                    n_neighbors=self.adasyn_neighbors,
                    n_jobs=-1)
        sX, sy = X[:, self.hsic_idx_], y
        if self.adasyn_neighbors > 0:
            try:
                sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y)
                for i in range(len(np.unique(y) - 1)):
                    sX, sy = sm.fit_resample(sX, sy)
            except:
                pass
            print("ADASYN done. Starting clf")

        self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy)
        print("done")
        return self
コード例 #10
0
def train_LGBM(src_folder, cols, model_save_folder, lr):
    '''
    从src_folder中加载trainset,valset, 并训练LGBM,将训练后的模型保存到model_save_folder中
    :param src_folder:
    :param cols:
    :param model_save_folder:
    :param lr:
    :return:
    '''
    os.makedirs(model_save_folder, exist_ok=True)

    def auc_prc(y_true, y_pred):
        return 'AUC_PRC', average_precision_score(y_true, y_pred), True

    train_months_li = [[201803, 201804]]
    val_months_li = [[201806]]
    for train_months, val_months in zip(train_months_li, val_months_li):
        print('************** train Months: {}, val Months: {}**************'.
              format(', '.join([str(i) for i in train_months]),
                     ', '.join([str(i) for i in val_months])))
        trainX, trainy, valX, valy = prepare_Train_Val_set(
            src_folder, train_months, val_months, cols)
        print('trainX shape: {}, valX shape: {}'.format(
            trainX.shape, valX.shape))
        print('trainy value_counts: {}'.format(trainy.value_counts()))
        print('valy value_counts: {}'.format(valy.value_counts()))
        clf = LGBMClassifier(num_leaves=127,
                             learning_rate=lr,
                             n_estimators=10000,
                             objective='binary',
                             is_unbalance=True,
                             subsample=0.8,
                             colsample_bytree=0.8,
                             device_type='gpu',
                             gpu_platform_id=1,
                             gpu_device_id=0)
        t0 = time.time()
        clf.fit(trainX,
                trainy,
                eval_set=[(valX, valy)],
                eval_metric=auc_prc,
                early_stopping_rounds=50,
                verbose=100)
        print('fit time: {:.4f}'.format(time.time() - t0))
        save_name = 'LGBM_' + 'Val_M' + ', '.join(
            [str(i)
             for i in val_months]) + datetime.now().strftime('%Y%m%d_%H%M%S')
        joblib.dump(clf, os.path.join(model_save_folder, save_name))
        print('model is saved to {}'.format(save_name))
        gc.collect()
コード例 #11
0
ファイル: run_lgbm_initial.py プロジェクト: tkazusa/titanic
    def find_best_cv(self):
        Util.split_cv(self.X, self.y, self.n_folds_list, ORG_DATA_DIR)

        acc_score_means = []
        acc_score_vars = []
        for num_of_fold in self.n_folds_list:
            print("============")
            logger.info("==evaluating %s fold==" % num_of_fold)
            CV_DIR = os.path.join(ORG_DATA_DIR, "n_folds_%s/" % num_of_fold)
            acc_score = []
            for i in range(num_of_fold):
                logger.info("loading %s th cv data in %s folds" % (i, num_of_fold))
                X_train = pd.read_csv(os.path.join(CV_DIR, "X_train_%s.csv") % i, header=None, sep="\t").values
                X_val = pd.read_csv(os.path.join(CV_DIR, "X_val_%s.csv") % i, header=None, sep="\t").values
                y_train = pd.read_csv(os.path.join(CV_DIR, "y_train_%s.csv") % i, header=None, sep="\t").values
                y_c, y_r = y_train.shape
                y_train = y_train.reshape(y_c, )
                y_val = pd.read_csv(os.path.join(CV_DIR, "y_val_%s.csv") % i, header=None, sep="\t").values
                y_c, y_r = y_val.shape
                y_val = y_val.reshape(y_c, )
                logger.info("end loading %s th cv data in %s folds" % (i, num_of_fold))
                logger.info("X_train.shape: %s %s" % X_train.shape)
                logger.info("X_val.shape: %s %s" % X_val.shape)
                logger.info("y_train.shape: %s" % y_train.shape)
                logger.info("y_val.shape: %s" % y_val.shape)

                clf = LGBMClassifier(objective="binary",
                                     n_estimators=20)

                weight_train = self._calc_w(y_train)

                clf.fit(X_train, y_train,
                        sample_weight=weight_train,
                        eval_set=[(X_val, y_val)],
                        verbose=True)
                y_pred = clf.predict(X_val)
                logger.info("acc socore: %s folds, %s iteration" % (num_of_fold, i))
                acc_score.append(accuracy_score(y_val, y_pred))
            logger.info("mean acc score of %s folds is %s" % (num_of_fold, np.mean(acc_score)))
            acc_score_means.append(np.mean(acc_score))
            logger.info("variance of acc score of %s folds is %s" % (num_of_fold, np.var(acc_score)))
            acc_score_vars.append(np.var(acc_score))
        for i in range(len(self.n_folds_list)):
            logger.info(
                "===%s_folds=== mean acc:%s, var acc: %s " % (self.n_folds_list[i],
                                                              acc_score_means[i],
                                                              acc_score_vars[i])
            )
コード例 #12
0
def fit():
    train, validation, _ = train_validation_holdout_split(read('./data/train_set.csv'))

    steps = [
        preprocess,
        russia_only,
        rouble_only,
        with_transaction_location,
        with_job,
        (partial(fit_categories, ['mcc', 'city', 'terminal_id']), transform_categories),
        partial(calc_is_close, ['transaction_lat', 'transaction_lon'], ['work_add_lat', 'work_add_lon'])
    ]

    pipeline, train = fit_pipeline(steps, train)
    validation = pipeline(validation)

    feature_columns = ['mcc', 'city', 'amount', 'terminal_id']
    print(f'Train size: {len(train)}, Validation size: {len(validation)}')
    print(f'Features: {feature_columns}')
    model = LGBMClassifier()
    model.fit(train[feature_columns], train['is_close'])

    predictions = model.predict_proba(validation[feature_columns])
    accuracy_value = accuracy_score(validation['is_close'], np.argmax(predictions, axis=1))
    logloss_value = log_loss(validation['is_close'], predictions)
    print(f'Accuracy: {accuracy_value:.5f}, Logloss: {logloss_value:.5f}')
    print(classification_report(validation['is_close'], np.argmax(predictions, axis=1)))

    validation['probs'] = predictions[:, 1]
    top1_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(1).is_close.max()).mean()
    top5_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(5).is_close.max()).mean()
    top10_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(10).is_close.max()).mean()
    print(f'Top1: {top1_accuracy:.5f}')
    print(f'Top5: {top5_accuracy:.5f}')
    print(f'Top10: {top10_accuracy:.5f}')

    # contributions = model._Booster.predict(validation[feature_columns], pred_contrib=True)
    # contributions_df = pd.DataFrame(
    #     index=validation.index,
    #     data=contributions,
    #     columns=list(map(lambda col: col + '_contr', feature_columns)) + ['expected_value']
    # )

    # debug_df = pd.concat([validation, contributions_df], axis=1)
    # debug_df.index.name = 'id'
    # debug_df.to_csv('./data/debug.csv')

    import pdb; pdb.set_trace()
コード例 #13
0
def lgb_model(X_train, y_train, X_val, y_val, save_file, folds, param_comb,
              n_jobs, scoring):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=123)
    lgb = LGBMClassifier(n_jobs=n_jobs, random_state=123)
    params = {
        "num_leaves": [3, 5, 10],
        "max_depth": [-1],
        "learning_rate": [0.2575640770995011],
        "n_estimators": [5000, 10000, 50000],
        "objective": ["binary"],
        "class_weight": ["balanced", None],
        "subsample": [0.7],
        "colsample_bytree": [0.6],
        "reg_lambda": [1.6599030323415402],
        "reg_alpha": [0.7044747533204038],
        "min_child_weight": [7]
    }
    model_lgb = RandomizedSearchCV(estimator=lgb,
                                   param_distributions=params,
                                   n_iter=param_comb,
                                   scoring=scoring,
                                   n_jobs=n_jobs,
                                   cv=skf.split(X_train, y_train),
                                   verbose=0,
                                   random_state=123)
    model_lgb.fit(X_train,
                  y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='auc',
                  early_stopping_rounds=50)
    joblib.dump(model_lgb.best_estimator_,
                os.path.join(os.path.dirname(__file__), f'best_{save_file}'),
                compress=1)
    return model_lgb
コード例 #14
0
    def __init__(self, data, continuous_feature_names, label, score, t='R', showFig=False):
        self.data = data
        self.continuous_feature_names = continuous_feature_names
        self.label = label
        self.score = score
        self.K = len(continuous_feature_names)
        self.T = t
        self.showFig = showFig
        # 备选特征
        self.train_X = data[continuous_feature_names]
        self.train_y = data[label]
        self.numNull = self.train_X.isnull().sum().sum()
        self.numInf = np.isinf(self.train_X.values).sum()

        # 备选模型
        self.linearRegressionModel = [LinearRegression(), Ridge(), Lasso(), LinearSVR()]
        self.linearClassModel = [LogisticRegression(), LinearSVC(), RidgeClassifier()]

        self.treeRegressionModel = [ExtraTreesRegressor(),
                                    DecisionTreeRegressor(),
                                    RandomForestRegressor(),  # RF相对较慢
                                    GradientBoostingRegressor(),
                                    XGBRegressor(n_estimators=100, objective='reg:squarederror'),
                                    LGBMRegressor(n_estimators=100)]
        self.treeClassModel = [ExtraTreesClassifier(),
                               DecisionTreeClassifier(),
                               RandomForestClassifier(),
                               GradientBoostingClassifier(),
                               XGBClassifier(n_estimators=100, objective="binary:logistic"),
                               LGBMClassifier(n_estimators=100)]

        self.nonlinearRegressionModel = self.treeRegressionModel + [SVR(), MLPRegressor(solver='lbfgs', max_iter=100),]
        self.nonlinearClassModel = self.treeClassModel + [SVC(), MLPClassifier(),]
コード例 #15
0
def lgb_model(X_train, y_train, X_val, y_val, save_file, folds, param_comb,
              n_jobs, scoring):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=123)
    lgb = LGBMClassifier(n_jobs=n_jobs, random_state=123)
    params = {
        "num_leaves": ss.randint(2, 50),
        "max_depth": ss.randint(3, 10),
        "learning_rate": ss.uniform(0.001, 0.5),
        "n_estimators": [1000],
        "objective": ["binary"],
        "class_weight": ["balanced", None],
        "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        "reg_lambda": ss.uniform(0.05, 5),
        "reg_alpha": ss.uniform(0.05, 5),
        "min_child_weight": ss.randint(1, 15)
    }
    model_lgb = RandomizedSearchCV(estimator=lgb,
                                   param_distributions=params,
                                   n_iter=param_comb,
                                   scoring=scoring,
                                   n_jobs=n_jobs,
                                   cv=skf.split(X_train, y_train),
                                   verbose=0,
                                   random_state=123)
    model_lgb.fit(X_train,
                  y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='auc',
                  early_stopping_rounds=50)
    joblib.dump(model_lgb.best_estimator_,
                os.path.join(os.path.dirname(__file__), f'best_{save_file}'),
                compress=1)
    return model_lgb
コード例 #16
0
    def autoengineer_ratios(self, ae_params=None, n_iter=1000):
        if ae_params is None:
            ae_params = {
                'boosting_type': 'gbdt',
                'max_depth': -1,
                'objective': 'binary',
                'learning_rate': 0.0212,
                'reg_alpha': 0.8,
                'reg_lambda': 0.4,
                'subsample': 1,
                'feature_fraction': 0.3,
                'device_type': 'gpu',
                'metric': 'auc',
                'random_state': 123,
                'n_estimators': 300,
                'num_leaves': 40,
                'max_bin': 255,
                'min_data_in_leaf': 2400,
                'min_data_in_bin': 5
            }

        def _fn_column_selector(X, k):
            ''' 
                select up to kth column
            '''
            return X[:, :k]

        ColumnSelector = FunctionTransformer(_fn_column_selector,
                                             validate=False)
        importance_weights = self.ae_feature_importances / self.ae_feature_importances.sum(
        )
        kfold = StratifiedKFold(n_splits=5, random_state=123)
        model = Pipeline([('selector', ColumnSelector),
                          ('clf', LGBMClassifier(**ae_params))])

        for i in range(n_iter):
            random_vars = list(
                choice(self.X_train.columns,
                       size=2,
                       p=importance_weights,
                       replace=False))

            X_tmp = self.X_train.loc[:, random_vars]
            X_tmp['_DIV_'.join(
                random_vars)] = X_tmp.iloc[:, 0] / (X_tmp.iloc[:, 1] + 1)

            gs = GridSearchCV(
                estimator=model,
                param_grid={'selector__kw_args': [{
                    'k': 2
                }, {
                    'k': 3
                }]},
                scoring='roc_auc',
                cv=kfold)
            gs.fit(X_tmp.values, self.y_train)
            perf_1, perf_2 = gs.cv_results_.get('mean_test_score')
            if perf_2 > perf_1:
                self.ae_discovery_ratios.append(
                    (random_vars[0], random_vars[1], perf_2 / perf_1))
コード例 #17
0
def optimize_model(character_model):
    """
    Optimizes a classification model with the given param_grid
    Parameters:
    -----------
    model: CharacterPredictiveModel
        Fitted character model for optimizing

    Returns:
    -----------
    pandas DataFrame
        A dataframe with optimization results
    """
    if verbose: print("Performing hyperparameter optimization on best model")

    #Performing hyperparameter optimization on LightGBM
    model = {"best_model": LGBMClassifier(random_state = 123)}
    param_grid = {'best_model__n_estimators'  : [5, 100, 500, 700, 1000, 1500, 4000],
                  'best_model__learning_rate' : [0.01, 0.1, 1],
                  'best_model__max_depth'     : [1, 3, 5, 6, 10],
                  'best_model__subsample'     : [0.15, 0.25, 0.5, 0.75, 1],
                  'best_model__num_leaves'    : [31, 64, 128]
                 }

    model_df = character_model.optimize_model(model, param_grid)

    save_img_large(model_df, output_dir, "optimized_model", filename_prefix)

    if verbose: print("Model optimization complete!")
    return character_model
コード例 #18
0
def compare_decision_tree_forests(character_model):
    """
    Compares a sampling of random forest decision tree type models
    Parameters:
    -----------
    model: CharacterPredictiveModel
        Fitted character model for testing

    Returns:
    -----------
    None
    """
    if verbose: print("Comparing random forest type model(s)")

    # List of decision tree types to compare
    models = {
        "Random Forest Classifier": RandomForestClassifier(random_state=123),
        "XGBClassifier": XGBClassifier(eval_metric="mlogloss",
                                       random_state=123),
        "LGBMClassifier": LGBMClassifier(num_leaves=31, random_state=123),
        "CatBoostClassifier": CatBoostClassifier(random_state=123, verbose=0)
    }
    rf_models_df = character_model.model_compare(models)

    save_img(rf_models_df, output_dir, "forest_model_comparison",
             filename_prefix)
    if verbose: print("Trained Forest model(s)")
    return
コード例 #19
0
    def LGB_train(self,X_train, X_valid, labels_train, labels_valid, X_test, lgb_param_all):
        lgb_param_contrl = {'early_stopping_rounds': 100, 'categorical_feature': 'auto'}
        lgb_param = lgb_param_all.copy()
        objective_type = lgb_param['objective_type']
        lgb_param.pop('objective_type')

        for k in ['early_stopping_rounds', 'categorical_feature']:
            if k in lgb_param:
                lgb_param_contrl[k] = lgb_param[k]
                lgb_param.pop(k)

        if not self.config.retrain:
            # 调用已有模型进行增量训练
            model_load = self.load_model()
            if not model_load:
                print('不存在模型:{},从头训练'.format(self.modelName))
                if objective_type == 'regressor':
                    clf = LGBMRegressor(**lgb_param)
                else:
                    clf = LGBMClassifier(**lgb_param)

                clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                        early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'],
                        categorical_feature=lgb_param_contrl['categorical_feature'])
            else:
                clf = model_load.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                                     early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'],
                                     categorical_feature=lgb_param_contrl['categorical_feature'])
        else:
            if objective_type == 'regressor':
                clf = LGBMRegressor(**lgb_param)
            else:
                clf = LGBMClassifier(**lgb_param)
            clf.fit(X_train, labels_train, eval_set=[(X_valid, labels_valid)], eval_metric='rmse',
                    early_stopping_rounds=lgb_param_contrl['early_stopping_rounds'],
                    categorical_feature=lgb_param_contrl['categorical_feature'])


        val_lgb_pre = clf.predict(X_valid.values, num_iteration=clf.best_iteration_)
        test_lgb_pre = clf.predict(X_test.values, num_iteration=clf.best_iteration_)

        metrics_name = self.config.metrics_name
        myMetrics = defindMetrics.MyMetrics(metrics_name)
        score_lgb = myMetrics.metricsFunc(val_lgb_pre, labels_valid)

        self.save_model(clf, self.config.saveModel)
        return val_lgb_pre, test_lgb_pre, score_lgb
コード例 #20
0
ファイル: Model.py プロジェクト: xuming1986/ppd_mojing_4
def feature_selcetion(df_final,train_y,num=train_num):
    trn_x, trn_y = df_final[:num], train_y['label'][:num]
    x_train, x_val, y_train, y_val = train_test_split(trn_x, trn_y, train_size=0.8, random_state=2019, stratify=trn_y)
    clf = LGBMClassifier(learning_rate=0.05, n_estimators=10000, subsample=0.8, subsample_freq=1, colsample_bytree=0.8,
                         random_state=2019)
    t = time.time()
    clf.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)],early_stopping_rounds=20, verbose=5)
    print('runtime: {}\n'.format(time.time() - t))
    feature_impo = pd.DataFrame(sorted(zip(clf.feature_importances_, range(df_final.shape[1]))),columns=['Value', 'Feature'])
    feature_impo.to_csv(path + 'feature_impo.csv', index=False)
    df_final_selected=pd.read_csv(path + 'feature_impo.csv')
    fea_num_0=df_final.shape[1]
    fea_num_1=fea_num_0-1000
    head_1000_fea=feature_impo[fea_num_1:fea_num_0]['Feature'].values.tolist()
    df_final = df_final[:, head_1000_fea]
    print('特征选择已完成')
    return df_final
コード例 #21
0
 def __init__(self, estimator=LGBMClassifier(), cv=5, random_state=None, n_repeats=None):
     self.estimator = estimator
     if n_repeats:
         self._kf = RepeatedStratifiedKFold(cv, True, random_state)
         self._num_preds = cv * n_repeats
     else:
         self._kf = StratifiedKFold(cv, True, random_state)
         self._num_preds = cv
コード例 #22
0
def main():
    transaction = pd.read_csv('transaction_new.csv')
    dis = pd.read_csv('submit_disv1.csv')
    transaction_new = pd.merge(transaction,
                               dis[['TransactionID', 'score']],
                               on='TransactionID')
    feature = [
        f for f in transaction_new.columns
        if f != 'TransactionID' and f != 'split' and f != 'isFraud'
    ]
    fmap = {}
    for f in feature:
        fmap[f] = f.replace(' ', '_')
    transaction_new = transaction_new.rename(columns=fmap)
    data = transaction_new[transaction_new['split'] == 1]
    valid = transaction_new[transaction_new['split'] == 2]

    train, test = train_test_split(data, test_size=0.3, random_state=42)
    train_x = train[list(fmap.values())]
    test_x = test[list(fmap.values())]
    train_y = train['isFraud'].astype('int')
    test_y = test['isFraud'].astype('int')

    clf = LGBMClassifier(
        boosting_type='gbdt',
        colsample_bytree=0.2,
        drop_rate=0.1,
        importance_type='split',
        learning_rate=0.04,
        max_bin=500,
        max_depth=4,
        min_child_samples=50,
        min_split_gain=0.1,
        n_estimators=500,
        n_jobs=-1,
        num_leaves=9,
        objective=None,
        random_state=24,
        reg_alpha=40,
        reg_lambda=10,
        sigmoid=0.4,
        silent=True,
        #class_weight={0:1,1:10},
        #subsample=0.3,
        subsample_for_bin=24000,
        is_unbalance=True,
        subsample_freq=1)
    clf.fit(train_x, train_y)
    train_y_pred = clf.predict_proba(train_x)[:, 1]
    train_ks = cal_ks_scipy(train_y_pred, train_y)
    y_pred = clf.predict_proba(test_x)[:, 1]
    test_ks = cal_ks_scipy(y_pred, test_y)
    print(train_ks, test_ks)
    tr_auc = metrics.roc_auc_score(train_y, train_y_pred)
    te_auc = metrics.roc_auc_score(test_y, y_pred)
    print(tr_auc, te_auc)

    valid['isFraud'] = clf.predict_proba(valid[clf._Booster.feature_name()])[:,
                                                                             1]
    valid[['TransactionID', 'isFraud']].to_csv('submit6.csv', index=False)
コード例 #23
0
 def _set_algorithm(self, prms):
     model = LGBMClassifier(objective="binary",
                            n_estimators=1000,
                            learning_rate=0.3,
                            min_child_samples=40,
                            reg_alpha=0.5,
                            reg_lambda=0.5,
                            **prms)
     return model
コード例 #24
0
ファイル: Model.py プロジェクト: xuming1986/ppd_mojing_4
def single_model(df_final, train_y,weight=None,metric=None):
    train_values, test_values = df_final[:train_num], df_final[test_num:]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    clf = LGBMClassifier(learning_rate=0.05, n_estimators=10000, subsample=0.8, subsample_freq=1,
                         colsample_bytree=0.8, random_state=2019)
    test_pred_prob = np.zeros((test_values.shape[0], 33))
    for i, (trn_idx, val_idx) in enumerate(skf.split(train_values, train_y['label'])):
        print(i, 'fold...')
        t = time.time()
        trn_x, trn_y = train_values[trn_idx], train_y['label'][trn_idx]
        val_x, val_y = train_values[val_idx], train_y['label'][val_idx]
        train_amt, val_amt = train_y['due_amt'][trn_idx].values, train_y['due_amt'][val_idx].values
        clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)],sample_weight=weight,
                eval_metric=metric, early_stopping_rounds=100, verbose=5)
        test_pred_prob += clf.predict_proba(test_values, num_iteration=clf.best_iteration_) / skf.n_splits
        print('runtime: {}\n'.format(time.time() - t))
    print('单模型拟合已完成')
    return test_pred_prob
コード例 #25
0
ファイル: rf_wc_sklearn.py プロジェクト: ririw/kaggle-quora
 def make_cls(self):
     cls = LGBMClassifier(
         n_estimators=2048,
         num_leaves=1024,
         learning_rate=self.learning_rate.get(),
         min_child_samples=self.min_child_samples.get(),
         subsample=0.75,
     )
     return AutoExitingGBMLike(cls, additional_fit_args={'verbose': False})
コード例 #26
0
ファイル: hypothesis.py プロジェクト: gregorywalsh/kaggle
 def __init__(self, x, hyper_search_strat, hyper_search_kwargs, n_trees=512,
              preprocessor=None, transformer=None, additional_hyper_dists=None):
     super().__init__(
         estimator=LGBMClassifier(n_estimators=n_trees),
         hyper_search_strat=hyper_search_strat,
         hyper_search_kwargs=hyper_search_kwargs,
         transformer=transformer,
         additional_hyper_dists=additional_hyper_dists
     )
     self.x = x
コード例 #27
0
    def train_stack(self, n_features, n_classes, estimators, tr_stack, y_train,
                    val_stack, y_val, test_stack, score_name):

        tr_pred = []
        val_pred = []
        test_pred = []

        for estimator_name in estimators:
            estimator, params = select_model(estimator_name, n_features,
                                             n_classes)

            # Train 2nd and 3rd layer with val_stack and evaluate with tr_stack
            train_kwargs = {
                'estimator': estimator,
                'params': params,
                'X_train': val_stack,
                'y_train': y_val,
                'X_val': tr_stack,
                'y_val': y_train,
                'n_iter': 100,
                'score_name': score_name,
                'report': False,
                'cv': 3,
                'random_state': 42,
            }

            # Random train with stacked data and get best_params
            params, _, _ = random_model(**train_kwargs)

            if estimator_name == 'xgb':
                clf = XGBClassifier(**params)
            elif estimator_name == 'lgb':
                clf = LGBMClassifier(**params)
            elif estimator_name == 'rfo':
                clf = RandomForestClassifier(**params)
            elif estimator_name == 'log':
                clf = LogisticRegression(**params)
            elif estimator_name == 'svc':
                clf = SVC(**params)
            elif estimator_name == 'knn':
                clf = KNeighborsClassifier(**params)
            elif estimator_name == 'ada':
                clf = AdaBoostClassifier(**params)
            elif estimator_name == 'ext':
                clf = ExtraTreesClassifier(**params)

            clf.fit(val_stack, y_val)
            tr_prob = clf.predict_proba(tr_stack)
            val_prob = clf.predict_proba(val_stack)
            test_prob = clf.predict_proba(test_stack)

            tr_pred.append(tr_prob)
            val_pred.append(val_prob)
            test_pred.append(test_prob)
        return tr_pred, val_pred, test_pred
コード例 #28
0
def get_ntree():  
    f1_t_total, f1_v_total = [], []
    for ntree in range(10, 810, 10):
        lgb_base = LGBMClassifier(n_estimators = ntree,objective = 'binary',
                      random_state=1234,n_jobs = 2,colsample_bytree=0.8, reg_alpha=1,
                      max_depth = 15, subsample = 0.8)

        print('此时 ntree = %s' % ntree)
        lgb_base.fit(X_t, y_t)
        y_t_pre = lgb_base.predict(X_t)
        y_v_pre = lgb_base.predict(X_v)
        f1_t_each = f1_score(y_t, y_t_pre,average = 'micro')
        f1_v_each = f1_score(y_v, y_v_pre,average = 'micro')
        f1_t_total.append(f1_t_each)
        f1_v_total.append(f1_v_each)
        myfile = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_810_2.txt',
                      'a', encoding='utf-8')
        print(f1_t_each,',',f1_v_each,file = myfile)
        myfile.close()
    return f1_t_total,f1_v_total
コード例 #29
0
ファイル: Evaluate.py プロジェクト: zuoxiaolei/TencetAD2020
def evaluate_age():
    features = pd.read_csv(
        'data/combine_feature/part-00000-380aaa4b-c838-43f4-8cb7-80164a4256f2-c000.csv'
    )
    y = features.age.values
    features.drop(['user_id', 'age', 'gender'], axis=1, inplace=True)
    print(features.shape)
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        y,
                                                        test_size=0.2)
    lightgbm = LGBMClassifier(n_estimators=200,
                              num_leaves=100,
                              feature_fraction=0.75,
                              bagging_fraction=0.75,
                              learning_rate=0.1)
    lightgbm.fit(X_train,
                 y_train,
                 eval_set=[(X_test, y_test)],
                 early_stopping_rounds=5)
    pred = lightgbm.predict(X_test)
    print(classification_report(y_test, pred))
    joblib.dump(lightgbm, 'data/lgb_age')
コード例 #30
0
def get_base_models():
    base_models = []
    base_models.append(('LR', LogisticRegression()))
    base_models.append(('LDA', LinearDiscriminantAnalysis()))
    base_models.append(('KNN', KNeighborsClassifier()))
    base_models.append(('DTC', DecisionTreeClassifier()))
    base_models.append(('NB', GaussianNB()))
    base_models.append(('SVM', SVC(probability=True, max_iter=100)))
    base_models.append(('AB', AdaBoostClassifier()))
    base_models.append(('RF', RandomForestClassifier()))
    base_models.append(('ET', ExtraTreesClassifier()))
    base_models.append(('LGBM', LGBMClassifier()))
    return base_models