Esempio n. 1
0
def training(processed_train_csv_file):
    processed_train_samples = pd.read_csv(processed_train_csv_file)
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    processed_train_samples = processed_train_samples.fillna(value=0)
    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    random.shuffle(processed_train_samples_index_lst)
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples["booking_bool"].values

    print "Training Random Forest Classifier"
    rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    rf_classifier.fit(features, labels)
    print "Saving the Random Forest Classifier"
    data_io.save_model(rf_classifier, model_name="rf_classifier.pkl")

    print "Training Gradient Boosting Classifier"
    gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    gb_classifier.fit(features, labels)
    print "Saving the Gradient Boosting Classifier"
    data_io.save_model(gb_classifier, model_name="gb_classifier.pkl")

    print "Training SGD Classifier"
    sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1)
    sgd_classifier.fit(features, labels)
    print "Saving the SGD Classifier"
    data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
Esempio n. 2
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]
    data_test = param["data_test"]
    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df_test = pd.read_csv(data_test, index_col=idx)

    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.3,
    )
    clf.fit(X, y)
    y_pred = clf.predict(X_guest)
    acc = accuracy_score(y_guest, y_pred)
    result = {"accuracy": acc}
    print(result)
    return {}, result
Esempio n. 3
0
def classification(model_name, samples, labels, rangex, rangey):
    samples = np.array(samples)
    labels = np.array(labels)

    # build the model
    models = {
        "KNN": KNeighborsClassifier(),
        "LDA": LinearDiscriminantAnalysis(),
        "NB": GaussianNB(),
        "TREE": DecisionTreeClassifier(),
        "RF": RandomForestClassifier(n_estimators=20),
        "SVM": SVC(gamma='scale'),
        "PERC": Perceptron(max_iter=2000),
        "GB": GradientBoostingClassifier()
    }
    model = models.get(model_name)

    # train the model
    model.fit(samples, labels)
    print("classifier ", model, " - created")

    # build the matrix of results using the model
    result = np.zeros([rangex, rangey])
    for x in range(rangex):
        for y in range(rangey):
            sample = np.array([x, y])
            result[x][y] = model.predict(sample.reshape(1, -1))

    return result
def grid_search(data_sets, label_sets):
    param_grid = [{
        'n_estimators': [10, 100],
        'learning_rate': np.arange(0.01, 1, 0.03)
    }]
    # 对于不需要搜索的参数,可以固定下来
    params = {
        'max_depth': 4,
        'min_samples_split': 2,
        'loss': 'deviance',
        'verbose': 0
    }
    gbc = GradientBoostingClassifier(**params)
    # 将超参数配置及模型放入GridSearch中进行自动搜索
    clf = GridSearchCV(gbc, param_grid, cv=5)
    clf.fit(data_sets, label_sets)

    # 获取选择的最优模型
    best_model = clf.best_estimator_

    # 查看选择的最优超参数配置
    print(clf.best_params_)
    # 评分函数在这里是使用gbc的criterion
    print(clf.best_score_)
    return best_model
Esempio n. 5
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]

    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df = df_guest.join(df_host, rsuffix='host')
    y = df[label_name]
    X = df.drop(label_name, axis=1)

    clf = GradientBoostingClassifier(
        random_state=0,
        n_estimators=120 if 'epsilon' in data_guest else 50,
        learning_rate=0.1)
    clf.fit(X, y)

    y_prob = clf.predict(X)

    try:
        auc_score = roc_auc_score(y, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    print(result)
    return {}, result
Esempio n. 6
0
def model_pred(trainX, trainY, testX, model_type):
    if model_type == "rf":
        clf = RandomForestClassifier(n_estimators=500, n_jobs=20)
        clf.fit(trainX, trainY)
        pred = clf.predict(testX)
    if model_type == "gbdt":
        clf = GradientBoostingClassifier(n_estimators=6,
                                         learning_rate=0.9,
                                         random_state=0)
        clf.fit(trainX, trainY)
        pred = clf.predict(testX)
    if model_type == "fusion":
        prob = np.zeros(len(testX))
        params = [100, 200, 300, 400, 500]
        for param in params:
            clf = RandomForestClassifier(n_estimators=param,
                                         n_jobs=20,
                                         bootstrap=True)
            clf.fit(trainX, trainY)
            prob += clf.predict(testX)
        '''
        params = [1,2,3,4,5,6,7,8,9,10]
        for param in params:
            clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        pred = list(prob >= 3)
    print "the pos rate is:", float(sum(pred)) / len(pred)
    return pred
Esempio n. 7
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)    # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:,:,0]
    X_test_leaves = gbdt.apply(X_test)[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print(X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
Esempio n. 8
0
    def __init__(self,
                 stats,
                 data_node,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 output_dir=None,
                 meta_learner='lightgbm',
                 kfold=5):
        super().__init__(stats=stats,
                         data_node=data_node,
                         ensemble_method='stacking',
                         ensemble_size=ensemble_size,
                         task_type=task_type,
                         metric=metric,
                         output_dir=output_dir)

        self.kfold = kfold
        try:
            from lightgbm import LGBMClassifier
        except:
            warnings.warn(
                "Lightgbm is not imported! Stacking will use linear model instead!"
            )
            meta_learner = 'linear'

        self.meta_method = meta_learner

        # We use Xgboost as default meta-learner
        if self.task_type in CLS_TASKS:
            if meta_learner == 'linear':
                from sklearn.linear_model.logistic import LogisticRegression
                self.meta_learner = LogisticRegression(max_iter=1000)
            elif meta_learner == 'gb':
                from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
                self.meta_learner = GradientBoostingClassifier(
                    learning_rate=0.05,
                    subsample=0.7,
                    max_depth=4,
                    n_estimators=250)
            elif meta_learner == 'lightgbm':
                from lightgbm import LGBMClassifier
                self.meta_learner = LGBMClassifier(max_depth=4,
                                                   learning_rate=0.05,
                                                   n_estimators=150,
                                                   n_jobs=1)
        else:
            if meta_learner == 'linear':
                from sklearn.linear_model import LinearRegression
                self.meta_learner = LinearRegression()
            elif meta_learner == 'lightgbm':
                from lightgbm import LGBMRegressor
                self.meta_learner = LGBMRegressor(max_depth=4,
                                                  learning_rate=0.05,
                                                  n_estimators=70,
                                                  n_jobs=1)
Esempio n. 9
0
 def init_gbdt(self):
     if self.gbdt_name == 'xgboost':
         gbdt = XGBClassifier()
     elif self.gbdt_name == 'gbdt':
         gbdt = GradientBoostingClassifier()
     elif self.gbdt_name == 'lgb':
         gbdt = LGBMClassifier()
     else:
         print('no valid gbdt model')
     return gbdt
def gradientBoostingClassifier(X_train, y_train, X_dev, y_dev):
    print("\nPerforming Gradient Boosting.")
    gb = GradientBoostingClassifier(n_estimators=50,
                                    learning_rate=0.25,
                                    max_depth=5,
                                    random_state=0)
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_dev)
    accuracy = np.mean(y_dev == y_pred)
    print("Accuracy", accuracy)
    return gb, accuracy
Esempio n. 11
0
def test_categorical_gb(n_samples=100000, n_features=10, p=0.7):
    y = numpy.random.random(n_samples) > 0.5
    X = numpy.random.randint(40, size=[n_samples, n_features]) * 2
    X += numpy.random.random(size=[n_samples, n_features]) > p
    X += y[:, numpy.newaxis]

    from sklearn.cross_validation import train_test_split

    trainX, testX, trainY, testY = train_test_split(X, y)
    boosters = {
        'old':
        GradientBoostingClassifier(n_estimators=100,
                                   min_samples_split=50,
                                   max_depth=5),
        'cat':
        CommonGradientBoosting(loss=AdaLossFunction(),
                               subsample=0.5,
                               dtype=int,
                               base_estimator=CategoricalTreeRegressor()),
        'cat2':
        TreeGradientBoostingClassifier(
            loss=BinomialDeviance(),
            dtype='int',
            update_tree=False,
            base_estimator=SimpleCategoricalRegressor(n_features=2,
                                                      n_attempts=3,
                                                      method='cv')),
        'cat3':
        TreeGradientBoostingClassifier(
            loss=BinomialDeviance(),
            dtype='int',
            update_tree=False,
            base_estimator=ObliviousCategoricalRegressor(n_features=10,
                                                         n_categories_power=5,
                                                         splits=1,
                                                         pfactor=0.5)),
        'cat2-2':
        TreeGradientBoostingClassifier(
            loss=BinomialDeviance(),
            dtype='int',
            update_tree=False,
            n_threads=2,
            base_estimator=SimpleCategoricalRegressor(n_features=2,
                                                      n_attempts=1)),
        'cat-linear':
        CategoricalLinearClassifier(),
    }
    for name, booster in boosters.items():
        start = time.time()
        booster.fit(trainX, trainY)
        auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1])
        print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
def classify_gbc(data_sets, label_sets):
    params = {
        'n_estimators': 100,
        'max_depth': 4,
        'min_samples_split': 2,
        'learning_rate': 0.01,
        'loss': 'deviance',
        'verbose': 0
    }
    clf = GradientBoostingClassifier(**params)
    clf.fit(data_sets, label_sets)
    # print(clf.score(data_sets, label_sets))
    return clf
    def __init__(self, data, label, task, model_name='lgb', eval_metric=None, importance_threshold=0.0):
        '''
        :param data: DataFrame
        :param label: label name
        :param task:  任务类型, [regression, classification]
        :param model: ['gbdt', 'xgb', 'lgb']
        :param importance_threshold, 除去小于阈值的特征
        '''
        self.data = data
        self.label = label
        self.task = task
        self.model_name = model_name
        self._importance_threshold = importance_threshold

        self.model = None
        # 根据任务和label的值,设置验证准则
        self.eval_metric = None

        if model_name == 'lgb':
            if self.task == 'classification':
                self.model = lgb.LGBMClassifier(**lgb_params)
                if self.data[self.label].unique().shape[0] == 2:
                    self.eval_metric = 'logloss'
                else:
                    self.eval_metric = 'logloss'
            elif self.task == 'regression':
                self.model = lgb.LGBMRegressor(**lgb_params)
                self.eval_metric = 'l2'
            else:
                raise ValueError('Task must be either "classification" or "regression"')
        elif model_name == 'xgb':
            if self.task == 'classification':
                self.model = xgb.XGBClassifier(**xgb_params)
                if self.data[self.label].unique().shape[0] == 2:
                    self.eval_metric = 'logloss'
                else:
                    self.eval_metric = 'mlogloss'
            elif self.task == 'regression':
                self.model = xgb.XGBRegressor(**xgb_params)
                self.eval_metric = 'rmse'
            else:
                raise ValueError('Task must be either "classification" or "regression"')
        else: # gbdt
            if self.task == 'classification':
                self.model = GradientBoostingClassifier(**gbdt_params)
            elif self.task == 'regression':
                self.model = GradientBoostingRegressor(**gbdt_params)
            else:
                raise ValueError('Task must be either "classification" or "regression"')
        if not eval_metric:
            self.eval_metric = eval_metric
Esempio n. 14
0
def gridsearch(params):
    tuning = GridSearchCV(estimator=GradientBoostingClassifier(),
                          param_grid=params,
                          scoring='accuracy',
                          n_jobs=4,
                          iid=False,
                          cv=5)
    X_train, X_test, y_train, y_test = dataset()
    tuning.fit(X_train, y_train)
    best_params = tuning.best_params_
    score = tuning.score(X_train, y_train)
    print(score)
    print(best_params)
    print(tuning.best_params_)
Esempio n. 15
0
def models():
    # Building and Cross-Validating the model
    algorithms = []
    names = []

    algorithms.append(('GB_Classifier', GradientBoostingClassifier()))
    algorithms.append(('Random_Forest', RandomForestClassifier()))
    algorithms.append(('ExtraTree_Classifier', ExtraTreesClassifier()))
    algorithms.append(('LDA_Classifier', LinearDiscriminantAnalysis()))
    algorithms.append(('KNN_Classification', KNeighborsClassifier()))
    algorithms.append(('ANN_Classification', MLPClassifier()))
    for name, algo in algorithms:
        names.append(name)
    return algorithms, names
Esempio n. 16
0
 def _create_estimator(self):
     return GradientBoostingClassifier(
         loss=self.loss,
         learning_rate=self.learning_rate,
         n_estimators=self.n_estimators,
         min_samples_split=self.min_samples_split,
         min_samples_leaf=self.min_samples_leaf,
         min_weight_fraction_leaf=self.min_weight_fraction_leaf,
         max_depth=self.max_depth,
         init=self.init,
         subsample=self.subsample,
         max_features=self.max_features,
         random_state=self.random_state,
         verbose=self.verbose,
         max_leaf_nodes=self.max_leaf_nodes)
Esempio n. 17
0
def get_feature_ranking(X_train, y_train):
    print("feature ranking running....-> LogisticRegression")
    model1 = LogisticRegression(max_iter=500)
    rfe = RFECV(estimator=model1,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    logr_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        logr_ranking.append([d, x])
    logr_ranking = pd.DataFrame(logr_ranking, columns=['features1', 'logr'])
    logr_ranking.sort_values('features1', inplace=True)

    print("feature ranking running....-> GradientBoostingClassifier")
    model2 = GradientBoostingClassifier()
    rfe = RFECV(estimator=model2,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    gboost_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        gboost_ranking.append([d, x])
    gboost_ranking = pd.DataFrame(gboost_ranking,
                                  columns=['features2', 'gboost'])
    gboost_ranking.sort_values('features2', inplace=True)

    print("feature ranking running....-> AdaBoostClassifier")
    model3 = AdaBoostClassifier()
    rfe = RFECV(estimator=model3,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    adaboost_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        adaboost_ranking.append([d, x])
    adaboost_ranking = pd.DataFrame(adaboost_ranking,
                                    columns=['features3', 'adaboost'])
    adaboost_ranking.sort_values('features3', inplace=True)

    feature_sum = logr_ranking['logr'] + gboost_ranking[
        'gboost'] + adaboost_ranking['adaboost']
    df_ranked = pd.concat([logr_ranking['features1'], feature_sum], axis=1)
    df_ranked.sort_values(0, inplace=True)

    return df_ranked
Esempio n. 18
0
 def __init__(self):
     n_estimators = 600
     max_depth = 3
     learning_rate = 0.01
     self.classifier = GradientBoostingClassifier(
         **{
             'verbose': 1,
             'n_estimators': n_estimators,
             'max_depth': max_depth,
             'learning_rate': learning_rate
         })
     self.name = "gb_n{n}_md{md}_lr{lr}".format(**{
         "n": n_estimators,
         "md": max_depth,
         "lr": learning_rate
     })
Esempio n. 19
0
    def __init__(self,
                 stats,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 output_dir=None,
                 meta_learner='xgboost'):
        super().__init__(stats=stats,
                         ensemble_method='blending',
                         ensemble_size=ensemble_size,
                         task_type=task_type,
                         metric=metric,
                         output_dir=output_dir)
        try:
            from xgboost import XGBClassifier
        except:
            warnings.warn(
                "Xgboost is not imported! Blending will use linear model instead!"
            )
            meta_learner = 'linear'

        # We use Xgboost as default meta-learner
        if self.task_type in CLS_TASKS:
            if meta_learner == 'linear':
                from sklearn.linear_model.logistic import LogisticRegression
                self.meta_learner = LogisticRegression(max_iter=1000)
            elif meta_learner == 'gb':
                from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
                self.meta_learner = GradientBoostingClassifier(
                    learning_rate=0.05,
                    subsample=0.7,
                    max_depth=4,
                    n_estimators=250)
            elif meta_learner == 'xgboost':
                from xgboost import XGBClassifier
                self.meta_learner = XGBClassifier(max_depth=4,
                                                  learning_rate=0.05,
                                                  n_estimators=150)
        else:
            if meta_learner == 'linear':
                from sklearn.linear_model import LinearRegression
                self.meta_learner = LinearRegression()
            elif meta_learner == 'xgboost':
                from xgboost import XGBRegressor
                self.meta_learner = XGBRegressor(max_depth=4,
                                                 learning_rate=0.05,
                                                 n_estimators=70)
Esempio n. 20
0
    def __init__(self,
                 model_info,
                 ensemble_size,
                 task_type,
                 metric,
                 evaluator,
                 model_type='ml',
                 meta_learner='xgboost',
                 kfold=3,
                 save_dir=None,
                 random_state=None):
        super().__init__(model_info=model_info,
                         ensemble_size=ensemble_size,
                         task_type=task_type,
                         metric=metric,
                         evaluator=evaluator,
                         model_type=model_type,
                         save_dir=save_dir,
                         random_state=random_state)

        self.kfold = kfold
        # We use Xgboost as default meta-learner
        if self.task_type == CLASSIFICATION:
            if meta_learner == 'logistic':
                from sklearn.linear_model.logistic import LogisticRegression
                self.meta_learner = LogisticRegression(max_iter=1000)
            elif meta_learner == 'gb':
                from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
                self.meta_learner = GradientBoostingClassifier(
                    learning_rate=0.05,
                    subsample=0.7,
                    max_depth=4,
                    n_estimators=250)
            elif meta_learner == 'xgboost':
                from xgboost import XGBClassifier
                self.meta_learner = XGBClassifier(max_depth=4,
                                                  learning_rate=0.05,
                                                  n_estimators=150)
        elif self.task_type == REGRESSION:
            if meta_learner == 'linear':
                from sklearn.linear_model import LinearRegression
                self.meta_learner = LinearRegression()
            elif meta_learner == 'xgboost':
                from xgboost import XGBRegressor
                self.meta_learner = XGBRegressor(max_depth=4,
                                                 learning_rate=0.05,
                                                 n_estimators=70)
Esempio n. 21
0
 def __init__(self,
              verbose=1,
              n_estimators=5,
              max_depth=6,
              min_samples_leaf=100):
     self.classifier = GradientBoostingClassifier(
         **{
             'verbose': verbose,
             'n_estimators': n_estimators,
             'max_depth': max_depth,
             'min_samples_leaf': min_samples_leaf
         })
     self.name = "gb_n{n}_md{md}_ms{ms}".format(**{
         "n": n_estimators,
         "md": max_depth,
         "ms": min_samples_leaf
     })
Esempio n. 22
0
def main(config="../../config.yaml", param="./gbdt_config_binary.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)
    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1)
    clf.fit(X, y)
    y_prob = clf.predict(X_guest)

    try:
        auc_score = roc_auc_score(y_guest, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    import time
    print(result)
    print(data_guest)
    time.sleep(3)
    return {}, result
Esempio n. 23
0
def defaultModels(df_xmat, df_ymat_cat):

    #### representitive common classifiers in sklearn ####
    classifiers = [
        GaussianNB(),
        LogisticRegression(max_iter=500),
        DecisionTreeClassifier(),
        KNeighborsClassifier(),
        SVC(kernel='rbf'),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        RandomForestClassifier(),
    ]

    cv = StratifiedKFold(n_splits=10)

    res = []

    for clf in classifiers:

        print('processing...' + str(clf)[:10])

        metrics_cv = []

        for train_index, test_index in cv.split(df_xmat.values, df_ymat_cat):

            X_train = df_xmat.iloc[train_index, :].values
            X_test = df_xmat.iloc[test_index, :].values
            y_train = [df_ymat_cat[i] for i in train_index]
            y_test = [df_ymat_cat[i] for i in test_index]

            clf.fit(X_train, y_train)

            metrics_cv.append(clf.score(X_test, y_test))

        res.append([
            str(clf)[:10],
            np.array(metrics_cv).mean(axis=0),
            np.array(metrics_cv).std(axis=0)
        ])

    return res
Esempio n. 24
0
def apply_gradient_boosting(X_train_preprocessed, X_test_preprocessed, y_train,
                            y_test):

    ##TO DO : Testing Hyper Parameters and Cross Validation

    print 'Applying Gradient Boosting'

    # Training the classifier
    classifier = GradientBoostingClassifier(n_estimators=100)
    classifier = classifier.fit(X_train_preprocessed, y_train)

    # Testing the classifier on Test Data
    y_test_pred = classifier.predict(X_test_preprocessed)

    #Compute Accuracy Score
    acc = accuracy_score(y_test, y_test_pred, normalize=True)

    print 'The accuracy achieved by the Gradient Boosting Classifier Model is: ', acc

    return classifier, acc
Esempio n. 25
0
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)
    df = df_guest.join(df_host, rsuffix='host')
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    clf = GradientBoostingClassifier(random_state=0,
                                     n_estimators=50,
                                     learning_rate=0.3)
    clf.fit(X, y)
    y_pred = clf.predict(X)

    try:
        auc_score = roc_auc_score(y, y_pred)
    except:
        print(f"no auc score available")

    acc = accuracy_score(y, y_pred)
    result = {"accuracy": acc}
    print('multi result', result)
    return {}, result
def classify_gbc(data_sets, label_sets):

    # params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2,
    #           'learning_rate': 0.01, 'loss': 'deviance', 'verbose': 0}

    # 网格搜索gbc最优超参数
    grid_search(data_sets, label_sets)
    # 这是网格CV搜索出的最佳参数 100,0.52
    params = {
        'n_estimators': 100,
        'max_depth': 4,
        'min_samples_split': 2,
        'learning_rate': 0.52,
        'loss': 'deviance',
        'verbose': 0
    }
    clf = GradientBoostingClassifier(**params)
    clf.fit(data_sets, label_sets)
    print(clf.score(data_sets, label_sets))

    return clf
Esempio n. 27
0
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)

    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.3,
    )
    clf.fit(X, y)
    y_pred = clf.predict(X_guest)
    acc = accuracy_score(y_guest, y_pred)
    result = {"accuracy": acc}
    print(result)
    return {}, result
Esempio n. 28
0
    def __init__(self):
        self.random_rate=33
        clf1=SVC(C=1.0,random_state=33)
        clf2=XGBClassifier(n_estimators=220,learning_rate=0.2,min_child_weight=2.3)
        clf3=RandomForestClassifier(n_estimators=80,random_state=330,n_jobs=-1)
        clf4=BaggingClassifier(n_estimators=40,random_state=101)
        clf5=AdaBoostClassifier(n_estimators=70,learning_rate=1.5,random_state=33)
        clf6=GradientBoostingClassifier(n_estimators=250,learning_rate=0.23,random_state=33)

        clf7=XGBClassifier(n_estimators=100,learning_rate=0.12,min_child_weight=1)


        base_model=[
            ['svc',clf1],
            ['xgbc',clf2],
            ['rfc',clf3],
            ['bgc',clf4],
            ['adbc',clf5],
            ['gdbc',clf6]
        ]

        self.base_models=base_model
        self.XGB=clf7
Esempio n. 29
0
 def gbdt_lr_train(self, Train_tab, Train_libsvm):
     # load样本数据
     X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
     # 训练/测试数据分割
     X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                         y_all,
                                                         test_size=0.1,
                                                         random_state=42)
     # 定义GBDT模型
     gbdt = GradientBoostingClassifier(n_estimators=40,
                                       max_depth=3,
                                       verbose=0,
                                       max_features=0.5)
     # 训练模型
     gbdt.fit(X_train, y_train)
     # GBDT编码原有特征
     X_train_leaves = gbdt.apply(X_train)[:, :, 0]
     X_test_leaves = gbdt.apply(X_test)[:, :, 0]
     # 对所有特征进行ont-hot编码
     (train_rows, cols) = X_train_leaves.shape
     gbdtenc = OneHotEncoder()
     X_trans = gbdtenc.fit_transform(
         np.concatenate((X_train_leaves, X_test_leaves), axis=0))
     # 定义LR模型
     lr = LogisticRegression(n_jobs=-1)
     # 组合特征
     X_train_ext = hstack([X_trans[:train_rows, :], X_train])
     X_test_ext = hstack([X_trans[train_rows:, :], X_test])
     # lr对组合特征的样本模型训练
     lr.fit(X_train_ext, y_train)
     # 预测及AUC评测
     filename = 'finalized_model.sav'
     pickle.dump(lr, open(filename, 'wb'))
     # load the model from disk
     loaded_model = pickle.load(open(filename, 'rb'))
     y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1]
     print(y_pred_gbdtlr2)
Esempio n. 30
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]
    data_test = param["data_test"]

    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1)
    clf.fit(X, y)
    y_prob = clf.predict(X_guest)

    try:
        auc_score = roc_auc_score(y_guest, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    import time
    print(result)
    print(data_guest)
    time.sleep(3)
    return {}, result