Exemple #1
0
class MyGradientBoostingClassifier(BaseClassifier):
    def __init__(self,
                 verbose=1,
                 n_estimators=5,
                 max_depth=6,
                 min_samples_leaf=100):
        self.classifier = GradientBoostingClassifier(
            **{
                'verbose': verbose,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_leaf': min_samples_leaf
            })
        self.name = "gb_n{n}_md{md}_ms{ms}".format(**{
            "n": n_estimators,
            "md": max_depth,
            "ms": min_samples_leaf
        })

    def get_name(self):
        return self.name

    def fit(self, X, y):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts
Exemple #2
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]
    data_test = param["data_test"]
    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df_test = pd.read_csv(data_test, index_col=idx)

    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.3,
    )
    clf.fit(X, y)
    y_pred = clf.predict(X_guest)
    acc = accuracy_score(y_guest, y_pred)
    result = {"accuracy": acc}
    print(result)
    return {}, result
Exemple #3
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]

    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df = df_guest.join(df_host, rsuffix='host')
    y = df[label_name]
    X = df.drop(label_name, axis=1)

    clf = GradientBoostingClassifier(
        random_state=0,
        n_estimators=120 if 'epsilon' in data_guest else 50,
        learning_rate=0.1)
    clf.fit(X, y)

    y_prob = clf.predict(X)

    try:
        auc_score = roc_auc_score(y, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    print(result)
    return {}, result
Exemple #4
0
def training(processed_train_csv_file):
    processed_train_samples = pd.read_csv(processed_train_csv_file)
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    processed_train_samples = processed_train_samples.fillna(value=0)
    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    random.shuffle(processed_train_samples_index_lst)
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples["booking_bool"].values

    print "Training Random Forest Classifier"
    rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    rf_classifier.fit(features, labels)
    print "Saving the Random Forest Classifier"
    data_io.save_model(rf_classifier, model_name="rf_classifier.pkl")

    print "Training Gradient Boosting Classifier"
    gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    gb_classifier.fit(features, labels)
    print "Saving the Gradient Boosting Classifier"
    data_io.save_model(gb_classifier, model_name="gb_classifier.pkl")

    print "Training SGD Classifier"
    sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1)
    sgd_classifier.fit(features, labels)
    print "Saving the SGD Classifier"
    data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
class Boosting():
    '''

    '''
    def __init__(self):
        self.clf = GB()

    def fit(self, X, y):
        '''

        :param X:
        :param y:
        :return:
        '''
        self.clf.fit(X,y)

    def predict(self, X):
        '''

        :param X:
        :return:
        '''
        m = int(X.shape[0] ** (0.5))
        pred = []
        for I in range(m):
            pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray()))
        return pred
def gradientBoostingClassifier(X_train, y_train, X_dev, y_dev):
    print("\nPerforming Gradient Boosting.")
    gb = GradientBoostingClassifier(n_estimators=50,
                                    learning_rate=0.25,
                                    max_depth=5,
                                    random_state=0)
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_dev)
    accuracy = np.mean(y_dev == y_pred)
    print("Accuracy", accuracy)
    return gb, accuracy
Exemple #7
0
class Boosting():
    #TODO: dokumentasi
    def __init__(self):
        self.clf = GB()
    def fit(self,X,y):
        self.clf.fit(X,y)
    def predict(self,X):
        m = int(X.shape[0] ** (0.5))
        pred = []
        for I in range(m):
            pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray()))
        return pred
Exemple #8
0
def do_training(processed_train_csv_file):
    ## Processed train samples reading
    # read saved processed train samples from the given csv file
    processed_train_samples = pd.read_csv(processed_train_csv_file)

    # inf to nan
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    # nan to 0
    processed_train_samples = processed_train_samples.fillna(value=0)

    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    # 之前排过序,这里shuffle一下,效果更好
    random.shuffle(processed_train_samples_index_lst)

    # organize new train samples and targets
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples['booking_bool'].values

    # Model training
    # 1 Random Forest Classifier

    print("Training Random Forest Classifier")
    rf_classifier = RandomForestClassifier(n_estimators=150,
                                           verbose=2,
                                           n_jobs=-1,
                                           min_samples_split=10)
    rf_classifier.fit(features, labels)

    print("Saving the Random Forest Classifier")
    data_io.save_model(rf_classifier, model_name='rf_classifier.pkl')

    # 2 Gradient Boosting Classifier
    print("Gradient Boosting  Classifier")
    gb_classifier = GradientBoostingClassifier(n_estimators=150,
                                               verbose=2,
                                               learning_rate=0.1,
                                               min_samples_split=10)
    gb_classifier.fit(features, labels)
    print("Saving the Gradient Boosting  Classifier")
    data_io.save_model(gb_classifier, model_name='gb_classifier.pkl')

    # 3 SGD Classifier
    print("SGD Classifier")
    sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2,
                                   n_jobs=-1)
    sgd_classifier.fit(features, labels)

    print("saved the SGD Classifier")
    data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
Exemple #9
0
def GN(pth):
     train_desc=np.load(pth+'/training_features.npy')
     nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0)
     idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
     stdSlr = StandardScaler().fit(train_desc)
     train_desc = stdSlr.transform(train_desc)
     modelGN=GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,
                                         max_depth=1, random_state=0)
     modelGN.fit(train_desc,np.array(train_labels))
     joblib.dump((modelGN, img_classes, stdSlr), pth+"/gn-bof.pkl", compress=3) 
     test(pth, "gn-")
def classify_gbc(data_sets, label_sets):
    params = {
        'n_estimators': 100,
        'max_depth': 4,
        'min_samples_split': 2,
        'learning_rate': 0.01,
        'loss': 'deviance',
        'verbose': 0
    }
    clf = GradientBoostingClassifier(**params)
    clf.fit(data_sets, label_sets)
    # print(clf.score(data_sets, label_sets))
    return clf
Exemple #11
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
Exemple #12
0
def classification(model_name, samples, labels, rangex, rangey):
    samples = np.array(samples)
    labels = np.array(labels)

    # build the model
    models = {
        "KNN": KNeighborsClassifier(),
        "LDA": LinearDiscriminantAnalysis(),
        "NB": GaussianNB(),
        "TREE": DecisionTreeClassifier(),
        "RF": RandomForestClassifier(n_estimators=20),
        "SVM": SVC(gamma='scale'),
        "PERC": Perceptron(max_iter=2000),
        "GB": GradientBoostingClassifier()
    }
    model = models.get(model_name)

    # train the model
    model.fit(samples, labels)
    print("classifier ", model, " - created")

    # build the matrix of results using the model
    result = np.zeros([rangex, rangey])
    for x in range(rangex):
        for y in range(rangey):
            sample = np.array([x, y])
            result[x][y] = model.predict(sample.reshape(1, -1))

    return result
def grid_search(data_sets, label_sets):
    param_grid = [{
        'n_estimators': [10, 100],
        'learning_rate': np.arange(0.01, 1, 0.03)
    }]
    # 对于不需要搜索的参数,可以固定下来
    params = {
        'max_depth': 4,
        'min_samples_split': 2,
        'loss': 'deviance',
        'verbose': 0
    }
    gbc = GradientBoostingClassifier(**params)
    # 将超参数配置及模型放入GridSearch中进行自动搜索
    clf = GridSearchCV(gbc, param_grid, cv=5)
    clf.fit(data_sets, label_sets)

    # 获取选择的最优模型
    best_model = clf.best_estimator_

    # 查看选择的最优超参数配置
    print(clf.best_params_)
    # 评分函数在这里是使用gbc的criterion
    print(clf.best_score_)
    return best_model
Exemple #14
0
def model_pred(trainX, trainY, testX, model_type):
    if model_type == "rf":
        clf = RandomForestClassifier(n_estimators=500, n_jobs=20)
        clf.fit(trainX, trainY)
        pred = clf.predict(testX)
    if model_type == "gbdt":
        clf = GradientBoostingClassifier(n_estimators=6,
                                         learning_rate=0.9,
                                         random_state=0)
        clf.fit(trainX, trainY)
        pred = clf.predict(testX)
    if model_type == "fusion":
        prob = np.zeros(len(testX))
        params = [100, 200, 300, 400, 500]
        for param in params:
            clf = RandomForestClassifier(n_estimators=param,
                                         n_jobs=20,
                                         bootstrap=True)
            clf.fit(trainX, trainY)
            prob += clf.predict(testX)
        '''
        params = [1,2,3,4,5,6,7,8,9,10]
        for param in params:
            clf = GradientBoostingClassifier(n_estimators=param,learning_rate=0.9,random_state=0)
            clf.fit(trainX,trainY)
            prob += clf.predict(testX)
        '''
        pred = list(prob >= 3)
    print "the pos rate is:", float(sum(pred)) / len(pred)
    return pred
Exemple #15
0
def trainGBT(requestsQ, responsesQ):
    while True:
        args = requestsQ.get()
        if args[0] == 'KILL':
            break

        vectors = args[1]     
        # expected in the order of learningRate, maxTrees, minSplitSize, maxDepth
        hyperparams = args[2]

        model =   GradientBoostingClassifier(learning_rate=hyperparams[0], n_estimators=hyperparams[1], min_samples_split=hyperparams[2], max_depth=hyperparams[3])
        
        model.fit(vectors['Xtrain'], vectors['Ytrain'])
        score = accuracy_score(vectors['Ytest'], model.predict(vectors['Xtest']))        
        responsesQ.put((model, score), True)

    return 0
Exemple #16
0
 def __init__(self, verbose=1, n_estimators = 200, max_depth=8, min_samples_leaf=10000):
     self.classifier = GradientBoostingClassifier( **{'verbose': verbose,
                                                  'n_estimators': n_estimators,
                                                  'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf
                                                  })
     self.name = "gb_n{n}_md{md}_ms{ms}".format(
         **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
     )
Exemple #17
0
 def __init__(self,
              verbose=1,
              n_estimators=5,
              max_depth=6,
              min_samples_leaf=100):
     self.classifier = GradientBoostingClassifier(
         **{
             'verbose': verbose,
             'n_estimators': n_estimators,
             'max_depth': max_depth,
             'min_samples_leaf': min_samples_leaf
         })
     self.name = "gb_n{n}_md{md}_ms{ms}".format(**{
         "n": n_estimators,
         "md": max_depth,
         "ms": min_samples_leaf
     })
Exemple #18
0
def main(config="../../config.yaml", param="./gbdt_config_binary.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)
    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1)
    clf.fit(X, y)
    y_prob = clf.predict(X_guest)

    try:
        auc_score = roc_auc_score(y_guest, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    import time
    print(result)
    print(data_guest)
    time.sleep(3)
    return {}, result
Exemple #19
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)    # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:,:,0]
    X_test_leaves = gbdt.apply(X_test)[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print(X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
Exemple #20
0
    def __init__(self,
                 stats,
                 data_node,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 output_dir=None,
                 meta_learner='lightgbm',
                 kfold=5):
        super().__init__(stats=stats,
                         data_node=data_node,
                         ensemble_method='stacking',
                         ensemble_size=ensemble_size,
                         task_type=task_type,
                         metric=metric,
                         output_dir=output_dir)

        self.kfold = kfold
        try:
            from lightgbm import LGBMClassifier
        except:
            warnings.warn(
                "Lightgbm is not imported! Stacking will use linear model instead!"
            )
            meta_learner = 'linear'

        self.meta_method = meta_learner

        # We use Xgboost as default meta-learner
        if self.task_type in CLS_TASKS:
            if meta_learner == 'linear':
                from sklearn.linear_model.logistic import LogisticRegression
                self.meta_learner = LogisticRegression(max_iter=1000)
            elif meta_learner == 'gb':
                from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
                self.meta_learner = GradientBoostingClassifier(
                    learning_rate=0.05,
                    subsample=0.7,
                    max_depth=4,
                    n_estimators=250)
            elif meta_learner == 'lightgbm':
                from lightgbm import LGBMClassifier
                self.meta_learner = LGBMClassifier(max_depth=4,
                                                   learning_rate=0.05,
                                                   n_estimators=150,
                                                   n_jobs=1)
        else:
            if meta_learner == 'linear':
                from sklearn.linear_model import LinearRegression
                self.meta_learner = LinearRegression()
            elif meta_learner == 'lightgbm':
                from lightgbm import LGBMRegressor
                self.meta_learner = LGBMRegressor(max_depth=4,
                                                  learning_rate=0.05,
                                                  n_estimators=70,
                                                  n_jobs=1)
Exemple #21
0
 def init_gbdt(self):
     if self.gbdt_name == 'xgboost':
         gbdt = XGBClassifier()
     elif self.gbdt_name == 'gbdt':
         gbdt = GradientBoostingClassifier()
     elif self.gbdt_name == 'lgb':
         gbdt = LGBMClassifier()
     else:
         print('no valid gbdt model')
     return gbdt
Exemple #22
0
def apply_gradient_boosting(X_train_preprocessed, X_test_preprocessed, y_train,
                            y_test):

    ##TO DO : Testing Hyper Parameters and Cross Validation

    print 'Applying Gradient Boosting'

    # Training the classifier
    classifier = GradientBoostingClassifier(n_estimators=100)
    classifier = classifier.fit(X_train_preprocessed, y_train)

    # Testing the classifier on Test Data
    y_test_pred = classifier.predict(X_test_preprocessed)

    #Compute Accuracy Score
    acc = accuracy_score(y_test, y_test_pred, normalize=True)

    print 'The accuracy achieved by the Gradient Boosting Classifier Model is: ', acc

    return classifier, acc
Exemple #23
0
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)
    df = df_guest.join(df_host, rsuffix='host')
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    clf = GradientBoostingClassifier(random_state=0,
                                     n_estimators=50,
                                     learning_rate=0.3)
    clf.fit(X, y)
    y_pred = clf.predict(X)

    try:
        auc_score = roc_auc_score(y, y_pred)
    except:
        print(f"no auc score available")

    acc = accuracy_score(y, y_pred)
    result = {"accuracy": acc}
    print('multi result', result)
    return {}, result
def classify_gbc(data_sets, label_sets):

    # params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2,
    #           'learning_rate': 0.01, 'loss': 'deviance', 'verbose': 0}

    # 网格搜索gbc最优超参数
    grid_search(data_sets, label_sets)
    # 这是网格CV搜索出的最佳参数 100,0.52
    params = {
        'n_estimators': 100,
        'max_depth': 4,
        'min_samples_split': 2,
        'learning_rate': 0.52,
        'loss': 'deviance',
        'verbose': 0
    }
    clf = GradientBoostingClassifier(**params)
    clf.fit(data_sets, label_sets)
    print(clf.score(data_sets, label_sets))

    return clf
Exemple #25
0
class MyGradientBoostingClassifier(BaseClassifier):
    def __init__(self, verbose=1, n_estimators = 200, max_depth=8, min_samples_leaf=10000):
        self.classifier = GradientBoostingClassifier( **{'verbose': verbose,
                                                     'n_estimators': n_estimators,
                                                     'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf
                                                     })
        self.name = "gb_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )

    def get_name(self):
        return self.name

    def fit(self, X, y):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self):
        return self.classifier.feature_importances_
Exemple #26
0
class GradientBoostingClassifierImpl():
    def __init__(self,
                 loss='deviance',
                 learning_rate=0.1,
                 n_estimators=100,
                 subsample=1.0,
                 criterion='friedman_mse',
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.0,
                 max_depth=3,
                 min_impurity_decrease=0.0,
                 min_impurity_split=None,
                 init=None,
                 random_state=None,
                 max_features=None,
                 verbose=0,
                 max_leaf_nodes=None,
                 warm_start=False,
                 presort='auto',
                 validation_fraction=0.1,
                 n_iter_no_change=None,
                 tol=0.0001):
        self._hyperparams = {
            'loss': loss,
            'learning_rate': learning_rate,
            'n_estimators': n_estimators,
            'subsample': subsample,
            'criterion': criterion,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_depth': max_depth,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'init': init,
            'random_state': random_state,
            'max_features': max_features,
            'verbose': verbose,
            'max_leaf_nodes': max_leaf_nodes,
            'warm_start': warm_start,
            'presort': presort,
            'validation_fraction': validation_fraction,
            'n_iter_no_change': n_iter_no_change,
            'tol': tol
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
Exemple #27
0
def test_categorical_gb(n_samples=100000, n_features=10, p=0.7):
    y = numpy.random.random(n_samples) > 0.5
    X = numpy.random.randint(40, size=[n_samples, n_features]) * 2
    X += numpy.random.random(size=[n_samples, n_features]) > p
    X += y[:, numpy.newaxis]

    from sklearn.cross_validation import train_test_split

    trainX, testX, trainY, testY = train_test_split(X, y)
    boosters = {
        'old':
        GradientBoostingClassifier(n_estimators=100,
                                   min_samples_split=50,
                                   max_depth=5),
        'cat':
        CommonGradientBoosting(loss=AdaLossFunction(),
                               subsample=0.5,
                               dtype=int,
                               base_estimator=CategoricalTreeRegressor()),
        'cat2':
        TreeGradientBoostingClassifier(
            loss=BinomialDeviance(),
            dtype='int',
            update_tree=False,
            base_estimator=SimpleCategoricalRegressor(n_features=2,
                                                      n_attempts=3,
                                                      method='cv')),
        'cat3':
        TreeGradientBoostingClassifier(
            loss=BinomialDeviance(),
            dtype='int',
            update_tree=False,
            base_estimator=ObliviousCategoricalRegressor(n_features=10,
                                                         n_categories_power=5,
                                                         splits=1,
                                                         pfactor=0.5)),
        'cat2-2':
        TreeGradientBoostingClassifier(
            loss=BinomialDeviance(),
            dtype='int',
            update_tree=False,
            n_threads=2,
            base_estimator=SimpleCategoricalRegressor(n_features=2,
                                                      n_attempts=1)),
        'cat-linear':
        CategoricalLinearClassifier(),
    }
    for name, booster in boosters.items():
        start = time.time()
        booster.fit(trainX, trainY)
        auc = roc_auc_score(testY, booster.predict_proba(testX)[:, 1])
        print(name, "spent:{:3.2f} auc:{}".format(time.time() - start, auc))
Exemple #28
0
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)

    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.3,
    )
    clf.fit(X, y)
    y_pred = clf.predict(X_guest)
    acc = accuracy_score(y_guest, y_pred)
    result = {"accuracy": acc}
    print(result)
    return {}, result
    def __init__(self, data, label, task, model_name='lgb', eval_metric=None, importance_threshold=0.0):
        '''
        :param data: DataFrame
        :param label: label name
        :param task:  任务类型, [regression, classification]
        :param model: ['gbdt', 'xgb', 'lgb']
        :param importance_threshold, 除去小于阈值的特征
        '''
        self.data = data
        self.label = label
        self.task = task
        self.model_name = model_name
        self._importance_threshold = importance_threshold

        self.model = None
        # 根据任务和label的值,设置验证准则
        self.eval_metric = None

        if model_name == 'lgb':
            if self.task == 'classification':
                self.model = lgb.LGBMClassifier(**lgb_params)
                if self.data[self.label].unique().shape[0] == 2:
                    self.eval_metric = 'logloss'
                else:
                    self.eval_metric = 'logloss'
            elif self.task == 'regression':
                self.model = lgb.LGBMRegressor(**lgb_params)
                self.eval_metric = 'l2'
            else:
                raise ValueError('Task must be either "classification" or "regression"')
        elif model_name == 'xgb':
            if self.task == 'classification':
                self.model = xgb.XGBClassifier(**xgb_params)
                if self.data[self.label].unique().shape[0] == 2:
                    self.eval_metric = 'logloss'
                else:
                    self.eval_metric = 'mlogloss'
            elif self.task == 'regression':
                self.model = xgb.XGBRegressor(**xgb_params)
                self.eval_metric = 'rmse'
            else:
                raise ValueError('Task must be either "classification" or "regression"')
        else: # gbdt
            if self.task == 'classification':
                self.model = GradientBoostingClassifier(**gbdt_params)
            elif self.task == 'regression':
                self.model = GradientBoostingRegressor(**gbdt_params)
            else:
                raise ValueError('Task must be either "classification" or "regression"')
        if not eval_metric:
            self.eval_metric = eval_metric
Exemple #30
0
def models():
    # Building and Cross-Validating the model
    algorithms = []
    names = []

    algorithms.append(('GB_Classifier', GradientBoostingClassifier()))
    algorithms.append(('Random_Forest', RandomForestClassifier()))
    algorithms.append(('ExtraTree_Classifier', ExtraTreesClassifier()))
    algorithms.append(('LDA_Classifier', LinearDiscriminantAnalysis()))
    algorithms.append(('KNN_Classification', KNeighborsClassifier()))
    algorithms.append(('ANN_Classification', MLPClassifier()))
    for name, algo in algorithms:
        names.append(name)
    return algorithms, names
Exemple #31
0
def gridsearch(params):
    tuning = GridSearchCV(estimator=GradientBoostingClassifier(),
                          param_grid=params,
                          scoring='accuracy',
                          n_jobs=4,
                          iid=False,
                          cv=5)
    X_train, X_test, y_train, y_test = dataset()
    tuning.fit(X_train, y_train)
    best_params = tuning.best_params_
    score = tuning.score(X_train, y_train)
    print(score)
    print(best_params)
    print(tuning.best_params_)
Exemple #32
0
 def _create_estimator(self):
     return GradientBoostingClassifier(
         loss=self.loss,
         learning_rate=self.learning_rate,
         n_estimators=self.n_estimators,
         min_samples_split=self.min_samples_split,
         min_samples_leaf=self.min_samples_leaf,
         min_weight_fraction_leaf=self.min_weight_fraction_leaf,
         max_depth=self.max_depth,
         init=self.init,
         subsample=self.subsample,
         max_features=self.max_features,
         random_state=self.random_state,
         verbose=self.verbose,
         max_leaf_nodes=self.max_leaf_nodes)
Exemple #33
0
 def gbm_model_train(self,train,targets,run_gs):
     
     if run_gs==False:
         gbm0 = GradientBoostingClassifier(random_state=0)
         gbm0.fit(train,targets)
         cv_result=cross_val_score(gbm0,train,targets,cv=5)
         print('gradient boosting cross validation score is ',cv_result.mean() )
         
     else:
         #using grid search CV with random forest  classfier
         rf=RandomForestClassifier(random_state=0)
         parameters = {
              'max_depth' : [6, 8,10],
              'n_estimators': [50, 100,200,400],
              'max_features': ['sqrt', 'auto', 'log2'],
              'min_samples_split': [3,5, 10],
              'min_samples_leaf': [5, 10, 15],
              'bootstrap': [True, False],
              'criterion':['gini','entropy']
              }
         grid_sear=GridSearchCV(rf,param_grid=parameters,scoring='accuracy',cv=10)
         grid=grid_sear.fit(train,targets)
         print(grid.best_score_)
         print(grid.best_params_)
Exemple #34
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]
    data_test = param["data_test"]

    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1)
    clf.fit(X, y)
    y_prob = clf.predict(X_guest)

    try:
        auc_score = roc_auc_score(y_guest, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    import time
    print(result)
    print(data_guest)
    time.sleep(3)
    return {}, result
Exemple #35
0
def get_feature_ranking(X_train, y_train):
    print("feature ranking running....-> LogisticRegression")
    model1 = LogisticRegression(max_iter=500)
    rfe = RFECV(estimator=model1,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    logr_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        logr_ranking.append([d, x])
    logr_ranking = pd.DataFrame(logr_ranking, columns=['features1', 'logr'])
    logr_ranking.sort_values('features1', inplace=True)

    print("feature ranking running....-> GradientBoostingClassifier")
    model2 = GradientBoostingClassifier()
    rfe = RFECV(estimator=model2,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    gboost_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        gboost_ranking.append([d, x])
    gboost_ranking = pd.DataFrame(gboost_ranking,
                                  columns=['features2', 'gboost'])
    gboost_ranking.sort_values('features2', inplace=True)

    print("feature ranking running....-> AdaBoostClassifier")
    model3 = AdaBoostClassifier()
    rfe = RFECV(estimator=model3,
                step=1,
                cv=StratifiedKFold(2),
                scoring='accuracy')
    rfe = rfe.fit(X_train, y_train)
    adaboost_ranking = []
    for x, d in zip(rfe.ranking_, X_train.columns):
        adaboost_ranking.append([d, x])
    adaboost_ranking = pd.DataFrame(adaboost_ranking,
                                    columns=['features3', 'adaboost'])
    adaboost_ranking.sort_values('features3', inplace=True)

    feature_sum = logr_ranking['logr'] + gboost_ranking[
        'gboost'] + adaboost_ranking['adaboost']
    df_ranked = pd.concat([logr_ranking['features1'], feature_sum], axis=1)
    df_ranked.sort_values(0, inplace=True)

    return df_ranked
def trainModel(param,feat_folder,feat_name):
    #read data from folder
    print 'now we read data from folder:%s'%(feat_folder)
   
    #start cv
    print 'now we need to generate cross_validation'
    accuracy_cv = []
  
    for i in range(0,2):
        print 'this is the run:%d cross-validation'%(i+1)
        testIndex = loadCVIndex("%s/test.run%d.txt"%("../data/feat/combine",(i+1)))
        #if we use xgboost to train model ,we need to use svmlib format
        if param['task'] in ['regression']:
            #with xgb we will dump the file with CV,and we will read data 
            train_data = xgb.DMatrix("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            valid_data = xgb.DMatrix("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            watchlist = [(train_data,'train'),(valid_data,'valid')]
            bst = xgb.train(param,train_data,int(param['num_round']),watchlist)
            pred = bst.predict(valid_data)
        
        elif param['task'] in ['clf_skl_lr']:
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            train_data  = train_data.tocsr()
            test_data = test_data.tocsr()
            clf = LogisticRegression()
            clf.fit(train_data,train_label)
            pred = clf.predict(test_data)
        
        elif param['task'] == "reg_skl_rf":
                    ## regression with sklearn random forest regressor
                    train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
                    test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
                    rf = RandomForestRegressor(n_estimators=param['n_estimators'],
                                               max_features=param['max_features'],
                                               n_jobs=param['n_jobs'],
                                               random_state=param['random_state'])
                    rf.fit(train_data, test_label)
                    pred = rf.predict(test_data)
        
        elif param['task'] == "reg_skl_etr":
                    ## regression with sklearn extra trees regressor
                    train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
                    test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
                    etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                                              max_features=param['max_features'],
                                              n_jobs=param['n_jobs'],
                                              random_state=param['random_state'])
                    etr.fit(train_data,test_label)
                    pred = etr.predict(test_data)
                    
        elif param['task'] in ['reg_skl_gbm'] :
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']),
                                             learning_rate=param['learning_rate'],
                                             max_features=param['max_features'],
                                             max_depth=param['max_depth'],
                                             subsample=param['subsample'],
                                             random_state=param['random_state'])
            feat_names.remove('cid')
            gbm.fit(train_data,train_label)
            pred = gbm.predict(test_data) 
        
        elif param['task'] in ['reg_skl_ridge']:
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            train_data  = train_data.tocsr()
            test_data = test_data.tocsr()
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(train_data,train_label)
            
            predraw = ridge.predict(test_data)
            print predraw
            predrank = predraw.argsort().argsort()
            trainIndex = loadCVIndex("%s/train.run%d.txt"%("../data/feat/combine",(i+1)))
            cdf = creatCDF(train, trainIndex)
            pred = getScore(predrank,cdf)
            print pred
            
        """
        elif param['task'] in ['regression']:
            
            
        
        elif param['task'] in ['reg_skl_gbm'] :
            gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']),
                                             learning_rate=param['learning_rate'],
                                             max_features=param['max_features'],
                                             max_depth=param['max_depth'],
                                             subsample=param['subsample'],
                                             random_state=param['random_state'])
            feat_names.remove('cid')
            gbm.fit(train_data[feat_names],train_data['cid'])
            pred = gbm.predict(valid_data[feat_names])
        elif param['task'] in ['reg_skl_ridge']:
            feat_names.remove('cid')
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(train_data[feat_names],train_data['cid'])
            pred = ridge.predict(valid_data[feat_names])
        """
        #now we use the the accuracy to limit our model
        acc = accuracy_model(pred,train.iloc[testIndex]['cid'])
        print "the model accurary:%s"%(acc)
        accuracy_cv.append(acc)

    #here we will count the 
    accuracy_cv_mean = np.mean(accuracy_cv)
    accuracy_cv_std = np.std(accuracy_cv)
    print 'the accuracy for %.6f'%(accuracy_cv_mean)
    return {'loss':-accuracy_cv_mean,'attachments':{'std':accuracy_cv_std},'status': STATUS_OK}
import matplotlib.pyplot as plt
import numpy as np
import output_coursera as coursera
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble.forest import RandomForestClassifier

data = pandas.read_csv('gbm-data.csv')
X = data[data.columns[1:]].values
y = data[data.columns[0]].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)

clf = GradientBoostingClassifier(n_estimators=250, random_state=241, verbose=True)


sigmoid = np.vectorize(lambda x: (1 / (1 + math.exp(-x))))


coursera.output('overfitting.txt', 'overfitting')

looses = {}
def plot_score(test_predictions, y_test, train_predictions, y_train, color, learning_rate):
    test_loss = [log_loss(y_test, pred) for pred in test_predictions]
    train_loss = [log_loss(y_train, pred) for pred in train_predictions]

    plt.plot(test_loss, color, linewidth=2)
    plt.plot(train_loss, color+'--', linewidth=2)
    looses[learning_rate] = test_loss
# <codecell>

df2 = df[selected]

# <codecell>

X, y = shuffle(df2[possible_features], df2.bad)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

# <codecell>

params = {'init': LogOddsEstimator(), 'n_estimators': 5, 'max_depth': 6, 'learning_rate': 0.1, 'loss': 'bdeviance'}
clf = GradientBoostingClassifier(**params)

# <codecell>

clf = clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

# <codecell>

clf.feature_importances_

# <codecell>

print "Mean Squared Error"
mse = mean_squared_error(y_test, predicted)
print("MSE: %.4f" % mse)
features_data_count = X.count()
missing = features_data_count[features_data_count < matches_count]
missing = missing.apply(lambda x: "missing {} of {}".format(matches_count - x, matches_count))
print(missing)


X = X.fillna(0)


# ===================== GradientBoosting ==============================

size = 0
score = 0
for forest_size in [10, 20, 30, 50, 150, 300]:
    start_time = datetime.datetime.now()
    clf = GradientBoostingClassifier(n_estimators=forest_size)
    k_folder = KFold(X.shape[0], n_folds=5, shuffle=True)
    scores = cross_val_score(clf, X=X, y=y, cv=k_folder, scoring='roc_auc')
    current_score = np.mean(scores)
    print("for {} trees mean score has been {} and time elapsed {}".format(forest_size, current_score, datetime.datetime.now() - start_time))
    if score < current_score:
        score = current_score
        size = forest_size

print("best score was for {} forest size: {}".format(size, score))

# ===================LogisticRegression=================
features = X
def train_logistic(features, target, label):
    scaler = StandardScaler()
    features = scaler.fit_transform(features)