Esempio n. 1
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]
    data_test = param["data_test"]
    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df_test = pd.read_csv(data_test, index_col=idx)

    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.3,
    )
    clf.fit(X, y)
    y_pred = clf.predict(X_guest)
    acc = accuracy_score(y_guest, y_pred)
    result = {"accuracy": acc}
    print(result)
    return {}, result
Esempio n. 2
0
class GradientBoostingClassifierImpl():
    def __init__(self,
                 loss='deviance',
                 learning_rate=0.1,
                 n_estimators=100,
                 subsample=1.0,
                 criterion='friedman_mse',
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.0,
                 max_depth=3,
                 min_impurity_decrease=0.0,
                 min_impurity_split=None,
                 init=None,
                 random_state=None,
                 max_features=None,
                 verbose=0,
                 max_leaf_nodes=None,
                 warm_start=False,
                 presort='auto',
                 validation_fraction=0.1,
                 n_iter_no_change=None,
                 tol=0.0001):
        self._hyperparams = {
            'loss': loss,
            'learning_rate': learning_rate,
            'n_estimators': n_estimators,
            'subsample': subsample,
            'criterion': criterion,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_depth': max_depth,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'init': init,
            'random_state': random_state,
            'max_features': max_features,
            'verbose': verbose,
            'max_leaf_nodes': max_leaf_nodes,
            'warm_start': warm_start,
            'presort': presort,
            'validation_fraction': validation_fraction,
            'n_iter_no_change': n_iter_no_change,
            'tol': tol
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
class Boosting():
    '''

    '''
    def __init__(self):
        self.clf = GB()

    def fit(self, X, y):
        '''

        :param X:
        :param y:
        :return:
        '''
        self.clf.fit(X,y)

    def predict(self, X):
        '''

        :param X:
        :return:
        '''
        m = int(X.shape[0] ** (0.5))
        pred = []
        for I in range(m):
            pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray()))
        return pred
Esempio n. 4
0
def training(processed_train_csv_file):
    processed_train_samples = pd.read_csv(processed_train_csv_file)
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    processed_train_samples = processed_train_samples.fillna(value=0)
    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    random.shuffle(processed_train_samples_index_lst)
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples["booking_bool"].values

    print "Training Random Forest Classifier"
    rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    rf_classifier.fit(features, labels)
    print "Saving the Random Forest Classifier"
    data_io.save_model(rf_classifier, model_name="rf_classifier.pkl")

    print "Training Gradient Boosting Classifier"
    gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    gb_classifier.fit(features, labels)
    print "Saving the Gradient Boosting Classifier"
    data_io.save_model(gb_classifier, model_name="gb_classifier.pkl")

    print "Training SGD Classifier"
    sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1)
    sgd_classifier.fit(features, labels)
    print "Saving the SGD Classifier"
    data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
Esempio n. 5
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]

    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df = df_guest.join(df_host, rsuffix='host')
    y = df[label_name]
    X = df.drop(label_name, axis=1)

    clf = GradientBoostingClassifier(
        random_state=0,
        n_estimators=120 if 'epsilon' in data_guest else 50,
        learning_rate=0.1)
    clf.fit(X, y)

    y_prob = clf.predict(X)

    try:
        auc_score = roc_auc_score(y, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    print(result)
    return {}, result
Esempio n. 6
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)    # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:,:,0]
    X_test_leaves = gbdt.apply(X_test)[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print(X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def gradientBoostingClassifier(X_train, y_train, X_dev, y_dev):
    print("\nPerforming Gradient Boosting.")
    gb = GradientBoostingClassifier(n_estimators=50,
                                    learning_rate=0.25,
                                    max_depth=5,
                                    random_state=0)
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_dev)
    accuracy = np.mean(y_dev == y_pred)
    print("Accuracy", accuracy)
    return gb, accuracy
Esempio n. 8
0
class Boosting():
    #TODO: dokumentasi
    def __init__(self):
        self.clf = GB()
    def fit(self,X,y):
        self.clf.fit(X,y)
    def predict(self,X):
        m = int(X.shape[0] ** (0.5))
        pred = []
        for I in range(m):
            pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray()))
        return pred
Esempio n. 9
0
def do_training(processed_train_csv_file):
    ## Processed train samples reading
    # read saved processed train samples from the given csv file
    processed_train_samples = pd.read_csv(processed_train_csv_file)

    # inf to nan
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    # nan to 0
    processed_train_samples = processed_train_samples.fillna(value=0)

    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    # 之前排过序,这里shuffle一下,效果更好
    random.shuffle(processed_train_samples_index_lst)

    # organize new train samples and targets
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples['booking_bool'].values

    # Model training
    # 1 Random Forest Classifier

    print("Training Random Forest Classifier")
    rf_classifier = RandomForestClassifier(n_estimators=150,
                                           verbose=2,
                                           n_jobs=-1,
                                           min_samples_split=10)
    rf_classifier.fit(features, labels)

    print("Saving the Random Forest Classifier")
    data_io.save_model(rf_classifier, model_name='rf_classifier.pkl')

    # 2 Gradient Boosting Classifier
    print("Gradient Boosting  Classifier")
    gb_classifier = GradientBoostingClassifier(n_estimators=150,
                                               verbose=2,
                                               learning_rate=0.1,
                                               min_samples_split=10)
    gb_classifier.fit(features, labels)
    print("Saving the Gradient Boosting  Classifier")
    data_io.save_model(gb_classifier, model_name='gb_classifier.pkl')

    # 3 SGD Classifier
    print("SGD Classifier")
    sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2,
                                   n_jobs=-1)
    sgd_classifier.fit(features, labels)

    print("saved the SGD Classifier")
    data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
Esempio n. 10
0
def GN(pth):
     train_desc=np.load(pth+'/training_features.npy')
     nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0)
     idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
     stdSlr = StandardScaler().fit(train_desc)
     train_desc = stdSlr.transform(train_desc)
     modelGN=GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,
                                         max_depth=1, random_state=0)
     modelGN.fit(train_desc,np.array(train_labels))
     joblib.dump((modelGN, img_classes, stdSlr), pth+"/gn-bof.pkl", compress=3) 
     test(pth, "gn-")
def classify_gbc(data_sets, label_sets):
    params = {
        'n_estimators': 100,
        'max_depth': 4,
        'min_samples_split': 2,
        'learning_rate': 0.01,
        'loss': 'deviance',
        'verbose': 0
    }
    clf = GradientBoostingClassifier(**params)
    clf.fit(data_sets, label_sets)
    # print(clf.score(data_sets, label_sets))
    return clf
Esempio n. 12
0
class MyGradientBoostingClassifier(BaseClassifier):
    def __init__(self,
                 verbose=1,
                 n_estimators=5,
                 max_depth=6,
                 min_samples_leaf=100):
        self.classifier = GradientBoostingClassifier(
            **{
                'verbose': verbose,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_leaf': min_samples_leaf
            })
        self.name = "gb_n{n}_md{md}_ms{ms}".format(**{
            "n": n_estimators,
            "md": max_depth,
            "ms": min_samples_leaf
        })

    def get_name(self):
        return self.name

    def fit(self, X, y):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self, feat_names):
        ipts = dict(zip(feat_names, self.classifier.feature_importances_))
        return ipts
Esempio n. 13
0
def trainGBT(requestsQ, responsesQ):
    while True:
        args = requestsQ.get()
        if args[0] == 'KILL':
            break

        vectors = args[1]     
        # expected in the order of learningRate, maxTrees, minSplitSize, maxDepth
        hyperparams = args[2]

        model =   GradientBoostingClassifier(learning_rate=hyperparams[0], n_estimators=hyperparams[1], min_samples_split=hyperparams[2], max_depth=hyperparams[3])
        
        model.fit(vectors['Xtrain'], vectors['Ytrain'])
        score = accuracy_score(vectors['Ytest'], model.predict(vectors['Xtest']))        
        responsesQ.put((model, score), True)

    return 0
Esempio n. 14
0
def main(config="../../config.yaml", param="./gbdt_config_binary.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)
    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1)
    clf.fit(X, y)
    y_prob = clf.predict(X_guest)

    try:
        auc_score = roc_auc_score(y_guest, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    import time
    print(result)
    print(data_guest)
    time.sleep(3)
    return {}, result
Esempio n. 15
0
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)
    df = df_guest.join(df_host, rsuffix='host')
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    clf = GradientBoostingClassifier(random_state=0,
                                     n_estimators=50,
                                     learning_rate=0.3)
    clf.fit(X, y)
    y_pred = clf.predict(X)

    try:
        auc_score = roc_auc_score(y, y_pred)
    except:
        print(f"no auc score available")

    acc = accuracy_score(y, y_pred)
    result = {"accuracy": acc}
    print('multi result', result)
    return {}, result
def classify_gbc(data_sets, label_sets):

    # params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2,
    #           'learning_rate': 0.01, 'loss': 'deviance', 'verbose': 0}

    # 网格搜索gbc最优超参数
    grid_search(data_sets, label_sets)
    # 这是网格CV搜索出的最佳参数 100,0.52
    params = {
        'n_estimators': 100,
        'max_depth': 4,
        'min_samples_split': 2,
        'learning_rate': 0.52,
        'loss': 'deviance',
        'verbose': 0
    }
    clf = GradientBoostingClassifier(**params)
    clf.fit(data_sets, label_sets)
    print(clf.score(data_sets, label_sets))

    return clf
Esempio n. 17
0
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)

    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.3,
    )
    clf.fit(X, y)
    y_pred = clf.predict(X_guest)
    acc = accuracy_score(y_guest, y_pred)
    result = {"accuracy": acc}
    print(result)
    return {}, result
Esempio n. 18
0
 def gbdt_lr_train(self, Train_tab, Train_libsvm):
     # load样本数据
     X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
     # 训练/测试数据分割
     X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                         y_all,
                                                         test_size=0.1,
                                                         random_state=42)
     # 定义GBDT模型
     gbdt = GradientBoostingClassifier(n_estimators=40,
                                       max_depth=3,
                                       verbose=0,
                                       max_features=0.5)
     # 训练模型
     gbdt.fit(X_train, y_train)
     # GBDT编码原有特征
     X_train_leaves = gbdt.apply(X_train)[:, :, 0]
     X_test_leaves = gbdt.apply(X_test)[:, :, 0]
     # 对所有特征进行ont-hot编码
     (train_rows, cols) = X_train_leaves.shape
     gbdtenc = OneHotEncoder()
     X_trans = gbdtenc.fit_transform(
         np.concatenate((X_train_leaves, X_test_leaves), axis=0))
     # 定义LR模型
     lr = LogisticRegression(n_jobs=-1)
     # 组合特征
     X_train_ext = hstack([X_trans[:train_rows, :], X_train])
     X_test_ext = hstack([X_trans[train_rows:, :], X_test])
     # lr对组合特征的样本模型训练
     lr.fit(X_train_ext, y_train)
     # 预测及AUC评测
     filename = 'finalized_model.sav'
     pickle.dump(lr, open(filename, 'wb'))
     # load the model from disk
     loaded_model = pickle.load(open(filename, 'rb'))
     y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1]
     print(y_pred_gbdtlr2)
Esempio n. 19
0
 def gbm_model_train(self,train,targets,run_gs):
     
     if run_gs==False:
         gbm0 = GradientBoostingClassifier(random_state=0)
         gbm0.fit(train,targets)
         cv_result=cross_val_score(gbm0,train,targets,cv=5)
         print('gradient boosting cross validation score is ',cv_result.mean() )
         
     else:
         #using grid search CV with random forest  classfier
         rf=RandomForestClassifier(random_state=0)
         parameters = {
              'max_depth' : [6, 8,10],
              'n_estimators': [50, 100,200,400],
              'max_features': ['sqrt', 'auto', 'log2'],
              'min_samples_split': [3,5, 10],
              'min_samples_leaf': [5, 10, 15],
              'bootstrap': [True, False],
              'criterion':['gini','entropy']
              }
         grid_sear=GridSearchCV(rf,param_grid=parameters,scoring='accuracy',cv=10)
         grid=grid_sear.fit(train,targets)
         print(grid.best_score_)
         print(grid.best_params_)
Esempio n. 20
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]
    data_test = param["data_test"]

    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1)
    clf.fit(X, y)
    y_prob = clf.predict(X_guest)

    try:
        auc_score = roc_auc_score(y_guest, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    import time
    print(result)
    print(data_guest)
    time.sleep(3)
    return {}, result
Esempio n. 21
0
def apply_gradient_boosting(X_train_preprocessed, X_test_preprocessed, y_train,
                            y_test):

    ##TO DO : Testing Hyper Parameters and Cross Validation

    print 'Applying Gradient Boosting'

    # Training the classifier
    classifier = GradientBoostingClassifier(n_estimators=100)
    classifier = classifier.fit(X_train_preprocessed, y_train)

    # Testing the classifier on Test Data
    y_test_pred = classifier.predict(X_test_preprocessed)

    #Compute Accuracy Score
    acc = accuracy_score(y_test, y_test_pred, normalize=True)

    print 'The accuracy achieved by the Gradient Boosting Classifier Model is: ', acc

    return classifier, acc
Esempio n. 22
0
class MyGradientBoostingClassifier(BaseClassifier):
    def __init__(self, verbose=1, n_estimators = 200, max_depth=8, min_samples_leaf=10000):
        self.classifier = GradientBoostingClassifier( **{'verbose': verbose,
                                                     'n_estimators': n_estimators,
                                                     'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf
                                                     })
        self.name = "gb_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )

    def get_name(self):
        return self.name

    def fit(self, X, y):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self):
        return self.classifier.feature_importances_
Esempio n. 23
0
                           n_features=20,
                           n_informative=18,
                           n_redundant=2,
                           n_classes=2,
                           n_clusters_per_class=3,
                           random_state=2017)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# 不生成新的特征,直接训练
clf = GradientBoostingClassifier(n_estimators=50)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
print("Original featrues")
print("GBDT_ACC: {:.6f}".format(acc))
print("GBDT_AUC: {:.6f}".format(auc))

# 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵
X_train_leaves = clf.apply(X_train)[:, :, 0]
X_test_leaves = clf.apply(X_test)[:, :, 0]

# 将X_train_leaves, X_test_leaves在axis=0方向上合并,再进行OneHotEncoder操作
All_leaves = np.r_[X_train_leaves, X_test_leaves]
looses = {}
def plot_score(test_predictions, y_test, train_predictions, y_train, color, learning_rate):
    test_loss = [log_loss(y_test, pred) for pred in test_predictions]
    train_loss = [log_loss(y_train, pred) for pred in train_predictions]

    plt.plot(test_loss, color, linewidth=2)
    plt.plot(train_loss, color+'--', linewidth=2)
    looses[learning_rate] = test_loss

plt.figure()
colors = ['r', 'g', 'b', 'c', 'm']
learn_rates = [1, 0.5, 0.3, 0.2, 0.1]
for index, learning_rate in enumerate(learn_rates):
    clf.learning_rate = learning_rate
    clf.fit(X_train, y_train)
    test_predictions = clf.staged_predict_proba(X_test)
    train_predictions = clf.staged_predict_proba(X_train)
    plot_score(test_predictions, y_test, train_predictions, y_train, color=colors[index], learning_rate=learning_rate)

legends = [["Test {}".format(learn_rate), "Train {}".format(learn_rate)] for learn_rate in learn_rates]
legends = [item for sublist in legends for item in sublist]
plt.legend(legends)
plt.savefig("coursera_out/gradient_boosting.png")

min_loss_on_iteration = np.argmin(looses[0.2])
min_loss = looses[0.2][min_loss_on_iteration]
print("on iteration {} was loose {}".format(min_loss_on_iteration, min_loss))
coursera.output("min_loose_on_0.2.txt", "{:.2f} {}".format(min_loss, min_loss_on_iteration))

Esempio n. 25
0
import numpy as np
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from numpy.ma.testutils import assert_array_almost_equal

# Create some data
m = 10000
X = np.random.normal(size=(m, 10))
thresh = np.random.normal(size=10)
X_transformed = X * (X > thresh)
beta = np.random.normal(size=10)
y = (np.dot(X_transformed, beta) + np.random.normal(size=m)) > 0

# Train a gradient boosting classifier
model = GradientBoostingClassifier()
model.fit(X, y)
print model.score(X, y)

# Inspect
pred = model.predict_proba(X)

approx = model.loss_._score_to_proba(
    model.learning_rate *
    sum(map(lambda est: est.predict(X), model.estimators_[:, 0])) +
    np.ravel(model.init_.predict(X)))

assert_array_almost_equal(pred, approx)
Esempio n. 26
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)
    print "train data shape: ", X_train.shape

    # 模型训练
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]
    print "gbdt leaves shape: ", X_train_leaves.shape
    for i in range(0, len(X_train_leaves[0])):
        cateMap = {}
        for j in range(0, len(X_train_leaves)):
            cateMap[X_train_leaves[j][i]] = 0
        print "F%d: %d" % (i, len(cateMap))

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape
    gbdtenc = OneHotEncoder(sparse=False, categories='auto')
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    print "gbdt oneHot shape: ", X_trans.shape
    print "oneHot leaves: ", X_trans[0]
    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print "gbdt leaves cross", X_train_ext.shape
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
Esempio n. 27
0
from sklearn.preprocessing import LabelEncoder
def data_process(data):
     encoder = LabelEncoder()
     data['V2'] = encoder.fit_transform(data['V2'])
     data['V4'] = encoder.fit_transform(data['V4'])
     data['V5'] = encoder.fit_transform(data['V5']) 
data_process(train_agg)
data_process(test_agg)

del a,gp,gp_day_mean,gp_day_var,gp1,gp2,gp3,gp4,index1,l,m1,m2,m3,merge_log,ss,ss2,t1,t2,t3,train_flg
#gbdt 构造新特征
gbdt = GradientBoostingClassifier(loss='exponential',learning_rate=0.12,n_estimators=60, max_depth=3,random_state=42,max_features=None)
X_train=train_agg.drop(['USRID','FLAG'],axis=1)
y_train=train_agg['FLAG']
# 训练学习
gbdt.fit(X_train, y_train)
# GBDT编码原有特征
X_train_leaves = gbdt.apply(X_train)[:,:,0]
X_test_leaves=gbdt.apply(test_agg.drop('USRID',axis=1))[:,:,0]
(train_rows, cols) = X_train_leaves.shape
onehot = OneHotEncoder()
X_trans = onehot.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

# 组合特征
X_train_agg = DataFrame(hstack([X_trans[:train_rows, :], train_agg]).toarray())
X_test_agg = DataFrame(hstack([X_trans[train_rows:, :], test_agg]).toarray())
X_train_agg.rename(columns={494: "USRID",495:"FLAG"},inplace=True)
X_test_agg.rename(columns={494: "USRID"},inplace=True)

#训练集和测试集
def inject_bag_of_words(X, features):
    X_pick = np.zeros((features.shape[0], 112))

    for i, match_id in enumerate(features.index):
        for p in range(5):
            X_pick[i, features.ix[match_id, 'r{}_hero'.format(p+1)]-1] = 1
            X_pick[i, features.ix[match_id, 'd{}_hero'.format(p+1)]-1] = -1

    return np.concatenate([X, X_pick], axis=1)


X = inject_bag_of_words(X, features)
clf, scaler = train_logistic(X, y, 'With Bag of Words')

# final test proba
clf.fit(scaler.transform(X), y)


test_features = pandas.read_csv('features_test.csv', index_col='match_id')

X_test = test_features.drop(category_features, axis=1)
X_test = X_test.fillna(0)
X_test = inject_bag_of_words(X_test, test_features)

X_test = scaler.transform(X_test)

proba = clf.predict_proba(X_test)[:, 1]

print("Proba min: {}".format(proba.min()))
print("Proba max: {}".format(proba.max()))
Esempio n. 29
0
    temp=groups[f].median()
    for i in range(0,768):
        if (dataset.loc[i,f]==0) & (dataset.loc[i,'outcome']==0):
            dataset.loc[i,f]=temp[0]
        if (dataset.loc[i,f]==0) & (dataset.loc[i,'outcome']==1):
            dataset.loc[i,f]=temp[1]


dataset = dataset.values
X = dataset[:,0:len(dataset[0]) -1]
Y = dataset[:, (len(dataset[0])-1)]


#this is for decision tree
data=[[0,0,0,0,0]]
df=pd.DataFrame(data,columns=['feats','depth','split','max_leaf','acc'])
for feats in range(2, 7):
    for dept in range(2, 6):
        acc = 0
        for split in range(5,40,5):
            for leaf in range(7,10):
                for i in range(20):
                    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
                    classifier= GradientBoostingClassifier(min_samples_split=split,max_depth=dept,max_features=feats,max_leaf_nodes=leaf)
                    classifier.fit(X_train, Y_train)
                    res = classifier.score(X_test, Y_test)
                    acc = acc + res
                acc = acc / 20    
                print('feats:', feats, 'Depth:', dept,'split:',split,'max_leaf',leaf, 'acc:', acc*100)
                df=df.append({'feats':feats,'depth':dept,'split':split,'max_leaf':leaf,'acc':acc},ignore_index=True)
df.to_csv('xgboost.csv', sep=',')
Esempio n. 30
0
def gbdt_lr_train(train,test,gbdt_features,lr_features,target,name,isOnline):

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=20, max_depth=3, verbose=0, max_features=0.3)
    #n_estimators=20, max_depth=3, verbose=0, max_features=0.5

    # 训练学习
    gbdt.fit(train[gbdt_features], train[target])

    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdt = gbdt.predict_proba(test[gbdt_features])[:, 1]
        gbdt_test_log_loss = log_loss(test[target], y_pred_gbdt)
        print('gbdt log_loss: %.5f' % gbdt_test_log_loss)
    else:
        y_pred_gbdt = gbdt.predict_proba(train[gbdt_features].tail(57562))[:, 1]
        gbdt_test_log_loss = log_loss(train[target].tail(57562), y_pred_gbdt)
        print('gbdt log_loss: %.5f' % gbdt_test_log_loss)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(train[gbdt_features])[:,:,0]
    X_test_leaves = gbdt.apply(test[gbdt_features])[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], train[target])
    
    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        gbdt_lr_test_log_loss1 = log_loss(test[target], y_pred_gbdtlr1)
        print('基于GBDT特征编码后的LR log_loss: %.5f' % gbdt_lr_test_log_loss1)
    else:
        print('Online')

    # 定义LR模型
    lr = LogisticRegression()
    
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], train[lr_features]])
    X_test_ext = hstack([X_trans[train_rows:, :], test[lr_features]])
    
    print("gbdt output",X_trans[:train_rows, :].shape)
    print("input",train[lr_features].shape)
    print(X_train_ext.shape)
    
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, train[target])

    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
        gbdt_lr_test_log_loss2 = log_loss(test[target], y_pred_gbdtlr2)
        print('基于组合特征的LR log_loss: %.5f' % gbdt_lr_test_log_loss2)
    else:
        print('Online')
        
        test['predicted_score'] = lr.predict_proba(X_test_ext)[:, 1]
        print(test['predicted_score'].head(5))
        print(len(test))
        test[['instance_id', 'predicted_score']].to_csv('../baseline_' + name +'.csv', index=False,sep=' ')#保存在线提交结果
        print('Saved result success!')
Esempio n. 31
0
class Predict():
    def __init__(self):
        self.gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5)
        self.lr = LogisticRegression(n_jobs=-1)
        Train_tab = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0,
                     0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0,
                     0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
                     0,
                     0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0,
                     0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        Train_libsvm = [[1, 1, 1, 1, 1, 1], [2, 2, 2, 1, 2, 2], [1, 1, 1, 1, 3, 1], [2, 2, 2, 1, 4, 1],
                        [3, 3, 2, 1, 5, 2],
                        [2, 2, 2, 1, 6, 1], [4, 4, 3, 1, 6, 2], [5, 5, 3, 1, 7, 2], [2, 2, 2, 1, 8, 1],
                        [2, 2, 2, 1, 6, 1],
                        [2, 2, 2, 1, 9, 2], [6, 6, 2, 1, 8, 3], [1, 1, 1, 1, 10, 1], [2, 2, 2, 1, 4, 2],
                        [2, 2, 2, 1, 4, 1],
                        [2, 2, 2, 1, 10, 2], [1, 1, 1, 1, 8, 1], [1, 1, 1, 1, 11, 1], [2, 2, 2, 1, 12, 1],
                        [2, 2, 2, 1, 2, 1],
                        [5, 5, 3, 1, 13, 2], [2, 2, 2, 1, 14, 1], [7, 7, 2, 1, 15, 2], [1, 1, 1, 1, 16, 1],
                        [1, 1, 1, 1, 8, 1],
                        [1, 1, 1, 1, 17, 1], [5, 5, 3, 1, 18, 2], [2, 2, 2, 1, 19, 2], [1, 1, 1, 1, 2, 1],
                        [2, 2, 2, 1, 20, 1],
                        [2, 2, 2, 1, 10, 1], [2, 2, 2, 1, 14, 2], [5, 5, 3, 1, 15, 2], [5, 5, 3, 1, 21, 2],
                        [2, 2, 2, 1, 21, 1],
                        [1, 1, 1, 1, 22, 1], [6, 6, 2, 1, 5, 2], [2, 2, 2, 1, 1, 2], [8, 8, 2, 1, 15, 3],
                        [4, 4, 3, 1, 23, 2],
                        [9, 9, 2, 2, 6, 2], [1, 1, 1, 1, 21, 1], [2, 2, 2, 1, 10, 2], [5, 5, 3, 1, 24, 2],
                        [2, 2, 2, 1, 20, 1],
                        [2, 2, 2, 1, 8, 1], [5, 5, 3, 1, 2, 2], [6, 6, 2, 1, 3, 3], [1, 1, 1, 1, 19, 1],
                        [2, 2, 2, 1, 12, 2],
                        [2, 2, 2, 1, 25, 1], [1, 1, 1, 1, 2, 1], [4, 4, 3, 1, 11, 2], [2, 2, 2, 1, 10, 1],
                        [1, 1, 1, 1, 21, 1],
                        [2, 2, 2, 1, 14, 2], [1, 1, 1, 1, 19, 1], [2, 2, 2, 1, 14, 1], [2, 2, 2, 1, 9, 1],
                        [2, 2, 2, 1, 20, 2],
                        [2, 2, 2, 1, 4, 2], [1, 1, 1, 1, 4, 1], [2, 2, 2, 1, 26, 1], [2, 2, 2, 1, 14, 1],
                        [2, 2, 2, 1, 4, 2],
                        [2, 2, 2, 1, 23, 1], [5, 5, 3, 1, 13, 2], [3, 3, 2, 1, 22, 2], [2, 2, 2, 1, 11, 2],
                        [2, 2, 2, 1, 1, 2],
                        [2, 2, 2, 1, 9, 1], [1, 1, 1, 1, 9, 1], [2, 2, 2, 1, 12, 2], [2, 2, 2, 1, 20, 1],
                        [2, 2, 2, 1, 1, 2],
                        [1, 1, 1, 1, 14, 1], [10, 10, 2, 1, 23, 3], [5, 5, 3, 1, 21, 2], [1, 1, 1, 1, 1, 1],
                        [2, 2, 2, 1, 19, 2],
                        [1, 1, 1, 1, 23, 1], [2, 2, 2, 1, 20, 1], [1, 1, 1, 1, 14, 1], [4, 4, 3, 1, 11, 2],
                        [2, 2, 2, 1, 19, 1],
                        [5, 5, 3, 1, 19, 2], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 14, 1], [11, 11, 2, 1, 10, 1],
                        [2, 2, 2, 1, 14, 2],
                        [1, 1, 1, 1, 22, 1], [9, 9, 2, 2, 27, 2], [4, 4, 3, 1, 1, 2], [4, 4, 3, 1, 12, 2],
                        [2, 2, 2, 1, 6, 1],
                        [4, 4, 3, 1, 8, 2], [1, 1, 1, 1, 16, 1], [1, 1, 1, 1, 28, 1], [2, 2, 2, 1, 15, 2],
                        [1, 1, 1, 1, 3, 1],
                        [2, 2, 2, 1, 14, 1], [1, 1, 1, 1, 21, 1], [2, 2, 2, 1, 24, 2], [2, 2, 2, 1, 23, 1],
                        [2, 2, 2, 1, 8, 1],
                        [2, 2, 2, 1, 21, 2], [6, 6, 2, 1, 6, 2], [1, 1, 1, 1, 2, 1], [2, 2, 2, 1, 12, 1],
                        [5, 5, 3, 1, 23, 2],
                        [1, 1, 1, 1, 29, 1], [1, 1, 1, 1, 8, 1], [4, 4, 3, 1, 2, 2], [1, 1, 1, 1, 8, 1],
                        [1, 1, 1, 1, 30, 1],
                        [2, 2, 2, 1, 8, 1], [1, 1, 1, 1, 8, 1], [4, 4, 3, 1, 23, 2], [5, 5, 3, 1, 9, 2],
                        [4, 4, 3, 1, 1, 2],
                        [9, 9, 2, 2, 19, 2], [1, 1, 1, 1, 11, 1], [2, 2, 2, 1, 1, 2], [10, 10, 2, 1, 30, 1],
                        [9, 9, 2, 2, 24, 2],
                        [5, 5, 3, 1, 14, 2], [2, 2, 2, 1, 4, 1], [2, 2, 2, 1, 22, 2], [2, 2, 2, 1, 26, 1],
                        [2, 2, 2, 1, 14, 1],
                        [2, 2, 2, 1, 1, 1], [4, 4, 3, 1, 2, 2], [3, 3, 2, 1, 29, 2], [2, 2, 2, 1, 6, 2],
                        [2, 2, 2, 1, 9, 2],
                        [2, 2, 2, 1, 16, 2], [5, 5, 3, 1, 13, 2], [13, 13, 2, 1, 3, 2], [2, 2, 2, 1, 27, 1],
                        [2, 2, 2, 1, 1, 2],
                        [2, 2, 2, 1, 4, 1], [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 29, 2], [3, 3, 2, 1, 12, 2],
                        [2, 2, 2, 1, 2, 2],
                        [2, 2, 2, 1, 5, 1], [5, 5, 3, 1, 28, 2], [6, 6, 2, 1, 22, 3], [1, 1, 1, 1, 5, 1],
                        [1, 1, 1, 1, 2, 1],
                        [2, 2, 2, 1, 21, 2], [2, 2, 2, 1, 1, 1], [2, 2, 2, 1, 19, 1], [2, 2, 2, 1, 4, 1],
                        [4, 4, 3, 1, 11, 2],
                        [2, 2, 2, 1, 4, 2], [5, 5, 3, 1, 18, 2], [2, 2, 2, 1, 18, 1], [1, 1, 1, 1, 23, 1],
                        [9, 9, 2, 2, 25, 2],
                        [2, 2, 2, 1, 1, 2], [2, 2, 2, 1, 5, 1], [10, 10, 2, 1, 2, 3], [2, 2, 2, 1, 9, 2],
                        [2, 2, 2, 1, 14, 2],
                        [1, 1, 1, 1, 26, 1], [1, 1, 1, 1, 3, 1], [14, 14, 2, 1, 23, 2], [4, 4, 3, 1, 2, 2],
                        [2, 2, 2, 1, 23, 2]]
        self.gbdt_lr_train(Train_tab, Train_libsvm)

    def gbdt_lr_train(self, Train_tab, Train_libsvm):
        # load样本数据
        X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
        # 训练/测试数据分割
        X_train, X_test, y_train, y_test = train_test_split(Train_libsvm, Train_tab, test_size=0.1, random_state=42)
        # 定义GBDT模型
        self.gbdt.fit(X_train, y_train)
        # GBDT编码原有特征
        self.X_train_leaves = self.gbdt.apply(X_train)[:, :, 0]
        X_test_leaves = self.gbdt.apply(X_test)[:, :, 0]
        # 对所有特征进行ont-hot编码
        (self.train_rows, cols) = self.X_train_leaves.shape
        gbdtenc = OneHotEncoder()
        X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0))
        X_train_ext = hstack([X_trans[:self.train_rows, :], X_train])
        # lr对组合特征的样本模型训练
        self.lr.fit(X_train_ext, y_train)

    def Predict(self, X_test):
        X_test_leaves = self.gbdt.apply(X_test)[:, :, 0]
        gbdtenc = OneHotEncoder()
        self.X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0))
        X_test_ext = hstack([self.X_trans[self.train_rows:, :], X_test])
        y_pred_gbdtlr2 = self.lr.predict_proba(X_test_ext)[:, 1]
        values = []
        for value in y_pred_gbdtlr2:
            values.append(value)
        return values
Esempio n. 32
0
y_pred1 = SVM.predict(X_test)
print(classification_report(y_test, y_pred1))
print(accuracy_score(y_test, y_pred1))

## Random Forest Classifier
print("RandomForrest Classifier Results are as following")
rfc = RandomForestClassifier(n_estimators=200, max_depth=4)
rfc.fit(X_train, y_train)
y_pred2 = rfc.predict(X_test)
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))

# Neural Network
print("Neural Network Classifier Results are as following")

mlp = MLPClassifier(max_iter=500)
mlp.fit(X_train, y_train)
y_pred3 = mlp.predict(X_test)
print(classification_report(y_test, y_pred3))
print(accuracy_score(y_test, y_pred3))

# GradientBoosting Classifier

print("GradientBoosting Classifier Results are as following")

grd = GradientBoostingClassifier()
grd.fit(X_train, y_train)
y_pred4 = grd.predict(X_test)
print(classification_report(y_test, y_pred4))
print(accuracy_score(y_test, y_pred4))
Esempio n. 33
0
# <codecell>

X, y = shuffle(df2[possible_features], df2.bad)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

# <codecell>

params = {'init': LogOddsEstimator(), 'n_estimators': 5, 'max_depth': 6, 'learning_rate': 0.1, 'loss': 'bdeviance'}
clf = GradientBoostingClassifier(**params)

# <codecell>

clf = clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

# <codecell>

clf.feature_importances_

# <codecell>

print "Mean Squared Error"
mse = mean_squared_error(y_test, predicted)
print("MSE: %.4f" % mse)
print 

# <codecell>
print('Best parameters: {}'.format(grid_search.best_params_))

# In[192]:

y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(accuracy_rf)

# In[193]:

######## Trying Gradient Boost ######

# In[194]:

gbc = GradientBoostingClassifier(n_estimators=100)
gbc.fit(X_train, y_train)

# In[195]:

y_pred_gbc = gbc.predict(X_test)
accuracy_gbc = accuracy_score(y_test, y_pred_gbc)
print(accuracy_gbc)

# In[196]:

############## Model evaluation ##############

# In[197]:

confusion_matrix(y_test, y_pred_lr)
Esempio n. 35
0
def gbdt_lr_train():
    cv_lr_scores = []
    cv_lr_trans_scores = []
    cv_lr_trans_raw_scores = []
    cv_gbdt_scores = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    for train_index, valid_index in skf.split(X, y):
        X_train = X[train_index]
        X_valid = X[valid_index]
        y_train = y[train_index]
        y_valid = y[valid_index]

        # 定义GBDT模型
        gbdt = GradientBoostingClassifier(n_estimators=60, max_depth=3, verbose=0, max_features=0.5)
        # 训练学习
        gbdt.fit(X_train, y_train)
        y_pred_gbdt = gbdt.predict_proba(X_valid)[:, 1]
        gbdt_auc = roc_auc_score(y_valid, y_pred_gbdt)
        print('基于原有特征的gbdt auc: %.5f' % gbdt_auc)
        cv_gbdt_scores.append(gbdt_auc)

        # lr对原始特征样本模型训练
        lr = LogisticRegression()
        lr.fit(X_train, y_train)  # 预测及AUC评测
        y_pred_test = lr.predict_proba(X_valid)[:, 1]
        lr_valid_auc = roc_auc_score(y_valid, y_pred_test)
        print('基于原有特征的LR AUC: %.5f' % lr_valid_auc)
        cv_lr_scores.append(lr_valid_auc)

        # GBDT编码原有特征
        X_train_leaves = gbdt.apply(X_train)[:, :, 0]
        X_valid_leaves = gbdt.apply(X_valid)[:, :, 0]

        # 对所有特征进行ont-hot编码
        (train_rows, cols) = X_train_leaves.shape

        gbdtenc = OneHotEncoder()
        X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_valid_leaves), axis=0))

        # 定义LR模型
        lr = LogisticRegression()
        # lr对gbdt特征编码后的样本模型训练
        lr.fit(X_trans[:train_rows, :], y_train)
        # 预测及AUC评测
        y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        gbdt_lr_auc1 = roc_auc_score(y_valid, y_pred_gbdtlr1)
        print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)
        cv_lr_trans_scores.append(gbdt_lr_auc1)

        # 定义LR模型
        lr = LogisticRegression(n_jobs=-1)
        # 组合特征
        X_train_ext = hstack([X_trans[:train_rows, :], X_train])
        X_valid_ext = hstack([X_trans[train_rows:, :], X_valid])

        print(X_train_ext.shape)
        # lr对组合特征的样本模型训练
        lr.fit(X_train_ext, y_train)

        # 预测及AUC评测
        y_pred_gbdtlr2 = lr.predict_proba(X_valid_ext)[:, 1]
        gbdt_lr_auc2 = roc_auc_score(y_valid, y_pred_gbdtlr2)
        print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
        cv_lr_trans_raw_scores.append(gbdt_lr_auc2)

    cv_lr = np.mean(cv_lr_scores)
    cv_lr_trans = np.mean(cv_lr_trans_scores)
    cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores)
    cv_gbdt = np.mean(cv_gbdt_scores)
    print("==" * 20)
    print("gbdt原始特征cv_gbdt:", cv_gbdt)
    print("lr原始特征cv_lr:", cv_lr)
    print("lr基于gbdt的特征cv_lr_trans:", cv_lr_trans)
    print("lr基于gbdt特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
Esempio n. 36
0
def trainModel(param,feat_folder,feat_name):
    #read data from folder
    print 'now we read data from folder:%s'%(feat_folder)
   
    #start cv
    print 'now we need to generate cross_validation'
    accuracy_cv = []
  
    for i in range(0,2):
        print 'this is the run:%d cross-validation'%(i+1)
        testIndex = loadCVIndex("%s/test.run%d.txt"%("../data/feat/combine",(i+1)))
        #if we use xgboost to train model ,we need to use svmlib format
        if param['task'] in ['regression']:
            #with xgb we will dump the file with CV,and we will read data 
            train_data = xgb.DMatrix("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            valid_data = xgb.DMatrix("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            watchlist = [(train_data,'train'),(valid_data,'valid')]
            bst = xgb.train(param,train_data,int(param['num_round']),watchlist)
            pred = bst.predict(valid_data)
        
        elif param['task'] in ['clf_skl_lr']:
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            train_data  = train_data.tocsr()
            test_data = test_data.tocsr()
            clf = LogisticRegression()
            clf.fit(train_data,train_label)
            pred = clf.predict(test_data)
        
        elif param['task'] == "reg_skl_rf":
                    ## regression with sklearn random forest regressor
                    train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
                    test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
                    rf = RandomForestRegressor(n_estimators=param['n_estimators'],
                                               max_features=param['max_features'],
                                               n_jobs=param['n_jobs'],
                                               random_state=param['random_state'])
                    rf.fit(train_data, test_label)
                    pred = rf.predict(test_data)
        
        elif param['task'] == "reg_skl_etr":
                    ## regression with sklearn extra trees regressor
                    train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
                    test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
                    etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                                              max_features=param['max_features'],
                                              n_jobs=param['n_jobs'],
                                              random_state=param['random_state'])
                    etr.fit(train_data,test_label)
                    pred = etr.predict(test_data)
                    
        elif param['task'] in ['reg_skl_gbm'] :
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']),
                                             learning_rate=param['learning_rate'],
                                             max_features=param['max_features'],
                                             max_depth=param['max_depth'],
                                             subsample=param['subsample'],
                                             random_state=param['random_state'])
            feat_names.remove('cid')
            gbm.fit(train_data,train_label)
            pred = gbm.predict(test_data) 
        
        elif param['task'] in ['reg_skl_ridge']:
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            train_data  = train_data.tocsr()
            test_data = test_data.tocsr()
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(train_data,train_label)
            
            predraw = ridge.predict(test_data)
            print predraw
            predrank = predraw.argsort().argsort()
            trainIndex = loadCVIndex("%s/train.run%d.txt"%("../data/feat/combine",(i+1)))
            cdf = creatCDF(train, trainIndex)
            pred = getScore(predrank,cdf)
            print pred
            
        """
        elif param['task'] in ['regression']:
            
            
        
        elif param['task'] in ['reg_skl_gbm'] :
            gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']),
                                             learning_rate=param['learning_rate'],
                                             max_features=param['max_features'],
                                             max_depth=param['max_depth'],
                                             subsample=param['subsample'],
                                             random_state=param['random_state'])
            feat_names.remove('cid')
            gbm.fit(train_data[feat_names],train_data['cid'])
            pred = gbm.predict(valid_data[feat_names])
        elif param['task'] in ['reg_skl_ridge']:
            feat_names.remove('cid')
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(train_data[feat_names],train_data['cid'])
            pred = ridge.predict(valid_data[feat_names])
        """
        #now we use the the accuracy to limit our model
        acc = accuracy_model(pred,train.iloc[testIndex]['cid'])
        print "the model accurary:%s"%(acc)
        accuracy_cv.append(acc)

    #here we will count the 
    accuracy_cv_mean = np.mean(accuracy_cv)
    accuracy_cv_std = np.std(accuracy_cv)
    print 'the accuracy for %.6f'%(accuracy_cv_mean)
    return {'loss':-accuracy_cv_mean,'attachments':{'std':accuracy_cv_std},'status': STATUS_OK}
Esempio n. 37
0
        'user_query_day_hour',
        'context_page_id',
        'hour',
        'shop_id',
        'shop_review_num_level',
        'shop_star_level',
        'shop_review_positive_rate',
        'shop_score_service',
        'shop_score_delivery',
        'shop_score_description',
    ]
    target = ['is_trade']

    X_train = train[features]
    X_test = test[features]
    Y_train = train[target]

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=170,
                                      min_samples_split=3,
                                      min_samples_leaf=8)
    # 调参之后的GBDT模型

    # 训练学习
    gbdt.fit(X_train, Y_train)
    # 预测及AUC评测
    Y_predict_gbdt = gbdt.predict_proba(X_test)[:, 1]

    pd.DataFrame({'instance_id': test['instance_id'], 'predicted_score': Y_predict_gbdt}). \
        to_csv('D:\kaggle\\alimm\\baseline_06.csv', index=False, sep=' ')
Esempio n. 38
0
ax = treeplot.randomforest(model_dt)
tree.plot_tree(model_dt)

# %% RandromForest EXAMPLE
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=100,
                                  max_depth=2,
                                  random_state=0).fit(X, y)

ax = treeplot.randomforest(model_rf, export='png')
ax = treeplot.randomforest(model_rf, export='pdf')

# %% Gradientboosting example
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
gb = GradientBoostingClassifier()
model_gradientboost = gb.fit(X, y)

ax = treeplot.plot(model_gradientboost)

# %% XGBOOST EXAMPLE
import xgboost as xgb
model_xgb = xgb.XGBClassifier(n_estimators=100, max_depth=2,
                              random_state=0).fit(X, y)

ax = treeplot.plot(model_xgb)
ax = treeplot.xgboost(model_xgb, plottype='vertical')

# %% XGBOOST EXAMPLE
from xgboost import XGBClassifier
model_xgb = XGBClassifier(n_estimators=100, max_depth=2,
                          random_state=0).fit(X, y)