class Boosting():
    '''

    '''
    def __init__(self):
        self.clf = GB()

    def fit(self, X, y):
        '''

        :param X:
        :param y:
        :return:
        '''
        self.clf.fit(X,y)

    def predict(self, X):
        '''

        :param X:
        :return:
        '''
        m = int(X.shape[0] ** (0.5))
        pred = []
        for I in range(m):
            pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray()))
        return pred
Esempio n. 2
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]
    data_test = param["data_test"]
    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df_test = pd.read_csv(data_test, index_col=idx)

    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.3,
    )
    clf.fit(X, y)
    y_pred = clf.predict(X_guest)
    acc = accuracy_score(y_guest, y_pred)
    result = {"accuracy": acc}
    print(result)
    return {}, result
Esempio n. 3
0
class GradientBoostingClassifierImpl():
    def __init__(self,
                 loss='deviance',
                 learning_rate=0.1,
                 n_estimators=100,
                 subsample=1.0,
                 criterion='friedman_mse',
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.0,
                 max_depth=3,
                 min_impurity_decrease=0.0,
                 min_impurity_split=None,
                 init=None,
                 random_state=None,
                 max_features=None,
                 verbose=0,
                 max_leaf_nodes=None,
                 warm_start=False,
                 presort='auto',
                 validation_fraction=0.1,
                 n_iter_no_change=None,
                 tol=0.0001):
        self._hyperparams = {
            'loss': loss,
            'learning_rate': learning_rate,
            'n_estimators': n_estimators,
            'subsample': subsample,
            'criterion': criterion,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_depth': max_depth,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'init': init,
            'random_state': random_state,
            'max_features': max_features,
            'verbose': verbose,
            'max_leaf_nodes': max_leaf_nodes,
            'warm_start': warm_start,
            'presort': presort,
            'validation_fraction': validation_fraction,
            'n_iter_no_change': n_iter_no_change,
            'tol': tol
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
Esempio n. 4
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]

    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df = df_guest.join(df_host, rsuffix='host')
    y = df[label_name]
    X = df.drop(label_name, axis=1)

    clf = GradientBoostingClassifier(
        random_state=0,
        n_estimators=120 if 'epsilon' in data_guest else 50,
        learning_rate=0.1)
    clf.fit(X, y)

    y_prob = clf.predict(X)

    try:
        auc_score = roc_auc_score(y, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    print(result)
    return {}, result
def gradientBoostingClassifier(X_train, y_train, X_dev, y_dev):
    print("\nPerforming Gradient Boosting.")
    gb = GradientBoostingClassifier(n_estimators=50,
                                    learning_rate=0.25,
                                    max_depth=5,
                                    random_state=0)
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_dev)
    accuracy = np.mean(y_dev == y_pred)
    print("Accuracy", accuracy)
    return gb, accuracy
Esempio n. 6
0
class Boosting():
    #TODO: dokumentasi
    def __init__(self):
        self.clf = GB()
    def fit(self,X,y):
        self.clf.fit(X,y)
    def predict(self,X):
        m = int(X.shape[0] ** (0.5))
        pred = []
        for I in range(m):
            pred.extend(self.clf.predict(X[I*X.shape[0]//m:(I+1)*X.shape[0]//m].toarray()))
        return pred
Esempio n. 7
0
def trainGBT(requestsQ, responsesQ):
    while True:
        args = requestsQ.get()
        if args[0] == 'KILL':
            break

        vectors = args[1]     
        # expected in the order of learningRate, maxTrees, minSplitSize, maxDepth
        hyperparams = args[2]

        model =   GradientBoostingClassifier(learning_rate=hyperparams[0], n_estimators=hyperparams[1], min_samples_split=hyperparams[2], max_depth=hyperparams[3])
        
        model.fit(vectors['Xtrain'], vectors['Ytrain'])
        score = accuracy_score(vectors['Ytest'], model.predict(vectors['Xtest']))        
        responsesQ.put((model, score), True)

    return 0
Esempio n. 8
0
def main(config="../../config.yaml", param="./gbdt_config_binary.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)
    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1)
    clf.fit(X, y)
    y_prob = clf.predict(X_guest)

    try:
        auc_score = roc_auc_score(y_guest, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    import time
    print(result)
    print(data_guest)
    time.sleep(3)
    return {}, result
Esempio n. 9
0
def apply_gradient_boosting(X_train_preprocessed, X_test_preprocessed, y_train,
                            y_test):

    ##TO DO : Testing Hyper Parameters and Cross Validation

    print 'Applying Gradient Boosting'

    # Training the classifier
    classifier = GradientBoostingClassifier(n_estimators=100)
    classifier = classifier.fit(X_train_preprocessed, y_train)

    # Testing the classifier on Test Data
    y_test_pred = classifier.predict(X_test_preprocessed)

    #Compute Accuracy Score
    acc = accuracy_score(y_test, y_test_pred, normalize=True)

    print 'The accuracy achieved by the Gradient Boosting Classifier Model is: ', acc

    return classifier, acc
Esempio n. 10
0
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)
    df = df_guest.join(df_host, rsuffix='host')
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    clf = GradientBoostingClassifier(random_state=0,
                                     n_estimators=50,
                                     learning_rate=0.3)
    clf.fit(X, y)
    y_pred = clf.predict(X)

    try:
        auc_score = roc_auc_score(y, y_pred)
    except:
        print(f"no auc score available")

    acc = accuracy_score(y, y_pred)
    result = {"accuracy": acc}
    print('multi result', result)
    return {}, result
Esempio n. 11
0
def main(config="../../config.yaml", param="./gbdt_config_multi.yaml"):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)

    data_guest = param["data_guest"]
    data_host = param["data_host"]
    idx = param["idx"]
    label_name = param["label_name"]

    print('config is {}'.format(config))
    if isinstance(config, str):
        config = JobConfig.load_from_file(config)
        data_base_dir = config["data_base_dir"]
        print('data base dir is', data_base_dir)
    else:
        data_base_dir = config.data_base_dir

    # prepare data
    df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest),
                           index_col=idx)
    df_host = pd.read_csv(os.path.join(data_base_dir, data_host),
                          index_col=idx)

    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.3,
    )
    clf.fit(X, y)
    y_pred = clf.predict(X_guest)
    acc = accuracy_score(y_guest, y_pred)
    result = {"accuracy": acc}
    print(result)
    return {}, result
Esempio n. 12
0
def main(param=""):
    # obtain config
    if isinstance(param, str):
        param = JobConfig.load_from_file(param)
    data_guest = param["data_guest"]
    data_host = param["data_host"]
    data_test = param["data_test"]

    idx = param["idx"]
    label_name = param["label_name"]

    # prepare data
    df_guest = pd.read_csv(data_guest, index_col=idx)
    df_host = pd.read_csv(data_host, index_col=idx)
    df = pd.concat([df_guest, df_host], axis=0)
    y = df[label_name]
    X = df.drop(label_name, axis=1)
    X_guest = df_guest.drop(label_name, axis=1)
    y_guest = df_guest[label_name]
    clf = GradientBoostingClassifier(
        n_estimators=120 if 'epsilon' in data_guest else 50, learning_rate=0.1)
    clf.fit(X, y)
    y_prob = clf.predict(X_guest)

    try:
        auc_score = roc_auc_score(y_guest, y_prob)
    except:
        print(f"no auc score available")
        return

    result = {"auc": auc_score}
    import time
    print(result)
    print(data_guest)
    time.sleep(3)
    return {}, result
Esempio n. 13
0
normal = y[:, 3]
pod = y[:, 4]
smurf = y[:, 5]
teardrop = y[:, 6]

print(x.shape)  #shap函数,几行几列np.shape
print(normal.shape)

y = teardrop  #先只针对teardrop预测

X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.4, random_state=0)  #验证集占训练集40%,随机种子(random_state)每次不一样
print('data load finish.....')

print(np.sum(y_train))
print(np.sum(y_test))  #没有交叉验证,

clf = GradientBoostingClassifier(n_estimators=50,
                                 learning_rate=0.1,
                                 max_depth=5,
                                 verbose=1)
#gbdt初始化(迭代最大次数太大过拟合太小欠拟合,步长,决策树的最大深度,输出日志
clf.fit(X_train, y_train)  #训练
y_ = clf.predict(X_test)  #预测

score = f1_score(y_test, y_)  #预测准确率

print(score)

joblib.dump(clf, 'model/teardrop_clf.m')
Esempio n. 14
0
test['Embarked'] = lbd2.transform(test['Embarked'])

#label encode categorical variables
lbd3 = LabelEncoder()
data['Sex'] = lbd3.fit_transform(data['Sex'])
test['Sex'] = lbd3.transform(test['Sex'])

#add total family members
data['SibSp'] = data['SibSp'] + data['Parch'] + 1
test['SibSp'] = test['SibSp'] + test['Parch'] + 1

#perform standard scaling
for x in ['Age', 'SibSp', 'Parch', 'Fare']:
    StndSc = StandardScaler()
    data[x] = StndSc.fit_transform(data[x].values.reshape(891, 1))
    test[x] = StndSc.transform(test[x].values.reshape(test.shape[0], 1))

X = data.iloc[:, 1:]
y = data.iloc[:, 0]

#train the gradient boosted classifier
gdb = GradientBoostingClassifier(n_estimators=250, max_depth=3)
gdb.fit(X, y)

#draw predictions
pred = gdb.predict(test)

#configuring the submission files
df = pd.DataFrame({'PassengerId': Id, 'Survived': pred})
df.to_csv("H://Submissions_Titanic.csv", index=False)
Esempio n. 15
0
                           n_informative=18,
                           n_redundant=2,
                           n_classes=2,
                           n_clusters_per_class=3,
                           random_state=2017)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# 不生成新的特征,直接训练
clf = GradientBoostingClassifier(n_estimators=50)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
print("Original featrues")
print("GBDT_ACC: {:.6f}".format(acc))
print("GBDT_AUC: {:.6f}".format(auc))

# 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵
X_train_leaves = clf.apply(X_train)[:, :, 0]
X_test_leaves = clf.apply(X_test)[:, :, 0]

# 将X_train_leaves, X_test_leaves在axis=0方向上合并,再进行OneHotEncoder操作
All_leaves = np.r_[X_train_leaves, X_test_leaves]

# 索引矩阵每列不是0/1二值型离散特征,因此需要OneHotEncoder操作
Esempio n. 16
0
y_pred1 = SVM.predict(X_test)
print(classification_report(y_test, y_pred1))
print(accuracy_score(y_test, y_pred1))

## Random Forest Classifier
print("RandomForrest Classifier Results are as following")
rfc = RandomForestClassifier(n_estimators=200, max_depth=4)
rfc.fit(X_train, y_train)
y_pred2 = rfc.predict(X_test)
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))

# Neural Network
print("Neural Network Classifier Results are as following")

mlp = MLPClassifier(max_iter=500)
mlp.fit(X_train, y_train)
y_pred3 = mlp.predict(X_test)
print(classification_report(y_test, y_pred3))
print(accuracy_score(y_test, y_pred3))

# GradientBoosting Classifier

print("GradientBoosting Classifier Results are as following")

grd = GradientBoostingClassifier()
grd.fit(X_train, y_train)
y_pred4 = grd.predict(X_test)
print(classification_report(y_test, y_pred4))
print(accuracy_score(y_test, y_pred4))
Esempio n. 17
0
# 当前最优 : GBC-随机梯度上升
############################################################
# 数据正态化后审查结果可视化
_Fig = plt.figure()
_Fig.suptitle(t="STANDARDSCALED ALGORITHM COMPARISION")
_Ax = _Fig.add_subplot(111)
plt.boxplot(x=_STANDARDSCALED_ALGORITHM_CMP_RESULT_LIST)
_Ax.set_xticklabels(labels=_STANDARDSCALED_MODELS.keys())
plt.show()
################################################################################
# 数据正态化后随机梯度上升预测
_STANDARDSCALED_GBC_MODEL = GBC()
_STANDARDSCALED_GBC_SCALER = preprocessing.StandardScaler().fit(X=_X_TRAIN)
_STANDARDSCALED_GBC_MODEL.fit(
    X=_STANDARDSCALED_GBC_SCALER.transform(X=_X_TRAIN), y=_Y_TRAIN)
_STANDARDSCALED_GBC_PREDICTIONS = _STANDARDSCALED_GBC_MODEL.predict(
    X=_STANDARDSCALED_GBC_SCALER.transform(X=_X_VAL))
print(
    "GBC-随机梯度上升数据正态化后预测结果:\n",
    #
    " " * 4,
    "ACCURACY_SCORE:\n",
    " " * 8,
    metrics.accuracy_score(y_true=_Y_VAL,
                           y_pred=_STANDARDSCALED_GBC_PREDICTIONS),
    "\n",
    #
    " " * 4,
    "CONFUSION_MATRIX:\n",
    metrics.confusion_matrix(y_true=_Y_VAL,
                             y_pred=_STANDARDSCALED_GBC_PREDICTIONS),
    "\n",
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(accuracy_rf)

# In[193]:

######## Trying Gradient Boost ######

# In[194]:

gbc = GradientBoostingClassifier(n_estimators=100)
gbc.fit(X_train, y_train)

# In[195]:

y_pred_gbc = gbc.predict(X_test)
accuracy_gbc = accuracy_score(y_test, y_pred_gbc)
print(accuracy_gbc)

# In[196]:

############## Model evaluation ##############

# In[197]:

confusion_matrix(y_test, y_pred_lr)

# In[198]:

confusion_matrix(y_test, y_pred_rf)
# y_hat = rfc.predict(test_x) # get output for predicted test_x data
#
# # Format to correct output CSV file
# y_hat_cols = np.reshape(y_hat, (y_hat.shape[0], 1))
# index = np.arange(y_hat.shape[0]).reshape((y_hat.shape[0], 1))
# csv_output = np.concatenate((index, y_hat_cols), axis=1)
# np.savetxt("output_predictions.csv", csv_output, fmt="%s", delimiter=',')

#################### SVM #########################
# svc = SVC(gamma='auto')
# svc.fit(train_x, train_y)
# y_hat = svc.predict(test_x)
#
# # Format to correct output CSV file
# y_hat_cols = np.reshape(y_hat, (y_hat.shape[0], 1))
# index = np.arange(y_hat.shape[0]).reshape((y_hat.shape[0], 1))
# csv_output = np.concatenate((index, y_hat_cols), axis=1)
# np.savetxt("output_predictions.csv", csv_output, fmt="%s", delimiter=',')

#################### GBC #########################
gbc = GBC(learning_rate=0.2, n_estimators=200)
gbc.fit(train_x, train_y)
y_hat = gbc.predict(test_x)

# Format to correct output CSV file
y_hat_cols = np.reshape(y_hat, (y_hat.shape[0], 1))
index = np.arange(y_hat.shape[0]).reshape((y_hat.shape[0], 1))
csv_output = np.concatenate((index, y_hat_cols), axis=1)
np.savetxt("output_predictions.csv", csv_output, fmt="%s", delimiter=',')
print("DONEEEE !!!!!!!")
    trainData = dataTrain.values[:, 1:]
    trainLabel = dataTrain.values[:, 0]
    preData = dataPre.values[:, :]
    return trainData, trainLabel, preData


# Data
X, y, _ = opencsv()
#X = X[:int(0.2*len(X)), :]
#y = y[:len(X)]
splitRatio = 0.9
X_train = X[:int(splitRatio * len(X)), :]
y_train = y[:int(splitRatio * len(X))]
X_test = X[int(splitRatio * len(X)):, :]
y_test = y[int(splitRatio * len(X)):]

print('Data split')

#param_search_n_estimators = {'n_estimators':range(20, 300, 20)}

#gsearch_gbct = GridSearchCV(GradientBoostingClassifier(), param_grid=param_search_n_estimators, scoring='accuracy', iid=False, cv=5)
gbct = GradientBoostingClassifier(n_estimators=200, subsample=.1)
gbct.fit(X_train, y_train)

print('Fit finished')
#print(gsearch_gbct.grid_scores_, gsearch_gbct.best_params_, gsearch_gbct.best_score_)

y_pre = gbct.predict(X_test)
print(accuracy_score(y_pre, y_test))

print('Time cost: ', time.clock() - c_time)
Esempio n. 21
0
def trainModel(param,feat_folder,feat_name):
    #read data from folder
    print 'now we read data from folder:%s'%(feat_folder)
   
    #start cv
    print 'now we need to generate cross_validation'
    accuracy_cv = []
  
    for i in range(0,2):
        print 'this is the run:%d cross-validation'%(i+1)
        testIndex = loadCVIndex("%s/test.run%d.txt"%("../data/feat/combine",(i+1)))
        #if we use xgboost to train model ,we need to use svmlib format
        if param['task'] in ['regression']:
            #with xgb we will dump the file with CV,and we will read data 
            train_data = xgb.DMatrix("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            valid_data = xgb.DMatrix("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            watchlist = [(train_data,'train'),(valid_data,'valid')]
            bst = xgb.train(param,train_data,int(param['num_round']),watchlist)
            pred = bst.predict(valid_data)
        
        elif param['task'] in ['clf_skl_lr']:
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            train_data  = train_data.tocsr()
            test_data = test_data.tocsr()
            clf = LogisticRegression()
            clf.fit(train_data,train_label)
            pred = clf.predict(test_data)
        
        elif param['task'] == "reg_skl_rf":
                    ## regression with sklearn random forest regressor
                    train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
                    test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
                    rf = RandomForestRegressor(n_estimators=param['n_estimators'],
                                               max_features=param['max_features'],
                                               n_jobs=param['n_jobs'],
                                               random_state=param['random_state'])
                    rf.fit(train_data, test_label)
                    pred = rf.predict(test_data)
        
        elif param['task'] == "reg_skl_etr":
                    ## regression with sklearn extra trees regressor
                    train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
                    test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
                    etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                                              max_features=param['max_features'],
                                              n_jobs=param['n_jobs'],
                                              random_state=param['random_state'])
                    etr.fit(train_data,test_label)
                    pred = etr.predict(test_data)
                    
        elif param['task'] in ['reg_skl_gbm'] :
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']),
                                             learning_rate=param['learning_rate'],
                                             max_features=param['max_features'],
                                             max_depth=param['max_depth'],
                                             subsample=param['subsample'],
                                             random_state=param['random_state'])
            feat_names.remove('cid')
            gbm.fit(train_data,train_label)
            pred = gbm.predict(test_data) 
        
        elif param['task'] in ['reg_skl_ridge']:
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            train_data  = train_data.tocsr()
            test_data = test_data.tocsr()
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(train_data,train_label)
            
            predraw = ridge.predict(test_data)
            print predraw
            predrank = predraw.argsort().argsort()
            trainIndex = loadCVIndex("%s/train.run%d.txt"%("../data/feat/combine",(i+1)))
            cdf = creatCDF(train, trainIndex)
            pred = getScore(predrank,cdf)
            print pred
            
        """
        elif param['task'] in ['regression']:
            
            
        
        elif param['task'] in ['reg_skl_gbm'] :
            gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']),
                                             learning_rate=param['learning_rate'],
                                             max_features=param['max_features'],
                                             max_depth=param['max_depth'],
                                             subsample=param['subsample'],
                                             random_state=param['random_state'])
            feat_names.remove('cid')
            gbm.fit(train_data[feat_names],train_data['cid'])
            pred = gbm.predict(valid_data[feat_names])
        elif param['task'] in ['reg_skl_ridge']:
            feat_names.remove('cid')
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(train_data[feat_names],train_data['cid'])
            pred = ridge.predict(valid_data[feat_names])
        """
        #now we use the the accuracy to limit our model
        acc = accuracy_model(pred,train.iloc[testIndex]['cid'])
        print "the model accurary:%s"%(acc)
        accuracy_cv.append(acc)

    #here we will count the 
    accuracy_cv_mean = np.mean(accuracy_cv)
    accuracy_cv_std = np.std(accuracy_cv)
    print 'the accuracy for %.6f'%(accuracy_cv_mean)
    return {'loss':-accuracy_cv_mean,'attachments':{'std':accuracy_cv_std},'status': STATUS_OK}
Esempio n. 22
0
# <codecell>

X, y = shuffle(df2[possible_features], df2.bad)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

# <codecell>

params = {'init': LogOddsEstimator(), 'n_estimators': 5, 'max_depth': 6, 'learning_rate': 0.1, 'loss': 'bdeviance'}
clf = GradientBoostingClassifier(**params)

# <codecell>

clf = clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

# <codecell>

clf.feature_importances_

# <codecell>

print "Mean Squared Error"
mse = mean_squared_error(y_test, predicted)
print("MSE: %.4f" % mse)
print 

# <codecell>

params = clf.get_params()
Esempio n. 23
0
X_train, X_test, y_train, y_test = train_test_split(df_train,
                                                    is_promoted,
                                                    test_size=0.3)
#y_train = y_train.reshape(-1,1)
#y_test = y_test.reshape(-1,1)
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

# In[19]:

#

# In[24]:

gboost = GradientBoostingClassifier(max_depth=5, n_estimators=200)
gboost.fit(df_train, is_promoted)
result = gboost.predict(df_test)

print 'lol'

# In[26]:

#WNS_solution = open('WNS_solution.csv','w')
WNS_solution = pd.DataFrame()
WNS_solution['employee_id'] = emp_id
result = pd.DataFrame(result)
WNS_solution['is_promoted'] = result
WNS_solution.to_csv('WNS_solution4.csv', index=False)

# In[ ]:

print 'lol'
data = pd.read_csv("D:/Sai/JavaDoc/Cousera/5/2/gbm-data.csv")

y = data[data.columns[1]].values
x = data[data.columns[1:]].values

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

ls = [1, 0.5, 0.3, 0.2, 0.1]

for i in ls:
    clf = GradientBoostingClassifier(n_estimators=250,
                                     verbose=True,
                                     random_state=241,
                                     learning_rate=i)

    clf.fit(x_train, y_train)
    qual_test = clf.staged_decision_function(x_test)
    qual_train = clf.staged_decision_function(x_train)

    predict = clf.predict(x_test)

    pred_trans = 1 / (1 + math.exp(-predict))

    plt.figure()
    plt.plot(test_loss, 'r', linewidth=2)
    plt.plot(train_loss, 'g', linewidth=2)
    plt.legend(['test', 'train'])