Ejemplo n.º 1
0
def runXGBoost(x_train, y_train, x_test, y_test, p):

    # Here we instantiate the extra gradient boosting classifier
    clf = XGBClassifier()
    clf.set_params(**p)

    clf.fit(x_train, y_train)

    # now, make the predictions using our classifier
    xgb_predictions = clf.predict(x_test)

    # now we have to computer the classification accuracy
    # think about what two variables we have to compare
    xgb_score = accuracy_score(y_test, xgb_predictions)
    print("XGB classification accuracy on test data is " + str(xgb_score),
          file=sys.stderr)

    etc_predictions = clf.predict(x_test)
    dt_score = accuracy_score(y_test, etc_predictions)
    print("accuracy score on test data: " + str(dt_score), file=sys.stderr)
    train_score = accuracy_score(y_train, clf.predict(x_train))
    print("accuracy score on training data: " + str(train_score),
          file=sys.stderr)

    return (train_score, dt_score)
def train_model(train_data, train_label, test_data, test_label):
    model = XGBClassifier(learning_rate=0.1,
                          n_estimators=160,
                          max_depth=6,
                          min_child_weight=3,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          objective='multi:softmax',
                          num_class=2,
                          nthread=4,
                          scale_pos_weight=1,
                          seed=0)
    # dtrain = xgb.DMatrix(data=train_data,label=train_label)
    # 训练模型
    print 'XGboost start trainning '
    start_time = time.time()
    model.fit(train_data, train_label)
    print 'XGboost finish trainning'
    print 'trainning time:%d' % (time.time() - start_time)
    # 存储模型
    joblib.dump(model, 'model/XGBoost_model_80w.pkl')
    print 'model write finish'
    # 测试结果
    print 'train data result'
    train_result = model.predict(train_data)
    print metrics.classification_report(train_label, train_result)
    print 'test data result'
    test_result = model.predict(test_data)
    print metrics.classification_report(test_label, test_result)
Ejemplo n.º 3
0
def model_na_train(data_offline_filter,data_online,col,predictors):
    data_offline_filter_col_nona = data_offline_filter.loc[pd.notnull(data_offline_filter.loc[:,col]),:]
    data_offline_filter_col_na = data_offline_filter.loc[pd.isnull(data_offline_filter.loc[:,col]),:]
    data_online_col_na =  data_online.loc[pd.isnull( data_online.loc[:,col]),:]
    data_online_col_nona =  data_online.loc[pd.notnull( data_online.loc[:,col]),:]
    k= pd.qcut(data_offline_filter_col_nona[col].tolist()+[data_offline_filter_col_nona[col].min()-1], 10, retbins=True, labels=False,duplicates ='drop')
    cutoffs = k[1]
    data_offline_filter_col_nona[col+'_dis'] = np.digitize(data_offline_filter_col_nona[col], cutoffs, right=True)
    data_online_col_nona[col+'_dis'] = np.digitize(data_online_col_nona[col], cutoffs, right=True)
    dep = col+'_dis'
    train_col = data_offline_filter_col_nona.loc[data_offline_filter_col_nona.loc[:,'date'].isin(date_sorted[:40]),:]
    valid_col = data_offline_filter_col_nona.loc[data_offline_filter_col_nona.loc[:,'date'].isin(date_sorted[40:46]),:]
    xgb1 = XGBClassifier(
     learning_rate =0.05,
     n_estimators=3000,
     max_depth=6,
     min_child_weight=1,
     gamma=0.1,
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
     n_jobs=16,
     scale_pos_weight=1,
     seed=1,
     reg_alpha=0.5,
     reg_lambda =10,
     silent=False)
    xgb1.fit(train_col[predictors],train_col[dep],eval_set=[(train_col[predictors],train_col[dep]),(valid_col[predictors],valid_col[dep])],eval_metric='mlogloss',early_stopping_rounds=10)
    save_obj(xgb1,col+'_xgb') #保存模型
    save_obj(cutoffs,col+'_cut') #保存切点
    data_offline_filter_col_na_pred = xgb1.predict(data_offline_filter_col_na[predictors])
    data_online_col_na_pred = xgb1.predict(data_online_col_na[predictors])
    data_offline_filter_col_na[col+'_dis'] = data_offline_filter_col_na_pred
    data_online_col_na[col+'_dis'] = data_online_col_na_pred    
    return pd.concat([data_offline_filter_col_nona.loc[:,['id',col+'_dis']],data_offline_filter_col_na.loc[:,['id',col+'_dis']]]),pd.concat([data_online_col_nona.loc[:,['id',col+'_dis']],data_online_col_na.loc[:,['id',col+'_dis']]])
Ejemplo n.º 4
0
def xgboost_submission(data):
    # combine test and training data for scaling
    train_X = data['training_data'].toarray()
    train_y = data['training_labels'].reshape(X.shape[0], 1)
    test_X = data['test_data'].toarray()
    X_train, X_val, y_train, y_val = train_test_split(train_X,
                                                      train_y,
                                                      test_size=0.20,
                                                      random_state=42)
    clf = XGBClassifier(max_depth=5,
                        min_child_weight=1,
                        gamma=0,
                        subsample=0.8,
                        colsample_bytree=0.8).fit(X_train, y_train)
    predicted_test = clf.predict(test_X)
    predicted_val = clf.predict(X_val)

    print('Accuracy:', accuracy_score(y_val, predicted_val))
    print('F1 score:', f1_score(y_val, predicted_val))
    print('Recall:', recall_score(y_val, predicted_val))
    print('Precision:', precision_score(y_val, predicted_val))
    print('\n clasification report:\n',
          classification_report(y_val, predicted_val))
    print('\n confussion matrix:\n', confusion_matrix(y_val, predicted_val))

    return predicted_test
def train():
    """
    Train model and save model:
    feature: tf_idf
    Classifier: XGBClassifier
    Model path: './model'
    :return:
    """

    print('read data...')
    data = pd.read_csv('./data/intend_data_1.csv')
    data = data.sample(frac=1.0, replace=True, random_state=42)

    print('clean data...')
    data['sentence'] = data['sentence'].apply(clean)

    label_subject = dict(zip(range(0, len(set(data['label']))), sorted(list(set(data['label'])))))
    subject_label = dict(zip(sorted(list(set(data['label']))), range(0, len(set(data['label'])))))

    data['label'] = data['label'].map(subject_label)

    X_train, X_test, y_train, y_test = train_test_split(list(data['sentence']), list(data['label']), test_size=0.1, random_state=42)


    print('extract tfidf feature')
    vec = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 2))

    tfidf_model = vec.fit(data['sentence'].tolist())
    trn_term_doc = tfidf_model.transform(X_train)
    test_term_doc = tfidf_model.transform(X_test)

    print('train topic model')
    classifier_model = XGBClassifier(learning_rate=0.30,
                              n_estimators=300,
                              max_depth=5,
                              objective='multi:softmax',
                              seed=42)

    classifier_model.fit(trn_term_doc, y_train, eval_metric='mlogloss')

    train_preds = classifier_model.predict(trn_term_doc)
    print('result in train:')
    print(metrics.classification_report(y_train, train_preds))

    test_preds = classifier_model.predict(test_term_doc)
    print('result in train:')
    print(metrics.classification_report(y_test, test_preds))
    print('train semantic model end')

    with open('./model/model.pk', 'wb') as file:
        save = {
            'label_subject': label_subject,
            'tfidfVectorizer': tfidf_model,
            'classifier_model': classifier_model
        }
        pickle.dump(save, file)
Ejemplo n.º 6
0
  def ranking_borda_xgboost(self):
    a = 0
    rankings = np.zeros(len(self.X.columns),)
    std = np.zeros(len(self.X.columns),)

    for x in range(self.loops):
      seed = randint(0, 10000)
  
  #Splits the train/val set by a seed that generates randomly each loop.
      X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state= seed)
  #Initializing a random forest
      rf = XGBClassifier()
  #Fits the Random forest and we calculate the matthew score. 
      rf.fit(X_train, y_train)
      mattheworiginal = matthews_corrcoef(y_fr, rf.predict(X_fr))
  #We initialize 2 lists to append values from the next loop.
      matthewscores= []
      columnsrf= []
  

      for x in self.X.columns:
    
        X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state = seed)
    #We drop a different column each loop. 
        X_train = X_train.drop([x], axis=1)
        X_fr = X_fr.drop([x], axis=1)
    #We fit our random forest again, but this time our training dataset lacks a feature.
        rf.fit(X_train, y_train)
        matthew = matthews_corrcoef(y_fr, rf.predict(X_fr))
    #We append to the list each column that we dropped.
        columnsrf.append(x)
    #And we also append, the drop (or gain), in r2 that we got when the feature was missing.
        matthewscores.append(mattheworiginal - matthew)
  
      a += 1 
      outcome = np.array(list(zip(columnsrf, matthewscores)))
      outcomepd = pd.DataFrame(data=outcome, columns=['Variables', 'r2-punish'])
      outcomepd['ranking'] = outcomepd['r2-punish'].rank(ascending = False)
     
      rankings = np.add(outcomepd['ranking'].to_numpy(), rankings)
      # We stack each value vertically to get a 2d numpy array
      std = np.vstack((outcomepd['ranking'].to_numpy(), std))
    
    std = np.delete(std, -1, axis = 0)
    std = np.std(std, axis = 0)
    std = np.dstack((columnsrf, std))
    featuresranks = np.dstack((columnsrf, rankings))
    std = pd.DataFrame(data = np.squeeze(std, axis = 0), columns =['Categories', 'STD'])
    borda = pd.DataFrame(data = np.squeeze(featuresranks, axis=0), columns=['Categories', 'Borda-Score'])
    borda = borda.merge(std, on = 'Categories',)
    borda['Borda-Score'] = pd.to_numeric(borda['Borda-Score'])
    borda['Borda-Average'] = borda['Borda-Score'] / self.loops
    borda['ranking'] = borda['Borda-Score'].rank(ascending = True)
    borda.sort_values(by='Borda-Score', inplace = True)
    
    return borda
Ejemplo n.º 7
0
def runXGBoost(x_train,y_train,x_test,y_test):

    parameter_grid = {
        'reg_lambda': np.linspace(27, 28 , 2),  # for over fitting
        'reg_alpha': np.linspace(27, 28, 2),
        "learning_rate": np.linspace(.0001, .001, 3), # usually between .05 and .3
        "max_depth": [2],
        "num_boosting_rounds": [1000],
        'nthread': [10]
    }

    # Here we instantiate the extra gradient boosting classifier
    clf = XGBClassifier()
    grid_search = GridSearchCV(clf, n_jobs=40, return_train_score=True, param_grid=parameter_grid, cv=StratifiedKFold(n_splits=10))
    grid_search.fit(x_train,y_train)

    print('Best score: {}'.format(grid_search.best_score_), file=sys.stderr)
    print('Best parameters: {}'.format(grid_search.best_params_), file=sys.stderr)

    # refit and train the model to the best features and training data
    clf = grid_search.best_estimator_

    importances = clf.feature_importances_
    print(importances, file=sys.stderr)

    # Print the feature ranking
    print("Feature ranking:", file=sys.stderr)
    importanceDict = {'names': [], 'imp': []}
    for name, importance in zip(x_train.columns, clf.feature_importances_):
        importanceDict['names'] += [name]
        importanceDict['imp'] += [importance]
    fRank = pd.DataFrame.from_dict(importanceDict)
    fRank = fRank.sort_values(by='imp', ascending=False)
    i = 0
    for index, row in fRank.iterrows():
        print("%d. %s %f" % (i, row['names'], row['imp']), file=sys.stderr)
        i += 1

    cv_results = pd.DataFrame(grid_search.cv_results_)[['rank_test_score', 'params','mean_test_score','mean_train_score']]
    sorted_results = cv_results.sort_values(by='rank_test_score').head(5)

    print("\nTop 5 best Parameters: ", file=sys.stderr)
    for index, row in sorted_results.iterrows():
        print("%d. %s train: %s test: %s" % (row['rank_test_score'], str(row['params']), str(row['mean_train_score']), str(row['mean_test_score'])), file=sys.stderr)

    # now we have to computer the classification accuracy
    # think about what two variables we have to compare
    etc_predictions = clf.predict(x_test)
    dt_score = accuracy_score(y_test, etc_predictions)
    print("accuracy score on test data: " + str(dt_score), file=sys.stderr)
    train_score = accuracy_score(y_train, clf.predict(x_train))
    print("accuracy score on training data: " + str(train_score), file=sys.stderr)

    return (train_score, dt_score)
Ejemplo n.º 8
0
  def ranking_by_matthew_punishment_xgb(self):

    std = np.zeros(len(self.X.columns),)
    rankings = np.zeros(len(self.X.columns),)

    for x in range(self.loops):
      seed = randint(0, 10000)
    #Splits the train/val set by a seed that generates randomly each loop.
      X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state= seed)
    #Initializing a random forest
      rf = XGBClassifier()
  #Fits the Random forest and we calculate a R2. 
      rf.fit(X_train, y_train)
      r2original = matthews_corrcoef(y_fr, rf.predict(X_fr))
  #We initialize 2 lists to append values from the next loop.
      r2fr= []
      columnsrf= []
  

      for x in self.X.columns:

        X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state = seed)
    #We drop a different column each loop.
        X_train = X_train.drop([x], axis=1)
        X_fr = X_fr.drop([x], axis=1)
    #We fit our random forest again, but this time our training dataset lacks a feature.
        rf.fit(X_train, y_train)
        r2 = matthews_corrcoef(y_fr, rf.predict(X_fr))
    #We append to the list each column that we dropped.
        columnsrf.append(x)
    #And we also append, the drop (or gain), in r2 that we got when the feature was missing.
        r2fr.append(r2original - r2)

      outcome = np.array(r2fr)
      rankings = np.add(outcome, rankings)
      std = np.vstack((outcome, std))
    
    rankings = np.true_divide(rankings, self.loops)
    std = np.delete(std, -1, axis = 0)
    std = np.std(std, axis = 0)
    std = np.dstack((columnsrf, std))
    std = pd.DataFrame(data = np.squeeze(std, axis = 0), columns =['Categories', 'SD_of_mtt_punishment'])
    featuresranks = np.dstack((columnsrf, rankings))
    borda = pd.DataFrame(data = np.squeeze(featuresranks, axis=0), columns=['Categories', 'average-mtt-punishment'])
    borda['ranking'] = borda['average-mtt-punishment'].rank(ascending = False)
    borda = borda.merge(std, on = 'Categories',)
    borda.sort_values(by='average-mtt-punishment', inplace = True, ascending = False)

    return borda
Ejemplo n.º 9
0
 def fit_model(self, X_train, y_train, X_test, y_test):
     clf = XGBClassifier(learning_rate=self.learning_rate,
                         n_estimators=self.n_estimators,
                         max_depth=self.max_depth,
                         min_child_weight=self.min_child_weight,
                         gamma=self.gamma,
                         subsample=self.subsample,
                         colsample_bytree=self.colsample_bytree,
                         objective=self.objective,
                         nthread=self.nthread,
                         scale_pos_weight=self.scale_pos_weight,
                         reg_alpha=self.reg_alpha,
                         reg_lambda=self.reg_lambda,
                         seed=self.seed)
     clf.fit(X_train, y_train)
     y_pre = clf.predict(X_test)
     y_pro = clf.predict_proba(X_test)[:, 1]
     print "pred_leaf=T  AUC Score : %f" % metrics.roc_auc_score(
         y_test, y_pro)
     print "pred_leaf=T  Accuracy : %.4g" % metrics.accuracy_score(
         y_test, y_pre)
     new_feature = clf.apply(X_train)
     X_train_new = self.mergeToOne(X_train, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print "Training set sample number remains the same"
     return X_train_new, y_train, X_test_new, y_test
 def fit_model_split(self, X_train, y_train, X_test, y_test):
     ##X_train_1用于生成模型  X_train_2用于和新特征组成新训练集合
     X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(
         X_train, y_train, test_size=0.6, random_state=0)
     clf = XGBClassifier(learning_rate=self.learning_rate,
                         n_estimators=self.n_estimators,
                         max_depth=self.max_depth,
                         min_child_weight=self.min_child_weight,
                         gamma=self.gamma,
                         subsample=self.subsample,
                         colsample_bytree=self.colsample_bytree,
                         objective=self.objective,
                         nthread=self.nthread,
                         scale_pos_weight=self.scale_pos_weight,
                         reg_alpha=self.reg_alpha,
                         reg_lambda=self.reg_lambda,
                         seed=self.seed)
     clf.fit(X_train_1, y_train_1)
     y_pre = clf.predict(X_train_2)
     y_pro = clf.predict_proba(X_train_2)[:, 1]
     print("pred_leaf=T AUC Score : %f" %
           metrics.roc_auc_score(y_train_2, y_pro))
     print("pred_leaf=T  Accuracy : %.4g" %
           metrics.accuracy_score(y_train_2, y_pre))
     new_feature = clf.apply(X_train_2)
     X_train_new2 = self.mergeToOne(X_train_2, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print("Training set of sample size 0.4 fewer than before")
     return X_train_new2, y_train_2, X_test_new, y_test
Ejemplo n.º 11
0
def XGBoost(returns, factRet):
    [timeN, factorN] = factRet.shape
    [timeN, assetN] = returns.shape
    #Prepare training and preidcting data
    colName = list(factRet.columns)
    f_bar = factRet.tail(2).mean()
    f_bar = pd.DataFrame(f_bar).T
    f_bar.columns = colName

    factRet = factRet.head(len(factRet) - 1)
    xgb = XGBClassifier(learning_rate=0.1,
                        n_estimators=10,
                        max_depth=7,
                        min_child_weight=2,
                        gamma=0.2,
                        subsample=0.8,
                        colsample_bytree=0.6,
                        objective='reg:linear',
                        scale_pos_weight=1,
                        seed=10)
    mu = []
    for i in range(assetN):
        xgb.fit(factRet, returns.iloc[:, i])
        mu.append(float(xgb.predict(f_bar)))
    mu = np.array(mu)
    Q = np.array(returns.cov())
    return mu, Q
Ejemplo n.º 12
0
def xgbt_base_rmse_mode(train_input, train_target, test_input, test_target):
    param = {
        'n_estimators':10,
        'learning_rate': 0.01,
        }

    adj_params = {
        'n_estimators':[10],
        'learning_rate': [0.01],
    #    'n_estimators':[10,50,100,200,300,400,500,1000],
    #    'learning_rate': [0.01, 0.1, 1] 
    }

    xgbt = XGBClassifier(**param)
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    cscv = GridSearchCV(xgbt, adj_params, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    cscv.fit(train_input, train_target)
    print("cv_results_:",cscv.cv_results_)
    print("best_params_: ",cscv.best_params_)
    xgbt= XGBClassifier(**cscv.best_params_)
    xgbt.fit(train_input,train_target.ravel())  
    predicted = xgbt.predict(test_input) 
    xgbt_base_rmse = np.sqrt(metrics.mean_squared_error(test_target, predicted))
    print("xgbt_base_rmse: ", xgbt_base_rmse)
    #print ("RMSE:", np.sqrt(metrics.mean_squared_error(test_target, predicted))) 
    return xgbt_base_rmse
Ejemplo n.º 13
0
def modelfit(useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    alg = XGBClassifier(**params)
    df = data.sample(frac=0.3)
    pX = df.drop('LABEL', axis=1)
    py = df['LABEL']
    if useTrainCV:
        print("start use cv")
        xgb_param = alg.get_xgb_params()
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=xgb_param['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds)
        print(cvresult.shape[0])
        alg.set_params(n_estimators=cvresult.shape[0])
        params['n_estimators'] = cvresult.shape[0]
        print("best tree size is {}".format(cvresult.shape[0]))
    # Fit the algorithm on the data
    alg.fit(X, y, eval_metric='auc')
    y_pred = alg.predict(pX)
    accuracy = metrics.accuracy_score(py, y_pred)
    print("精确率Accuracy: %.2f%%" % (accuracy * 100.0))
    print('auc:', metrics.roc_auc_score(py, y_pred))
    train_report = metrics.classification_report(py, y_pred)
    print(train_report)
    feat_imp = pd.Series(
        alg.get_booster().get_fscore()).sort_values(ascending=False)
    print(feat_imp)
    return alg
Ejemplo n.º 14
0
def FraudXGB(trainX, trainY, testX, testY):
    # Long computation in this cell (~1.8 minutes)
    clf_xgb = XGBClassifier(max_depth=7,
                            learning_rate=0.05,
                            n_estimators=400,
                            objective="binary:hinge",
                            booster='gbtree',
                            n_jobs=-1,
                            nthread=None,
                            gamma=0,
                            min_child_weight=1,
                            max_delta_step=0,
                            subsample=1,
                            colsample_bytree=1,
                            colsample_bylevel=1,
                            reg_alpha=0,
                            reg_lambda=1,
                            scale_pos_weight=1,
                            base_score=0.5,
                            random_state=42)

    pred_prob = clf_xgb.fit(trainX, trainY).predict_proba(testX)
    predY_xgb = clf_xgb.predict(testX)
    modelName = 'XGBoostClassifier'
    model_perf = createOutParam(modelName, testX, testY, predY_xgb, pred_prob)
    XGBoostClassifier_pkl_filename = obj.model_path + '/XGBoostClassifier_20200202.pkl'
    # Open the file to save as pkl file
    XGBoostClassifier_model_pkl = open(XGBoostClassifier_pkl_filename, 'wb')
    pickle.dump(clf_xgb, XGBoostClassifier_model_pkl)
    # Close the pickle instances
    XGBoostClassifier_model_pkl.close()

    return model_perf
Ejemplo n.º 15
0
def do_simple_xgboost_regression(x_train, y_train, x_test, y_test):
    xg_reg = XGBClassifier(silent=False,
                           scale_pos_weight=1,
                           learning_rate=0.01,
                           colsample_bytree=0.4,
                           subsample=0.8,
                           objective='binary:logistic',
                           n_estimators=1000,
                           reg_alpha=0.3,
                           max_depth=4,
                           gamma=10)

    eval_set = [(x_train, y_train), (x_test, y_test)]
    eval_metric = ["auc", "error"]
    xg_reg.fit(x_train,
               y_train,
               eval_metric=eval_metric,
               eval_set=eval_set,
               verbose=True)
    train_accuracy = compute_accuracy(xg_reg, x_train, y_train)
    test_accuracy = compute_accuracy(xg_reg, x_test, y_test)

    print('train set accuracy: {}'.format(train_accuracy))
    print('test set accuracy: {}'.format(test_accuracy))

    y_score = xg_reg.predict(x_test)

    score = metrics.roc_auc_score(y_test, y_score)
    print('score {}'.format(score))
Ejemplo n.º 16
0
    def runxgBoostClassifier(self, bDetailReport=False):
        print("m_X_train  size", len(self.m_X_train))

        boosters = ['gbtree', 'gblinear']

        for depth in range(3, 4):
            for rate in (range(2, 3, 1)):
                for estimator in (range(220, 240, 20)):
                    #                    for bster in boosters:
                    clf = XGBClassifier(max_depth=depth,
                                        learning_rate=(float(rate) / 10),
                                        n_estimators=estimator,
                                        silent=True,
                                        objective='binary:logistic',
                                        seed=400)
                    clf.fit(self.m_X_train, self.m_y_train)
                    y = clf.predict(self.m_X_test)
                    print(
                        "\nxgBoostClassifier depth={} rate={} estimator={}\n".
                        format(depth, (float(rate) / 10), estimator))
                    print(classification_report(self.m_y_test, y))
                    print(clf.feature_importances_)
                    # plot
                    pyplot.bar(range(len(clf.feature_importances_)),
                               clf.feature_importances_)
                    pyplot.show()

                    plot_importance(clf)
                    if (bDetailReport):
                        self.ClassifierDetailReport(self.m_y_test, y)
Ejemplo n.º 17
0
def train_model(train, test):
    x_train, y_train = get_fea_lab(train)
    x_test, y_test = get_fea_lab(test)
    xgb = XGBClassifier()
    print(xgb)

    paras = {
        'max_depth': range(1, 3),
        'min_child_weight': [i / 10 for i in range(0, 10)],
        'scale_pos_weight': range(10, 100, 10)
    }
    gscv = GridSearchCV(estimator=xgb,
                        param_grid=paras,
                        cv=5,
                        scoring='roc_auc')
    gscv.fit(x_train, y_train)
    print(gscv.best_params_)
    print(gscv.best_score_)
    print(gscv.score(x_test, y_test))
    result = gscv.predict(x_test)
    print(confusion_matrix(y_test, result))
    print(classification_report(y_test, result))

    xgb.fit(x_train, y_train)
    test_result = xgb.predict(x_test)
    print(confusion_matrix(y_test, test_result))
    print(classification_report(y_test, test_result))
    '''
Ejemplo n.º 18
0
def XGBoost(returns, factRet):
    """
    :param return:
    :param facret:
    :param a: lambda
    """
    [timeN, factorN] = factRet.shape
    [timeN, assetN] = returns.shape
    f_bar = []
    for i in range(factorN):
        f_bar.append(np.prod(factRet.iloc[:, i] + 1)**(1 / timeN) - 1)

    colName = list(factRet.columns)
    f_bar = pd.DataFrame(f_bar).T
    f_bar.columns = colName
    xgb = XGBClassifier(learning_rate=0.1,
                        n_estimators=10,
                        max_depth=7,
                        min_child_weight=2,
                        gamma=0.2,
                        subsample=0.8,
                        colsample_bytree=0.6,
                        objective='reg:linear',
                        scale_pos_weight=1,
                        seed=10)
    mu = []
    for i in range(assetN):
        xgb.fit(factRet, returns.iloc[:, i])
        mu.append(float(xgb.predict(f_bar)))
    mu = np.array(mu)
    Q = np.array(returns.cov())
    return mu, Q
Ejemplo n.º 19
0
def xgb(X_train, y_train, X_test, y_test, lime_flag=False,
                      max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, 
       objective='binary:logistic', booster='gbtree', n_jobs=-1, nthread=None, gamma=0, 
       min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, 
       colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, 
       base_score=0.5, random_state=42, seed=None, missing=0):
    
    '''
    Parameters:
    X_train, y_train, X_test, y_test- Learning set
    lime_flag-  enable or disable lime
    '''
    start_time          = time.time()
    # cretae instance
    xgb= XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, silent=silent, 
       objective=objective, booster=booster, n_jobs=n_jobs, nthread=nthread, gamma=gamma, 
       min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, 
       colsample_bylevel=colsample_bylevel, reg_alpha=reg_alpha, reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, 
       base_score=base_score, random_state=random_state, seed=seed, missing=missing)

    xgb.fit(X_train,y_train)
    #Predict on test set
    y_pred= xgb.predict(X_test)

    # understand the model through lime
    #if lime_flag:
    #    lime_explainer(X_train, y_train, X_test, y_test, df_row=2,  model_predictor= xgb, alogorithm_name="XGB")                                                
    time_end=time.time() - start_time
    # Scores
    model_evaluation(X_train,y_train, X_test, y_test,y_pred, xgb, time_end, alg_name='XGB') 
    # resturn model object
    return xgb
Ejemplo n.º 20
0
def XgbTrain(X, y):
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0)  ##test_size测试集合所占比例
    test_preds = pd.DataFrame({"label": test_y})
    clf = XGBClassifier(
        learning_rate=0.3,  # 默认0.3 学习率
        n_estimators=50,  # 树的个数
        max_depth=10, #树的最大深度
        objective='multi:softmax',
        min_child_weight=3,
        gamma=0.3, #伽玛参数
        eta=0.1,
        subsample=0.7, #训练集占比
        colsample_bytree=0.6,
        nthread=4,  # cpu线程数
        scale_pos_weight=1,
        reg_alpha=1e-05,
        reg_lambda=1,
        num_class=10,
        seed=10
    )
    clf.fit(train_x, train_y)
    test_preds['y_pred'] = clf.predict(test_x)
    test_preds['cha'] = test_preds['y_pred'] - test_preds['label']
    test_preds.to_csv('E:/xinyong/xgbmodelfile/result191-501.csv', index=None)
    stdm = metrics.accuracy_score(test_preds['label'], test_preds['y_pred'])
    import matplotlib.pyplot as plt  # 画出预测结果图
    p = test_preds[['label', 'y_pred']].plot(subplots=True, style=['b-o', 'r-*'])
    plt.show()
    return stdm, clf
Ejemplo n.º 21
0
def model_train(xtrain, ytrain):
    X_train, X_test, y_train, y_test = train_test_split(xtrain,
                                                        ytrain,
                                                        test_size=0.2,
                                                        random_state=0)
    cls = XGBClassifier()
    start_time = time.time()
    cls.fit(X_train, y_train)
    end_time = time.time()
    print('It took %d seconds to train the model!' % (end_time - start_time))
    print()
    y_pred = cls.predict(X_test)
    print("模型及模型参数:")
    print(str(cls))
    print("模型评估:")
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('F1 score:', f1_score(y_test, y_pred))
    print('Recall:', recall_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred))
    print('\n clasification report:\n', classification_report(y_test, y_pred))
    print('\n confussion matrix:\n', confusion_matrix(y_test, y_pred))

    # 保存模型
    model_name = "./model/" + "xgb_model"
    joblib.dump(cls, model_name)
Ejemplo n.º 22
0
def featureimportance(model, X_train, X_test, y_train, y_test):
    thresholds = sort(model.feature_importances_)
    bestthresh = 0
    bestN = 0
    bestaccuracy = 0
    for thresh in thresholds:
        # select features using threshold
        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        select_X_train = selection.transform(X_train)
        select_X_test = selection.transform(X_test)
        # train model
        
        selection_model = XGBClassifier(model.get_xgb_params())
        selection_model = modelfit(selection_model, select_X_train, select_X_test, y_train, y_test, featureimportance=True)
        # eval model
        y_pred = selection_model.predict(select_X_test)
        predictions = [round(value) for value in y_pred]
        accuracy = metrics.accuracy_score(y_test, predictions)
        print(f"Thresh={thresh}, n={select_X_train.shape[1]}, Accuracy: {accuracy*100}%")
        if accuracy > bestaccuracy:
            bestthresh = thresh
            bestN = select_X_train.shape[1]
            bestaccuracy = accuracy

    print(f"Best Run: Thresh={bestthresh}, n={bestN}, Accuracy: {bestaccuracy*100}%")
    '''
def pvXBOOST(trainX, testX, trainY, testY):
    train = np.append(trainX, trainY, axis=1)
    # test = np.append(testX, testY, axis=1)
    X = train[:, 0:-1]
    Y = train[:, -1]

    # sklearn接口
    clf = XGBClassifier(
        n_estimators=100,  # trees number
        learning_rate=0.2,
        max_depth=3,
        min_child_weight=1,
        gamma=0.3,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=12,
        scale_pos_weight=1,
        reg_lambda=1,
        seed=27)

    model_sklearn = clf.fit(X, Y)
    preds = clf.predict(testX)
    conMar = confusion_matrix(testY, preds)
    # feature importance
    print(clf.feature_importances_)
    # plot
    pyplot.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
    pyplot.show()
    FeatureImportance(clf.feature_importances_)
    print(conMar)
    cnm.writelines('\n**XGBoosting-confusion martix\n')
    cnm.write(np.array2string(conMar))
    return classification_report(testY, preds)
Ejemplo n.º 24
0
class Classifier(object):
    def __init__(self, conf, task, train=None, test=None):
        self.conf = conf
        self.task = task
        self.train_ = train
        self.test_ = test
        self.features = [
            "hasWith", "hasIn", "simiBucket", "textPos", "hasOf", "hasAnd",
            "startEntity", "distance", "hasFrom", "endEntity", "similarity",
            "hasThan", "hasVerb"
        ]
        self.labels = ["relation"]
        self.num_round = 500
        self.eval_set = list()
        self.early_stopping_rounds = 20
        self.classifier = XGBClassifier(max_depth=4,
                                        learning_rate=0.1,
                                        n_estimators=1000,
                                        gamma=4,
                                        verbosity=1,
                                        objective='multi:softmax',
                                        num_class=6,
                                        booster='gbtree',
                                        n_jobs=4,
                                        seed=27)

    def train(self):
        train_X, test_X, train_y, test_y = train_test_split(
            self.train_[self.features],
            self.train_[self.labels],
            test_size=0.4,
            random_state=42)

        self.eval_set = [(train_X.values, train_y.values),
                         (test_X.values, test_y.values)]
        self.classifier.fit(train_X.values,
                            train_y.values,
                            eval_metric='merror',
                            eval_set=self.eval_set,
                            early_stopping_rounds=self.early_stopping_rounds,
                            verbose=True)

        self.classifier.save_model(self.conf.model_path.format(self.task))
        return 'Model has been saved!'

    def test(self):
        test_set = self.test_[self.features].values
        self.classifier.load_model(self.conf.model_path.format(self.task))
        self.classifier._le = LabelEncoder().fit([
            'USAGE', 'TOPIC', 'MODEL-FEATURE', 'PART_WHOLE', 'RESULT',
            'COMPARE'
        ])
        pred = self.classifier.predict(test_set)
        predictions = pd.concat([
            self.test_[self.features],
            pd.DataFrame(pred, columns=["relation"])
        ],
                                axis=1)

        return predictions
def XGB_class_evaluation(individual):
    N_SPLITS = N_splits
    kf = KFold(n_splits=N_SPLITS)
    fc = XGBClassifier(learning_rate=individual[0],
                       n_estimators=100,
                       silent=True,
                       nthread=-1,
                       gamma=0,
                       min_child_weight=individual[1],
                       max_depth=individual[2],
                       subsample=individual[3],
                       colsample_bylevel=individual[4],
                       seed=0)
    M_pos = 0
    M_mid = 0
    M_neg = 0
    for train, test in kf.split(trainX):
        fc.fit(trainX[train, :], trainY[train])
        testY_pre = fc.predict(trainX[test, :])
        Ind_pos = (trainY[test] == 1)
        Ind_mid = (trainY[test] == 0)
        Ind_neg = (trainY[test] == -1)
        M_pos += len(np.where(np.array(testY_pre[Ind_pos]) == 1)[0]) / len(
            np.where(Ind_pos)[0])
        M_mid += len(np.where(np.array(testY_pre[Ind_mid]) == 0)[0]) / len(
            np.where(Ind_mid)[0])
        M_neg += len(np.where(np.array(testY_pre[Ind_neg]) == -1)[0]) / len(
            np.where(Ind_neg)[0])

    correct = map(lambda x: x / N_SPLITS, [M_pos, M_mid, M_neg])
    return (tuple(correct))
Ejemplo n.º 26
0
def get_ntree():
    f1_t_total, f1_v_total = [], []
    for ntree in range(10, 810, 10):
        xgb_base = XGBClassifier(objective='binary:logistic',
                                 n_estimators=ntree,
                                 random_state=1234,
                                 silent=0,
                                 booster='gbtree',
                                 subsample=0.8,
                                 colsample_bytree=0.8,
                                 reg_alpha=1,
                                 reg_lambda=0,
                                 learning_rate=0.1,
                                 max_depth=6)

        print('此时 ntree = %s' % ntree)
        xgb_base.fit(X_t, y_t)
        y_t_pre = xgb_base.predict(X_t)
        y_v_pre = xgb_base.predict(X_v)
        f1_t_each = f1_score(y_t, y_t_pre, average='micro')
        f1_v_each = f1_score(y_v, y_v_pre, average='micro')
        f1_t_total.append(f1_t_each)
        f1_v_total.append(f1_v_each)
        myfile = open('D:\\workspace python\\contest\\accu_save\\' +
                      'xgbbase_810_1.txt',
                      'a',
                      encoding='utf-8')
        print(f1_t_each, ',', f1_v_each, file=myfile)
        myfile.close()
    return f1_t_total, f1_v_total
Ejemplo n.º 27
0
def xgboost(train_features,
            train_labels,
            test_features,
            feature_list=None,
            hfo_type_name=None):
    clf = XGBClassifier(nthread=-1)
    '''
    #clf = XGBClassifier(learning_rate=0.05,
                        n_estimators=1000, #100
                        max_depth=6,
                        min_child_weight=3,
                        gamma=0.05,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        reg_alpha=0.005,
                        objective='binary:logistic',
                        nthread=-1,
                        scale_pos_weight=1,
                        seed=10,
                        eval_metric='aucpr' #'aucpr'
                        )
    '''
    clf.fit(train_features, train_labels)
    # Predict over test
    clf_predictions = clf.predict(test_features)
    clf_probs = clf.predict_proba(test_features)[:, 1]

    # graphics.feature_importances(feature_list, clf.feature_importances_, hfo_type_name, fig_id)
    return clf_predictions, clf_probs, clf
Ejemplo n.º 28
0
def leaveoneout(dataset, labels):
    '''分类器采用xgboost,交叉验证采用留一法'''
    leaveoo = LeaveOneOut()
    #    Y_true = []
    #    Y_pre  = []
    #xgboost参数分为三类:
    '''1、通用参数
       2、Booster参数:控制每一步的booster
       3、学习目标参数:控制训练目标的表现'''
    for train_index, test_index in leaveoo.split(dataset):
        x_train, x_test = dataset[[train_index]], dataset[[test_index]]
        y_train, y_test = [labels[i] for i in train_index
                           ], [labels[i] for i in test_index]
        estimator = XGBClassifier(
            silent=0,  #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。
            min_child_weight=1,
            gamma=0,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这种样子。
            max_delta_step=1,  #最大增量步长,我们允许每个树的权重估计。
            colsample_bytree=0.8,  # 生成树时进行的列采样 
            nthread=4,
            objective=
            'binary:logistic',  #定义需要被最小化的损失函数,binary:logistic 二分类的逻辑回归,返回预测的概率(不是类别)。
            reg_lambda=1,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
            scale_pos_weight=1,
            n_estimators=200,  #树的个数
            seed=1000  #随机种子
        )
        estimator.fit(x_train, y_train)
        print(estimator.best_params_)
        y_true, y_pre = y_test, list(estimator.predict(x_test))
    print("Accuracy : %.6g" % metrics.accuracy_score(y_true, y_pre))
Ejemplo n.º 29
0
def gradient_boosted_trees(train_features: np.array, train_labels: np.array,
                           test_features: np.array, **kwargs):
    """Gradient Boosted Trees classifier.

    Parameters
    ----------
    train_features : np.array
        Training sample features.
    train_labels: np.array
        Training sample classes.
    test_features: np.array
        Test sample features.
    kwargs: extra parameters
        All parameters allowed by sklearn.XGBClassifier

    Returns
    -------
    predictions: np.array
        Predicted classes.
    prob: np.array
        Classification probability for all objects, [pIa, pnon-Ia].
    """

    #create classifier instance
    clf = XGBClassifier(**kwargs)

    clf.fit(train_features, train_labels)  # train
    predictions = clf.predict(test_features)  # predict
    prob = clf.predict_proba(test_features)  # get probabilities

    return predictions, prob, clf
Ejemplo n.º 30
0
    def compute_cv_metric(split, cross_val_data, bayes_trials_results):

        #Create clasifier for cross validation results
        clf = XGBClassifier(random_state=0,
                            n_jobs=-1,
                            **bayes_trials_results[0]['params'])

        train_x = cross_val_data[split][0]
        train_y = cross_val_data[split][1]
        test_x = cross_val_data[split][2]
        test_y = cross_val_data[split][3]

        clf.fit(train_x, train_y)

        y_pred_cv = clf.predict(test_x)
        y_pred_prob_cv = clf.predict_proba(test_x)

        tn = confusion_matrix(test_y, y_pred_cv)[0, 0]
        tp = confusion_matrix(test_y, y_pred_cv)[1, 1]
        fp = confusion_matrix(test_y, y_pred_cv)[0, 1]
        fn = confusion_matrix(test_y, y_pred_cv)[1, 0]

        npv = tn / (tn + fn)
        specificity = tn / (tn + fp)

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)

        roc_auc_cv = roc_auc_score(test_y, y_pred_prob_cv[:, 1])

        f1_cv = 2 * (precision * recall) / (precision + recall)

        return npv, specificity, precision, recall, roc_auc_cv, f1_cv, y_pred_prob_cv
#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#ClusterCentroids
cc = ClusterCentroids(random_state=0)
os_X,os_y = cc.fit_sample(X_train,y_train)

#XGboost
clf_XG = XGBClassifier(learning_rate= 0.3, min_child_weight=1,
                       max_depth=6,gamma=0,subsample=1, max_delta_step=0, colsample_bytree=1,
                       reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000)  
clf_XG.fit(os_X, os_y,eval_set=[(os_X, os_y), (X_test, y_test)],eval_metric='auc',verbose=False)  
evals_result = clf_XG.evals_result()  
y_true, y_pred = y_test, clf_XG.predict(X_test)  

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)  
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)  
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
 
#Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)
print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1])
specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) 
print "G score: " , math.sqrt(recall/ specifity) 

#Plot non-normalized confusion matrix
Ejemplo n.º 32
0
#  reg_alpha=0.1,
#  seed=27)
# modelfit(xgb1, df_train, predictors, targetname, early_stopping_rounds=50)


xgb1 = XGBClassifier(
 learning_rate=0.01,
 n_estimators=700,
 max_depth=5,
 min_child_weight=8,
 gamma=0.3,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 scale_pos_weight=1,
 seed=27)



xgb1.fit(df_train[predictors], df_train[targetname])
df_test['target'] = xgb1.predict(df_test[predictors])




df_test['target'] = df_test['target'].apply(lambda x: 'Y' if x==1 else 'N')

submission = pd.DataFrame()
submission['Loan_ID'] = df_test['Loan_ID']
submission['Loan_Status'] = df_test['target']
submission.to_csv('submission_XGB_retunned.csv', index=False)
Ejemplo n.º 33
0
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
import dataGatherer as dg
isTest = int(sys.argv[1])
if isTest == 1:
	train, test, feature_train, feature_test, label_train, label_test = dg.test_data(.8)
else:
	train, test, feature_train, feature_test, label_train = dg.prod_data()
f_train = pd.concat([train,feature_train], axis = 1)
f_test = pd.concat([test,feature_test], axis = 1)
xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=50, objective='multi:softprob', subsample=1.0, colsample_bytree=1, seed=0)
le = LabelEncoder()
y = le.fit_transform(label_train.values)

xgb.fit(f_train.values, y)
y_pred = xgb.predict(f_test.values)
y_pred = le.inverse_transform(y_pred)
if isTest == 1 :
	y_f = y_pred == label_test.values
	print("misclassified = " + str(len(y_f[y_f==False])))
	print("currect class = " + str(len(y_f[y_f==True])))
	print("score = " + str(len(y_f[y_f==True])/len(y_f)))
	t = test[~y_f]
	l = label_test[~y_f]
	l_p = y_pred[~y_f]
	for i in range(0, len(l)):
		di.draw(t[i:i+1].values[0,], "images/prob_" + str(i) + "_" + str(l.values[i]) + "_" + str(l_p[i]) )
else :
	index = list(range(1,len(y_pred)+1))
	index = pd.DataFrame(index, columns = ['ImageId'])
	y_pred = pd.DataFrame(y_pred, columns = ['Label'])
Ejemplo n.º 34
0
class TrollClassifier:
    def set_train_path(self, path):
        self.train_path = path

    def pre_process(self, json, istrain):
        mecab = Mecab()

        data = []

        for cnt, article in enumerate(json):
            if cnt % 10000 == 0:
                print(cnt)
                
            text = bs(article["text"], "html.parser").text
            #title_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["title"])]
            #author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])]
            text_pos = ["%s_%s" % (first, second) for first, second in mecab.pos(text)]

            data.append({
                #"title_pos": title_pos,
                #"title_pos_sentences" : " ".join(title_pos),
                #"author_pos": author_pos,
                #"author_pos_sentences" : " ".join(author_pos),
                "text":article["text"],
                "text_pos": text_pos,
                "text_pos_sentences" : " ".join(text_pos),
                #"forumid": article["forumid"],                    
                "pk": article["pk"]
            })

            if istrain == True:
                data[cnt]["istroll"] = article["is_troll"]

        data = pd.DataFrame.from_dict(data)
        data = data.set_index('pk')

        return data

    def fit(self, json_train, n_estimators = 10, is_xgb = True):

        train = self.pre_process(json_train, istrain = True)
        
        bow_vectorizer = BagOfWordsVectorizer()
        word2vec_model = Word2VecModel()
        tag_counter_model = TagCounterModel()

        # word2vec_model.fit(train["author_pos_sentences"], 500)
        # author_features = word2vec_model.transform(train["author_pos_sentences"], "author")
        # self.author_model = word2vec_model.get_model()

#        bow_vectorizer.fit(train["title_pos_sentences"], 1000)
#        title_features = bow_vectorizer.transform(train["title_pos_sentences"], "title")
#        self.title_model = bow_vectorizer.get_vectorizer()

        bow_vectorizer.fit(train["text_pos_sentences"], 1000)
        text_features = bow_vectorizer.transform(train["text_pos_sentences"], "text")
        self.text_model = bow_vectorizer.get_vectorizer()

#        tag_features = tag_counter_model.fit_transform(train["text"])
#        self.tag_model = tag_counter_model.get_col()

        train = pd.concat([train, text_features], axis = 1)

        #le = preprocessing.LabelEncoder()

        # train["forumid"] = le.fit_transform(train["forumid"])
        
        label = train['istroll']
        train = train.drop('istroll', axis=1)
        train = train.drop(['text', 'text_pos', 'text_pos_sentences'], axis=1)
        
        print(train.columns)

        train.columns = [str(x) for x in range(len(train.columns))]
        
        if is_xgb == False:
            self.model = RandomForestClassifier(n_estimators, n_jobs=-1)
        else:
            self.model = XGBClassifier(n_estimators = n_estimators, max_depth = 10)

        print(train.shape)
        self.model.fit(train, label)

    def save_model(self, save_path = "predict_model"):

        if not os.path.exists(save_path):
            os.makedirs(save_path)

        #pickle.dump(self.author_model, open("%s/author_model.p" % save_path, "wb"), protocol = pickle.HIGHEST_PROTOCOL)
        #pickle.dump(self.title_model, open("%s/title_model.p" % save_path, "wb"), protocol = pickle.HIGHEST_PROTOCOL)
        pickle.dump(self.text_model, open("%s/text_model.p" % save_path, "wb"), protocol = pickle.HIGHEST_PROTOCOL)
        #pickle.dump(self.tag_model, open("%s/tag_model.p" % save_path,"wb"), protocol = pickle.HIGHEST_PROTOCOL)
        pickle.dump(self.model, open("%s/predict_model.p" % save_path,"wb"), protocol = pickle.HIGHEST_PROTOCOL)

    def load_model(self, save_path = "predict_model"):
        #self.author_model = pickle.load(open("%s/author_model.p" % save_path, "rb"))
        #self.title_model = pickle.load(open("%s/title_model.p" % save_path, "rb"))
        self.text_model = pickle.load(open("%s/text_model.p" % save_path, "rb"))
        #self.tag_model = pickle.load(open("%s/tag_model.p" % save_path, "rb"))
        self.model = pickle.load(open("%s/predict_model.p" % save_path,"rb"))

    def _predict(self, json_test):
        
        test = self.pre_process(json_test, istrain = False)

        bow_vectorizer = BagOfWordsVectorizer()
        word2vec_model = Word2VecModel()
        tag_counter_model = TagCounterModel()

        # word2vec_model.set_model(self.author_model)
        # author_features = word2vec_model.transform(test["author_pos_sentences"], "author")

        #bow_vectorizer.set_vectorizer(self.title_model)
        #title_features = bow_vectorizer.transform(test["title_pos_sentences"], "title")

        bow_vectorizer.set_vectorizer(self.text_model)
        text_features = bow_vectorizer.transform(test["text_pos_sentences"], "text")

        #tag_counter_model.set_col(self.tag_model)
        #tag_features = tag_counter_model.transform(test["text"])

        test = pd.concat([test, text_features], axis = 1)

        #le = preprocessing.LabelEncoder()

        #test["forumid"] = le.fit_transform(test["forumid"])

        test = test.drop(['text', 'text_pos', 'text_pos_sentences'], axis=1)

        test.columns = [str(x) for x in range(len(test.columns))]

        return test

        
    def predict(self, json_test):
        result = self.model.predict(self._predict(json_test))
        
        return result

    def predict_proba(self, json_test):
        result = self.model.predict_proba(self._predict(json_test)).T

        #if results are all False, it's not 2-dimensional so return only first col

        if result.shape[0] < 2:
            return result[0]
        else:
            return result[1]
Ejemplo n.º 35
0
data_x=pd.get_dummies(data.action_type,prefix="action_type")
cols=["combined_shot_type","game_event_id","period","playoffs",
      "shot_type","shot_zone_area","shot_zone_basic","shot_zone_range",
      "matchup","opponent","game_date","shot_distance","minutes_remaining","seconds_remaining",
      "loc_x","loc_y"]
for col in cols:
    data_x=pd.concat([data_x,pd.get_dummies(data[col],prefix=col),],axis=1)
train_x=data_x[-pd.isnull(data.shot_made_flag)]
test_x=data_x[pd.isnull(data.shot_made_flag)]
train_y=data.shot_made_flag[-pd.isnull(data.shot_made_flag)]

clf = XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=550,
                     subsample=0.5, colsample_bytree=0.5, seed=0)
clf.fit(train_x, train_y)
y_pred = clf.predict(train_x)
print("Number of mislabeled points out of a total %d points : %d"  % (train_x.shape[0],(train_y != y_pred).sum()))

def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    print(ll)
    return ll
    
logloss(train_y,clf.predict_proba(train_x)[:,1])

test_y=clf.predict_proba(test_x)[:,1]
test_id=data[pd.isnull(data.shot_made_flag)]["shot_id"]