コード例 #1
0
ファイル: avazu_ctr.py プロジェクト: xuefenga616/mygit
def fit(X_vec, y_vec):
    # 切分数据集
    cv = cross_validation.ShuffleSplit(len(X_vec),
                                       n_iter=10,
                                       test_size=0.2,
                                       random_state=0)

    # 随机森林回归
    # for train, test in cv:
    #     svc = RandomForestClassifier(n_estimators=100).fit(X_vec[train], y_vec[train])
    #     print("train score: %.3f, test score: %.3f\n" % (
    #         svc.score(X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test])
    #     ))

    # gbdt
    # for train, test in cv:
    #     svc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1).fit(X_vec[train], y_vec[train])
    #     print("train score: %.3f, test score: %.3f\n" % (
    #         svc.score(X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test])
    #     ))

    # xgboost
    for train, test in cv:
        svc = XGBClassifier(max_depth=10,
                            gamma=0.001).fit(X_vec[train], y_vec[train])
        print("train score: %.3f, test score: %.3f\n" % (svc.score(
            X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test])))
コード例 #2
0
def xxgboost(training, cv, testing):
    xgb = XGBClassifier(max_depth=6,
                        n_estimators=25,
                        objective='multi:softprob',
                        subsample=0.5,
                        colsample_bytree=0.5)

    xgb.fit(training, cv.ravel())
    XGBtrainscore = xgb.score(training, cv.ravel())  #Train Score

    kf = KFold(len(cv), n_folds=5)  # 5 folder cross validation
    scores = cross_val_score(xgb, training, cv.ravel(), cv=kf)
    XGBvalidation = abs(scores.mean())

    XGBy_pred = xgb.predict_proba(testing)

    le = LabelEncoder()
    y = le.fit_transform(labels)

    idlist = []  #id list
    listcty = []  #countries list

    for i in range(len(testid)):
        idi = testid[i]
        idlist += [idi] * 5
        listcty += le.inverse_transform(np.argsort(
            XGBy_pred[i])[::-1])[:5].tolist()

    XGBsub = pd.DataFrame(np.column_stack((idlist, listcty)),
                          columns=['id', 'country'])
    XGBsub.to_csv('XGsub_%s.csv' % csvname, index=False)
    print("XGBtrainscore", XGBtrainscore)
    print("XGBvalidation", XGBvalidation)
コード例 #3
0
ファイル: dbMgr.py プロジェクト: BennJB/predict_B2
def modeling_RF():
    estimator = None
    try:
        df1 = pd.read_csv('last_total.csv', encoding='cp949')

        df_dummy = pd.get_dummies(df1)
        train, test = train_test_split(df_dummy,
                                       test_size=0.2,
                                       random_state=1234)
        train_x = train.drop('target_bool', axis=1)
        train_y = train['target_bool']
        test_x = test.drop('target_bool', axis=1)
        test_y = test['target_bool']

        xgb = XGBClassifier(random_state=1234,
                            learning_rate=0.6000000000000001,
                            max_depth=9,
                            n_estimators=200)
        xgb.fit(train_x, train_y)
        abc = xgb.score(train_x, train_y)

    except Exception as e:
        print(e)
    finally:
        pass

    return abc
コード例 #4
0
 def _XGBoost(self):
     clf = XGBClassifier()
     clf.fit(self.X_train, self.y_train)
     score = clf.score(self.X_test, self.y_test)
     print('Accuracy rate of XGBoost: {0:.3f}'.format(score))
     y_pred = clf.predict_proba(self.X_test)
     ks(y_pred.T[0], self.y_test)
コード例 #5
0
def train_model(mall_id):
    # 开始训练模型
    random_state = 10
    metrix, tar = utils.get_data(mall_id)
    x_train, x_test, y_train, y_test = train_test_split(
        metrix, tar, test_size=0.1, random_state=random_state)
    # xgboost方法,基于boosting tree(提升树方法)
    # 设参数 训练慢
    clf_name = "xgboost"
    save_dir = "./model/" + clf_name + "_" + mall_id + "_model.m"
    n_est = 50
    clf = XGBClassifier(
        learning_rate=0.1,  # 学习率 典型值为0.01-0.2
        n_estimators=n_est,
        max_depth=5,  # 树的最大深度 一般3-10
        min_child_weight=1,  # 决定最小叶子节点样本权重和 值较大,避免过拟合 值过高,会导致欠拟合
        gamma=0,  # 指定了节点分裂所需的最小损失函数下降值。 这个参数的值越大,算法越保守
        subsample=0.8,  # 对于每棵树,随机采样的比例 减小,算法保守,避免过拟合。值设置得过小,它会导致欠拟合 典型值:0.5-1
        colsample_bytree=0.8,  # 每棵随机采样的列数的占比
        objective='binary:logistic',  # 使用二分类
        nthread=4,  # 线程数
        scale_pos_weight=1,  # 在各类别样本十分不平衡时,参数设定为一个正值,可以使算法更快收敛
        seed=0)  # 随机数的种子 设置它可以复现随机数据的结果
    print(utils.get_time(), ' ', mall_id, ' starts...')
    train_time = time.time()
    clf.fit(x_train, y_train)
    train_time = time.time() - train_time
    score = clf.score(x_test, y_test)
    joblib.dump(clf, save_dir)
    print(utils.get_time(), ' saved a model for ', mall_id, ' score: ', score,
          '  train time : ', train_time)
    train_time = int(train_time)
    return (score, n_est, train_time)
コード例 #6
0
ファイル: hotel_train.py プロジェクト: wandaoyi/hotel_pred
    def best_param_xgboost(self,
                           estimator=10,
                           depth=1,
                           lr=0.1,
                           gama=0.1,
                           subsamples=1.0,
                           bytree=0.3,
                           n_thread=1,
                           child_weight=1,
                           seed_num=7):
        best_model = XGBClassifier(n_estimators=estimator,
                                   max_depth=depth,
                                   learning_rate=lr,
                                   gamma=gama,
                                   subsample=subsamples,
                                   colsample_bytree=bytree,
                                   nthread=n_thread,
                                   min_child_weight=child_weight,
                                   seed=seed_num,
                                   objective='binary:logistic')

        best_model.fit(self.x_train, self.y_train)
        y_pred = best_model.predict(self.x_val)
        acc_score = metrics.accuracy_score(self.y_val, y_pred)
        print("acc_score: {}".format(acc_score))
        print("score: {}".format(best_model.score(self.x_val, self.y_val)))

        save_path = self.model_save_path + "acc={:.6f}".format(
            acc_score) + ".m"
        # 判断模型是否存在,存在则删除
        if os.path.exists(save_path):
            os.remove(save_path)
            pass

        # 保存模型
        joblib.dump(best_model, save_path)

        print("AUC Score: {}".format(metrics.roc_auc_score(self.y_val,
                                                           y_pred)))

        # 绘制 ROC 曲线
        self.plt_roc(best_model)

        pass
コード例 #7
0
def decision_tree_algo(original_df: pd.DataFrame):
    """
    Mon propre test du decision tree pour essayer de l'ameliorer et proposer mes idees a Max.
    Je l'ai mis pour montrer comment j'ai fait, mais la fonction pourrait etre ameliorer
    :param original_df: la DF originale sur laquelle construire le decision tree
    :return: rien
    """
    copied_df = original_df.copy()

    data = copied_df.iloc[:, :-1]
    target = copied_df.iloc[:, -1]
    print()
    print("========================================================")
    print("========================================================")
    print("In decision tree algorithm.")
    d1 = dt.datetime.now()
    xtrain, xtest, ytrain, ytest = tts(data, target, train_size=0.8)
    boost = XGBClassifier(max_depth=4, n_estimators=500)
    boost.fit(xtrain, ytrain)
    boost_prediction = boost.predict(xtest)
    print("Score Train:", round(boost.score(xtest, ytest) * 100, 2), " %")
    d2 = dt.datetime.now()
    print("Took ", d2 - d1)
    print("End decision tree algorithm.")

    labels = 'Found', 'Not found'
    hamming = distance.hamming(ytest, boost_prediction)
    rates = [1 - hamming, hamming]
    fig1, ax1 = plt.subplots()
    ax1.pie(rates, labels=labels, autopct='%0.2f%%')
    plt.show()

    plot_tree(boost, rankdir='LR')
    fig = plt.gcf()
    fig.set_size_inches(150, 50)
    # fig.savefig("tree.png")
    plt.show()
コード例 #8
0
ファイル: mlp_news.py プロジェクト: spinning210/FTIAS
def run():
    company_news = source.get_company_news()

    cross_over_keys, cross_under_keys = source.get_cross_keywords()
    cross_over_keys = cross_over_keys[:128]
    cross_under_keys = cross_under_keys[:128]
    company_news['post_time'] = pd.to_datetime(company_news['post_time']).dt.date
    company_news_train = company_news[int(len(company_news)*0.9):]


    tmp_content = jieba_analyse.cut_to_list(company_news_train)

    done = []
    row_list = []
    label = []  #-跌 +漲

    for article in tqdm(tmp_content):
        xx = {}
        tmp_content_score = pd.DataFrame()
        
        
        
        for _, index in cross_over_keys.iterrows():
            if index['key'] in article[1]:
                sss = {index['key']: float(index['weight']) }
            else:
                sss = {index['key']: 0}
            xx.update(sss)
        for _, index in cross_under_keys.iterrows():
            if index['key'] in article[1]:
                sss = {index['key']: float(index['weight']*-1)}
            else:
                sss = {index['key']: 0}
            xx.update(sss)
        sss = {'date': article[0]}
        xx.update(sss)
        row_list.append(xx)

    tmp_content_score = pd.DataFrame(row_list)
    a = tmp_content_score.columns.values.tolist()


    df_score = tmp_content_score.set_index('date')
    print(df_score)

    index_b = source.get_company_higher_lower_index()
    index_b = index_b.set_index('date')
    index_b = index_b['result']

    result = pd.concat([df_score, index_b], axis=1, join='inner')

    print(result)
    print(len(result))
        
    a = result.columns.values.tolist()
    a.pop(len(a)-1)


    Y = result['result'].to_numpy()
    X = result[a].to_numpy()
    validation_size = 0.20
    seed = 7
    X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed, stratify=Y ) 
    #===============mlp
    print('mlp===============================================================')
    model_MLP = MLPClassifier(hidden_layer_sizes=(256, 256,), max_iter=256)
    model_MLP.fit(X_train, Y_train)
    print(model_MLP.score(X_train, Y_train))

    

    predictions = model_MLP.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))
    print('==================================================================')

    #===============RandomForest
    print('RandomForest======================================================')
    model_RandomForest = RandomForestClassifier()
    model_RandomForest.fit(X_train, Y_train)
    print(model_RandomForest.score(X_train, Y_train))

    predictions = model_RandomForest.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))
    print('==================================================================')

    #===============XGBClassifier
    print('XGBClassifier=====================================================')
    model_XGBClassifier = XGBClassifier()
    model_XGBClassifier.fit(X_train, Y_train)
    print(model_XGBClassifier.score(X_train, Y_train))

    predictions = model_XGBClassifier.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))
    print('==================================================================')

    #===============LogisticRegression
    print('LogisticRegression================================================')
    model_LogisticRegression = LogisticRegression()
    model_LogisticRegression.fit(X_train, Y_train)
    print(model_LogisticRegression.score(X_train, Y_train))

    predictions = model_LogisticRegression.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))
    print('==================================================================')
コード例 #9
0
# Confusion Matrix

cm = confusion_matrix(y_test, pred)
plt.figure(figsize=(12, 8))
ax = sns.heatmap(cm, fmt="f", square=True, annot=True, cbar=False)
ax.set_xlabel('Predicted Labels', fontsize=15)
ax.set_ylabel('True Labels', fontsize=15)
plt.show()

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

xgc = XGBClassifier(max_depth=3, random_state=22)
xgc.fit(X_train, y_train)

print("Accuracy of train: ", xgc.score(X_train, y_train))
print("Accuracy of test: ", xgc.score(X_test, y_test))

importances = xgc.feature_importances_
sns.barplot(x=importances, y=X_train.columns)
plt.show()

pred = xgc.predict(X_test)
print(classification_report(y_test, pred))
print("*" * 100, "\n")

# Metrics
print("Precision = {}".format(precision_score(y_test, pred, average='macro')))
print("Recall = {}".format(recall_score(y_test, pred, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, pred)))
print("F1 Score = {}\n".format(f1_score(y_test, pred, average='macro')))
コード例 #10
0
    def train(self, train_set, dev_set):
        logger.log('Get features from training set')
        if os.path.exists(train_features_file):
            train_features = np.load(train_features_file)
            _, _, train_labels, _, _ = self.get_minibatch(
                train_set, 0, len(train_set))
        else:
            train_features = None
            train_labels = []
            total_batch = int(len(train_set) - 1) / self.batch_size + 1
            for i in tqdm(range(total_batch)):
                minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \
                    minibatch_prem_dep, minibatch_hypo_dep = \
                    self.get_minibatch(train_set, i * self.batch_size, (i+1) * self.batch_size)
                feed_dict = {
                    self.model.premise_x: minibatch_premise_vectors,
                    self.model.hypothesis_x: minibatch_hypothesis_vectors,
                    self.model.y: minibatch_labels,
                    self.model.keep_rate_ph: 1.0
                }
                if 'dep_avg' in self.model_type:
                    feed_dict[self.model.prem_dep] = minibatch_prem_dep
                    feed_dict[self.model.hypo_dep] = minibatch_hypo_dep
                minibatch_features = self.sess.run([self.model.features],
                                                   feed_dict)
                train_features = minibatch_features[0] if train_features is None \
                    else np.concatenate((train_features, minibatch_features[0]))
                train_labels += minibatch_labels

            np.save(train_features_file, train_features)

        logger.log('Get features from dev set')
        if os.path.exists(dev_features_file):
            dev_features = np.load(dev_features_file)
            _, _, dev_labels, _, _ = self.get_minibatch(
                dev_set, 0, len(dev_set))
        else:
            dev_features = None
            dev_labels = []
            total_batch = int(len(dev_set) - 1) / self.batch_size + 1
            for i in tqdm(range(total_batch)):
                minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \
                    minibatch_prem_dep, minibatch_hypo_dep = \
                    self.get_minibatch(dev_set, i * self.batch_size, (i+1) * self.batch_size)
                feed_dict = {
                    self.model.premise_x: minibatch_premise_vectors,
                    self.model.hypothesis_x: minibatch_hypothesis_vectors,
                    self.model.y: minibatch_labels,
                    self.model.keep_rate_ph: 1.0
                }
                if 'dep_avg' in self.model_type:
                    feed_dict[self.model.prem_dep] = minibatch_prem_dep
                    feed_dict[self.model.hypo_dep] = minibatch_hypo_dep
                minibatch_features = self.sess.run([self.model.features],
                                                   feed_dict)
                dev_features = minibatch_features[0] if dev_features is None \
                    else np.concatenate((dev_features, minibatch_features[0]))
                dev_labels += minibatch_labels

            np.save(dev_features_file, dev_features)

        tuned_parameters = {'max_depth': [4, 6, 8], 'n_estimators': [100, 200]}

        best_score = 0.
        best_params = []
        for g in ParameterGrid(tuned_parameters):
            clf = XGBClassifier(nthread=24)
            clf.set_params(**g)
            clf.fit(train_features, train_labels)
            score = clf.score(dev_features, dev_labels)
            logger.log('%s: %f' % (str(g), score))
            if best_score < score:
                best_score = score
                best_params = g
                self.clf = clf

        logger.log('Best score: %s %f' % (str(best_params), best_score))
コード例 #11
0
def decision_tree():
    print(
        "Microsoft Malware Prediction using a Decision Tree Algorithm (XGBoost)"
    )
    d1 = dt.datetime.now()
    print("Data processing started at", "%02d:%02d" % (d1.hour, d1.minute))

    # Data loading
    with open('../../data/json/datatypes.json') as file:
        dtype = json.load(file)
    df = pd.read_csv('../../data/csv/microsoft-malware.csv', dtype=dtype)

    # Dropping categorical
    binary = []
    categorical = []
    numerical = []

    for key, value in dtype.items():
        if value in ['int8']:
            binary.append(key)
        if value in ['int16', 'category']:
            categorical.append(key)
        else:
            numerical.append(key)

    categorical.remove('MachineIdentifier')  # Déjà enlevé par iloc
    df = df.drop(columns=list(categorical))

    # Cleaning NaN
    for i in df.columns:
        s = df.loc[:, i]
        if i in numerical:  # set NaNs in numerical features to -1
            s.fillna(-1, inplace=True)
        elif i in binary:  # set NaNs in binary feature to the most frequent one
            s.fillna(s.mode().iloc[0], inplace=True)
        df[i] = s.values
        if df[i].dtype == "int64" or df[i].dtype == "float64":
            df.loc[df[i].value_counts(normalize=True)[df[i]].values < 0.05,
                   i] = -1

    # Splitting dataset
    data = df.iloc[:, 1:-1]  # Dropping MachineIdentifier & HasDetections
    target = df.iloc[:, -1]  # Selecting HasDetections
    xtrain, xtest, ytrain, ytest = tts(data, target, train_size=0.8)

    # Training model
    boost = XGBClassifier(max_depth=2, n_estimators=200)
    boost.fit(xtrain, ytrain)
    boost_prediction = boost.predict(xtest)
    print("Score Train :", round(boost.score(xtest, ytest) * 100, 2), " %")
    d2 = dt.datetime.now()
    print("Took ", d2 - d1)

    # Plotting result
    labels = 'Found', 'Not found'
    hamming = distance.hamming(ytest, boost_prediction)
    rates = [1 - hamming, hamming]
    fig1, ax1 = plt.subplots()
    ax1.pie(rates, labels=labels, autopct='%0.2f%%')
    plt.show()

    # Decision tree
    print('Plotting decision tree')
    plot_tree(boost, rankdir='LR')
    fig = plt.gcf()
    fig.set_size_inches(150, 50)
    # fig.savefig("tree.png")
    plt.show()
コード例 #12
0
ファイル: code.py プロジェクト: Mk09878/Digit-Recognizer
images_train.isnull().any().describe()
labels_train.isnull().any().describe()

#from xgboost import XGBClassifier
classifier = XGBClassifier(silent=0,
                           eta=0.1,
                           max_depth=8,
                           subsample=0.75,
                           colsample_bytree=0.75)
classifier.fit(images, labels)

# Predicting the Test set results
y_pred = classifier.predict(test)

#Checking score (Accuracy)
classifier.score(images_test, labels_test)

#Creating the joblib file
dump(classifier, 'xgb.joblib')
"""#Getting the joblib file
classifier = load('random_forest.joblib')"""
#Generating the final dataframe
y_pred = pd.DataFrame(y_pred)
y_pred[:, 1] = y_pred[:, 0]
y_pred['ImageId'] = pd.Series(data=np.arange(1, 28001), index=y_pred.index)
y_pred.columns = ['Label', 'ImageId']
#y_pred = y_pred.drop(columns = ['ImageId'])
columnsTitles = ["ImageId", "Label"]
y_pred = y_pred.reindex(columns=columnsTitles)

#Exporting the dataframe
コード例 #13
0
def train_model_xgb_cv(X_train, X_test, y_train, y_test):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    xgb_sklearn = XGBClassifier(learning_rate=0.1,
                                n_estimators=300,
                                max_depth=3,
                                min_child_weight=1,
                                gamma=0.3,
                                subsample=0.6,
                                colsample_bytree=0.7,
                                objective='binary:logistic',
                                nthread=4,
                                seed=27,
                                reg_lambda=0.01)

    xgb_params = xgb_sklearn.get_params()
    cvresult = xgb.cv(xgb_params,
                      dtrain,
                      num_boost_round=xgb_params['n_estimators'],
                      nfold=5,
                      metrics='auc',
                      early_stopping_rounds=5)
    n_estimators = cvresult.shape[0]
    print("n_estimators: ", n_estimators)
    xgb_sklearn.set_params(n_estimators=n_estimators)
    xgb_sklearn.fit(np.array(X_train), np.array(y_train), eval_metric='auc')

    pred_y = xgb_sklearn.predict(X_test)
    pred_y_prob = xgb_sklearn.predict_proba(X_test)[:, 1]
    # auc
    auc = roc_auc_score(y_test, pred_y_prob)
    print('AUC: ', auc)
    # error
    score = xgb_sklearn.score(X_test, y_test)
    print('error: ', 1 - score)

    # grid search
    params = {'max_depth': [2, 3, 4, 5, 6, 7, 8]}
    model = GridSearchCV(
        estimator=XGBClassifier(
            learning_rate=0.1,
            n_estimators=300,
            # max_depth=3,
            min_child_weight=1,
            gamma=0.3,
            subsample=0.6,
            colsample_bytree=0.7,
            objective='binary:logistic',
            nthread=4,
            seed=27,
            reg_lambda=0.01),
        param_grid=params,
        cv=2)
    model.fit(np.array(X_train), np.array(y_train), eval_metric='auc')
    print(model.cv_results_, model.best_params_, model.best_score_)

    feat_imp = pd.Series(xgb_sklearn.get_booster().get_fscore(
        fmap='xgb.fmap')).sort_values(ascending=True)
    feat_imp.plot(kind='barh', color='black', legend=False, figsize=(10, 6))
    plt.ylabel('Feature name')
    plt.xlabel('Feature score')
    plt.savefig(
        'C:/Users/Administrator.USER-20161227PQ/Desktop/paper figure/figure5.png',
        dpi=300)
    plt.show()
コード例 #14
0
score = random_forest.score(X, y)

Y_pred = random_forest.predict(X_test)

# In[14]:

#Classifier
xgb = XGBClassifier(max_depth=6,
                    learning_rate=0.3,
                    n_estimators=25,
                    objective='multi:softprob',
                    subsample=0.5,
                    colsample_bytree=0.5,
                    seed=0)
xgb.fit(X, y)
score = xgb.score(X, y)
y_pred = xgb.predict_proba(X_test)

# In[15]:

print(score)

# In[21]:

# for Random forest

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27).fit(X_train, np.ravel(y_train))

print('Accuracy of XGBOOST classifier on training set: {:.2f}'
     .format(xgb1.score(X_train, np.ravel(y_train))))
print('Accuracy of XGBOOST classifier on validation set: {:.2f}'
     .format(xgb1.score(X_val, np.ravel(y_val))))


# **GridSearchCV with XGBoost**

# In[ ]:


grid_values = {'n_estimators': [300] , 'learning_rate' : [0.05] , 'max_depth' : [5] , 'min_child_weight' : [1],
              'colsample_bytree': [0.8] , 'subsample' : [0.6], 'gamma': [0]}
clf_xgb_grid = XGBClassifier(seed=2,objective= 'binary:logistic',nthread=-1,scale_pos_weight=1)

clf_xgb_grid_acc = GridSearchCV(clf_xgb_grid, param_grid = grid_values)
clf_xgb_grid_acc.fit(X_train, np.ravel(y_train))
コード例 #16
0
X_train, X_test, y_train, y_test = train_test_split(trainX,
                                                    trainY,
                                                    test_size=0.5)
del train
del trainTopSiteid
del trainX
del trainY
del listTopSiteids

model_train_siteid = XGBClassifier(n_estimators=10,
                                   nthread=-1,
                                   silent=False,
                                   seed=125,
                                   learning_rate=0.2)
model_train_siteid.fit(X_train, y_train)
model_train_siteid.score(X_test, y_test)

pred = model_train_siteid.predict(testTopSiteid)
testTopSiteid = train_real[train_real['siteid'].isnull()].copy()
testTopSiteid['siteid'] = pred
train_real[train_real['siteid'].isnull()] = testTopSiteid
train_real.to_csv('data\\train_br_dev_site_pp4.csv', index=False)

####################################################
#
# --------------------------- For Test Data
#
####################################################

# --------------------- Loading datasets
コード例 #17
0

clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
clf_pred = clf.predict(X_test)
print("Decision Tree Classifier")
print("Train Accuracy :",clf.score(X_train,y_train))
print("Test Accuracy ",metrics.accuracy_score(y_test,clf_pred))
print("")


xgb = XGBClassifier()
xgb.fit(X_train,y_train)
xgb_pred = xgb.predict(X_test)
print("Xgboost Classifier")
print("Train Accuracy  xgb:", xgb.score(X_train,y_train))
print("Test Accuracy ",metrics.accuracy_score(y_test,xgb_pred))
print("")

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_test)
print("Random Forest Classifier")
print("Train Accuracy of Random Forest C:",rfc.score(X_train,y_train))
print("Test Accuracy ",metrics.accuracy_score(rfc_pred,xgb_pred))
print("")




コード例 #18
0
tweets_transform = pipe.fit_transform(tweets_tfidf)


send_event("Explained Variance: " + str(pipe.get_params()['svd'].explained_variance_ratio_.sum()))
send_event("Dimension Reduction - Execution time: %s seconds ---" % (time.time() - start_time))
print("Explained Variance: " + str(pipe.get_params()['svd'].explained_variance_ratio_.sum()))
print("Dimension Rediction - Execution time: %s seconds ---" % (time.time() - start_time))

print('Start model training...')
start_time = time.time()
X_train, X_test, y_train, y_test = train_test_split(tweets_transform, y, test_size=0.3)

xgb_model = XGBClassifier(max_depth=5,
                          min_child_weight=5,
                          gamma=0.1,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          scale_pos_weight=1,
                          random_state=10,
                          n_estimators=5000,
                          learning_rate=0.01,
                          n_jobs=-1)


xgb_model.fit(X_train, y_train)

send_event("Test Set Score: " + str(xgb_model.score(X_test, y_test)))
send_event("Train - Execution time: %s seconds ---" % (time.time() - start_time))
print("Test Set Score: " + str(xgb_model.score(X_test, y_test)))
print("Train - Execution time: %s seconds ---" % (time.time() - start_time))
コード例 #19
0
                                                    datasets.target,
                                                    train_size=0.8,
                                                    random_state=104)

#2

# model  = GradientBoostingClassifier(max_depth=4)
model = XGBClassifier(n_jobs=-1, use_label_encoder=False)

#3

model.fit(x_train, y_train, eval_metric='mlogloss')

#4

acc = model.score(x_test, y_test)

print(model.feature_importances_)
print('acc : ', acc)
'''
def plot_feature_importances_dataset(model):
    n_features = datasets.data.shape[1]
    plt.barh(np.arange(n_features),model.feature_importances_,
            align='center')
    plt.yticks(np.arange(n_features),datasets.feature_names)
    plt.xlabel("Feature Importances")
    plt.ylim(-1, n_features)

plot_feature_importances_dataset(model)
'''
コード例 #20
0
                          base_score=0.2,
                          n_estimators=200,
                          seed=random_seed,
                          max_depth=8)

# In[29]:

estimator.fit(dfTr2model[columns], dfTr2model.Cod_Prod)

# **Evaluation of the test data**
#
# In order to observe the results of the test predictions, the trained classifier is evaluated on the subset of test data.

# In[30]:

tsScore = estimator.score(dfTs2eval[columns], dfTs2eval.Cod_Prod)

print("Score obtained in test: " + str(tsScore))

# <a id="predict"> </a>
# ## **Prediction**
#
# We make the prediction of the future products to be hired by the customers of the test dataset.

# In[31]:

Cod_Prod_predicted = estimator.predict(dfTs2predict[columns])

# **Creation of the results dataframe**
#
# In the next cell, the creation of a dataframe with the customer's ID and the product code to be purchased is carried out.
コード例 #21
0
Lr_predicted = Lr.predict(x_test)

clf = neighbors.KNeighborsClassifier()
clf.fit(x_train, y_train)
clf_predicted = clf.predict(x_test)

svc_linear = SVC()
svc_linear.fit(x_train, y_train)
svc_linear_predicted = svc_linear.predict(x_test)

gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
gauss_predicted = gaussian.predict(x_test)

#Calculate the accuracy
print("XGB Classifier accuracy :", model.score(x_test, y_test),
      confusion_matrix(y_test, model_predicted),
      classification_report(y_test, model_predicted))
print("Random Forest Classifier accuracy :", Rf.score(x_test, y_test),
      confusion_matrix(y_test, Rf_predicted),
      classification_report(y_test, Rf_predicted))
print("Logistic Regression accuracy :", Lr.score(x_test, y_test),
      confusion_matrix(y_test, Lr_predicted),
      classification_report(y_test, Lr_predicted))
print("KNeighborsClassifier accuracy :", clf.score(x_test, y_test),
      confusion_matrix(y_test, clf_predicted),
      classification_report(y_test, clf_predicted))
print("SVC accuracy :", svc_linear.score(x_test, y_test),
      confusion_matrix(y_test, svc_linear_predicted),
      classification_report(y_test, gauss_predicted))
print("GaussianNB accuracy :", gaussian.score(x_test, y_test),
コード例 #22
0
from xgboost.sklearn import XGBClassifier

X_train, X_test, y_train, y_test = train_test_split(data,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

clf = DecisionTreeClassifier()
#we have to define max_depth to prevent overfitting
clf.fit(X_train, y_train)
print("Train Accuracy of clf:", clf.score(X_train, y_train))
print("Test Accuracy of clf", clf.score(X_test, y_test))

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
print("Train Accuracy of xgb:", xgb.score(X_train, y_train))
print("Test Accuracy of xgb:", xgb.score(X_test, y_test))

#%%
from sklearn.model_selection import GridSearchCV

#GridSearch on Xgboost Classifier
param_dict = {
    'max_depth': range(2, 3, 4),
    'min_child_weight': range(1, 2, 6),
    'learning_rate': [0.00001, 0.001, 0.01, 0.1],
    'n_estimators': [10, 50, 100]
}

xgb_ = GridSearchCV(xgb, param_dict, cv=3, n_jobs=-1).fit(X_train, y_train)
コード例 #23
0
# classification report
cr = classification_report(y_valid, y_pred_rf)
print(cr)

# modelling
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

model_xgb = XGBClassifier()
model_xgb.fit(x_train, y_train)

y_pred_xgb = model_xgb.predict(x_valid)

# evaluating the model
print("Training Accuracy :", model_xgb.score(x_train, y_train))
print("Validation Accuracy :", model_xgb.score(x_valid, y_valid))

# confusion matrix
cm = confusion_matrix(y_valid, y_pred_xgb)
print(cm)

# classification report
cr = classification_report(y_valid, y_pred_xgb)
print(cr)

# boosting the predictions of the model

boosted_predictions = 0.4 * y_pred_rf + 0.6 * y_pred_xgb
boosted_predictions
コード例 #24
0
##param_test7 = {
## 'reg_alpha':[1e-7, 1e-6, 0.05e-5, 1e-5, 1e-4, 0.5e-4]
##}
##gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=234, max_depth=9,
## min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
## objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
## param_grid = param_test7, scoring='f1_macro',n_jobs=4,iid=False, cv=5)
##gsearch7.fit(train_data[predictors],train_data[target])
##print(gsearch7.cv_results_, gsearch7.best_params_, gsearch7.best_score_)

xgb_model = XGBClassifier(learning_rate=0.1,
                          n_estimators=175,
                          max_depth=9,
                          min_child_weight=2,
                          gamma=0.0,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          reg_alpha=5e-05,
                          objective='binary:logistic',
                          nthread=4,
                          scale_pos_weight=1,
                          seed=27)
xgb_model.fit(data, data_label)

print(xgb_model)

print("score : " + str(xgb_model.score(train_data, train_label)))
pred = pd.DataFrame(xgb_model.predict(test_data))
print("Accuracy : " + str(metrics.accuracy_score(test_label, pred)))
print("F1 score : " + str(metrics.f1_score(test_label, pred)))
コード例 #25
0
class XGBoostModel(BaseModel):
    """RandomForest classifier."""
    def __init__(self,
                 max_depth=3,
                 learning_rate=0.1,
                 n_estimators=100,
                 objective="binary:logistic",
                 booster='gbtree',
                 silent=True,
                 n_jobs=1,
                 gamma=0,
                 min_child_weight=1,
                 max_delta_step=0,
                 subsample=1,
                 colsample_bytree=1,
                 colsample_bylevel=1,
                 reg_alpha=0,
                 reg_lambda=1,
                 scale_pos_weight=1,
                 base_score=0.5,
                 random_state=0,
                 missing=None):
        """"""
        super(XGBoostModel).__init__()
        self.model = XGBClassifier(max_depth=max_depth,
                                   learning_rate=learning_rate,
                                   n_estimators=n_estimators,
                                   silent=silent,
                                   objective=objective,
                                   booster=booster,
                                   n_jobs=n_jobs,
                                   gamma=gamma,
                                   min_child_weight=min_child_weight,
                                   max_delta_step=max_delta_step,
                                   subsample=subsample,
                                   colsample_bytree=colsample_bytree,
                                   colsample_bylevel=colsample_bylevel,
                                   reg_alpha=reg_alpha,
                                   reg_lambda=reg_lambda,
                                   scale_pos_weight=scale_pos_weight,
                                   base_score=base_score,
                                   random_state=random_state,
                                   missing=missing)

    def predict(self, features):
        super().predict(features)
        labels = self.model.predict(features)
        return labels

    def predict_prob(self, features):
        super().predict_prob(features)
        probs = self.model.predict_proba(features)
        return probs

    def predict_log_prob(self, features):
        super().predict_log_prob(features)
        probs = self.model.predict_proba(features)
        return probs

    def train(self, features, targets):
        super().train(features, targets)
        start = time.time()
        self.model.fit(X=features, y=targets)
        print('Finished, time %s' % (time.time() - start))

    def accuracy_score(self, features, targets):
        super().accuracy_score(features, targets)
        score = self.model.score(features, targets,
                                 self.model.scale_pos_weight)
        return score

    def abs_errors(self, features, targets):
        targets_pred = self.predict(features)
        result = abs(targets_pred - targets)
        return result

    def rmse_score(self, y_pred, y_true):
        """
        计算RMSE评分,为了体现预测结果0、1、2不同的重要性,增加对1,2预测错误的惩罚度,
        在评分计算时对不同行为分别乘以1,2,2.5的权重因子。
        np.average((y_true - y_pred) ** 2, axis=0, weights=weights)
        :param y_pred: 预测标签
        :param y_true: 真实标签
        :return: 评分
        """
        weight_dict = {0: 1, 1: 2, 2: 2.5}  # 不同类别的误判惩罚权重
        weights = [weight_dict[l] for l in y_true]
        mse = np.average((y_true - y_pred)**2, axis=0, weights=weights)
        score = 1 / (1 + np.sqrt(mse))
        return score
コード例 #26
0
class Example_XGB:
    def __init__(self, filePath, cols):
        mpl.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体:解决plot不能显示中文问题
        mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
        # 读入表格文件函数
        self.all = pd.read_csv(filePath, encoding='UTF-8')
        own_feature = self.all.columns.values  # 数据集具备的特征,包含标签label
        self.feature_cols = self.get_feature(
            cols, own_feature)  # cols:需要的特征,feature_cols: 特征交集
        # if self.feature_cols == "err":
        #     err = "err&模型加载错误或测试文件读取失败!"
        self.y_pred = []

        self.model = XGBClassifier()
        print("初始化完成...")

    def split_file(self, test_file_path, train_file_path):
        if len(self.feature_cols) == 0:
            return "err"
        X = self.all
        y = X.pop(Label)  # pop() 函数用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值
        X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
            X, y, test_size=0.3)

        X_test = X_test[self.feature_cols]
        X_train = X_train[self.feature_cols]

        train_data = pd.concat([X_train, Y_train], axis=1)
        train_data.to_csv(train_file_path,
                          index=False,
                          encoding='UTF-8',
                          mode='w')  # 训练集文件
        test_data = pd.concat([X_test, Y_test], axis=1)
        test_data.to_csv(test_file_path,
                         index=False,
                         encoding='UTF-8',
                         mode='w')  # 测试集文件

        return ""

    # 特征处理
    def get_feature(self, feature1, feature2):
        # 两种求列表并集的方法
        feature3 = (set(feature1) - set(feature2))  # 需要但是不具备的特征
        feature4 = list(set(feature1) - feature3)  # 需要且具备的特征
        if len(feature4) <= 1:
            return "err"
        else:
            return feature4

        # 传 树形图 ,柱状图
        # filePath 文件路径
        # feature_cols 标签

    def process(self, train, path1, path2):
        # 数据处理,模型调用
        # self.plot_feature_importance(train, path2)
        fmap_filename = "picture/xgb_2.fmap"
        self.tree_pic(self.feature_cols, fmap_filename, path1)
        self.plot_feature_importance(train, path2)

    def train_model(self, train_file, model_file):
        train_data = pd.read_csv(train_file)
        x_train = train_data
        y_train = x_train.pop(Label)
        self.model.fit(x_train, y_train)
        if not os.path.exists(model_file):
            f = open(model_file, mode='ab')
        else:
            f = open(model_file, mode="wb")
        pickle.dump(self.model, f)  # 保存模型

        return x_train

    def result(self, all_file, model_file, test_file_path, img_path):
        if model_file != "" or all_file == "" or test_file_path == "":
            f = open(model_file, 'rb')
            self.model = pickle.load(f)  # 读取模型
        else:
            return "err"

        train_data = pd.read_csv(test_file_path)
        x_test = train_data
        y_test = x_test.pop(Label)

        y_pred = self.model.predict(x_test)  # 模型测试

        # 计算评价指标
        accuracy = self.model.score(x_test, y_test)
        accuracy = '%.4f%%' % (accuracy * 100)
        ret = "准确率:{0}".format(accuracy)
        print("ret:", ret)

        y_pred = y_pred.reshape(y_pred.shape[0], 1)
        res = pd.read_csv(test_file_path, encoding='utf-8')
        # a = pd.read_csv(all_file, encoding='utf-8')
        # res = a.loc[y_test.index]
        res['pred'] = y_pred
        res = res.iloc[0:500]
        res.to_csv(img_path, mode='w', index=False, encoding='UTF-8')
        return ret

    def tree_pic(self, features, fmap_filename, path_1):
        outfile = open(fmap_filename, 'w')
        i = 0
        for feat in features:
            outfile.write('{0}\t{1}\tq\n'.format(i, feat))
            i = i + 1
        outfile.close()
        from xgboost import plot_tree
        plot_tree(self.model, num_trees=0, fmap=fmap_filename)
        fig = plt.gcf()
        fig.set_size_inches(15, 10)
        fig.savefig(path_1)
        # im = Image.open(path_1)
        # im.show()

    def plot_feature_importance(self, x_train, path2):
        plt.clf()  # 清空画板
        feat_labels = x_train.columns
        importances = self.model.feature_importances_
        indices = np.argsort(importances)[::-1]
        for f in range(x_train.shape[1]):
            print("%2d)  %-*s  %f" %
                  (f + 1, 30, feat_labels[f], importances[indices[f]]))
        plt.title('特征重要性分析', fontsize=18)
        plt.bar(range(x_train.shape[1]),
                importances[indices],
                color='lightblue',
                align='center')
        font2 = {'size': 18}
        plt.xlabel(u'特征变量', font2)
        plt.ylabel(u'重要度', font2)
        plt.xticks(range(x_train.shape[1]),
                   feat_labels,
                   rotation=0,
                   fontsize=16)
        plt.yticks(fontsize=18)
        plt.xlim([-1, x_train.shape[1]])
        plt.tight_layout()
        plt.savefig(path2)
コード例 #27
0
    [clf1, clf2, clf3, eclf],
    ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_validation.cross_val_score(clf,
                                              X,
                                              y,
                                              cv=5,
                                              scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
          (scores.mean(), scores.std(), label))

##################################

xgbc = XGBClassifier(learning_rate=0.1)
xgbc.fit(X_trainval, y_trainval)

print("XGBoost预测准确率:", xgbc.score(X_test, y_test))  # 0.7872340425531915

rfc = RandomForestClassifier()
rfc.fit(X_trainval, y_trainval)
y_predst = rfc.predict(X_test)
# accuracy=accuracy_score(X_test,y_predst)
# print("XGBoost预测值与实际: %.2f%%" % (accuracy*100.0))

joblib.dump(rfc, 'xgb_down_model.joblib')

predd_xgb = pd.DataFrame({'y_test': y_test, 'y_pred': y_predst})
predd_xgb.to_csv('xgb_predd.csv')

print("随机森林预测准确率:", rfc.score(X_test, y_test))  # 0.7811550151975684

########################################################################
コード例 #28
0
class Trainer(object):
    def __init__(self, actions, c_config, c_i, data_manager):
        self.actions = actions
        self.c_config = c_config
        self.c_i = c_i
        self.data_manager = data_manager

        self.method = None
        # param = self.getparams()
        if c_i == 0:
            self.method = RandomForestClassifier(
                n_estimators=int(actions[0]),
                max_depth=int(actions[1]),
                min_samples_split=int(actions[2]),
                min_samples_leaf=int(actions[3]),
                max_features=actions[4],
                bootstrap=True,
                n_jobs=-1)
        elif c_i == 1:
            self.method = XGBClassifier(max_depth=int(actions[0]),
                                        learning_rate=float(actions[1]),
                                        n_estimators=int(actions[2]),
                                        gamma=float(actions[3]),
                                        min_child_weight=int(actions[4]),
                                        subsample=float(actions[5]),
                                        colsample_bytree=float(actions[6]),
                                        colsample_bylevel=float(actions[7]),
                                        reg_alpha=float(actions[8]),
                                        reg_lambda=float(actions[9]),
                                        nthread=-1)
        else:
            assert False, "Trainer.__init__: 异常信息!"

    def getparams(self):
        param = []
        key_value = self.c_config.methods_dict[self.c_i][1:]
        assert len(
            self.actions) == len(key_value), "Trainer.getparams: 数据维度应该一样!"
        for i in range(len(key_value)):
            param.append(key_value[i][1][self.actions[i]])
        return param

    def run(self):
        self.fit()
        accuracy = self.estimate()
        return accuracy

    # 交叉验证集版本的训练方法
    def run_CV(self):
        results = cross_val_score(self.method,
                                  self.data_manager.data_cv['data_cv'],
                                  self.data_manager.data_cv['labels_cv'],
                                  cv=2,
                                  n_jobs=1)
        accuracy = np.mean(results)
        return accuracy

    def fit(self):
        self.method.fit(self.data_manager.data_cv['data_cv'],
                        self.data_manager.data_cv['labels_cv'])

    def predict(self, x):
        return self.method.predict(x)

    def estimate(self):
        return self.method.score(self.data_manager.data_cv["data_test"],
                                 self.data_manager.data_cv["labels_test"])
コード例 #29
0
                n_estimators=n_est,
                max_depth=5,  #树的最大深度 一般3-10
                min_child_weight=1,  #决定最小叶子节点样本权重和 值较大,避免过拟合 值过高,会导致欠拟合
                gamma=0,  #指定了节点分裂所需的最小损失函数下降值。 这个参数的值越大,算法越保守
                subsample=
                0.8,  #对于每棵树,随机采样的比例 减小,算法保守,避免过拟合。值设置得过小,它会导致欠拟合 典型值:0.5-1
                colsample_bytree=0.8,  #每棵随机采样的列数的占比
                objective='binary:logistic',  #使用二分类
                nthread=4,  #线程数
                scale_pos_weight=1,  #在各类别样本十分不平衡时,参数设定为一个正值,可以使算法更快收敛
                seed=0)  #随机数的种子 设置它可以复现随机数据的结果
            print(get_time(), ' ', mall_id, ' starts...')
            train_time = time.time()
            clf.fit(x_train, y_train)
            train_time = time.time() - train_time
            print('time : ', train_time)
            score = clf.score(x_test, y_test)
            joblib.dump(clf, save_dir)
            print(get_time(), ' saved a model for ', mall_id, ' score: ',
                  score)
            train_time = int(train_time)
            sql = "UPDATE scores SET xgb='{s}',xgb_itr_times={t}, xgb_train_time={tt} WHERE mall_id='{m}'".format(
                s=score, t=n_est, m=mall_id, tt=train_time)
            cur.execute(sql)
            conn.commit()
            # print('test done... spent time = {s}'.format(s=get_time() - time))
        else:
            print(mall_id, ' has already been handled.')
    cur.close()
    conn.close()
コード例 #30
0
random_forest = RandomForestClassifier(random_state=1, n_estimators=45, min_samples_split=3, min_samples_leaf=2)

random_forest.fit(X, y)
score=random_forest.score(X, y)

Y_pred = random_forest.predict(X_test)


# In[14]:

#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X, y)
score = xgb.score(X,y)
y_pred = xgb.predict_proba(X_test)  



# In[15]:

print (score)


# In[21]:

# for Random forest


#Taking the 5 classes with highest probabilities
コード例 #31
0
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_valid = scalar.transform(X_valid)
test_X = scalar.transform(test_X)

from xgboost.sklearn import XGBClassifier
from sklearn.metrics import roc_auc_score

modelXG = XGBClassifier()
modelXG.fit(X_train, Y_train)

Y_predXG = modelXG.predict(X_valid)

print("Train Accuracy: ", modelXG.score(X_train, Y_train))
print("Validation Accuracy: ", modelXG.score(X_valid, Y_valid))

print("AUROC Score of XGBoost = ", roc_auc_score(Y_valid, Y_predXG))

from sklearn.ensemble import RandomForestClassifier

modelRF = RandomForestClassifier()
modelRF.fit(X_train, Y_train)

Y_predRF = modelRF.predict(X_valid)

print("Train Accuracy: ", modelRF.score(X_train, Y_train))
print("Validation Accuracy: ", modelRF.score(X_valid, Y_valid))

print("AUROC Score of Random Forest = ", roc_auc_score(Y_valid, Y_predRF))