Beispiel #1
0
Without much preprocessing and parameter tuning a simple LGBMClassifier should work decently.
"""

# Split training testing data
enc = LabelEncoder()
label_encoded = enc.fit_transform(label)
X_train, X_test, y_train, y_test = train_test_split(tsne_data,
                                                    label_encoded,
                                                    random_state=3)

# Create the model
lgbm = LGBMClassifier(n_estimators=500, random_state=3)
lgbm = lgbm.fit(X_train, y_train)

# Test the model
score = accuracy_score(y_true=y_test, y_pred=lgbm.predict(X_test))
print('Accuracy on testset:\t{:.4f}\n'.format(score))
"""With a basic untuned model the **activity of the smartphone user** can be predicted with an **accuracy of 95%.**<br>
This is pretty striking regarding six equally distributed labels.

**Summary:**<br>
If the smartphone or an App wants to know what you are doing, this is feasible.

## <a id=5>Participant Exploration</a>
### <a id=5.1>How Good Are the Participants Separable?</a>

As we have seen in the second t-SNE plot the separability of the participants seem to vary regarding their activity. Let us investigate this a little bit by fitting the same basic model to the data of each activity separately.
"""

# Store the data
data = []
Beispiel #2
0
                       random_state=19,
                       max_depth=4,
                       num_leaves=30,
                       objective='binary',
                       learning_rate=0.01,
                       colsample_bytree=1,
                       subsample=1,
                       verbose=-1)
    }
    stack_train, stack_test = stack(k=5,
                                    models=models,
                                    train_X=train_X,
                                    train_y=train_y,
                                    test_X=test_X)
    # Main training Process
    lgb_stack = LGBMClassifier(n_estimators=2000,
                               silent=False,
                               random_state=19,
                               max_depth=4,
                               num_leaves=20,
                               objective='binary',
                               learning_rate=0.005,
                               colsample_bytree=1,
                               subsample=1,
                               verbose=-1).fit(stack_train, train.label)
    stack_pred = lgb_stack.predict(stack_test)
    pd.DataFrame({
        "a": stack_pred
    }).to_csv(PATH + "\submission.csv", header=None,
              index=None)  # save submission to PATH
Beispiel #3
0
y = dataset.target

x_train, x_test, y_train, y_test = tts(x, y, train_size=0.8, random_state=66)

lgbm = LGBMClassifier(n_estimators=100, learning_rate=0.1, n_jobs=-1)

lgbm.fit(x_train,
         y_train,
         verbose=True,
         eval_metric=["logloss", "rmse"],
         eval_set=[(x_train, y_train), (x_test, y_test)],
         early_stopping_rounds=20)

#rmse,mae,logloss,error,auc

y_pre = lgbm.predict(x_test)

r2 = r2_score(y_test, y_pre)
score = lgbm.score(x_test, y_test)
print(__file__)
print("r2")
print(r2)
print("score")
print(score)

#6)selectFromModel

thresholds = np.sort(lgbm.feature_importances_)

idx_max = -1
max = r2
Beispiel #4
0
lgbmBO = BayesianOptimization(lt.lgbm_evaluate, {
    'min_child_weight': (0.01, 1),
    'learning_rate': (1, 10),
    'max_depth': (-1, 15),
    'num_leaves': (5, 50)
},
                              random_state=3)

lgbmBO.maximize(init_points=3, n_iter=10)

# In[24]:

params = lt.clean_param(lgbmBO.res['max']['max_params'])
lgbm_model = LGBMClassifier(**params)
lgbm_model.fit(x_pci_train, y_pci_train)
y_pci_pred = lgbm_model.predict(x_pci_test)
predictions = [round(value) for value in y_pci_pred]
accuracy = accuracy_score(y_pci_test, predictions)
print(accuracy)

# In[13]:

params = {
    'learning_rate': 0.099387,
    'max_depth': 14,
    'min_child_weight': 0,
    'num_leaves': 5
}
lgbm_model = LGBMClassifier(**params)
lgbm_model.fit(x_pci_train, y_pci_train)
y_pci_pred = lgbm_model.predict(x_pci_test)
Beispiel #5
0
#%% 建模
# xgboost
xgb = XGBClassifier()
xgb.fit(tfidf_train, y_train.values.ravel())
xgb_pred = xgb.predict(tfidf_test)
print("xgboost")
print(" Accuracy: ", accuracy_score(y_test, xgb_pred))
print(" Precision: ", precision_score(y_test, xgb_pred, pos_label='1'))
print(" Recall: ", recall_score(y_test, xgb_pred, pos_label='1'))
print(" F-measure: ", f1_score(y_test, xgb_pred, pos_label='1'))

# GBDT
gbr = GradientBoostingClassifier()
gbr.fit(tfidf_train, y_train.values.ravel())
gbr_pred = gbr.predict(tfidf_test)
print("GBDT")
print(" Accuracy: ", accuracy_score(y_test, gbr_pred))
print(" Precision: ", precision_score(y_test, gbr_pred, pos_label='1'))
print(" Recall: ", recall_score(y_test, gbr_pred, pos_label='1'))
print(" F-measure: ", f1_score(y_test, gbr_pred, pos_label='1'))

# LightGBM
lgbm = LGBMClassifier()
lgbm.fit(tfidf_train, y_train.values.ravel())
lgbm_pred = lgbm.predict(tfidf_test)
print("LightGBM")
print(" Accuracy: ", accuracy_score(y_test, lgbm_pred))
print(" Precision: ", precision_score(y_test, lgbm_pred, pos_label='1'))
print(" Recall: ", recall_score(y_test, lgbm_pred, pos_label='1'))
print(" F-measure: ", f1_score(y_test, lgbm_pred, pos_label='1'))
Beispiel #6
0
# %%
# 打乱数据
state = np.random.get_state()
np.random.shuffle(data_x)
np.random.set_state(state)
np.random.shuffle(data_y)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(data_x,
                                                    data_y,
                                                    test_size=0.3)

# 转换为Dataset数据格式
train_data = lgb.Dataset(X_train, label=y_train)
validation_data = lgb.Dataset(X_test, label=y_test)

lgbm_model = LGBMClassifier(boosting_type='gbdt',
                            num_leaves=300,
                            max_depth=-1,
                            learning_rate=0.03,
                            n_estimators=100,
                            subsample_for_bin=200000,
                            objective='binary')
lgbm_model.fit(X_train, y_train)
#用建立好的lightbm模型运用到训练集和测试集上,进行预测
y_train_pred = lgbm_model.predict(X_train)
y_test_pred = lgbm_model.predict(X_test)

print('训练集:{:.4f}'.format(roc_auc_score(y_train, y_train_pred)))
print('测试集:{:.4f}'.format(roc_auc_score(y_test, y_test_pred)))
Beispiel #7
0
    eval_metric = lgb_f1,
    early_stopping_rounds = 100,
    verbose = 10,
)

print('best score', lgb.best_score_)

# ==============================================================
# 使用全部的 train data 和 调好迭代轮数训练模型,并用 test data 做预测
# ==============================================================
print("=" * 25)
print('predicting')

lgb.n_estimators = lgb.best_iteration_
lgb.fit(all_train_x, all_train_y)
test_y = lgb.predict(test_x)


# ==============================================================
# 创建submission.csv文件
# ==============================================================
print("=" * 25)
print("submission file")
print("=" * 25)

df_sub = pd.concat([df_test['sid'], pd.Series(test_y)], axis = 1)
df_sub.columns = ['sid', 'label']
df_sub.to_csv('/Users/zfwang/project/mlproj/projects/move_ad_fraud/submission_file/submit-{}.csv' \
    .format(datetime.now().strftime('%m%d_%H%M%S')), sep = ',', index = False)

Beispiel #8
0
from sklearn.metrics import *

dia = pd.read_csv("10.1 diabetes.csv.csv")
df = dia.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)
#X=df["Pregnancies"]
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42)

lgbm_model = LGBMClassifier().fit(X_train, y_train)
y_pred = lgbm_model.predict(X_test)
print(accuracy_score(y_test, y_pred))
lgbm_params = {
    'n_estimators': [100, 500, 1000, 2000],
    'subsample': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.1, 0.01, 0.02, 0.05],
    'min_child_samples': [20, 5, 10]
}
lgbm = LGBMClassifier()
lgbm_cv_model = GridSearchCV(lgbm, lgbm_params, cv=10, n_jobs=-1, verbose=2)
lgbm_cv_model.fit(X_train, y_train)
print("En iyi paramereler:" + str(lgbm_cv_model.best_params_))
xgb = LGBMClassifier(learning_rate=0.01,
                     n_estimators=500,
                     max_depth=3,
Beispiel #9
0
class LGBBaseline(BaseBaseline):
    def __init__(self):
        super(LGBBaseline, self).__init__(name="lgb")

    def fit(self, X_train, y_train, X_val, y_val, categoricals=None):
        results = dict()

        self.num_classes = len(np.unique(y_train))
        self.config["num_class"] = self.num_classes

        self.all_nan = np.all(np.isnan(X_train), axis=0)
        X_train = X_train[:, ~self.all_nan]
        X_val = X_val[:, ~self.all_nan]

        X_train = np.nan_to_num(X_train)
        X_val = np.nan_to_num(X_val)

        early_stopping = 150 if X_train.shape[0] > 10000 else max(
            round(150 * 10000 / X_train.shape[0]), 10)
        self.config["early_stopping_rounds"] = early_stopping

        categoricals = [
            ind for ind in range(X_train.shape[1])
            if isinstance(X_train[0, ind], str)
        ]
        X_train, X_val, self.encode_dicts = encode_categoricals(
            X_train, X_val, encode_dicts=None)

        self.model = LGBMClassifier(**self.config)
        self.model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

        pred_train = self.model.predict_proba(X_train)
        pred_val = self.model.predict_proba(X_val)

        # This fixes a bug
        if self.num_classes == 2:
            pred_train = pred_train.transpose()[0:len(y_train)]
            pred_val = pred_val.transpose()[0:len(y_val)]

        results["val_preds"] = pred_val.tolist()
        results["labels"] = y_val.tolist()

        pred_train = np.argmax(pred_train, axis=1)
        pred_val = np.argmax(pred_val, axis=1)

        results["train_acc"] = metrics.accuracy_score(y_train, pred_train)
        results["train_balanced_acc"] = metrics.balanced_accuracy_score(
            y_train, pred_train)
        results["val_acc"] = metrics.accuracy_score(y_val, pred_val)
        results["val_balanced_acc"] = metrics.balanced_accuracy_score(
            y_val, pred_val)

        return results

    def score(self, X_test, y_test):
        results = dict()

        y_pred = self.predict(X_test)

        results["test_acc"] = metrics.accuracy_score(y_test, y_pred)
        results["test_balanced_acc"] = metrics.balanced_accuracy_score(
            y_test, y_pred)

        return results

    def predict(self, X_test, predict_proba=False):
        X_test = X_test[:, ~self.all_nan]
        X_test = np.nan_to_num(X_test)
        X_test, _, _ = encode_categoricals(X_test,
                                           encode_dicts=self.encode_dicts)

        if predict_proba:
            y_pred_proba = self.model.predict_proba(X_test)
            if self.num_classes == 2:
                y_pred_proba = y_pred_proba.transpose()[0:len(X_test)]
            return y_pred_proba

        y_pred = self.model.predict(X_test)
        if self.num_classes == 2:
            y_pred = y_pred.transpose()[0:len(X_test)]
        y_pred = np.argmax(y_pred, axis=1)
        return y_pred
Beispiel #10
0
print('processing lightgbm.............')
model2 = LGBMClassifier(learning_rate=0.1,
                        max_depth=3,
                        num_leaves=15,
                        n_estimators=300)
model2.fit(x_train, y_train)
feature = model2.feature_importances_
idxsorted = np.argsort(-feature)
lgb_feature = [colnames[i] for i in idxsorted]
lgb_feature_score = [feature[i] for i in idxsorted]
lgb_feature_final = pd.DataFrame(lgb_feature_score,
                                 index=lgb_feature,
                                 columns=['feature_score'])

train_pred = model2.predict_proba(x_train)
test_pred = model2.predict_proba(x_test)
train_pred_label = model2.predict(x_train)
test_pred_label = model2.predict(x_test)

lgb_cf = confusion_matrix_score(test_pred_label)
lgb_acc = Counter(test_pred_label == y_test['First_label'])[1] / len(
    y_test['First_label'])
lgb_acc_train = Counter(train_pred_label == y_train)[1] / len(y_train)
Normal, AF, I_AVF, LBBB, RBBB, PAC, PVC, STD, STE, F1 = ecg_score(lgb_cf)
print('acc为', lgb_acc, 'f1为', F1)

#lgb_feature_final.to_csv('D:/ecg12/feture_score_train.csv')
elapsed = (time.clock() - start)
print("Time used:", elapsed)
        subsample=.8,
        max_depth=10,
        reg_alpha=.1,
        reg_lambda=.05,
        min_split_gain=.005
    )

lgbm_class.fit(features_train, 
        target_train,
        eval_set= [(features_train, target_train), (features_test, target_test)], 
        eval_metric='auc', 
        verbose=0, 
        early_stopping_rounds=30
       )

pred = lgbm_class.predict(features_test)

print('\n Percentage accuracy')
print(classification_report(pred, target_test))

#%% [markdown]
# Now, since LGBM performed the best (as expected), train it on all of the data. I won't be able to see the accuracy this time. 

#%%
final_model = LGBMClassifier(
        n_estimators=300,
        num_leaves=30,
        colsample_bytree=.8,
        subsample=.8,
        max_depth=10,
        reg_alpha=.1,
Beispiel #12
0
lbs_valid = np.load(path_lbs_valid)
fts_train = np.load(path_fts_train)
lbs_train = np.load(path_lbs_train)
fts_train.shape, lbs_train.shape, fts_valid.shape, lbs_valid.shape
# %%


def report_intermediate_result(env):
    nni.report_intermediate_result(env.evaluation_result_list[1][2])
    # print(env.evaluation_result_list)


# %%
params = nni.get_next_parameter()
lgb = LGBMClassifier(n_jobs=-1, **params)

lgb.fit(fts_train,
        lbs_train,
        eval_set=[(fts_valid, lbs_valid)],
        eval_metric='multi_error',
        verbose=100,
        callbacks=[report_intermediate_result],
        early_stopping_rounds=50)

# %%
preds = lgb.predict(fts_valid)
score = accuracy_score(lbs_valid, preds)
nni.report_final_result(1 - score)

# %%
            # model = RandomForestClassifier(n_jobs=-1, **experiment)
            model = LGBMClassifier(boosting_type="rf",
                                   verbose=-1,
                                   **experiment)
            accuracies = []
            kappas = []
            # f1_scores = []
            times = []

            for folds in splits.values():
                for fold in folds.values():
                    for repeat in fold.values():
                        start = time.time()
                        train_index, test_index = repeat.train, repeat.test
                        model.fit(X[train_index], y[train_index])
                        y_pred = model.predict(X[test_index])
                        end = time.time()

                        times.append(end - start)
                        accuracies.append(accuracy_score(
                            y[test_index], y_pred))
                        kappas.append(cohen_kappa_score(y[test_index], y_pred))
                        # f1_scores.append(f1_score(y[test_index], y_pred))

            eval_time = np.sum(times)
            mean_acc = np.mean(accuracies)
            # mean_f1 = np.mean(f1_scores)
            mean_kappa = np.mean(kappas)

            db.insert({
                "task_id": task_id,
Beispiel #14
0
from pprint import pprint
print('Parameters Currently In Use:\n')
pprint(classifier_lgbm_corr.get_params())

# Fit e Predição

import time
start = time.time()

classifier_lgbm_corr.fit(X_corr_train, Y_corr_train)

end = time.time()
print("Tempo de Execução: {} sec".format(end - start))

Y_pred_lgbm_corr = classifier_lgbm_corr.predict(X_corr_test)

#Análise de Métricas

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

#Accuracy Score

mtrc_accuracy_score_lgbm_corr = accuracy_score(Y_corr_test, Y_pred_lgbm_corr)
print('Accuracy Score : ' + str(mtrc_accuracy_score_lgbm_corr))

#Precision Score

mtrc_precision_score_lgbm_corr = precision_score(Y_corr_test, Y_pred_lgbm_corr)
print('Precision Score : ' + str(mtrc_precision_score_lgbm_corr))
Beispiel #15
0
# evals = [(x_test, y_test)]
# model.fit(x_train, y_train, early_stopping_rounds= 100, eval_metric= 'logloss', eval_set=evals, verbose=True)
# model.fit(x_train, y_train,  eval_metric= 'logloss')
model = LGBMClassifier()
model.fit(x_train, y_train)

# model & weight save
pickle.dump(model, open('C:/nmb/nmb_data/h5/LGBM0.data', 'wb'))  # wb : write
# print("== save complete ==")

# model load
# model = pickle.load(open('E:/nmb/nmb_data/cp/m03_mels_SVC.data', 'rb'))  # rb : read
# time >>  0:01:07.868304

# evaluate
y_pred = model.predict(x_test)
# print(y_pred[:100])
# print(y_pred[100:])

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
hamm_loss = hamming_loss(y_test, y_pred)
hinge_loss = hinge_loss(y_test, y_pred)
log_loss = log_loss(y_test, y_pred)

print("accuracy : \t", accuracy)
print("recall : \t", recall)
print("precision : \t", precision)
print("f1 : \t", f1)
score = []
for train_index, test_index in skf.split(data, data_y):
    # 训练集
    train_data = data.iloc[train_index]
    train_data_y = train_data['tz_students'].values
    train_data = train_data.drop(['STUDENTCODE', 'tz_students', 'FACTTUITION', 'STUDYMODE_1',
                                  'STUDYMODE_2', 'earliestchoosefrom2'], axis=1)

    # 测试集
    test_data = data.iloc[test_index]
    test_y = data_y[test_index]

    # 训练模型
    clf = LGBMClassifier(num_leaves=8, learning_rate=0.05, max_depth=8, n_estimators=300, subsample=0.8,
                         colsample_bytree=1, min_child_weight=1, )
    clf.fit(X=train_data, y=train_data_y)

    # 预测
    test_x = test_data.drop(['STUDENTCODE', 'tz_students', 'FACTTUITION', 'STUDYMODE_1',
                                  'STUDYMODE_2', 'earliestchoosefrom2'], axis=1)
    test_data['pre'] = clf.predict(test_x)
    # test_data.at[test_data[test_data.STUDYMODE_1 == 1].index, 'pre'] = 0
    # tmp_score = metrics.roc_auc_score(y_true=test_y, y_score=pred)
    tmp_score = metrics.f1_score(y_true=test_y, y_pred=test_data['pre'].values, average='macro')
    # print(tmp_score)
    score.append(tmp_score)

print(score)
print('f1:', sum(score)/len(score))

Beispiel #17
0
def cv_scores(df,
              num_folds,
              params,
              stratified=False,
              verbose=-1,
              save_train_prediction=True,
              train_prediction_file_name='train_prediction.csv',
              save_test_prediction=True,
              test_prediction_file_name='test_prediction.csv'):
    warnings.simplefilter('ignore')

    clf = LGBMClassifier(**params)

    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)

    # Create arrays and dataframes to store results
    train_pred = np.zeros(train_df.shape[0])
    train_pred_proba = np.zeros(train_df.shape[0])

    test_pred = np.zeros(train_df.shape[0])
    test_pred_proba = np.zeros(train_df.shape[0])

    prediction = np.zeros(test_df.shape[0])

    feats = [
        f for f in train_df.columns if f not in
        ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
    ]

    df_feature_importance = pd.DataFrame(index=feats)

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['TARGET'])):
        print('Fold', n_fold, 'started at', time.ctime())
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'TARGET'].iloc[valid_idx]

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc',
                verbose=verbose,
                early_stopping_rounds=200)

        train_pred[train_idx] = clf.predict(train_x,
                                            num_iteration=clf.best_iteration_)
        train_pred_proba[train_idx] = clf.predict_proba(
            train_x, num_iteration=clf.best_iteration_)[:, 1]
        test_pred[valid_idx] = clf.predict(valid_x,
                                           num_iteration=clf.best_iteration_)
        test_pred_proba[valid_idx] = clf.predict_proba(
            valid_x, num_iteration=clf.best_iteration_)[:, 1]

        prediction += \
                clf.predict_proba(test_df[feats], num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits

        df_feature_importance[n_fold] = pd.Series(clf.feature_importances_,
                                                  index=feats)

        print('Fold %2d AUC : %.6f' %
              (n_fold, roc_auc_score(valid_y, test_pred_proba[valid_idx])))
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    roc_auc_train = roc_auc_score(train_df['TARGET'], train_pred_proba)
    precision_train = precision_score(train_df['TARGET'],
                                      train_pred,
                                      average=None)
    recall_train = recall_score(train_df['TARGET'], train_pred, average=None)

    roc_auc_test = roc_auc_score(train_df['TARGET'], test_pred_proba)
    precision_test = precision_score(train_df['TARGET'],
                                     test_pred,
                                     average=None)
    recall_test = recall_score(train_df['TARGET'], test_pred, average=None)

    print('Full AUC score %.6f' % roc_auc_test)

    df_feature_importance.fillna(0, inplace=True)
    df_feature_importance['mean'] = df_feature_importance.mean(axis=1)

    # Write prediction files
    if save_train_prediction:
        df_prediction = train_df[['SK_ID_CURR', 'TARGET']]
        df_prediction['Prediction'] = test_pred_proba
        df_prediction.to_csv(train_prediction_file_name, index=False)
        del df_prediction
        gc.collect()

    if save_test_prediction:
        df_prediction = test_df[['SK_ID_CURR']]
        df_prediction['TARGET'] = prediction
        df_prediction.to_csv(test_prediction_file_name, index=False)
        del df_prediction
        gc.collect()

    return df_feature_importance, \
           [roc_auc_train, roc_auc_test,
            precision_train[0], precision_test[0], precision_train[1], precision_test[1],
            recall_train[0], recall_test[0], recall_train[1], recall_test[1], 0]
Beispiel #18
0
[ 25  26  33  46  47  48  49  51  59  62  63  64  70  71  83  85  85  94
  95  98 110 112 112 124 128 138 146 168 175 302]
'''


for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)
    selection_model = LGBMClassifier(n_estimators=300, learning_rate=0.1, n_jobs=-1)
    selection_model.fit(select_x_train, y_train, verbose=False, eval_metric='logloss',
                eval_set=[(select_x_train, y_train), (select_x_test, y_test)],
                early_stopping_rounds=20)
    
    y_pred = selection_model.predict(select_x_test)

    acc = accuracy_score(y_test, y_pred)

    # get_clf_eval(y_test, y_pred)

    print('Thresh=%.3f, n=%d, acc: %.2f%%' %(thresh, select_x_train.shape[1], acc*100.0))
    # model.save_model('./model/xgb_save/cancer_n=%d_acc=%.3f.model' %(select_x_train.shape[1], acc))

def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
Beispiel #19
0
import lightgbm as lgb

train_data = lgb.Dataset(x_train, y_train, free_raw_data=False, categorical_feature = cat_feat)
valid_data = lgb.Dataset(x_valid, y_valid, free_raw_data=False, categorical_feature = cat_feat)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}

from lightgbm import LGBMClassifier
num_round = 1000
lgbm = LGBMClassifier(num_leaves= 180, max_depth= -1, n_estimators = 2000, n_jobs = 16, random_state = 4, subsample = 0.9, gpu_id = 0, colsample_bytree = 0.85, max_bin = 512, tree_method = 'gpu_hist')
lgbm.fit(X=x_train,y=y_train,eval_set = [(x_train,y_train),(x_valid, y_valid)], eval_metric = ['binary_logloss'], early_stopping_rounds = 70)
# model = lgb.train(parameter, train_data, num_round, valid_sets = [train_data, valid_data], verbose_eval = 100, early_stopping_rounds = 50)

pred_lgb = lgbm.predict(X_test)

idx = []
for i in range (X_test.shape[0]):
    idx.append(i)

mysubmit = pd.DataFrame({'id': idx, 'up_down': pred_lgb})
mysubmit.to_csv('submission.csv', index=True)
Beispiel #20
0
pickle.dump(model, open(model_file, 'wb'))

# calculate the fpr and tpr for all thresholds of the classification

probs = model.predict_proba(images_validation)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(labels_validation, preds)
roc_score = roc_auc_score(labels_validation, preds)
print("ROC score: %s" % roc_score)
roc_auc = metrics.auc(fpr, tpr)

# ploting to a file

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.savefig(gw_roc_file, bbox_inches = 'tight',pad_inches = 0)


# Training report

target_names = [1,0]
pred_labels = model.predict(images_validation)
print(classification_report(labels_validation, pred_labels))
Beispiel #21
0
                     seed=42,
                     feature_fraction_seed=42,
                     bagging_seed=42,
                     drop_seed=42,
                     data_random_seed=42,
                     boost_from_average=True,
                     scale_pos_weight=w)

clf.fit(train_x,
        train_y,
        eval_set=[(train_x, train_y), (valid_x, valid_y)],
        eval_metric='auc',
        early_stopping_rounds=200,
        verbose=10)

ho_pred = clf.predict(valid_x)
ho_proba = clf.predict_proba(valid_x)[:, 1]
v_pred = roc_auc_score(valid_y, ho_pred)
v_proba = roc_auc_score(valid_y, ho_proba)
print('##################')
print('Training : Single Model Hold Out Pred AUC=', v_pred)
print('##################')
print('Training : Single Model Hold out ProabA AUC=', v_proba)
ct = pd.crosstab(valid_y,
                 ho_pred,
                 rownames=['Actual'],
                 colnames=['Predicted'],
                 margins=True)
print(ct)
print(classification_report(valid_y, ho_pred))
Beispiel #22
0
feature_importance_df = pd.DataFrame(data=None,
                                     columns=['feature', 'importances'])
feature_importance_df['importances'] = rf.feature_importances_
feature_importance_df['feature'] = Xtrain.columns

feature_importance_df = feature_importance_df.sort_values(by='importances',
                                                          ascending=False)
feature_importance_df.tail()
feature_names = feature_importance_df.head(56)['feature']
Xtrain = train[feature_names]
Xtest = test[feature_names]
print(Xtrain.shape, Ytrain.shape, Xtest.shape)
params = {
    'n_estimators': 1222,
    'learning_rate': 0.07307234151834806,
    'num_leaves': 96,
    'colsample_bytree': 0.8972376156262298,
    'subsample': 0.9312856106293543,
    'min_child_samples': 1
}

lightgbm = LGBMClassifier(random_state=18,
                          subsample_freq=1,
                          silent=False,
                          **params)
lightgbm.fit(Xtrain, Ytrain)
predictions = lightgbm.predict(Xtest)
submission['Cover_Type'] = predictions
submission.to_csv('LGBSingleModel.csv')
submission.head()
Beispiel #23
0
    cnt = 0
    for flat_data in flats_data:
        x = []
        y = flat_data['rating']
        for k, v in flat_data.items():
            if k != 'rating':
                x.append(v)
        X.append(x)
        Y.append(y)
        cnt += 1
        if cnt == n:
            break
    return X, Y

train_x, train_y = get_xy(train_flats_data)
test_x, test_y = get_xy(test_flats_data)

model = LGBMClassifier()
model.fit(train_x, train_y)

yhat = list(model.predict(test_x))

errs = {0:0, 1:0, 2:0, 3:0, 4:0}
for i in range(len(test_y)):
    err = abs(test_y[i] - yhat[i])
    errs[err] += 1

for k, v in errs.items():
    print(k, v)

print(get_quality_rmse(test_y, yhat))
Beispiel #24
0
def objective(config):
    # Get and log parameters
    params = {
        "num_leaves": config["num_leaves"],
        "learning_rate": config["learning_rate"],
        "n_estimators": config["n_estimators"],
        "objective": config["objective"],
        "reg_alpha": config["reg_alpha"],
        "reg_lambda": config["reg_lambda"],
        "tree_learner": config["tree_learner"],
        "subsample": config["subsample"],
        "subsample_freq": config["subsample_freq"],
        "feature_sel": fet_sel_dict[config["feature_sel"]]
    }

    mlflow.log_params(params)

    model = LGBMClassifier(**params, random_state=0)

    X_train, X_test, y_train, y_test = give_data(
        feature_sel=config["feature_sel"])

    model.fit(X_train,
              np.ravel(y_train),
              eval_set=[(X_test, np.ravel(y_test))],
              verbose=False,
              early_stopping_rounds=50,
              callbacks=[LightGBMCallback])

    eval_results = classification_report(np.ravel(y_test),
                                         model.predict(X_test),
                                         output_dict=True)
    eval_results["accuracy"] = accuracy_score(y_test, model.predict(X_test))
    eval_results["auroc"] = roc_auc_score(y_test,
                                          model.predict_proba(X_test)[:, 1])

    mlflow.log_metric("val_auroc", eval_results["auroc"])

    fold_accuracy = eval_results["accuracy"]
    mlflow.log_metric("val_accuracy", fold_accuracy)

    fold_f1 = eval_results["1"]["f1-score"]
    mlflow.log_metric("val_f1-score-1", fold_f1)
    mlflow.log_metric("val_f1-score-0", eval_results["0"]["f1-score"])

    fold_precision = eval_results["1"]["precision"]
    mlflow.log_metric("val_precision", fold_precision)

    fold_recall = eval_results["1"]["recall"]
    mlflow.log_metric("val_recall", fold_recall)

    eval_results_tr = classification_report(np.ravel(y_train),
                                            model.predict(X_train),
                                            output_dict=True)
    eval_results_tr["accuracy"] = accuracy_score(y_train,
                                                 model.predict(X_train))

    fold_accuracy_tr = eval_results_tr["accuracy"]
    mlflow.log_metric("tr_accuracy", fold_accuracy_tr)

    fold_f1_tr = eval_results_tr["1"]["f1-score"]
    mlflow.log_metric("tr_f1-score", fold_f1_tr)

    fold_precision_tr = eval_results_tr["1"]["precision"]
    mlflow.log_metric("tr_precision", fold_precision_tr)

    fold_recall_tr = eval_results_tr["1"]["recall"]
    mlflow.log_metric("tr_recall", fold_recall_tr)

    tune.report(auroc=eval_results["auroc"], done=True)
class PHSICAdasynLGBM(BaseEstimator):
    """
    An estimator upsampling minority classes, finding a small set of 
    stable biomarkers, and fitting a gradient boosting model over them

    Parameters
    ----------
    n_features : int, optional (default=30)
        Max. number of biomarkers (important features) to be selected

    adasyn_neighbors : int, optional (default=10)
        K neighbors for ADASYN upsampling algorithm
        
    B : int, optional (default=20)
        Block size for Block HSIC Lasso
        
    M : int, optional (default=10)
        Max allowed permutations of samples for Block HSIC Lasso

    hsic_splits :  int, optional (default=5)
        number of folds for verifying feature stability

    feature_neighbor_threshold : float, optional (default=0.4)
        threshold for considering neighbors of important features in stability check
    """
    def __init__(self,
                 n_features=30,
                 adasyn_neighbors=10,
                 B=20,
                 M=10,
                 hsic_splits=3,
                 stability_minimum_across_splits=2,
                 feature_neighbor_threshold=0.4):
        self.n_features = n_features
        self.adasyn_neighbors = adasyn_neighbors
        self.M = M
        self.B = B
        self.hsic_splits = hsic_splits
        self.neighbor_threshold = feature_neighbor_threshold
        self.stability_minimum_across_splits = stability_minimum_across_splits

    def fit(self, X, y):
        if X.shape[1] > 10000:
            #clf = RandomForestClassifier(n_estimators=1000,n_jobs=-1).fit(X,y)
            clf = LGBMClassifier(n_estimators=1000, n_jobs=-1).fit(X, y)
            ftimp = clf.feature_importances_
            relevant = np.where(ftimp > 0)[0]
            print("relevant ft:", len(relevant), "/", X.shape[1])
        else:
            relevant = np.arange(X.shape[1])

        sss = StratifiedShuffleSplit(n_splits=self.hsic_splits,
                                     random_state=42)
        idxs = []
        hsics = []
        for train_index, test_index in list(sss.split(X, y)):
            hsic_lasso2 = HSICLasso()
            hsic_lasso2.input(X[:, relevant][train_index], y[train_index])
            hsic_lasso2.classification(
                self.n_features, B=self.B,
                M=self.M)  #(self.n_features, B=self.B, M=self.M)
            hsics.append(hsic_lasso2)

            # not just best features - get their neighbors (similar features) too
            all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel()
            for i in range(len(all_ft_idx)):
                idx = np.array(hsic_lasso2.get_index_neighbors(
                    feat_index=i, num_neighbors=10),
                               dtype=int)
                score = np.array(hsic_lasso2.get_index_neighbors_score(
                    feat_index=i, num_neighbors=10),
                                 dtype=int)
                idx = idx[np.where(score > self.neighbor_threshold)[0]]
                all_ft_idx = np.concatenate((all_ft_idx, idx))
            all_ft_idx = np.unique(all_ft_idx)

            idxs.append(relevant[all_ft_idx])
            #if len(idxs) == 1:
            #    self.hsic_idx_ = idxs[0]
            #else:
            #    self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_)
        self.hsic_idx_ = []

        stability_concession = 0
        while len(self.hsic_idx_) == 0:
            featurecandidates = np.unique(np.concatenate(idxs))
            for candidate in featurecandidates:
                occurrences = np.sum(
                    [1 if candidate in idx else 0 for idx in idxs])
                if occurrences > self.stability_minimum_across_splits - stability_concession:
                    self.hsic_idx_.append(candidate)
            if len(self.hsic_idx_) > 1:
                break
            else:
                # failed to find commonly occurring features - reduce threshold
                stability_concession += 1
        print("HSIC done.", len(self.hsic_idx_), "(out of ",
              len(featurecandidates), " candidates)")

        print("Upsampling with ADASYN... (features: " +
              str(len(self.hsic_idx_)) + ")")
        sm = ADASYN(sampling_strategy="minority",
                    n_neighbors=self.adasyn_neighbors,
                    n_jobs=-1)
        sX, sy = X[:, self.hsic_idx_], y
        if self.adasyn_neighbors > 0:
            try:
                sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y)
                for i in range(len(np.unique(y) - 1)):
                    sX, sy = sm.fit_resample(sX, sy)
            except:
                pass
            print("ADASYN done. Starting clf")

        self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy)
        print("done")
        return self

    def predict_proba(self, X):
        return self.clf_.predict_proba(X[:, self.hsic_idx_])

    def predict(self, X):
        return self.clf_.predict(X[:, self.hsic_idx_])
     learner.fit(cv=3, optimizer='scikit-bayes')
     score_auto_skopt = accuracy_score(y_test, learner.predict(X_test))
     xgb_default = XGBClassifier()
     cat_default = CatBoostClassifier(logging_level='Silent')
     lgbm_default = LGBMClassifier()
     X_train, X_test, y_train = _feature_preprocessor.transform(
         X_train), _feature_preprocessor.transform(
             X_test), _target_preprocessor.transform(
                 np.array(y_train).reshape(-1, 1)).ravel()
     print('training defaults')
     xgb_default.fit(X_train, y_train)
     cat_default.fit(X_train, y_train)
     lgbm_default.fit(X_train, y_train)
     score_xgb = accuracy_score(y_test, xgb_default.predict(X_test))
     score_cat = accuracy_score(y_test, cat_default.predict(X_test))
     score_lgbm = accuracy_score(y_test, lgbm_default.predict(X_test))
     del xgb_default, cat_default, lgbm_default, learner
     results['name'].append(name)
     results['xgboost'].append(score_xgb)
     results['lightgbm'].append(score_lgbm)
     results['catboost'].append(score_cat)
     results['automl-grid'].append(score_auto_grid)
     results['automl-hyperopt'].append(score_auto_hyperopt)
     results['automl-skopt'].append(score_auto_skopt)
     print('RESULTS:')
     print(
         f"Name: {name}, XGB-default: {score_xgb}, LGBM-default: {score_lgbm}, CAT-default: {score_cat}, GRID: {score_auto_grid}, HYPEROPT: {score_auto_hyperopt}, SKOPT: {score_auto_skopt}"
     )
 except Exception as ex:
     print(f'{name} failed')
     print(ex)
)

x1_train, x1_test, y1_train, y1_test = train_test_split(x1,
                                                        y1,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        stratify=y1)
lg1.fit(x1_train, y1_train)
print("YOUR R2 MACHINE LEARNED WITH THIS ACCURACY : ",
      lg1.score(x1_test, y1_test))

print(
    "\n.................................................................................\n"
)

y1_pred = lg1.predict(x1_test)

print(classification_report(y1_test, y1_pred))

precision, recall, fscore, support = score(y1_test, y1_pred)

print('precision: ', np.mean(precision))
print('recall: ', np.mean(recall))
print('fscore: ', np.mean(fscore))

print("---+++---+++---+++---+++---")

# THIS PART FOR CHECK RESIDENT 2 LightGBM:

for i in range(10):
    globals()["url0" + str(
Beispiel #28
0
# Scaling data to remove any potential bias when fitting
sc = StandardScaler()
train_features = sc.fit_transform(train_features)
test_features = sc.transform(test_features)

# Using Light Gradient Boosting Model classification with a maximum tree depth of 4
model = LGBMClassifier(max_depth=4)

# Fitting the model
model.fit(train_features, train_labels)

# Extract feature importances
fi = pd.DataFrame({'feature': list(feature_cols),
                   'importance': model.feature_importances_}).\
                    sort_values('importance', ascending = False)

fi.head(20)

# Predicting the Test set results
predictions = model.predict(test_features)

# Making the Confusion Matrix
pd.crosstab(test_labels,
            predictions,
            rownames=['Actual'],
            colnames=['Predicted'])

# Accuracy Score
accuracy_score(test_labels, predictions)
    tprs_knn[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs_knn.append(roc_auc)

    clf_rf = clf_rf.fit(X[train], y[train])
    ac_rf.append(accuracy_score(y[test], clf_rf.predict(X[test])))
    mean_fpr = np.linspace(0, 1, 100)
    probas_ = clf_rf.fit(X[train], y[train]).predict_proba(X[test])
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs_rf.append(interp(mean_fpr, fpr, tpr))
    tprs_rf[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs_rf.append(roc_auc)

    clf_lgbc = clf_lgbc.fit(X[train], y[train])
    ac_lgbc.append(accuracy_score(y[test], clf_lgbc.predict(X[test])))
    mean_fpr = np.linspace(0, 1, 100)
    probas_ = clf_lgbc.fit(X[train], y[train]).predict_proba(X[test])
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs_lgbc.append(interp(mean_fpr, fpr, tpr))
    tprs_lgbc[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs_lgbc.append(roc_auc)

    clf_xgb = clf_xgb.fit(X[train], y[train])
    ac_xgb.append(accuracy_score(y[test], clf_xgb.predict(X[test])))
    mean_fpr = np.linspace(0, 1, 100)
    probas_ = clf_xgb.fit(X[train], y[train]).predict_proba(X[test])
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs_xgb.append(interp(mean_fpr, fpr, tpr))
    tprs_xgb[-1][0] = 0.0
Beispiel #30
0
                                   lowercase=True,
                                   use_idf=True)

# %%
# Apply to train and test
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

# %%
# Model
model = LGBMClassifier(learning_rate=0.1,
                       num_leaves=128,
                       min_child_samples=100,
                       ubsample=0.96,
                       colsample_bytree=0.28,
                       random_state=0,
                       subsample_freq=1,
                       n_estimators=100)
model.fit(tfidf_train, y_train)

y_pred = model.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

# %%
# Saving model
jl.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl.z')
jl.dump(model, 'model.pkl.z')

# %%
    #aa = model_lr.coef_    
    
if cond01 == 3:
    from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bayes
    model_nb = GaussianNB(); model_nb.fit(X_train, Y_train)
    predicted_nb = model_nb.predict(X_test) ; print("Gaussian Naive Bayes",metrics.accuracy_score(Y_test, predicted_nb),"\n")

if cond01 == 4:
    from sklearn.ensemble import GradientBoostingClassifier # GradientBoosting
    model_gb = GradientBoostingClassifier(); model_gb.fit(X_train, Y_train)
    predicted_gb = model_gb.predict(X_test) ; print("GradientBoosting",metrics.accuracy_score(Y_test, predicted_gb),"\n")

if cond01 == 5:
    from lightgbm import LGBMClassifier  # LightGBM
    model_lgbm = LGBMClassifier(); model_lgbm.fit(X_train, Y_train)
    predicted_lgbm = model_lgbm.predict(X_test); print("LightGBM",metrics.accuracy_score(Y_test, predicted_lgbm),"\n")



#
##http://myenigma.hatenablog.com/entry/2015/10/09/223629
#import seaborn as sns
#iris = sns.load_dataset("iris") #サンプルデータセット
##sns.pairplot(iris);
#sns.pairplot(iris,hue="species");
#sns.plt.savefig("iris.png")
#sns.plt.show()
#