コード例 #1
0
ファイル: A_TL_LGB_LGB.py プロジェクト: tomzhang/QH_FInSight
def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
    # SelectfromModel
    clf = LGBMClassifier(n_estimators=400)
    clf.fit(matrix_x_temp, label_y)
    sfm = SelectFromModel(clf, prefit=True, threshold=th)
    matrix_x = sfm.transform(matrix_x_temp)

    # 打印出有多少特征重要性非零的特征
    feature_score_dict = {}
    for fn, s in zip(fe_name, clf.feature_importances_):
        feature_score_dict[fn] = s
    m = 0
    for k in feature_score_dict:
        if feature_score_dict[k] == 0.0:
            m += 1
    print 'number of not-zero features:' + str(len(feature_score_dict) - m)

    # 打印出特征重要性
    feature_score_dict_sorted = sorted(feature_score_dict.items(),
                                       key=lambda d: d[1], reverse=True)
    print 'feature_importance:'
    for ii in range(len(feature_score_dict_sorted)):
        print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
    print '\n'

    f = open('../eda/lgb_feature_importance.txt', 'w')
    f.write(th)
    f.write('\nRank\tFeature Name\tFeature Importance\n')
    for i in range(len(feature_score_dict_sorted)):
        f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
    f.close()

    # 打印具体使用了哪些字段
    how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
    feature_used_dict_temp = feature_score_dict_sorted[:how_long]
    feature_used_name = []
    for ii in range(len(feature_used_dict_temp)):
        feature_used_name.append(feature_used_dict_temp[ii][0])
    print 'feature_chooesed:'
    for ii in range(len(feature_used_name)):
        print feature_used_name[ii]
    print '\n'

    f = open('../eda/lgb_feature_chose.txt', 'w')
    f.write('Feature Chose Name :\n')
    for i in range(len(feature_used_name)):
        f.write(str(feature_used_name[i]) + '\n')
    f.close()

    # 找到未被使用的字段名
    feature_not_used_name = []
    for i in range(len(fe_name)):
        if fe_name[i] not in feature_used_name:
            feature_not_used_name.append(fe_name[i])

    return matrix_x, feature_not_used_name[:], len(feature_used_name)
コード例 #2
0
ファイル: practice.py プロジェクト: Ewen2015/Kaggle
def main():
	# load the data
	print('\nloading...')
	wd = '/Users/ewenwang/Documents/credit/data'
	os.chdir(wd)
	dataFile = 'creditcard.csv'
	dataset = pd.read_csv(dataFile, low_memory=False)

	# set target and predictors
	target = 'Class'
	predictors = [x for x in dataset.columns if x not in [target]]

	# split the data into training and test sets 
	seed = 2017
	dtrain, dtest = train_test_split(dataset, test_size=0.33, random_state=seed)

	# build the classifier
	gbm = LGBMClassifier(
		learning_rate=0.01,
		n_estimators=5000,
		objective='binary',
		metric='auc',
		max_depth=10,
		subsample=0.83,
		colsample_bytree=0.63,
		save_binary=True,
		is_unbalance=True,
		random_state=seed
	)

	# train the model
	print('\nfitting...')
	gbm.fit(dtrain[predictors], dtrain[target])

	# report
	report(gbm, dtrain, dtest, predictors, target)

	return None
train_X, valid_X, train_Y, valid_Y = train_test_split(X, Y, test_size=0.2, random_state=2018)


# In[ ]:


clf = LGBMClassifier(n_estimators=200, learning_rate=0.01)


# In[ ]:


clf.fit(
        train_X,
        train_Y,
        eval_set=[(train_X, train_Y), (valid_X, valid_Y)],
        eval_metric='auc',
        early_stopping_rounds=50,
        verbose=False
       )


# In[ ]:


plot_importance(clf, figsize=(10,10))


# In[ ]:


#print("only showing the distribution for the first few columns, edit the counter to show all distribution")
コード例 #4
0
xgbc_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
xgbc_valid_auc, xgbc_valid_accuracy, xgbc_valid_recall, xgbc_valid_precision, \
xgbc_valid_specificity = print_report(y_valid,y_valid_preds, thresh)
# .......................... (11)xgboost END ..........................

# .......................... (12)lightgbm ..........................
from evaluate_metrix import *
from lightgbm import LGBMClassifier
import lightgbm as lgb

x_valid = x_train
y_valid = y_train

lgbc = LGBMClassifier()
lgbc.fit(x_train, y_train)

y_train_preds = lgbc.predict_proba(x_train)[:,1]
y_valid_preds = lgbc.predict_proba(x_valid)[:,1]

print('Xtreme Gradient Boosting Classifier')
print('Training:')
lgbc_train_auc, lgbc_train_accuracy, lgbc_train_recall, lgbc_train_precision, \
lgbc_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lgbc_valid_auc, lgbc_valid_accuracy, lgbc_valid_recall, lgbc_valid_precision, \
lgbc_valid_specificity = print_report(y_valid,y_valid_preds, thresh)
# .......................... (12)lightgbm END ..........................

# -------------------------------------------------------------------------------------------------------------------
# ------------------------ build traditional models and evaluate the model END ------------------------------
コード例 #5
0
    def AllModelsClass(self,X_train,y_train,X_test,y_test):
        loj1 = LogisticRegression() # Liblinear yerine baska yontemler var bunun bunlar da data yapisina gore sonuclar vermekte
        loj_model1 = loj1.fit(X_train,y_train)



#mlp_regres = MLPClassifier().fit(StandardScaler().fit_transform(X_train),y_train)


        cart = DecisionTreeClassifier()
        cart_model = cart.fit(X_train, y_train)


        rf_model = RandomForestClassifier().fit(X_train, y_train)


        Svc = SVC().fit(X_train, y_train)


        bayes = GaussianNB()
        bayes_model = bayes.fit(X_train,y_train)


        lgbm_ = LGBMClassifier()
        lgbm_model = lgbm_.fit(X_train,y_train)



        knn = KNeighborsClassifier()
        knn_model = knn.fit(X_train, y_train)


        gbm_model = GradientBoostingClassifier().fit(X_train, y_train)


        xgb_model = XGBClassifier().fit(X_train, y_train)

        cat_model = CatBoostClassifier().fit(X_train, y_train)
    
        modeller = [
        loj_model1,
        cart_model,
        rf_model,
        Svc,
        bayes_model,
        lgbm_model,
        knn_model,
        gbm_model,
        xgb_model,
        cat_model]


        for model in modeller:
            isimler = model.__class__.__name__
            y_pred = model.predict(X_test)     
            dogruluk = accuracy_score(y_test, y_pred)
            print("-"*28)
            print(isimler + ":" )
            print("Accuracy: {:.4%}".format(dogruluk))
        sonuc = []

        sonuclar = pd.DataFrame(columns= ["Modeller","Accuracy"])

        for model in modeller:
            isimler = model.__class__.__name__
            y_pred = model.predict(X_test)
            dogruluk = accuracy_score(y_test, y_pred)    
            sonuc = pd.DataFrame([[isimler, dogruluk*100]], columns= ["Modeller","Accuracy"])
            sonuclar = sonuclar.append(sonuc)
    
    
        sns.barplot(x= 'Accuracy', y = 'Modeller', data=sonuclar, color="b")
        plt.xlabel('Accuracy %')
        plt.title('Modellerin Doğruluk Oranları');
コード例 #6
0
# +
X = df.drop(columns=["y"])
_X = pd.get_dummies(X, "c")
y = df.y

clf = tree.DecisionTreeClassifier(random_state=117, max_depth=5, min_samples_leaf=10)
clf.fit(_X, y)

pred = clf.predict(_X)
accuracy_score(y, pred)

# +
# plot_tree(clf, X, y)
# -

# Que paso aca?
#
# Acá lo que queremos mostrar es que algunas representaciones o encodings no son siempre las mejores, depende del modelo que estemos usando.
#
# Por ejemplo, usando otro modelo...

# +
X = df.drop(columns=["y"])
X.c = X.c.astype("category")
y = df.y

lgbm_tree = LGBMClassifier(n_estimators=1)
lgbm_tree.fit(X, y)
pred = lgbm_tree.predict(X)
accuracy_score(y, pred)
コード例 #7
0
svc_rbf_disp = plot_roc_curve(svc_rbf, X_test, y_test, ax=ax, alpha=0.8)
gauss_disp = plot_roc_curve(gauss, X_test, y_test, ax=ax, alpha=0.8)
tree_disp = plot_roc_curve(tree, X_test, y_test, ax=ax, alpha=0.8)
forest_disp = plot_roc_curve(forest, X_test, y_test, ax=ax, alpha=0.8)
histgrad_disp = plot_roc_curve(histgrad, X_test, y_test, ax=ax, alpha=0.8)
gbm_disp = plot_roc_curve(gbm, X_test, y_test, ax=ax, alpha=0.8)
xgboost_disp = plot_roc_curve(xgboost, X_test, y_test, ax=ax, alpha=0.8)
lightgbm_disp = plot_roc_curve(lightgbm, X_test, y_test, ax=ax, alpha=0.8)
plt.legend(loc = 'best', prop={'size': 16})
plt.show()

roc_curve_values=dict()
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
lgbm=LGBMClassifier(learning_rate= 0.02 , max_depth= 4, subsample= 0.6, n_estimators= 1000, min_child_samples= 5)
lgbm_tuned=lgbm.fit(X_train,y_train)
y_pred=lgbm_tuned.predict(X_test)
roc_curve_values["Light GBM Classifier"]=roc_auc_score(y_test,y_pred)
roc_auc_score(y_test,y_pred)

from xgboost import XGBClassifier
xgb=XGBClassifier()
xgb_tuned=xgb.fit(X_train,y_train)
y_pred=xgb_tuned.predict(X_test)
roc_curve_values["XGBoost Classifier"]=roc_auc_score(y_test,y_pred)
roc_auc_score(y_test,y_pred)

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

histgrad=HistGradientBoostingClassifier(max_depth=8,max_leaf_nodes=14,learning_rate=0.1)
コード例 #8
0
ファイル: lgb_base0.py プロジェクト: shinnnne/zhihucup2019
            val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

model_lgb = LGBMClassifier(n_estimators=2000,
                           n_jobs=-1,
                           objective='binary',
                           seed=1000,
                           silent=True)
model_lgb.fit(X_train,
              y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)

sub = test1.copy()
sub_size = len(sub)
sub['label'] = model_lgb.predict_proba(test[feature_cols])[:, 1]

sub.to_csv('result.txt', index=None, header=None, sep='\t')

pd.set_option('display.max_rows', None)
print(
    pd.DataFrame({
        'column': feature_cols,
        'importance': model_lgb.feature_importances_
    }).sort_values(by='importance', ascending=False))
コード例 #9
0
ファイル: 06_Modeling_6_LGBM.py プロジェクト: Taerimmm/ML
from sklearn.metrics import precision_score

# Importing the dataset
X = np.load('./project/mini/data/X.npy')
y = pd.read_csv('./project/mini/data/y_label.csv', header=0).iloc[:, 0]

X = X.reshape(X.shape[0], X.shape[1] * X.shape[2])

# Splitting the dataset into the Training set and Test set
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

# Feature Scaling
x_train /= -80
x_test /= -80

model = LGBMClassifier(objective='multiclass')

model.fit(x_train, y_train, categorical_feature=[0, 12])

print('feature_importances :', model.feature_importances_)

y_pred = model.predict(x_test)
print('최종 정답률 :', model.score(x_test, y_test))

# 최종 정답률 : 0.5326016785022595
コード例 #10
0
import matplotlib.pyplot as plt

# 1. 데이터

x, y = load_iris(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=88)

# 2. 모델 구성

model = LGBMClassifier(n_estimators=1000, n_jobs=-1, objective="multiclass")

# 3. 훈련

model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)],
          eval_metric=["multi_error", "multi_logloss"], early_stopping_rounds=100)

score = model.score(x_test, y_test)

# 3-1. 컬럼수 만큼 돌 thresholds 생성

thresholds = np.sort(model.feature_importances_)

print(thresholds)

# [0.01818451 0.01885792 0.3417337  0.62122387]


# 3-2. SelectFromModel 생성

for thresh in thresholds:
コード例 #11
0
# 获取数据
digits = datasets.load_digits()
print(digits.data.shape)  # 特征空间维度
print(digits.target.shape)  # 标签的维度

# 将数据进行分割
x_train, x_test, y_train, y_test = train_test_split(digits.data,
                                                    digits.target,
                                                    test_size=0.3,
                                                    random_state=30)

params = {
    'objective': 'multiclass',
    'num_iterations': 193,
    'num_leaves': 31,
    'learning_rate': 0.1,
}
gbm = LGBMClassifier(**params)

# 训练
gbm.fit(x_train,
        y_train,
        eval_set=[(x_test, y_test)],
        eval_metric='multi_logloss',
        early_stopping_rounds=15)

# predict
y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration_)
print(f'Best iterations: {gbm.best_iteration_}')
print(accuracy_score(y_test, y_pred))
        n_estimators=10000,
        learning_rate=0.03,
        num_leaves = 22,
        colsample_bytree=0.8,
        subsample=0.8,
        max_depth=6,
        reg_alpha=0.1,
        reg_lambda=0.1,
        min_split_gain=0.01,
        min_child_weight=100,
        silent=-1,
        verbose=-1)


X_train, X_test,y_train, y_test = train_test_split(train_F_scaled , train_response,test_size =0.4, random_state=42)
lgbm_model.fit(X_train, y_train)
y_pred_prob =  lgbm_model.predict_proba( X_test)[:,1]
roc_auc_score(y_test, y_pred_prob)


# In[ ]:


from sklearn.cross_validation import cross_val_score

print( np.mean(cross_val_score(lgbm_model , train_F_scaled,train_response, scoring = 'roc_auc', cv=5)))


# In[ ]:

コード例 #13
0
ファイル: m37_LGBM2.py プロジェクト: chankyu11/Study
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)

## 모델링
model = LGBMClassifier(
    n_estimators=1000,  # verbose의 갯수, epochs와 동일
    num_leaves=50,
    subsample=0.8,
    min_child_samples=60,
    max_depth=-1)

model.fit(
    x_train,
    y_train,
    verbose=True,
    eval_metric=['auc', 'error'],  # 리스트로 묶어서 매트릭스 두개 사용가능
    eval_set=[(x_train, y_train), (x_test, y_test)],
    early_stopping_rounds=30)  # ealrystopping

# eval_metic의 종류 : rmse, mae, logloss, error(error가 0.2면 accuracy는 0.8), auc(정확도, 정밀도; accuracy의 친구다)
thresholds = np.sort(model.feature_importances_)
print(thresholds)

import pickle

for thresh in thresholds:
    # 칼럼수 만큼 돈다.
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    selection_x_train = selection.transform(x_train)
コード例 #14
0
    predicted_lr = model_lr.predict(X_test) ; print("LogisticRegression",metrics.accuracy_score(Y_test, predicted_lr),"\n")
    #aa = model_lr.coef_    
    
if cond01 == 3:
    from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bayes
    model_nb = GaussianNB(); model_nb.fit(X_train, Y_train)
    predicted_nb = model_nb.predict(X_test) ; print("Gaussian Naive Bayes",metrics.accuracy_score(Y_test, predicted_nb),"\n")

if cond01 == 4:
    from sklearn.ensemble import GradientBoostingClassifier # GradientBoosting
    model_gb = GradientBoostingClassifier(); model_gb.fit(X_train, Y_train)
    predicted_gb = model_gb.predict(X_test) ; print("GradientBoosting",metrics.accuracy_score(Y_test, predicted_gb),"\n")

if cond01 == 5:
    from lightgbm import LGBMClassifier  # LightGBM
    model_lgbm = LGBMClassifier(); model_lgbm.fit(X_train, Y_train)
    predicted_lgbm = model_lgbm.predict(X_test); print("LightGBM",metrics.accuracy_score(Y_test, predicted_lgbm),"\n")



#
##http://myenigma.hatenablog.com/entry/2015/10/09/223629
#import seaborn as sns
#iris = sns.load_dataset("iris") #サンプルデータセット
##sns.pairplot(iris);
#sns.pairplot(iris,hue="species");
#sns.plt.savefig("iris.png")
#sns.plt.show()
#

コード例 #15
0
ファイル: Property.py プロジェクト: Ewen2015/Kaggle
ip_test = data_ip[test_index,:]

gbm = LGBMClassifier(
	objective='binary',
	num_leaves=24,
	max_depth=3,
	learning_rate=0.1,
	seed=2018,
	colsample_bytree=0.3,
	subsample=0.8,
	n_jobs=-1,
	n_estimators=2000
	)

print('fitting...')
gbm.fit(ip_train, train.loc[train_index, 'is_trade'], eval_set=[(ip_test, train.loc[test_index, 'is_trade'])], 
		early_stopping_rounds=10)

property_df = pd.DataFrame(columns=['instance_id', 'item_property_prob'])
property_df['instance_id'] = data['instance_id']
property_df['item_property_prob'] = gbm.predict_proba(data_ip)[:, 1]

def NatureLP(data, columns):
	
	pass

print('saving...')
property_df.to_csv(wd+out_put[0], index=False, sep=' ')



コード例 #16
0
from sklearn.ensemble import GradientBoostingClassifier

modelGB = GradientBoostingClassifier()
modelGB.fit(X_train, Y_train)

Y_predGB = modelGB.predict(X_valid)

print("Training Accuracy: ", modelGB.score(X_train, Y_train))
print('Testing Accuarcy: ', modelGB.score(X_valid, Y_valid))

print("AUROC Score of Gradient Boosting = ", roc_auc_score(Y_valid, Y_predGB))

from lightgbm import LGBMClassifier

modelLGBM = LGBMClassifier()
modelLGBM.fit(X_train, Y_train)

Y_predLGBM = modelLGBM.predict(X_valid)

print("Training Accuracy: ", modelLGBM.score(X_train, Y_train))
print('Testing Accuarcy: ', modelLGBM.score(X_valid, Y_valid))

print("AUROC Score of LGBM = ", roc_auc_score(Y_valid, Y_predLGBM))

test_Y_RF = modelRF.predict(test_X)
test_Y_XG = modelXG.predict(test_X)
test_Y_AB = modelAB.predict(test_X)
test_Y_LGBM = modelLGBM.predict(test_X)
test_Y_GB = modelGB.predict(test_X)
test_Y_pred = []
コード例 #17
0
def OOFPreds(X, y, test_X, params, n_splits=5, random_state=23, clf='lgb'):
    """
    输入要求数据为 Dataframe
    返回数据 Series
    """

    # 方便后续特征重要度分析
    feature_importance = pd.DataFrame(columns=['feature', 'importance', 'fold'])

    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # oof 是交叉验证结果 sub是测试集预测结果
    oof_preds, sub_preds = np.zeros(X.shape[0]), np.zeros(test_X.shape[0])

    oof_train = np.zeros(X.shape[0])

    print(X.shape, test_X.shape)

    valid_scores = []
    train_scores = []

    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        trn_x, trn_y = X.iloc[trn_idx], y.iloc[trn_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

        # # 初始化 score记录方式
        # trn_init_score = pd.Series([0.95] * len(trn_x), index=trn_x.index)
        # val_init_score = pd.Series([0.95] * len(val_x), index=val_x.index)

        # 模型构建与预测任务
        if clf == 'lgb':
            with timer('{} fold 训练时间:'.format(n_fold)) as time:
                gbm = LGBMClassifier(**params)
                gbm.fit(trn_x, trn_y, init_score=trn_init_score,
                        eval_set=[(trn_x, trn_y), (val_x, val_y)],
                        eval_init_score=[trn_init_score, val_init_score],
                        eval_metric='auc', verbose=30, early_stopping_rounds=100)

                print('best iteration: {}'.format(gbm.best_iteration_))
                print('100单次训练时间: {:.3f}'.format(time*100/gbm.best_iteration_))

                pred_val = gbm.predict_proba(val_x, num_iteration=gbm.best_iteration_)[:, 1]
                pred_test = gbm.predict_proba(test_X, num_iteration=gbm.best_iteration_)[:, 1]

            # 预测分数 预测结果记录
            oof_preds[val_idx] = pred_val
            sub_preds += pred_test / folds.n_splits

            print(gbm.best_score_)

            valid_score = gbm.best_score_['valid_1']['auc']
            train_score = gbm.best_score_['training']['auc']

            valid_scores.append(valid_score)
            train_scores.append(train_score)

            feature_importance = feature_importance.append(pd.DataFrame({
                'importance': gbm.feature_importances_,
                'fold': [n_fold + 1] * X.shape[1],
                'feature': X.columns.tolist()}))


        else:
            # 自己的模型
            # 任务一:完成模型的构建预测任务
            # 任务二:完成预测分数,预测结果的记录
            # 任务三:完成模型重要程度的记录
            clf = LogisticRegression(**params)
            clf.fit(trn_x, trn_y)

            pred_train = clf.predict_proba(trn_x)[:, 1]
            pred_val = clf.predict_proba(val_x)[:, 1]
            pred_test = clf.predict_proba(test_X)[:, 1] \

            oof_preds[val_idx] = pred_val
            sub_preds += pred_test / folds.n_splits


            valid_score = roc_auc_score(val_y, pred_val)
            train_score = roc_auc_score(trn_y, pred_train)

            valid_scores.append(valid_score)
            train_scores.append(train_score)

            feature_importance = feature_importance.append(pd.DataFrame({
                'importance': clf.coef_[0],
                'fold': [n_fold + 1] * X.shape[1],
                'feature': X.columns.tolist()}))


        print('Fold {:02d} 训练集 AUC: {:.6f} 验证集 AUC: {:.6f} '.format(n_fold + 1, train_score, valid_score))
        del trn_x, trn_y, val_x, val_y;
        gc.collect()

    feature_importance['importance'] = feature_importance['importance'].astype(float)

    fold_names = list(range(folds.n_splits))
    fold_names.append('overall')

    valid_auc = roc_auc_score(y, oof_preds)

    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))

    # 构建记录分数的 Dataframe
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})

    oof_preds = pd.Series(oof_preds.flatten(), index=X.index).rename('TARGET')
    sub_preds = pd.Series(sub_preds.flatten(), index=test_X.index).rename('TARGET')

    return oof_preds, sub_preds, feature_importance, metrics
コード例 #18
0
#FIT E PREDIÇÃO DOS CLASSIFICADORES

from lightgbm import LGBMClassifier

#Classificador 04

classifier_lgbm_4 = LGBMClassifier( max_depth = 100, 
                                    learning_rate = 0.3,
                                    num_leaves = 500,
                                    n_estimators = 500 )

import time
start = time.time()
classifier_lgbm_4.fit(X_train, Y_train)
end = time.time()
print("Tempo de Execução: {:.2f} min".format((end - start)/60))

Tempo de Execução: 1.49 min

Y_pred_lgbm_4 = classifier_lgbm_4.predict(X_test)

#Classificador 07

classifier_lgbm_7 = LGBMClassifier( max_depth = 1000, 
                                    learning_rate = 0.15,
                                    num_leaves = 2000,
                                    min_data_in_leaf = 200,
                                    n_estimators = 2000 )

import time
start = time.time()
コード例 #19
0
def kfold_lightgbm(training_file, testing_file, num_folds, stratified=False):
    # Divide in training/validation and test data

    train_df = pd.read_csv(training_file)
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    feature_importance_df = pd.DataFrame()
    #feature name: train data dont use 'TARGET', '*_ID_*'is just a indentification of a sample, some is optional.
    # your data may include 'ID' to identify a sample, and modify feature name in the next row code
    feats = [
        f for f in train_df.columns if f not in
        ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
    ]

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=100,
            learning_rate=0.01,
            num_leaves=40,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1,
        )

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc',
                verbose=100,
                early_stopping_rounds=100)

        oof_preds[valid_idx] = clf.predict_proba(
            valid_x, num_iteration=clf.best_iteration_)[:, 1]
        test_df = pd.read_csv(testing_file)
        sub_preds = np.zeros(test_df.shape[0])
        sub_preds += clf.predict_proba(
            test_df[feats],
            num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' %
              (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    test_df['TARGET'] = sub_preds
    test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False)
    #display_importances(feature_importance_df)
    return feature_importance_df
コード例 #20
0
from _00_imports import *

# Use LGBM to select the most important 100 features
if __name__ == '__main__':
    train_feat = np.load('../data/feat_train.npy')
    test_feat = np.load('../data/feat_test.npy')

    train_input = np.load('../data/feat_train.npy')
    lgbm = LGBMClassifier(n_estimators=5000)
    train_x, test_x, train_y, test_y = train_test_split(train_input,
                                                        truth,
                                                        test_size=0.2,
                                                        random_state=1)

    lgbm.fit(train_x, train_y)
    pred = lgbm.predict(test_x)
    print(f1_score(test_y, pred))
    indices = np.argsort(lgbm.feature_importances_)
    importance = lgbm.feature_importances_
    top_indices = indices[-100:][::-1]

    train_feat_df = pd.read_csv('../data/feat_train.csv')
    test_feat_df = pd.read_csv('../data/feat_test.csv')
    feat_names = train_feat_df.columns.values
    selected_feat_names = feat_names[top_indices]

    train_feat_df[selected_feat_names].to_csv(
        '../data/simplified_train_feat.csv', index=False)
    test_feat_df[selected_feat_names].to_csv(
        '../data/simplified_test_feat.csv', index=False)
コード例 #21
0
                       num_leaves=452,
                       num_iterations=5500,
                       learning_rate=0.01,
                       min_data_in_leaf=17,
                       max_bin=800,
                       bagging_fraction=0.74,
                       max_depth=50,
                       objective='binary')
"""
grid = GridSearchCV(model,param_grid)
grid.fit(res_train, feature)
# summarize the results of the grid search
print(grid.best_params_)
"""

model.fit(res_train, feature)

y_pred = model.predict(res_test)

my_submission = pd.DataFrame({
    'building_id': index_test,
    'damage_grade': y_pred
})
clean_submission = {
    "damage_grade": {
        1: "Grade 1",
        2: "Grade 2",
        3: "Grade 3",
        4: "Grade 4",
        5: "Grade 5"
    }
コード例 #22
0
    num_leaves=10,
    colsample_bytree=.8,
    subsample=.9,
    max_depth=7,
    reg_alpha=.1,
    reg_lambda=.1,
    min_split_gain=.01,
    min_child_weight=2,
    silent=-1,
    verbose=-1,
)

lgbm.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric='auc',
    verbose=100,
    early_stopping_rounds=10  #30
)

# Use 5-fold Cross Validation to get the accuracy # 0.6192
cv_score = model_selection.cross_val_score(lgbm, X_train, y_train, cv=5)
print('Model accuracy of LGBM is:', cv_score.mean())

### Part 4.2: Use Grid Search to Find Optimal Hyperparameters
# Choose the number of trees
parameters = {'n_estimators': [60, 80, 100]}
Grid_RF = GridSearchCV(LGBMClassifier(), parameters, cv=5)
Grid_RF.fit(X_train, y_train)

# best number of tress
コード例 #23
0
ファイル: model.py プロジェクト: ppstacy/automl3_starting_kit
class Model:
    def __init__(self, datainfo, timeinfo):
        '''
        This constructor is supposed to initialize data members.
        Use triple quotes for function documentation.
        '''
        # Just logging.info some info from the datainfo variable
        logging.info("The Budget for this data set is: %d seconds" %
                     datainfo['time_budget'])

        logging.info(
            "Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables"
            % (datainfo['loaded_feat_types'][0],
               datainfo['loaded_feat_types'][1],
               datainfo['loaded_feat_types'][2],
               datainfo['loaded_feat_types'][3]))
        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]
        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)
        self.num_train_samples = 0
        self.num_feat = 1
        self.num_labels = 1
        self.is_trained = False
        self.clf = LGBMClassifier(**params)
        # Here you may have parameters and hyper-parameters

    def fit(self, F, y, datainfo, timeinfo):
        '''
        This function should train the model parameters.
        Here we do nothing in this example...
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        If fit is called multiple times on incremental data (train, test1, test2, etc.)
        you should warm-start your training from the pre-trained model. Past data will
        NOT be available for re-training.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'])

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            self.cat_encs = FrequencyEncoder()
            X_cat = self.cat_encs.fit_transform(F['CAT']).values
            X = np.concatenate((X, X_cat), axis=1)
            del X_cat

        self.num_train_samples = X.shape[0]
        self.num_feat = X.shape[1]
        num_train_samples = y.shape[0]

        self.DataX = X
        self.DataY = y
        logging.info("The whole available data is: ")
        logging.info(
            ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0],
                                                      self.DataX.shape[1]))
        logging.info(
            ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0],
                                                      self.num_labels))

        X_trn, X_val, y_trn, y_val = train_test_split(X,
                                                      y,
                                                      test_size=.25,
                                                      random_state=SEED)
        self.clf.fit(X_trn,
                     y_trn,
                     eval_set=(X_val, y_val),
                     early_stopping_rounds=10,
                     verbose=10)

        if (self.num_train_samples != num_train_samples):
            logging.info("ARRGH: number of samples in X and y do not match!")
        self.is_trained = True

    def predict(self, F, datainfo, timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves.
        The function predict eventually casdn return probabilities or continuous values.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'])

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            X_cat = self.cat_encs.transform(F['CAT']).values
            X = np.concatenate((X, X_cat), axis=1)
            del X_cat

        num_test_samples = X.shape[0]
        if X.ndim > 1: num_feat = X.shape[1]
        logging.info(
            ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples,
                                                     num_feat))
        if (self.num_feat != num_feat):
            logging.info(
                "ARRGH: number of features in X does not match training data!")
        logging.info(
            ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples,
                                                     self.num_labels))
        y = self.clf.predict_proba(X)[:, 1]
        y = np.transpose(y)
        return y

    def save(self, path="./"):
        pickle.dump(self, open(path + '_model.pickle', "w"))

    def load(self, path="./"):
        modelfile = path + '_model.pickle'
        if isfile(modelfile):
            with open(modelfile) as f:
                self = pickle.load(f)
            logging.info("Model reloaded from: " + modelfile)
        return self
コード例 #24
0
# Make copies for X, Y to b e used within CV
X = X_train.drop(["msno", "is_churn"], axis=1).copy()
y = X_train["is_churn"].copy()

# (stratified) Cross validation
for train_index, validation_index in kf.split(X, y):
    print("Cross-validation, Fold %d" % (len(log_loss_val) + 1))

    # Split data into training and testing set
    X_train = X.iloc[train_index, :].copy()
    X_validate = X.iloc[validation_index, :].copy()
    y_train = y[train_index]
    y_validate = y[validation_index]

    # Train the model
    model = model.fit(X_train, y_train)

    # Test the model
    log_loss_val.append(log_loss(y_validate, model.predict_proba(X_validate)))
    print("Log loss: %f" % log_loss_val[-1])

    # Make predictions
    y_pred.append(np.log(model.predict_proba(X_test[X.columns])[:, 1]))

    # delete temporal dataframes
    del X_train, X_validate, y_train, y_validate

# Evaluate results from CV
print("Log loss %f +/- %f" % (np.mean(log_loss_val), 2 * np.std(log_loss_val)))

## =========================== 4. Output results =========================== ##
コード例 #25
0
    def cal_subject_mul(train_vec, train_subject, test_id, test_vec, iter,
                        baseline):
        # param = { 'boosting_type':'gbdt', 'num_leaves':55, 'reg_alpha':0.0, 'reg_lambda':1,
        #           'max_depth':15, 'n_estimators':6000, 'objective':'binary',
        #           'subsample':0.8, 'colsample_bytree':0.8, 'subsample_freq':1,
        #           'learning_rate':0.06, 'min_child_weight':1, 'random_state':20, 'n_jobs':4}
        # clf = LGBMClassifier(param)
        # clf = svm.LinearSVC(max_iter=100000)
        N = 10
        train_vec = np.array(train_vec)
        train_subject = np.array(train_subject)
        kf = StratifiedKFold(n_splits=N,
                             random_state=2018).split(train_vec, train_subject)
        clf = LGBMClassifier(boosting_type='gbdt',
                             num_leaves=80,
                             reg_alpha=0.1,
                             reg_lambda=1,
                             max_depth=8,
                             n_estimators=iter,
                             objective='binary',
                             subsample=0.8,
                             colsample_bytree=0.8,
                             subsample_freq=1,
                             learning_rate=0.06,
                             min_child_weight=1,
                             random_state=20,
                             n_jobs=4)
        # iter_list = [803,61,69,314,196,223,64,153,55,284,173]

        test_res = list()

        for l in range(len(test_id)):
            test_res.append(list())

        subject_vocab = {
            '价格': 0,
            '配置': 1,
            '操控': 2,
            '舒适性': 3,
            '油耗': 4,
            '动力': 5,
            '内饰': 6,
            '安全性': 7,
            '空间': 8,
            '外观': 9
        }

        for l in range(len(test_id)):
            test_res[l].append(subject_vocab[baseline['subject'][l]])

        value_list = list()

        for l in range(len(test_id)):
            value_list.append(list())

        for l in range(len(test_id)):
            value_list[l].append(baseline['sentiment_value'][l])

        res_sub = np.zeros([10, len(test_id), N])
        for k, (train_fold, test_fold) in enumerate(kf):
            for i in range(10):
                train_subject_kf = train_subject[train_fold]
                train_label_onehot = train_subject_kf.copy()
                for l in range(len(train_subject_kf)):
                    if train_subject_kf[l] != i:
                        train_label_onehot[l] = 0
                    else:
                        train_label_onehot[l] = 1
                # print(train_label_onehot)
                # print(train_subject)
                # clf = LGBMClassifier(boosting_type='gbdt', num_leaves=80, reg_alpha=0.1, reg_lambda=1,
                #                      max_depth=8, n_estimators=iter_list[i], objective='binary',
                #                      subsample=0.8, colsample_bytree=0.8, subsample_freq=1,
                #                      learning_rate=0.06, min_child_weight=1, random_state=20, n_jobs=4)
                clf.fit(train_vec[train_fold], train_label_onehot)
                res_onehot = clf.predict(test_vec)
                for l in range(len(test_id)):
                    res_sub[i][l][k] = res_onehot[l]

        res_onehot = np.zeros([10, len(test_id)])
        for i in range(10):
            for j in range(len(test_id)):
                tmp = []
                for k in range(N):
                    tmp.append(res_sub[i][j][k])
                if sum(tmp) > 7:
                    res_onehot[i][j] = 1

        for i in range(10):
            for l in range(len(test_id)):
                if res_onehot[i][l] == 1 and i not in test_res[l]:
                    test_res[l].append(i)
                    value_list[l].append(0)
                    # value_list[l].append(value_list[l][0])
        # clf = LGBMClassifier(boosting_type='gbdt', num_leaves=80, reg_alpha=0.1, reg_lambda=1,
        #                      max_depth=8, n_estimators=iter_list[10], objective='binary',
        #                      subsample=0.8, colsample_bytree=0.8, subsample_freq=1,
        #                      learning_rate=0.06, min_child_weight=1, random_state=20, n_jobs=4)

        return test_id, test_res, value_list
コード例 #26
0
#Bloco 02: Parametrização do Modelo

from lightgbm import LGBMClassifier

classifier_lgbm_kpca = LGBMClassifier(max_depth=500,
                                      learning_rate=0.01,
                                      num_leaves=1000,
                                      min_data_in_leaf=200,
                                      n_estimators=2000,
                                      objective='binary',
                                      metric='binary_logloss',
                                      random_state=42)

#Bloco 03: Fit e Predição

classifier_lgbm_kpca.fit(X_train_kpca_new, Y_train)
Y_pred_lgbm_kpca = classifier_lgbm_kpca.predict(X_test_kpca_new)

#Bloco 04: Análise de Métricas

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

#Accuracy Score

mtrc_accuracy_score_lgbm_kpca = accuracy_score(Y_test, Y_pred_lgbm_kpca)
print('Accuracy Score : ' + str(mtrc_accuracy_score_lgbm_kpca))

#Precision Score

mtrc_precision_score_lgbm_kpca = precision_score(Y_test, Y_pred_lgbm_kpca)
print('Precision Score : ' + str(mtrc_precision_score_lgbm_kpca))
def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

    best_model = None
    best_score = 0.0
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        roc_auc_score_kfold = roc_auc_score(valid_y, oof_preds[valid_idx])
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score_kfold))
        if roc_auc_score_kfold > best_score:
            best_model = clf
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    #display_importances(feature_importance_df)
    datasets = {'feats':feats,'train_x':train_df[feats],'train_y':train_df['TARGET'],'valid_x':test_df[feats],'valid_y':test_df['TARGET']}
    return feature_importance_df, datasets, best_model
コード例 #28
0
x = np.concatenate([f_ds, m_ds], 0)
x = x.reshape(x.shape[0], x.shape[1]*x.shape[2])
y = np.concatenate([f_lb, m_lb], 0)
print(x.shape)  # (2141, 110336)
print(y.shape)  # (2141,)

# 전처리
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, test_size=0.2, random_state=42)
print(x_train.shape)    # (1712, 110336)
print(x_test.shape)     # (429, 110336)
print(y_train.shape)    # (1712,)
print(y_test.shape)     # (429,)

# 모델 구성
model = LGBMClassifier(device='gpu')
model.fit(x_train, y_train)

# model & weight save
# pickle.dump(model, open('E:/nmb/nmb_data/cp/m04_mfcc_LGBMClassifier.data', 'wb')) # wb : write
# print("== save complete ==")

# model load
model = pickle.load(open('E:/nmb/nmb_data/cp/m04_mfcc_LGBMClassifier.data', 'rb'))  # rb : read
# time >>

# evaluate
y_pred = model.predict(x_test)
# print(y_pred[:100])
# print(y_pred[100:])

accuracy = accuracy_score(y_test, y_pred)
コード例 #29
0
def find_markers(
    data: AnnData,
    label_attr: str,
    de_key: str = "de_res",
    n_jobs: int = -1,
    min_gain: float = 1.0,
    random_state: int = 0,
    remove_ribo: bool = False,
) -> Dict[str, Dict[str, List[str]]]:
    """Find markers using gradient boosting method.

    Parameters
    ----------
    data: ``anndata.AnnData``
        Annotated data matrix with rows for cells and columns for genes.

    label_attr: ``str``
        Cluster labels used for finding markers. Must exist in ``data.obs``.

    de_key: ``str``, optional, default: ``"de_res"``
        Keyword of DE analysis result stored in ``data.varm``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to used. If ``-1``, use all available threads.

    min_gain: ``float``, optional, default: ``1.0``
        Only report genes with a feature importance score (in gain) of at least ``min_gain``.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    remove_ribo: ``bool``, optional, default: ``False``
        If ``True``, remove ribosomal genes with either RPL or RPS as prefixes.

    Returns
    -------
    markers: ``Dict[str, Dict[str, List[str]]]``
        A Python dictionary containing marker information in structure ``dict[cluster_id]['up' or 'down'][dataframe]``.

    Examples
    --------
    >>> marker_dict = pg.find_markers(adata, label_attr = 'leiden_labels')
    """

    n_jobs = effective_n_jobs(n_jobs)

    if remove_ribo:
        data = data[:,
                    np.vectorize(lambda x: not x.startswith("RPL") and not x.
                                 startswith("RPS"))(data.var_names), ]

    X_train, X_test, y_train, y_test = train_test_split(
        data.X,
        data.obs[label_attr],
        test_size=0.1,
        random_state=random_state,
        stratify=data.obs[label_attr],
    )

    # start = time.time()
    # xgb = XGBClassifier(n_jobs = n_jobs, n_gpus = 0)
    # xgb.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric = 'merror')
    # # print(xgb.evals_result())
    # end = time.time()
    # print("XGBoost used {:.2f}s to train.".format(end - start))

    # from xgboost import XGBClassifier
    try:
        from lightgbm import LGBMClassifier
    except ImportError:
        print("Need lightgbm! Try 'pip install lightgbm'.")
    start_lgb = time.time()
    lgb = LGBMClassifier(n_jobs=n_jobs,
                         metric="multi_error",
                         importance_type="gain")
    lgb.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=1,
    )
    end_lgb = time.time()
    logger.info("LightGBM used {:.2f}s to train.".format(end_lgb - start_lgb))

    ntot = (lgb.feature_importances_ >= min_gain).sum()
    ords = np.argsort(lgb.feature_importances_)[::-1][:ntot]

    log_exprs = [
        x for x in data.varm[de_key].dtype.names
        if x.startswith("mean_logExpr:")
    ]
    labels = [x.rpartition(":")[2] for x in log_exprs]

    titles = [("down", "down_gain"), ("weak", "weak_gain"),
              ("strong", "strong_gain")]
    markers = defaultdict(lambda: defaultdict(list))

    kmeans = KMeans(n_clusters=3, random_state=random_state)
    for gene_id in ords:
        gene_symbol = data.var_names[gene_id]
        mydat = [[x] for x in data.varm[de_key][log_exprs][gene_id]]
        kmeans.fit(mydat)
        kmeans_label_mode = pd.Series(kmeans.labels_).mode()[0]
        for i, kmeans_label in enumerate(
                np.argsort(kmeans.cluster_centers_[:, 0])):
            if kmeans_label != kmeans_label_mode:
                for pos in (kmeans.labels_ == kmeans_label).nonzero()[0]:
                    clust_label = labels[pos]
                    markers[clust_label][titles[i][0]].append(gene_symbol)
                    markers[clust_label][titles[i][1]].append("{:.2f}".format(
                        lgb.feature_importances_[gene_id]))

    return markers
コード例 #30
0
        TARGET_COL
    ]
]
len(features)

from lightgbm import LGBMClassifier
clf = LGBMClassifier(n_estimators=550,
                     learning_rate=0.03,
                     min_child_samples=40,
                     random_state=1,
                     colsample_bytree=0.5,
                     reg_alpha=2,
                     reg_lambda=2)

clf.fit(trn[features],
        trn[TARGET_COL],
        eval_set=[(val[features], val[TARGET_COL])],
        verbose=50,
        eval_metric='auc',
        early_stopping_rounds=100)

preds = clf.predict_proba(test[features])[:, 1]

fi = pd.Series(index=features, data=clf.feature_importances_)
fi.sort_values(ascending=False)[0:20][::-1].plot(kind='barh')

sub = pd.DataFrame({"Patient_ID": test.Patient_ID.values})
sub["Health_Camp_ID"] = test.Health_Camp_ID.values
sub["Outcome"] = preds
sub.to_csv("lgbmblending.csv", index=False)
コード例 #31
0
data = pd.read_csv('feature.csv')
data_y = data['tz_students'].values
data_y = 1 - data_y
data = data.drop(['STUDENTCODE', 'tz_students'], axis=1)

# lightgbm
clf = LGBMClassifier(num_leaves=40,
                     learning_rate=0.05,
                     max_depth=20,
                     n_estimators=300,
                     subsample=0.8,
                     colsample_bytree=1,
                     min_child_weight=1)

# 计算特征重要度
clf.fit(X=data, y=data_y)
score = clf.feature_importances_
score = [(data.columns[i], score[i]) for i in range(len(score))]
score = sorted(score, key=lambda k: k[1], reverse=True)
name_list = []
for i in range(len(score)):
    if score[i][1] > 0:
        name_list.append(score[i][0])
        print(i, score[i])
    else:
        break
print(name_list)

#  做出基本分
tmp_data = data[name_list]
start = time.time()
コード例 #32
0

############################################################
############################################################
############################################################
############################################################
ColumnSelect=np.asarray(["C"+str(X) for X in range(1,15)])
TempTrain=TrainTransaction[ColumnSelect]
TempTrain=TempTrain.join([pd.get_dummies(data=TrainTransaction["ProductCD"]), pd.get_dummies(data=TrainTransaction["P_emaildomain"]), pd.get_dummies(data=TrainTransaction["QuantileAmt"])])

#Train and test sets
X_train, X_test, y_train, y_test = train_test_split(TempTrain, TrainTransaction['isFraud'], test_size=0.1, random_state=42)

#Set up SDG Model with Grid Search
LGBMModel=LGBMClassifier()
LGBMModel.fit(X_train, y_train)

#Predict
Predictions=LGBMModel.predict(TempTrain)

#Metrics
print(confusion_matrix(y_test, Predictions))
print(classification_report(y_test, Predictions))

#Save Parameters
text_file = open("Params_V5.txt", "w")
text_file.write("%s\n" % confusion_matrix(y_test, Predictions))
text_file.write("%s\n" % classification_report(y_test, Predictions))
text_file.close()

#Try with test
コード例 #33
0
ファイル: Model-1.py プロジェクト: vgaurav3011/100-Days-of-ML
sns.catplot(x="Crop_Damage", y="Season", hue="Crop_Damage", kind="bar", data=train);

X = train.drop(labels=['Crop_Damage'], axis=1)
y = train['Crop_Damage']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

y_test.value_counts

X_train_data = X_train.iloc[:, 1:9]
X_test_data = X_test.iloc[:, 1:9]
X_train_data

lgbm = LGBMClassifier()
lgbm_pred = lgbm.fit(X_train_data, y_train)
y_pred = lgbm_pred.predict(X_test_data)
print(accuracy_score(y_pred, y_test))

test2 = test.iloc[:, 1:9]
test_pred = lgbm_pred.predict(test2)
test2['Crop_Damage'] = test_pred
test2['ID'] = test['ID']
test2

output=pd.DataFrame(data={"ID":test2["ID"],"Crop_Damage":test2["Crop_Damage"]}).to_csv("Sol.csv", index=False)
from google.colab import files
files.download('Sol.csv')

output
コード例 #34
0
def gen_sub_by_para():
    args = locals()
    logger.debug(f'Run train dnn:{args}')

    from code_felix.tiny.util import get_stable_feature
    feature_label = get_stable_feature('1003')
    #feature_label = get_dynamic_feature()
    logger.debug(f'The input feature:{feature_label.shape}')

    test = feature_label[feature_label['sex'].isnull()]
    train = feature_label[feature_label['sex'].notnull()]
    train['sex_age'] = train['sex_age'].astype('category')

    X_train, X_test, y_train, y_test = split_train(train)

    gbm = LGBMClassifier(
        n_estimators=20000,
        boosting_type='gbdt',
        objective='multiclass',
        num_class=22,
        random_state=47,
        metric=['multi_logloss'],
        verbose=-1,
        max_depth=3,
        feature_fraction=0.2,
        subsample=0.5,
        min_data_in_leaf=1472,
        reg_alpha=2,
        reg_lambda=4,

        ##########
        learning_rate=0.05,  # 0.1
        colsample_bytree=None,  #1
        min_child_samples=None,  #20
        min_child_weight=None,  #0.001
        min_split_gain=None,  #0
        num_leaves=None,  #31
        subsample_for_bin=None,  #200000
        subsample_freq=None,  #1
        nthread=-1,
        #device='gpu'
    )

    # gbm.set_params(**params)

    logger.debug(gbm)

    res = gbm.fit(X_train,
                  y_train,
                  eval_set=[(X_train, y_train), (X_test, y_test)],
                  early_stopping_rounds=100,
                  verbose=True)
    print(f'Fit return type:{type(res)}')

    print('Feature importances:', list(gbm.feature_importances_))

    print_imp_list(train, gbm)

    best = round(gbm.best_score_.get('valid_1').get('multi_logloss'), 5)
    best_score = best
    best_epoch = gbm.best_iteration_

    print(gbm)

    best = "{:.5f}".format(best)

    pre_x = test.drop(['sex', 'age', 'sex_age', 'device'], axis=1)
    # sub = pd.DataFrame(gbm.predict_proba(pre_x.values, num_iteration=gbm.best_iteration_))
    #
    # sub.columns=train.sex_age.cat.categories
    # sub['DeviceID']=test['device'].values
    # sub=sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]
    #
    # # from sklearn.metrics import log_loss
    # # loss = log_loss(y_test, gbm.predict_proba(X_test,num_iteration=gbm.best_iteration_))
    # #
    # # print(f'Loss={loss}, best={best}')
    # #lgb.plot_importance(gbm, max_num_features=20)
    #
    # #print(f'=============Final train feature({len(feature_label.columns)}):\n{list(feature_label.columns)} \n {len(feature_label.columns)}')
    #
    # file = f'./sub/baseline_lg_sci_{best}_{args}.csv'
    # file = replace_invalid_filename_char(file)
    # print(f'sub file save to {file}')
    # sub.to_csv(file,index=False)

    ###Save result for ensemble
    train_bk = pd.DataFrame(gbm.predict_proba(
        train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)),
                            index=train.device,
                            columns=train.sex_age.cat.categories)

    test_bk = pd.DataFrame(gbm.predict_proba(pre_x),
                           index=test.device,
                           columns=train.sex_age.cat.categories)

    from code_felix.tiny.util import save_result_for_ensemble
    save_result_for_ensemble(
        f'{best_score}_{best_epoch}_lgb_{args}',
        train=train_bk,
        test=test_bk,
        label=None,
    )
コード例 #35
0
ファイル: check_7.py プロジェクト: vlirsub/sdsj2018-automl
    "bagging_fraction": 0.70,
    'bagging_freq': 4,
    "max_depth": -1,
    "verbosity": -1,
    "reg_alpha": 0.3,
    "reg_lambda": 0.1,
    # "min_split_gain":0.2,
    "min_child_weight": 10,
    'zero_as_missing': True,
    'num_threads': 8,
}

model = lgb.train(params, lgb.Dataset(X_values, label=y_train), 600)

model = LGBMClassifier(n_estimators=100)
model.fit(X_values, y_train)
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
score = cross_val_score(model, X_values, y_train, cv=kfold, n_jobs=1, scoring='roc_auc', verbose=0)
print('score {:.4}'.format(score.mean()))

prediction = model.predict_proba(X_test)[:, 1]
prediction = model.predict(X_test)

result = y_true.copy()
result['prediction'] = prediction

metric = roc_auc_score(result['target'], result['prediction'])
print('roc auc: {:.4}'.format(metric))
# 0.8453 0.8461
# 0.8317
コード例 #36
0
ファイル: A_TL_LGB_LGB.py プロジェクト: tomzhang/QH_FInSight
def without_cv_transfer_a_to_b_modeling():
    """

    :return:
    """

    '''Data input'''
    data_a_train = pd.read_csv('../data/A_train_final.csv', index_col='no')
    data_b_train = pd.read_csv('../data/B_train_final.csv', index_col='no')
    y_of_b_train = data_b_train['flag']
    data_b_test = pd.read_csv('../data/B_test_final.csv', index_col='no')

    '''A train特征工程'''
    data_a_train_without_label = data_a_train.drop('flag', axis=1)

    data_a_train_without_label['UserInfo_222x82'] = data_a_train_without_label['UserInfo_82'] * data_a_train_without_label['UserInfo_222']

    '''缺失值填充'''
    data_a_train_filled = data_a_train_without_label.fillna(value=10)

    '''特征的名字'''
    feature_name = list(data_a_train_without_label.columns.values)
    data_b_test_user_id = list(data_b_test.index.values)

    '''构造训练集和测试集'''
    x_temp = data_a_train_filled.iloc[:, :].as_matrix()  # 自变量
    y = data_a_train.iloc[:, -1].as_matrix()  # 因变量

    '''Feature selection 注意如果加特征的话,feature name还是需要改的'''
    X, dropped_feature_name, len_feature_choose = lgb_feature_selection(feature_name, x_temp, y, "0.1*mean")

    '''B train特征工程'''
    data_b_train_without_label = data_b_train.drop('flag', axis=1)

    data_b_train_without_label['UserInfo_222x82'] = data_b_train_without_label['UserInfo_82'] * data_b_train_without_label['UserInfo_222']
    data_b_train_filled = data_b_train_without_label.fillna(value=10)

    '''b test 特征工程'''
    data_b_test['UserInfo_222x82'] = data_b_test['UserInfo_82'] * data_b_test['UserInfo_222']
    data_b_test_filled = data_b_test.fillna(value=10)

    '''特征筛选'''
    data_b_train_filled_after_feature_selection = data_test_feature_drop(data_b_train_filled, dropped_feature_name)
    data_b_test_filled_after_feature_selection = data_test_feature_drop(data_b_test_filled, dropped_feature_name)

    '''用A_train建模预测B_train'''

    print '起始时间'
    print time.clock()*1.0/60

    parameter_n_estimators = 400
    classifier = LGBMClassifier(n_estimators=parameter_n_estimators)

    a_model = classifier.fit(X, y)

    prob_of_b_train = a_model.predict_proba(data_b_train_filled_after_feature_selection)

    print '训练终止时间'
    print time.clock()*1.0/60

    '''画roc曲线'''
    fpr, tpr, thresholds = roc_curve(y_of_b_train, prob_of_b_train[:, 1])

    roc_auc = auc(fpr, tpr)

    print '\nauc='+str(roc_auc)

    '''预测Btest'''

    prob_of_b_test = a_model.predict_proba(data_b_test_filled_after_feature_selection)

    result_file_name = '../result/B_test_predict_using_A_LGBLGB_without_cv_fillna_10' + '_N_' + str(parameter_n_estimators) + '_features_' + \
                       str(len_feature_choose) + '_offline_'+str(roc_auc)+'.csv'

    write_predict_results_to_csv(result_file_name, data_b_test_user_id, prob_of_b_test[:, 1].tolist())