Ejemplo n.º 1
0
def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
    # SelectfromModel
    clf = LGBMClassifier(n_estimators=400)
    clf.fit(matrix_x_temp, label_y)
    sfm = SelectFromModel(clf, prefit=True, threshold=th)
    matrix_x = sfm.transform(matrix_x_temp)

    # 打印出有多少特征重要性非零的特征
    feature_score_dict = {}
    for fn, s in zip(fe_name, clf.feature_importances_):
        feature_score_dict[fn] = s
    m = 0
    for k in feature_score_dict:
        if feature_score_dict[k] == 0.0:
            m += 1
    print 'number of not-zero features:' + str(len(feature_score_dict) - m)

    # 打印出特征重要性
    feature_score_dict_sorted = sorted(feature_score_dict.items(),
                                       key=lambda d: d[1], reverse=True)
    print 'feature_importance:'
    for ii in range(len(feature_score_dict_sorted)):
        print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
    print '\n'

    f = open('../eda/lgb_feature_importance.txt', 'w')
    f.write(th)
    f.write('\nRank\tFeature Name\tFeature Importance\n')
    for i in range(len(feature_score_dict_sorted)):
        f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
    f.close()

    # 打印具体使用了哪些字段
    how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
    feature_used_dict_temp = feature_score_dict_sorted[:how_long]
    feature_used_name = []
    for ii in range(len(feature_used_dict_temp)):
        feature_used_name.append(feature_used_dict_temp[ii][0])
    print 'feature_chooesed:'
    for ii in range(len(feature_used_name)):
        print feature_used_name[ii]
    print '\n'

    f = open('../eda/lgb_feature_chose.txt', 'w')
    f.write('Feature Chose Name :\n')
    for i in range(len(feature_used_name)):
        f.write(str(feature_used_name[i]) + '\n')
    f.close()

    # 找到未被使用的字段名
    feature_not_used_name = []
    for i in range(len(fe_name)):
        if fe_name[i] not in feature_used_name:
            feature_not_used_name.append(fe_name[i])

    return matrix_x, feature_not_used_name[:], len(feature_used_name)
Ejemplo n.º 2
0
def main():
	# load the data
	print('\nloading...')
	wd = '/Users/ewenwang/Documents/credit/data'
	os.chdir(wd)
	dataFile = 'creditcard.csv'
	dataset = pd.read_csv(dataFile, low_memory=False)

	# set target and predictors
	target = 'Class'
	predictors = [x for x in dataset.columns if x not in [target]]

	# split the data into training and test sets 
	seed = 2017
	dtrain, dtest = train_test_split(dataset, test_size=0.33, random_state=seed)

	# build the classifier
	gbm = LGBMClassifier(
		learning_rate=0.01,
		n_estimators=5000,
		objective='binary',
		metric='auc',
		max_depth=10,
		subsample=0.83,
		colsample_bytree=0.63,
		save_binary=True,
		is_unbalance=True,
		random_state=seed
	)

	# train the model
	print('\nfitting...')
	gbm.fit(dtrain[predictors], dtrain[target])

	# report
	report(gbm, dtrain, dtest, predictors, target)

	return None
Ejemplo n.º 3
0
    #     param_distributions={
    #         "n_estimators": np.arange(10, 100, 10),
    #         "learning_rate": np.arange(0.1, 1, 0.1)
    #     },
    #     n_iter=1,
    #     scoring=SCORING,
    #     refit=SCORING[0],
    #     cv=CV,
    #     return_train_score=False,
    #     random_state=RANDOM_STATE
    # )
    # model.add_model("AdaBoost", adb_rs)

    # 4. LightGBM
    lgbm_rs = RandomizedSearchCV(
        estimator=LGBMClassifier(),
        param_distributions={
            # "n_estimators": np.arange(40, 80, 10),
            # "learning_rate": np.arange(0.1, 0.4, 0.1),
            # "max_depth": np.arange(30, 60, 10),
            "n_estimators": [60],
            "learning_rate": [0.2],
            "max_depth": [60],
        },
        n_iter=1,
        scoring=SCORING,
        refit=SCORING[0],
        cv=CV,
        return_train_score=False,
        random_state=RANDOM_STATE
    )
Ejemplo n.º 4
0
data = train_set[['label_list', 'course_vecs2']]
labels = data['label_list'].values.tolist()
y = [item for elem in labels for item in elem]
course_info = data['course_vecs2'].values.tolist()
course_list = [item for elem in course_info for item in elem]

model_lgb = LGBMClassifier(boosting_type='gbdt',
                           num_leaves=64,
                           learning_rate=0.01,
                           n_estimators=2000,
                           max_bin=425,
                           subsample_for_bin=50000,
                           objective='binary',
                           min_split_gain=0,
                           min_child_weight=5,
                           min_child_samples=10,
                           subsample=0.8,
                           subsample_freq=1,
                           colsample_bytree=1,
                           reg_alpha=3,
                           reg_lambda=5,
                           seed=1000,
                           n_jobs=-1,
                           silent=True)

model_lgb.fit(course_list,
              y,
              eval_names=['train'],
              eval_metric=['logloss', 'auc'],
              eval_set=[(course_list, y)],
              early_stopping_rounds=10)
import numpy
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
data = load_iris()
X = data.data[:, :2]
y = data.target

ind = numpy.arange(X.shape[0])
numpy.random.shuffle(ind)
X = X[ind, :].copy()
y = y[ind].copy()

pipe = Pipeline([('scaler', StandardScaler()),
                 ('lgbm', LGBMClassifier(n_estimators=1, max_depth=1))])
pipe.fit(X, y)

##################################
# The conversion happens here and fails.

try:
    model_onnx = convert_sklearn(pipe, 'pipeline',
                                 [('input', FloatTensorType([1, 2]))])
except Exception as e:
    print(e)

###################################
# *sklearn-onnx* needs to know the appropriate converter
# for class *LGBMClassifier*, the converter needs to be registered.
# The converter comes with two pieces: a shape calculator which
    'n_estimators': 376,
    'reg_alpha': 1.05,
    'reg_lambda': 2.53,
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'subsample': 0.7,
    'random_state': 42,
    'colsample_bytree': 0.7
}

X_train, X_test, y_train, y_test = train_test_split(ttrain,
                                                    train['Segmentation'],
                                                    test_size=0.2,
                                                    random_state=42)

clf = LGBMClassifier(**params)
clf.fit(X_train, y_train)
p = clf.predict(X_test)
accuracy_score(y_test, p)
confusion_matrix(y_test, p)

plt.barh(colx, clf.feature_importances_)

clf.fit(ttrain, train["Segmentation"])
pred = clf.predict(ttest)
testx["Segmentation"] = pred

test = pd.merge(test, testx[['ID', 'Segmentation']], on='ID', how='left')
test["Segmentation"] = np.where(test["Segmentation_x"].isnull(),
                                test["Segmentation_y"], test["Segmentation_x"])
test[["ID", "Segmentation"]].to_csv(link + "\output.csv", index=False)
Ejemplo n.º 7
0
X = final_train[cols]
y = final_train['like']

folds = KFold(n_splits=10, shuffle=True, random_state=123)
oof_preds = np.zeros(final_train.shape[0])
sub_preds = np.zeros(final_test.shape[0])
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    trn_x, trn_y = X.iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    print(trn_x.shape, trn_y.shape, val_x.shape, val_y.shape)
    clf = LGBMClassifier(
        n_estimators=2000,
        learning_rate=0.01,
        num_leaves=100,
        colsample_bytree=.8,
        subsample=.9,
        max_depth=20,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01,
        min_child_weight=2
    )
    
    clf.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], 
            eval_metric='auc', verbose=500, early_stopping_rounds=400
           )
    
    oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(final_test[cols], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))

vals = [0.26, 0.27, 0.28, 0.29, .30]
lgb_test_preds = []
for val in vals:
    f1_threshold = val
    mod = LGBMClassifier(boosting_type='gbdt',
                         class_weight=None,
                         colsample_bytree=0.5,
                         learning_rate=0.1,
                         max_depth=-1,
                         metric='None',
                         min_child_samples=20,
                         min_child_weight=20,
                         min_split_gain=0.0,
                         n_estimators=10000,
                         n_jobs=8,
                         num_leaves=30,
                         objective=None,
                         random_state=None,
                         reg_alpha=0.0,
                         reg_lambda=0.0,
                         silent=True,
                         subsample=1.0,
                         subsample_for_bin=200000,
                         subsample_freq=1)
    n = preds(df_train=fe_train, y=y, seed=100, df_test=fe_test, mod=mod)
    lgb_test_preds.append((pd.Series(np.column_stack(n[1]).mean(axis=1)) >
                           f1_threshold).astype('int'))
    print(pd.Series(np.column_stack(n[1]).mean(axis=1)) >
          f1_threshold).value_counts(1)
    print pd.Series(n[0] > f1_threshold).value_counts(1)
Ejemplo n.º 9
0
def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=47)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            is_unbalance=False,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=30,
            colsample_bytree=0.05,
            subsample=1,
            max_depth= 8,
            reg_alpha=0,
            reg_lambda=100,
            min_split_gain=0.5,
            min_child_weight=70,
            silent= -1,
            verbose= -1,
            max_bin= 300,
            subsample_freq= 1
            )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
            eval_metric= 'auc', verbose= 1000, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    display_importances(feature_importance_df)
    return feature_importance_df
Ejemplo n.º 10
0
def classification(X_train, X_test, y_train, y_test, label_type, pca_dim=100):
    scaler = StandardScaler()
    scaler.fit(np.vstack([X_train, X_test]))

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    if label_type != 'labels':
        scaler = MinMaxScaler()
        temp1, temp2 = y_train.loc[:,
                                   ['valence', 'activation'
                                    ]], y_test.loc[:,
                                                   ['valence', 'activation']]
        temp = pd.concat((temp1, temp2))
        scaler.fit(temp.values)

        y_train = scaler.transform(temp1.values)
        y_test = scaler.transform(temp2.values)
    else:
        y_train = y_train.loc[:, 'cur_label'].values
        y_test = y_test.loc[:, 'cur_label'].values

    if pca_dim > 0:
        pca_model = PCA(n_components=min(pca_dim, X_train.shape[1])).fit(
            np.array(X_train))
        X_train = pca_model.transform(np.array(X_train))
        X_test = pca_model.transform(np.array(X_test))

    with open('train_test_data' + '.pickle', 'rb') as f:
        [train_data, test_data] = pickle.load(f)

    # with open('best_rf_cl' + '.pickle', 'rb') as f:
    #     clf = pickle.load(f)
    clf = LGBMClassifier(boosting_type='gbdt',
                         num_leaves=31,
                         max_depth=-1,
                         learning_rate=0.001,
                         n_estimators=1000,
                         objective=None,
                         min_split_gain=0,
                         min_child_weight=3,
                         min_child_samples=10,
                         subsample=0.8,
                         subsample_freq=1,
                         colsample_bytree=0.7,
                         reg_alpha=0.3,
                         reg_lambda=0,
                         seed=17)

    if label_type == 'valence':
        print('VALENCE')
        for i in combs:
            print('train {}, test {}'.format(cleaned(i[0]), cleaned(i[1])))
            [X_temp_train,
             y_temp_train] = cut_extreme_values(y_train[:, 0], X_train,
                                                i[0][0], i[0][1])
            y_temp_train = [extreme_features(x) for x in y_temp_train]

            [X_temp_test,
             y_temp_test] = cut_extreme_values(y_test[:, 0], X_test, i[1][0],
                                               i[1][1])
            y_temp_test = [extreme_features(x) for x in y_temp_test]

            clf.fit(X_temp_train, y_temp_train)
            y_pred = clf.predict(X_temp_test)
            print('f1_score= {}'.format(
                round(f1_score(y_pred, y_temp_test, average='macro'), 3)))
            # print(classification_report(y_pred, y_test))

        # y_test = [extreme_features(x) for x in y_test]
    elif label_type == 'arousal':
        print('AROUSAL')
        for i in combs:
            print('train {}, test {}'.format(cleaned(i[0]), cleaned(i[1])))
            [X_temp_train,
             y_temp_train] = cut_extreme_values(y_train[:, 1], X_train,
                                                i[0][0], i[0][1])
            y_temp_train = [extreme_features(x) for x in y_temp_train]

            [X_temp_test,
             y_temp_test] = cut_extreme_values(y_test[:, 1], X_test, i[1][0],
                                               i[1][1])
            y_temp_test = [extreme_features(x) for x in y_temp_test]

            clf.fit(X_temp_train, y_temp_train)
            y_pred = clf.predict(X_temp_test)
            print('f1_score= {}'.format(
                round(f1_score(y_pred, y_temp_test, average='macro'), 3)))
            # print(classification_report(y_pred, y_test))

    elif label_type == 'labels':

        # STRATIFICATION
        # X = np.hstack((X_train, y_train[:, np.newaxis]))
        # X_pd = pd.DataFrame(X)
        # X_new = np.zeros((1, X.shape[1]))
        # max_num = max(X_pd.iloc[:, -1].value_counts())
        # for label in np.unique(y_train):
        #     indexes_to_add = np.random.choice(a=X_pd[X_pd.iloc[:, -1] == label].index, size=(max_num,))
        #     X_additional = X_pd.loc[indexes_to_add, :]
        #     X_new = np.vstack((X_new, X_additional.values))
        # X_new = X_new[1:,:]
        # X_train = X_new[:,:-1]
        # y_train = X_new[:, -1]
        # # ============================================================
        # shuffle

        combined = list(zip(X_train, y_train))
        random.shuffle(combined)
        X_train[:], y_train[:] = zip(*combined)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print('f1_score= {}'.format(f1_score(y_pred, y_test, average='macro')))
    else:
        raise Exception('label_type is mistaken')
Ejemplo n.º 11
0
def gbdt_lgb_cv_modeling():
    """

    :return:
    """
    '''Data input'''
    data_train = pd.read_csv('../data/train.csv', index_col='ID')
    data_predict = pd.read_csv('../data/pred.csv', index_col='ID')
    '''train 特征工程'''
    data_train_without_label = data_train.drop('Label', axis=1)
    # del data_train_without_label['V17']
    # data_train_without_label['V14×V17'] = data_train_without_label['V14'] * data_train_without_label['V17']
    # data_train_without_label['V14×V4'] = data_train_without_label['V14'] * data_train_without_label['V4']
    # data_train_without_label['V14×V20'] = data_train_without_label['V14'] * data_train_without_label['V20']
    # data_train_without_label['V14×V7'] = data_train_without_label['V14']*data_train_without_label['V7']
    # data_train_without_label['V14×V10'] = data_train_without_label['V14'] * data_train_without_label['V10']
    #
    # data_train_without_label['V17×V4'] = data_train_without_label['V17'] * data_train_without_label['V4']
    # data_train_without_label['V17×V20'] = data_train_without_label['V17'] * data_train_without_label['V20']
    # data_train_without_label['V17×V7'] = data_train_without_label['V17'] * data_train_without_label['V7']
    # data_train_without_label['V17×V10'] = data_train_without_label['V17'] * data_train_without_label['V10']
    #
    # data_train_without_label['V4×V20'] = data_train_without_label['V4'] * data_train_without_label['V20']
    # data_train_without_label['V4×V7'] = data_train_without_label['V4'] * data_train_without_label['V7']
    # data_train_without_label['V4×V10'] = data_train_without_label['V4'] * data_train_without_label['V10']
    #
    # data_train_without_label['V20×V7'] = data_train_without_label['V20'] * data_train_without_label['V7']
    # data_train_without_label['V20×V10'] = data_train_without_label['V20'] * data_train_without_label['V10']
    #
    # data_train_without_label['V7×V10'] = data_train_without_label['V7'] * data_train_without_label['V10']

    feature_name = list(data_train_without_label.columns.values)
    data_predict_user_id = list(data_predict.index.values)
    '''缺失值填充'''
    frames = [data_train_without_label, data_predict]
    data_all = pd.concat(frames)
    data_train_filled = data_train_without_label.fillna(
        value=data_all.median())
    '''构造训练集和测试集'''
    x_temp = data_train_filled.iloc[:, :].as_matrix()  # 自变量
    y = data_train.iloc[:, -1].as_matrix()  # 因变量
    '''Feature selection'''
    X, dropped_feature_name, len_feature_choose = gbdt_feature_selection(
        feature_name, x_temp, y, '0.0005*mean')
    # 0.1*mean可以选出10个特征
    # 0.00001*mean可以选出14个特征
    '''处理 验证集 B_test'''
    # del data_predict['V17']

    data_predict_filled = data_predict.fillna(value=data_all.median())
    data_predict_filled_after_feature_selection = data_test_feature_drop(
        data_predict_filled, dropped_feature_name)
    '''Split train/test data sets'''
    cv = StratifiedKFold(n_splits=5, shuffle=True,
                         random_state=0)  # 分层抽样  cv的意思是cross-validation
    '''Choose a classification model'''
    parameter_n_estimators = 100
    classifier = LGBMClassifier(n_estimators=parameter_n_estimators,
                                learning_rate=0.1)
    '''Model fit, predict and ROC'''
    colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue'])
    lw = 2
    mean_f1 = 0.0
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 500)
    i_of_roc = 0
    a = 0

    th = 0.3

    for (train_indice, test_indice), color in zip(cv.split(X, y), colors):
        a_model = classifier.fit(X[train_indice], y[train_indice])

        # y_predict_label = a_model.predict(X[test_indice])

        probas_ = a_model.predict_proba(X[test_indice])

        fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1])

        a += 1  # 序号加1

        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0

        roc_auc = auc(fpr, tpr)
        plt.plot(fpr,
                 tpr,
                 lw=lw,
                 color=color,
                 label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc))
        i_of_roc += 1

        label_transformed = probas_[:, 1]
        for i in range(len(label_transformed)):
            if label_transformed[i] > th:
                label_transformed[i] = 1
            else:
                label_transformed[i] = 0
        lt = label_transformed.astype('int32')
        f1 = f1_score(y[test_indice], lt)
        mean_f1 += f1  # 0.7739

    plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')

    mean_tpr /= cv.get_n_splits(X, y)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print 'mean_auc=' + str(mean_auc)
    print 'mean_f1=' + str(mean_f1 / 5)
    plt.plot(mean_fpr,
             mean_tpr,
             color='g',
             linestyle='--',
             label='Mean ROC (area = %0.4f)' % mean_auc,
             lw=lw)
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate mean_f1:' + str(mean_f1))
    plt.ylabel('True Positive Rate')

    plt.title('ROC_gbdt_' + str(len_feature_choose) + '_features_f1_' +
              str(mean_f1 / 5))
    plt.legend(loc="lower right")
    plt.savefig('../result/pred_ROC_GL' + '_N_' + str(parameter_n_estimators) +
                '_features_' + str(len_feature_choose) +
                '_proba_to_label_using_th_' + str(th) + '.png')
    # plt.show()

    a_model = classifier.fit(X, y)

    # label_predict = a_model.predict(data_predict_filled_after_feature_selection)  # 对B_test进行预测
    proba_predict = a_model.predict_proba(
        data_predict_filled_after_feature_selection)
    '''写入预测出概率的结果'''
    result_file_name = '../result/pred_result_GL_N_' + str(
        parameter_n_estimators) + '_features_' + str(
            len_feature_choose) + '_proba.csv'
    write_predict_results_to_csv(result_file_name, data_predict_user_id,
                                 proba_predict[:, 1].tolist())
    '''写入要提交的结果'''
    label_transformed = proba_predict[:, 1]
    sum_of_1 = 0
    for i in range(len(label_transformed)):
        if label_transformed[i] > th:
            label_transformed[i] = 1
            sum_of_1 += 1
        else:
            label_transformed[i] = 0
    lt = label_transformed.astype('int32')
    result_file_name = '../result/pred_result_GL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + \
                       '_proba_to_label_using_th_' + str(th) + '_' + str(sum_of_1) + '.csv'
    write_predict_results_to_csv(result_file_name, data_predict_user_id,
                                 lt.tolist())
Ejemplo n.º 12
0
probility = np.zeros((len(test_df), label.shape[1]))
i = 0
model_type = 'ensemble'
# model_type ='single'
# K折交叉划分训练
for train_index, valid_index in kf.split(train_df, label):
    print("\nFold {}".format(i + 1))
    i += 1
    X_train, label_train = train_df[train_index], label[train_index]
    X_valid, label_valid = train_df[valid_index], label[valid_index]

    clf1 = OneVsRestClassifier(
        XGBClassifier(eval_metric='mlogloss',
                      use_label_encoder=False,
                      n_estimators=150))
    clf2 = LGBMClassifier()
    clf3 = LogisticRegression(max_iter=500, n_jobs=20)
    # 模型集成方法1
    if model_type == 'ensemble':
        # 因为XGB的单模型效果比其他两个好,所以权重是2:1:1
        model = OneVsRestClassifier(
            EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                   weights=[2, 1, 1],
                                   voting='soft',
                                   verbose=2))
    # 模型集成方法2
    elif model_type == 'stacking':
        lr = LogisticRegression()
        base = StackingClassifier(classifiers=[clf1, clf2, clf3],
                                  use_probas=True,
                                  average_probas=False,
Ejemplo n.º 13
0
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'scale_pos_weight': [100, 200, 300, 400, 500, 600]}


fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_jobs=-1, scale_pos_weight=578)
lgbm_gs = RandomizedSearchCV(
    lgbm, param_distributions=param_test, 
    n_iter=100, scoring=scorer, cv=k_fold, refit=True, n_jobs=-1,

)
lgbm_gs.fit(X_train, y_train)
logger.info(f'LGBM finetuned best scores: {lgbm_gs.best_score_}')
logger.info(f'LGBM finetuned best params: {lgbm_gs.best_params_}')

logger.info('Evaluating models on test set...')

logger.info('LR score:')
logger.info(f'{rs.score(X_test, y_test)}')
logger.info('LGBN score:')
logger.info(f'{lgbm_gs.score(X_test, y_test)}')
Ejemplo n.º 14
0
def HyperOptPipeline(algo, n_iter=-1):
    if algo in ['linreg', 'logreg', 'svr', 'svc']:
        ss = StandardScaler()
        mms = MinMaxScaler()
        
    if algo == 'linreg':
        model_linreg = LinearRegression()
        model_lasso = Lasso()
        model_ridge = Ridge()
        model_elasticnet = ElasticNet()
        
        params = [
            {
                'scaler': [ss, mms],
                'estimator': [model_linreg]
            },{
                'scaler': [ss, mms],
                'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                'estimator': [model_lasso]
            },{
                'scaler': [ss, mms],
                'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                'estimator': [model_ridge]
            },{
                'scaler': [ss, mms],
                'estimator__alpha': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                'estimator__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
                'estimator': [model_elasticnet]
            }
        ]
        
        pipeline = Pipeline([('scaler', ss), ('estimator', model_linreg)])
        
    if algo == 'logreg':
        model_logreg = LogisticRegression(class_weight='balanced', solver='saga', max_iter=100_000)

        params = [
            {
                'scaler': [ss, mms],
                'estimator__penalty': ['l1', 'l2'],
                'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
            },
            {
                'scaler': [ss, mms],
                'estimator__penalty': ['elasticnet'],
                'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                'estimator__l1_ratio': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
            },
            {
                'scaler': [ss, mms],
                'estimator__penalty': ['none'],
            },
        ]
        
        pipeline = Pipeline([('scaler', ss), ('estimator', model_logreg)])

    if algo in ['svc', 'svr']:
        
        model = SVC(class_weight='balanced') if algo == 'svc' else SVR()
        
        params = [
            {
                'scaler': [ss, mms],
                'estimator__kernel': ['linear'],
                'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
            },
            {
                'scaler': [ss, mms],
                'estimator__kernel': ['rbf', 'sigmoid'],
                'estimator__gamma': ['scale', 'auto'],
                'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
            },
            {
                'scaler': [ss, mms],
                'estimator__kernel': ['poly'],
                'estimator__gamma': ['scale', 'auto'],
                'estimator__C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                'estimator__degree': [2, 3, 4, 5]
            }
        ]
        
        pipeline = Pipeline([('scaler', ss), ('estimator', model)])
        
    if algo in ['ctree', 'rtree']:
        if algo == 'ctree':
            model_rf = RandomForestClassifier(class_weight='balanced')
            model_gb = GradientBoostingClassifier()
            model_et = ExtraTreesClassifier(class_weight='balanced')
            model_xgb = XGBClassifier()
            model_xgbrf = XGBRFClassifier()
            model_cb = CatBoostClassifier(bootstrap_type='Bernoulli')
            model_lgbm = LGBMClassifier(class_weight='balanced')
        else:
            model_rf = RandomForestRegressor()
            model_gb = GradientBoostingRegressor()
            model_et = ExtraTreesRegressor()
            model_xgb = XGBRegressor()
            model_xgbrf = XGBRFRegressor()
            model_cb = CatBoostRegressor(bootstrap_type='Bernoulli')
            model_lgbm = LGBMRegressor()

        params =  [
            {
                'estimator': [model_rf],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__max_depth': [5, 10, 15, 25, 30, None],
                'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100],
                'estimator__min_samples_leaf': [1, 2, 5, 10],
            },
            {
                'estimator': [model_gb],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
                'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
                'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
                'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100],
                'estimator__min_samples_leaf': [1, 2, 5, 10],
            },
            {
                'estimator': [model_et],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__max_depth': [5, 10, 15, 25, 30, None],
                'estimator__min_samples_split': [1.0, 2, 5, 10, 15, 100],
                'estimator__min_samples_leaf': [1, 2, 5, 10],
            },
            {
                'estimator': [model_xgb],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
                'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
                'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
                'estimator__gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
                'estimator__min_child_weight': [1, 3, 5, 7],
                'estimator__reg_lambda': [0.01, 0.1, 1.0],
                'estimator__reg_alpha': [0, 0.1, 0.5, 1.0],
            },
            {
                'estimator': [model_xgbrf],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
                'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
                'estimator__max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
                'estimator__gamma': [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
                'estimator__min_child_weight': [1, 3, 5, 7],
                'estimator__reg_lambda': [0.01, 0.1, 1.0],
                'estimator__reg_alpha': [0, 0.1, 0.5, 1.0],
            },
            {
                'estimator': [model_cb],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
                'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
                'estimator__max_depth': [3, 5, 7, 9, 12, 15, 16],
                'estimator__reg_lambda': [0.01, 0.1, 1.0],
            },
            {
                'estimator': [model_lgbm],
                'estimator__n_estimators': [10, 50, 100, 250, 500],
                'estimator__learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
                'estimator__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
                'estimator__min_child_samples': [1, 2, 5, 10, 15, 100],
                'estimator__min_child_weight': [1, 3, 5, 7],
                'estimator__reg_lambda': [0.01, 0.1, 1.0],
                'estimator__reg_alpha': [0, 0.1, 0.5, 1.0],
            } 
        ]  
        
        pipeline = Pipeline([('estimator', model_rf)]) 
    
    n_params = 0        
    for param_dict in params:    
        n = 1
        for v in param_dict.values():
            n *= len(v)
        n_params += n
        
    print(n_params, 'parameter settings identified')
    if n_iter == -1:
        return GridSearchCV(pipeline, params, cv=ShuffleSplit(test_size=0.1, n_splits=1, random_state=19))
    return RandomizedSearchCV(pipeline, params, n_iter=n_iter, cv=ShuffleSplit(test_size=0.1, n_splits=1, random_state=19), random_state=19)
Ejemplo n.º 15
0
# TF-iDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features= 10000)
all_review = tfidf.fit_transform(clean_reviews)
all_review=all_review.toarray()

print(len(y))


X_train=all_review[:len(y)]

#LGBM 사용
from lightgbm import LGBMClassifier


lgbm_model = LGBMClassifier(n_estimators=220, learning_rate=0.2, num_leaves=120, random_state=77)
lgbm_model.fit(X_train,y)


X_test = all_review[len(y):]
preds = lgbm_model.predict(X_test)



submit=pd.DataFrame(all["id"][len(y):])
submit['sentiment']=preds
submit.head()


#id의 ""를 지우는 함수
def remove_d(word):
Ejemplo n.º 16
0
print('transforming...')
count_vec = TfidfVectorizer()
data_ip = count_vec.fit_transform(data['item_property_list'])

train = data[data.is_trade.notnull()]
train_index = list(train[train.day < 24].index) 
test_index = list(train[train.day == 24].index) 
ip_train = data_ip[train_index,:]
ip_test = data_ip[test_index,:]

gbm = LGBMClassifier(
	objective='binary',
	num_leaves=24,
	max_depth=3,
	learning_rate=0.1,
	seed=2018,
	colsample_bytree=0.3,
	subsample=0.8,
	n_jobs=-1,
	n_estimators=2000
	)

print('fitting...')
gbm.fit(ip_train, train.loc[train_index, 'is_trade'], eval_set=[(ip_test, train.loc[test_index, 'is_trade'])], 
		early_stopping_rounds=10)

property_df = pd.DataFrame(columns=['instance_id', 'item_property_prob'])
property_df['instance_id'] = data['instance_id']
property_df['item_property_prob'] = gbm.predict_proba(data_ip)[:, 1]

def NatureLP(data, columns):
Ejemplo n.º 17
0
def OOFPreds(X, y, test_X, params, n_splits=5, random_state=23, clf='lgb'):
    """
    输入要求数据为 Dataframe
    返回数据 Series
    """

    # 方便后续特征重要度分析
    feature_importance = pd.DataFrame(columns=['feature', 'importance', 'fold'])

    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # oof 是交叉验证结果 sub是测试集预测结果
    oof_preds, sub_preds = np.zeros(X.shape[0]), np.zeros(test_X.shape[0])

    oof_train = np.zeros(X.shape[0])

    print(X.shape, test_X.shape)

    valid_scores = []
    train_scores = []

    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        trn_x, trn_y = X.iloc[trn_idx], y.iloc[trn_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

        # # 初始化 score记录方式
        # trn_init_score = pd.Series([0.95] * len(trn_x), index=trn_x.index)
        # val_init_score = pd.Series([0.95] * len(val_x), index=val_x.index)

        # 模型构建与预测任务
        if clf == 'lgb':
            with timer('{} fold 训练时间:'.format(n_fold)) as time:
                gbm = LGBMClassifier(**params)
                gbm.fit(trn_x, trn_y, init_score=trn_init_score,
                        eval_set=[(trn_x, trn_y), (val_x, val_y)],
                        eval_init_score=[trn_init_score, val_init_score],
                        eval_metric='auc', verbose=30, early_stopping_rounds=100)

                print('best iteration: {}'.format(gbm.best_iteration_))
                print('100单次训练时间: {:.3f}'.format(time*100/gbm.best_iteration_))

                pred_val = gbm.predict_proba(val_x, num_iteration=gbm.best_iteration_)[:, 1]
                pred_test = gbm.predict_proba(test_X, num_iteration=gbm.best_iteration_)[:, 1]

            # 预测分数 预测结果记录
            oof_preds[val_idx] = pred_val
            sub_preds += pred_test / folds.n_splits

            print(gbm.best_score_)

            valid_score = gbm.best_score_['valid_1']['auc']
            train_score = gbm.best_score_['training']['auc']

            valid_scores.append(valid_score)
            train_scores.append(train_score)

            feature_importance = feature_importance.append(pd.DataFrame({
                'importance': gbm.feature_importances_,
                'fold': [n_fold + 1] * X.shape[1],
                'feature': X.columns.tolist()}))


        else:
            # 自己的模型
            # 任务一:完成模型的构建预测任务
            # 任务二:完成预测分数,预测结果的记录
            # 任务三:完成模型重要程度的记录
            clf = LogisticRegression(**params)
            clf.fit(trn_x, trn_y)

            pred_train = clf.predict_proba(trn_x)[:, 1]
            pred_val = clf.predict_proba(val_x)[:, 1]
            pred_test = clf.predict_proba(test_X)[:, 1] \

            oof_preds[val_idx] = pred_val
            sub_preds += pred_test / folds.n_splits


            valid_score = roc_auc_score(val_y, pred_val)
            train_score = roc_auc_score(trn_y, pred_train)

            valid_scores.append(valid_score)
            train_scores.append(train_score)

            feature_importance = feature_importance.append(pd.DataFrame({
                'importance': clf.coef_[0],
                'fold': [n_fold + 1] * X.shape[1],
                'feature': X.columns.tolist()}))


        print('Fold {:02d} 训练集 AUC: {:.6f} 验证集 AUC: {:.6f} '.format(n_fold + 1, train_score, valid_score))
        del trn_x, trn_y, val_x, val_y;
        gc.collect()

    feature_importance['importance'] = feature_importance['importance'].astype(float)

    fold_names = list(range(folds.n_splits))
    fold_names.append('overall')

    valid_auc = roc_auc_score(y, oof_preds)

    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))

    # 构建记录分数的 Dataframe
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})

    oof_preds = pd.Series(oof_preds.flatten(), index=X.index).rename('TARGET')
    sub_preds = pd.Series(sub_preds.flatten(), index=test_X.index).rename('TARGET')

    return oof_preds, sub_preds, feature_importance, metrics
Ejemplo n.º 18
0
     'regression':
     LGBMRegressor(boosting_type='gbdt',
                   learning_rate=0.05,
                   num_iterations=1200,
                   max_depth=5,
                   n_estimators=1000,
                   verbose=-1,
                   num_leaves=2**5,
                   silent=True,
                   n_jobs=4),
     'classification':
     LGBMClassifier(boosting_type='gbdt',
                    learning_rate=0.05,
                    num_iterations=1200,
                    max_depth=5,
                    n_estimators=1000,
                    verbose=-1,
                    num_leaves=2**5,
                    silent=True,
                    n_jobs=4),
     'type':
     'gbdt'
 },
 #{'classification': CatBoostClassifier(
 #    **{**CATBOOST_PARAMS, **{
 #        'loss_function': 'MultiClass',
 #        'verbose': False,
 #        'thread_count': 4,
 #        'random_seed': 0}
 #    }
 #), 'regression': CatBoostRegressor(
train = get_interval_ratio_feat(train)
test = get_interval_ratio_feat(test)

cate_feature = [
    'gender', 'age', 'edu', 'play_mday', 'play_weekday', 'play_isweekend',
    'fav_click_type', 'fav_like_type', 'fav_play_type', 'fav_show_type'
]
feature = list(train.columns)

lgb_model = LGBMClassifier(
    boosting_type="gbdt",
    num_leaves=64,
    reg_alpha=3,
    reg_lambda=3,
    max_depth=-1,
    n_estimators=10000,
    subsample=0.8,
    colsample_bytree=0.8,
    subsample_freq=1,
    learning_rate=0.01,
    random_state=1230,
    n_jobs=-1,
)
predict_result = pd.DataFrame()
predict_result['userid'] = test_id
predict_result['rentention_rate'] = 0
best_score = []
skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)
st = time.time()
for index, (train_index, test_index) in enumerate(skf.split(train, y)):
    print('Start', index + 1, ' Fold')
    train_x, test_x, train_y, test_y = train.loc[train_index], train.loc[
Ejemplo n.º 20
0
##KNeighborsClassifier(n_neighbors=1)
rnf = RandomForestClassifier(n_estimators=181,
                             max_features='sqrt',
                             bootstrap=False,
                             max_depth=60,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             random_state=1)
etr = ExtraTreesClassifier(n_estimators=500,
                           max_features=X_copy.shape[1],
                           min_samples_split=5,
                           min_samples_leaf=1,
                           random_state=1)
lgb = LGBMClassifier(objective='multiclass',
                     num_class=7,
                     learning_rate=0.2,
                     num_leaves=X_copy.shape[1],
                     random_state=1)  #num_leaves=109,
lrg = LogisticRegression(C=1000,
                         multi_class='multinomial',
                         solver='newton-cg',
                         random_state=1)
mlp = MLPClassifier(activation='logistic', max_iter=500)
#xgb = xgb.XGBClassifier(objective='multi:softmax')
#------------------------------------------------------------------------------
rf_param = {
    'n_estimators': [250, 300, 350, 400],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [None, 50, 60, 70, 80, 90],
    #'min_samples_split' : [2, 5, 10],
    #'min_samples_leaf' : [1, 2, 4],
Ejemplo n.º 21
0
Archivo: train.py Proyecto: 3v1l91l/age
def train_model(train):
    excluded_features = [
        'target',
        'user_hash',
        # 'City_post_HOME', 'City_post_WORK',
        # 'Raion_post_HOME', 'Raion_post_WORK',
        # 'City_post_HOME', 'City_post_WORK',
        'lat_quad_home',
        'lat_quad_work',
        'lon_quad_home',
        'lon_quad_work',
        'LAT_WORK',
        'LAT_HOME',
        'LON_WORK',
        'LON_HOME'
    ]  # , 'data_type_3_m1', 'data_type_1_m1', 'data_type_2_m1']
    train_features = [x for x in train.columns if x not in excluded_features]

    cats = list(train.dtypes[train.dtypes == 'object'].index.values)
    cats = [x for x in cats if x not in excluded_features]

    for f in cats:
        train[f], indexer = pd.factorize(train[f])

    importances = pd.DataFrame()
    importances['feature'] = train_features
    importances['gain'] = 0
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    for (train_index, valid_index) in kf.split(train, train['target']):
        trn_x, trn_y = train[train_features].iloc[train_index], train[
            'target'].iloc[train_index]
        val_x, val_y = train[train_features].iloc[valid_index], train[
            'target'].iloc[valid_index]
        clf = LGBMClassifier(
            objective='multiclass',
            num_class=6,
            num_leaves=16,
            max_depth=5,
            learning_rate=0.06,
            n_estimators=1000,
            subsample=.9,
            colsample_bytree=.8,
            #         lambda_l1=10,
            #         lambda_l2=0.01,
            random_state=1)
        clf.fit(
            trn_x,
            trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            #         eval_names=['train', 'valid'],
            early_stopping_rounds=50,
            verbose=50,
            categorical_feature=cats)
        importances['gain'] += clf.booster_.feature_importance(
            importance_type='gain') / n_splits
        y_pred = clf.predict(val_x)
        acc = accuracy_score(val_y, y_pred)
        print(f'accuracy_score={acc}')
        plt.figure(figsize=(12, 16))
        sns.barplot(x='gain',
                    y='feature',
                    data=importances.sort_values('gain', ascending=False)[:60])
        plt.savefig('importance.png')
Ejemplo n.º 22
0
     'PROVINCE_NAME_广西壮族自治区', 'LCENTERTYPERANK_3.0', 'LCENTERTYPERANK_4.0', 'm1_maxmin_LOG_DAY',
     'm0_maxmin_KJ_CLICK_NUM', 'm4_LM_CLICK_NUM', 'SEX_1', 'MARRIAGE_其它', 'm5_maxmin_LOGIN_DURATION',
     'm0_KJ_CLICK_NUM', 'm2_maxmin_LOG_DAY', 'LCENTERTYPERANK_2.0', 'PROVINCE_NAME_江西省', 'm5_maxmin_LOG_DAY',
     'm3_KJ_CLICK_NUM', 'm2_LOGIN_NUM', 'LCENTERTYPENAME_直属', 'm1_KJ_CLICK_NUM', 'm4_LOGIN_NUM', 'm3_LOGIN_NUM',
     'STD_LM_CLICK_NUM', 'm1_maxmin_LOGIN_DURATION', 'm4_KJ_CLICK_NUM', 'LCENTERTYPERANK_1.0', 'PROVINCE_NAME_福建省',
     'PROVINCE_NAME_内蒙古自治区', 'm0_maxmin_LOG_DAY', 'm1_LOGIN_NUM', 'm3_maxmin_LOG_DAY', 'm2_maxmin_LOGIN_NUM',
     'm0_LOGIN_NUM', 'm5_maxmin_LOGIN_NUM', 'm5_LOGIN_DURATION', 'DAY_MEAN_LM_CLICK_NUM', 'm4_maxmin_LOG_DAY',
     'm1_maxmin_LOGIN_NUM', 'mean_LOG_DAY', 'm3_maxmin_LOGIN_NUM', 'm2_maxmin_LOGIN_DURATION',
     'm3_maxmin_LOGIN_DURATION', 'm2_LOGIN_DURATION', 'm0_maxmin_LOGIN_NUM', 'w_mLOGIN_NUM', 'mean_LOGIN_NUM',
     'w_stdKJ_CLICK_NUM', 'm4_maxmin_LOGIN_NUM', 'm4_maxmin_LOGIN_DURATION', 'w_mLOG_DAY', 'm0_maxmin_LOGIN_DURATION',
     'w_mKJ_CLICK_NUM', 'mean_KJ_CLICK_NUM', 'm3_LOGIN_DURATION', 'w_stdLOGIN_NUM', 'm1_LOGIN_DURATION',
     'w_mLOGIN_DURATION', 'STD_LOG_DAY', 'm4_LOGIN_DURATION', 'mean_LOGIN_DURATION', 'w_stdLOGIN_DURATION']
]

# lightgbm
clf = LGBMClassifier(num_leaves=40, learning_rate=0.05, max_depth=20, n_estimators=300, subsample=0.8,
                     colsample_bytree=1, min_child_weight=1)

# 计算特征重要度
clf.fit(X=data, y=data_y)
score = clf.feature_importances_
score = [(data.columns[i], score[i]) for i in range(len(score))]
score = sorted(score, key=lambda k: k[1], reverse=True)
for i in range(len(score)):
    print(i, score[i])
# start = time.time()
# # 交叉验证
# score_name = 'roc_auc'
# score = model_selection.cross_val_score(estimator=clf, X=data, y=data_y,
#                                         cv=model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=8),
#                                         scoring=score_name, groups=data_y)
# print(score)
Ejemplo n.º 23
0
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import lightgbm
from lightgbm import LGBMClassifier, Dataset, train as train_lgbm
import onnxruntime as rt
import skl2onnx
import onnxmltools
from onnxconverter_common.data_types import FloatTensorType
from onnxmltools.convert import convert_lightgbm

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
clr = LGBMClassifier()
clr.fit(X_train, y_train)
print(clr)

###########################
# Convert a model into ONNX
# +++++++++++++++++++++++++

initial_type = [('float_input', FloatTensorType([None, 4]))]
onx = convert_lightgbm(clr, initial_types=initial_type)

###################################
# Compute the predictions with onnxruntime
# ++++++++++++++++++++++++++++++++++++++++

sess = rt.InferenceSession(onx.SerializeToString())
Ejemplo n.º 24
0
start = time.time()
for train_index, test_index in skf.split(data, data_y):
    # 训练集
    train_data = data.iloc[train_index]
    train_y = data_y[train_index]

    # 测试集
    test_data = data.iloc[test_index]
    test_y = data_y[test_index]
    print(len(train_data), len(test_data))

    # 训练模型
    model = LGBMClassifier(num_leaves=6,
                           learning_rate=0.05,
                           max_depth=6,
                           n_estimators=200,
                           subsample=1,
                           colsample_bytree=1,
                           min_child_weight=1)
    model.fit(X=train_data, y=train_y)

    # 预测
    pred = model.predict(test_data)
    # tmp_score = metrics.roc_auc_score(y_true=test_y, y_score=pred)
    tmp_score = metrics.f1_score(y_true=test_y, y_pred=pred, average='macro')
    print(tmp_score)
    score.append(tmp_score)

print(score)
print('f1:', sum(score) / len(score), 'time:', time.time() - start)
images = images.reshape(images.shape[0], SHAPE_SIZE_X * SHAPE_SIZE_Y)
labels = labels.astype('int')
images_validation = images_validation.reshape(images_validation.shape[0],
                                              SHAPE_SIZE_X * SHAPE_SIZE_Y)
labels_validation = labels_validation.astype('int')

images, labels = reduced_dataset(images, labels)

# # Train the light GBM
# model = LGBMClassifier()
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# n_scores = cross_val_score(model, images, labels, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# fit the model on the whole dataset
model = LGBMClassifier(objective="binary", class_weight="balanced")

start_time = time.time()

model = model.fit(images, labels)
print("Train Light GBM --- %s seconds ---" % (time.time() - start_time))

start_time = time.time()
basic_score = model.score(images_validation, labels_validation)
print("Validation Light GBM --- %s seconds ---" % (time.time() - start_time))

print("Light GBM scikit learn basic score: %0.4f" % basic_score)

# Validating the model and evaluation
start_time = time.time()
scores = cross_validate(model,
Ejemplo n.º 26
0
config['cat_columns'] = cat_feat

if os.path.exists('../trans_data/train_1000_10.pkl'):
    train = pickle.load(open('../trans_data/train_1000_10.pkl', 'rb'))
    test = pickle.load(open('../trans_data/test_1000_10.pkl', 'rb'))
else:
    d={'add':'+', 'sub':'-', 'mul':'*', 'div':'/'}
    feat0 = feat.copy()
    for i in trange(len(feat)):
        df_temp=train[feat0].copy()
        for j in range(i+1,len(feat)):
            df_temp['%s|%s|add'%(feat[i],feat[j])] = train[feat[i]]+train[feat[j]]
            df_temp['%s|%s|sub'%(feat[i],feat[j])] = train[feat[i]]-train[feat[j]]
            df_temp['%s|%s|mul'%(feat[i],feat[j])] = train[feat[i]]*train[feat[j]]
            df_temp['%s|%s|div'%(feat[i],feat[j])] = train[feat[i]]/train[feat[j]]
        model = LGBMClassifier(n_estimators=1000, learning_rate=0.08, max_depth=7, subsample=0.8, colsample_bytree=0.6, n_jobs=4)
        model.fit(df_temp.values, train_y)
        qq = pd.Series(model.feature_importances_, index=df_temp.columns).sort_values()
        for col in set(qq.loc[qq>10].index)-set(feat0):
            f0, f1, f2 = col.split('|')
            train[col] = df_temp[col]
            test[col] = eval("test['%s']%stest['%s']"%(f0,d[f2],f1))
        feat0.extend(list(set(qq.loc[qq>10].index)-set(feat0)))
    pickle.dump(train, open('../trans_data/train_1000_10.pkl','wb'))
    pickle.dump(test, open('../trans_data/test_1000_10.pkl','wb'))

def gen_feat(data):
    for col in cat_feat:
        data[col] = data[col].fillna('empty').astype(str)
    for col in data.columns:
        if '年' not in col and '|' not in col and data[col].isna().sum()>0:
Ejemplo n.º 27
0
for file in test_list:
    df_action = pd.read_csv(file)
    df_test.append(df_action)
df_test = pd.concat(df_test, axis=0, ignore_index=True)

print(df_train['sign'].value_counts())
print(df_test['sign'].value_counts())
time.sleep(2)

# Prepare data
y_train = df_train.pop('sign')
x_train = df_train.values
y_test = df_test.pop('sign')
x_test = df_test.values

# Model
# print('Training random forest')
# model = RandomForestClassifier(n_estimators=9)
# model.fit(x_train, y_train)

model = LGBMClassifier(n_estimators=50)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of Random forest: ', accuracy)

# Save model
with open('../model/md.pickle', 'wb') as f:
    pickle.dump(model, f)
lgb_params['min_child_samples'] = 500
lgb_params['seed'] = 99

lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16
lgb_params2['seed'] = 99

log_params = {}
log_params['class_weight'] = {0: 1, 1: 4.5}
log_params['random_state'] = 99
lgb_model = LGBMClassifier(**lgb_params)

lgb_model2 = LGBMClassifier(**lgb_params2)

log_model = LogisticRegression(**log_params)

stack = Ensemble(n_splits=3,
                 stacker=log_model,
                 base_models=(lgb_model, lgb_model2))
train = train.drop([
    'PRXYDATA_98', 'PRXYDATA_97', 'PRXYDATA_2', 'PRXRETRY_98', 'PRXRETRY_97',
    'PRVHLTIN_98', 'PRVHLTIN_97', 'PRVHLTIN_85', 'MEDICARE_98', 'MEDICARE_97',
    'MEDICARE_85', 'IRWELMOS_7', 'IRWELMOS_10', 'IRPRVHLT_2', 'IRPINC3_7',
    'IRPINC3_6', 'IROTHHLT_99', 'IROTHHLT_1', 'IIOTHHLT_3', 'IIOTHHLT_1',
    'IIMEDICR_3', 'IIHHSIZ2_3', 'IIHHSIZ2_1', 'HLTINNOS_99', 'HLTINNOS_98',
    'HLTINNOS_97', 'HLTINNOS_94', 'HLTINNOS_2', 'HLTINNOS_1', 'HLNVSOR_98',
Ejemplo n.º 29
0
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import InstanceHardnessThreshold

from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

from collections import Counter
r = 1001

models = [
    DecisionTreeClassifier(random_state=r),
    BaggingClassifier(random_state=r),
    RandomForestClassifier(random_state=r),
    GradientBoostingClassifier(random_state=r),
    LGBMClassifier(),
    XGBClassifier(random_state=r),
    CatBoostClassifier(random_state=r, verbose=False),
]
names = [
    "Decision Tree",
    "Ensemble-Bagging",
    "Ensemble-Random Forest",
    "Ensemble-Gradient Boosting",
    "Light Gradient Boosting",
    "XG Boost",
    "Cat Boost",
]

samplers = [
    # imbalanced-learn Over
#cat_params['l2_leaf_reg'] = 3.5
#cat_params['border_count'] = 8
#cat_params['gradient_iterations'] = 4

# Regularized Greedy Forest params
#rgf_params = {}
#rgf_params['max_leaf'] = 2000
#rgf_params['learning_rate'] = 0.5
#rgf_params['algorithm'] = "RGF_Sib"
#rgf_params['test_interval'] = 100
#rgf_params['min_samples_leaf'] = 3
#rgf_params['reg_depth'] = 1.0
#rgf_params['l2'] = 0.5
#rgf_params['sl2'] = 0.005

lgb_model_1 = LGBMClassifier(**lgb_params_1)

lgb_model_2 = LGBMClassifier(**lgb_params_2)

lgb_model_3 = LGBMClassifier(**lgb_params_3)

#rf_model = RandomForestClassifier(**rf_params)

#et_model = ExtraTreesClassifier(**et_params)

#xgb_model = XGBClassifier(**xgb_params)

#cat_model = CatBoostClassifier(**cat_params)

#rgf_model = RGFClassifier(**rgf_params)
Ejemplo n.º 31
0
def compute_part_of_the_stacking(X_train_nn,X_train_sk,X_train_basic,\
                                 X_train_GoQ,y_train,X_test_nn,X_test_sk,\
                                 X_test_basic,X_test_GoQ,max_size,embedding_matrix,glove_embedding\
                                 ,word_vector_dim,drop_rate,my_optimizer,batch_size,nb_epoch,my_patience=0):
    '''
    Compute the stackings features from the given input matrices that are defined
    Various easy parameters are used ...
    '''

    X_train_basic_GoQ = np.concatenate((X_train_basic, X_train_GoQ), axis=1)
    X_test_basic_GoQ = np.concatenate((X_test_basic, X_test_GoQ), axis=1)

    mcp1 = ModelCheckpoint('weights.stack.hdf5',
                           monitor="val_acc",
                           save_best_only=True,
                           save_weights_only=False)

    model = create_nn_model(max_size, embedding_matrix, word_vector_dim,
                            drop_rate, my_optimizer)
    model.fit(
        X_train_nn,
        y_train,
        batch_size=batch_size,
        epochs=nb_epoch,
        validation_split=0.1,
        callbacks=[mcp1],
    )

    keras.backend.clear_session()

    mcp2 = ModelCheckpoint('weights.stack_gv.hdf5',
                           monitor="val_acc",
                           save_best_only=True,
                           save_weights_only=False)

    print('Computing Neural Netword for Glove features')
    model_gv = create_nn_model(max_size, glove_embedding, 200, drop_rate,
                               my_optimizer)
    model_gv.fit(
        X_train_nn,
        y_train,
        batch_size=batch_size,
        epochs=nb_epoch,
        validation_split=0.1,
        callbacks=[mcp2],
    )
    keras.backend.clear_session()

    X_train_nn_GoQ = X_train_nn + [X_train_GoQ]
    X_test_nn_GoQ = X_test_nn + [X_test_GoQ]

    mcp3 = ModelCheckpoint('weights.stack_1.hdf5',
                           monitor="val_acc",
                           save_best_only=True,
                           save_weights_only=False)

    model_v2 = create_nn_model_v2(max_size, embedding_matrix, word_vector_dim,
                                  drop_rate, my_optimizer,
                                  X_train_GoQ.shape[1])
    model_v2.fit(
        X_train_nn_GoQ,
        y_train,
        batch_size=batch_size,
        epochs=nb_epoch,
        validation_split=0.1,
        callbacks=[mcp3],
    )

    keras.backend.clear_session()

    mcp4 = ModelCheckpoint('weights.stack_gv_1.hdf5',
                           monitor="val_acc",
                           save_best_only=True,
                           save_weights_only=False)

    model_gv_v2 = create_nn_model_v2(max_size, glove_embedding, 200, drop_rate,
                                     my_optimizer, X_train_GoQ.shape[1])
    model_gv_v2.fit(
        X_train_nn_GoQ,
        y_train,
        batch_size=batch_size,
        epochs=nb_epoch,
        validation_split=0.1,
        callbacks=[mcp4],
    )

    keras.backend.clear_session()

    X_train_nn_basic = X_train_nn + [X_train_basic]
    X_test_nn_basic = X_test_nn + [X_test_basic]

    mcp5 = ModelCheckpoint('weights.stack_2.hdf5',
                           monitor="val_acc",
                           save_best_only=True,
                           save_weights_only=False)

    model_v3 = create_nn_model_v2(max_size, embedding_matrix, word_vector_dim,
                                  drop_rate, my_optimizer,
                                  X_train_basic.shape[1])
    model_v3.fit(
        X_train_nn_basic,
        y_train,
        batch_size=batch_size,
        epochs=nb_epoch,
        validation_split=0.1,
        callbacks=[mcp5],
    )

    keras.backend.clear_session()

    mcp6 = ModelCheckpoint('weights.stack_gv_2.hdf5',
                           monitor="val_acc",
                           save_best_only=True,
                           save_weights_only=False)

    model_gv_v3 = create_nn_model_v2(max_size, glove_embedding, 200, drop_rate,
                                     my_optimizer, X_train_basic.shape[1])
    model_gv_v3.fit(
        X_train_nn_basic,
        y_train,
        batch_size=batch_size,
        epochs=nb_epoch,
        validation_split=0.1,
        callbacks=[mcp6],
    )

    keras.backend.clear_session()

    X_train_nn_basic_GoQ = X_train_nn + [X_train_basic_GoQ]
    X_test_nn_basic_GoQ = X_test_nn + [X_test_basic_GoQ]

    mcp7 = ModelCheckpoint('weights.stack_3.hdf5',
                           monitor="val_acc",
                           save_best_only=True,
                           save_weights_only=False)

    model_v4 = create_nn_model_v2(max_size, embedding_matrix, word_vector_dim,
                                  drop_rate, my_optimizer,
                                  X_train_basic_GoQ.shape[1])
    model_v4.fit(
        X_train_nn_basic_GoQ,
        y_train,
        batch_size=batch_size,
        epochs=nb_epoch,
        validation_split=0.1,
        callbacks=[mcp7],
    )

    keras.backend.clear_session()

    mcp8 = ModelCheckpoint('weights.stack_gv_3.hdf5',
                           monitor="val_acc",
                           save_best_only=True,
                           save_weights_only=False)

    model_gv_v4 = create_nn_model_v2(max_size, glove_embedding, 200, drop_rate,
                                     my_optimizer, X_train_basic_GoQ.shape[1])
    model_gv_v4.fit(
        X_train_nn_basic_GoQ,
        y_train,
        batch_size=batch_size,
        epochs=nb_epoch,
        validation_split=0.1,
        callbacks=[mcp8],
    )

    keras.backend.clear_session()

    print('Computing skmodels')

    rf_model = RandomForestClassifier(n_estimators=50)
    elastic_net_model = ElasticNet()
    log_reg_model = LogisticRegression()
    lin_reg_model = LinearRegression()
    km_model = KNeighborsClassifier(n_neighbors=50)
    svm_model = SVC(probability=True)
    xgb_model = XGBClassifier(max_depth=6,
                              n_estimators=100,
                              reg_lambda=1,
                              seed=5)
    lightgbm_model = LGBMClassifier(max_depth=6,
                                    n_estimators=100,
                                    reg_lambda=1,
                                    seed=5)

    rf_model_GoQ = RandomForestClassifier(n_estimators=50)
    elastic_net_model_GoQ = ElasticNet()
    log_reg_model_GoQ = LogisticRegression()
    lin_reg_model_GoQ = LinearRegression()
    km_model_GoQ = KNeighborsClassifier(n_neighbors=50)
    svm_model_GoQ = SVC(probability=True)
    xgb_model_GoQ = XGBClassifier(max_depth=6,
                                  n_estimators=100,
                                  reg_lambda=1,
                                  seed=5)
    lightgbm_model_GOQ = LGBMClassifier(max_depth=6,
                                        n_estimators=100,
                                        reg_lambda=1,
                                        seed=5)

    rf_model_basic = RandomForestClassifier(n_estimators=50)
    elastic_net_model_basic = ElasticNet()
    log_reg_model_basic = LogisticRegression()
    lin_reg_model_basic = LinearRegression()
    km_model_basic = KNeighborsClassifier(n_neighbors=50)
    svm_model_basic = SVC(probability=True)
    xgb_model_basic = XGBClassifier(max_depth=6,
                                    n_estimators=100,
                                    reg_lambda=1,
                                    seed=5)
    lightgbm_model_basic = LGBMClassifier(max_depth=6,
                                          n_estimators=100,
                                          reg_lambda=1,
                                          seed=5)

    rf_model_basic_GoQ = RandomForestClassifier(n_estimators=200)
    elastic_net_model_basic_GoQ = ElasticNet()
    log_reg_model_basic_GoQ = LogisticRegression()
    lin_reg_model_basic_GoQ = LinearRegression()
    km_model_basic_GoQ = KNeighborsClassifier(n_neighbors=50)
    svm_model_basic_GoQ = SVC(probability=True)
    xgb_model_basic_GoQ = XGBClassifier(max_depth=6,
                                        n_estimators=150,
                                        reg_lambda=1,
                                        seed=5)
    lightgbm_model_basic_GoQ = LGBMClassifier(max_depth=6,
                                              n_estimators=150,
                                              reg_lambda=1,
                                              seed=5)

    print('XGB Model')
    xgb_model.fit(X_train_sk, y_train)
    xgb_model_basic.fit(X_train_basic, y_train)
    xgb_model_GoQ.fit(X_train_GoQ, y_train)
    xgb_model_basic_GoQ.fit(X_train_basic_GoQ, y_train)

    print('Light GBM')
    lightgbm_model.fit(X_train_sk, y_train)
    lightgbm_model_basic.fit(X_train_basic, y_train)
    lightgbm_model_GOQ.fit(X_train_GoQ, y_train)
    lightgbm_model_basic_GoQ.fit(X_train_basic_GoQ, y_train)

    print('Train RF Model')
    rf_model.fit(X_train_sk, y_train)
    rf_model_basic.fit(X_train_basic, y_train)
    rf_model_GoQ.fit(X_train_GoQ, y_train)
    rf_model_basic_GoQ.fit(X_train_basic_GoQ, y_train)

    print('ElasticNet Model')
    elastic_net_model.fit(X_train_sk, y_train)
    elastic_net_model_basic.fit(X_train_basic, y_train)
    elastic_net_model_GoQ.fit(X_train_GoQ, y_train)
    elastic_net_model_basic_GoQ.fit(X_train_basic_GoQ, y_train)

    print('Logistic Reg Model')
    log_reg_model.fit(X_train_sk, y_train)
    log_reg_model_basic.fit(X_train_basic, y_train)
    log_reg_model_GoQ.fit(X_train_GoQ, y_train)
    log_reg_model_basic_GoQ.fit(X_train_basic_GoQ, y_train)

    print('Linear Reg Model')
    lin_reg_model.fit(X_train_sk, y_train)
    lin_reg_model_basic.fit(X_train_basic, y_train)
    lin_reg_model_GoQ.fit(X_train_GoQ, y_train)
    lin_reg_model_basic_GoQ.fit(X_train_basic_GoQ, y_train)

    print('KMeans Model')
    km_model.fit(X_train_sk, y_train)
    km_model_basic.fit(X_train_basic, y_train)
    km_model_GoQ.fit(X_train_GoQ, y_train)
    km_model_basic_GoQ.fit(X_train_basic_GoQ, y_train)
    '''#Too long to compute
    print('SVM Model')
    svm_model.fit(X_train_sk,y_train)
    svm_model_basic.fit(X_train_basic,y_train)
    svm_model_GoQ.fit(X_train_GoQ,y_train)
    svm_model_basic_GoQ.fit(X_train_basic_GoQ,y_train)
    '''
    print('Predict Output Test')

    model_gv_v4 = keras.models.load_model('weights.stack_gv_3.hdf5')

    model_v4 = keras.models.load_model('weights.stack_3.hdf5')

    model_gv_v3 = keras.models.load_model('weights.stack_gv_2.hdf5')

    model_v3 = keras.models.load_model('weights.stack_2.hdf5')

    model_gv_v2 = keras.models.load_model('weights.stack_gv_1.hdf5')

    model_v2 = keras.models.load_model('weights.stack_1.hdf5')

    model_gv = keras.models.load_model('weights.stack_gv.hdf5')

    model = keras.models.load_model('weights.stack.hdf5')

    outcome_nn_test = model.predict(X_test_nn)
    outcome_nn_test_gv = model_gv.predict(X_test_nn)

    outcome_nn_test_v2 = model_v2.predict(X_test_nn_GoQ)
    outcome_nn_test_gv_v2 = model_gv_v2.predict(X_test_nn_GoQ)

    outcome_nn_test_v3 = model_v3.predict(X_test_nn_basic)
    outcome_nn_test_gv_v3 = model_gv_v3.predict(X_test_nn_basic)

    outcome_nn_test_v4 = model_v4.predict(X_test_nn_basic_GoQ)
    outcome_nn_test_gv_v4 = model_gv_v4.predict(X_test_nn_basic_GoQ)

    keras.backend.clear_session()

    outcome_rf_test = rf_model.predict_proba(X_test_sk)[:, 1].reshape((-1, 1))
    outcome_ada_test = elastic_net_model.predict(X_test_sk).reshape((-1, 1))
    outcome_log_reg_model_test = log_reg_model.predict_proba(
        X_test_sk)[:, 1].reshape((-1, 1))
    outcme_lin_model_test = lin_reg_model.predict(X_test_sk).reshape((-1, 1))
    outcome_kmeans_test = km_model.predict_proba(X_test_sk)[:, 1].reshape(
        (-1, 1))
    #outcome_svm_test = svm_model.predict(X_test_sk).reshape((-1,1))
    outcome_xgb_test = xgb_model.predict_proba(X_test_sk)[:, 1].reshape(
        (-1, 1))
    outcome_lgb_test = lightgbm_model.predict_proba(X_test_sk)[:, 1].reshape(
        (-1, 1))

    outcome_rf_test_basic = rf_model_basic.predict_proba(
        X_test_basic)[:, 1].reshape((-1, 1))
    outcome_ada_test_basic = elastic_net_model_basic.predict(
        X_test_basic).reshape((-1, 1))
    outcome_log_reg_model_test_basic = log_reg_model_basic.predict_proba(
        X_test_basic)[:, 1].reshape((-1, 1))
    outcme_lin_mode_testl_basic = lin_reg_model_basic.predict(
        X_test_basic).reshape((-1, 1))
    outcome_kmeans_test_basic = km_model_basic.predict_proba(
        X_test_basic)[:, 1].reshape((-1, 1))
    #outcome_svm_test_basic = svm_model_basic.predict(X_test_basic).reshape((-1,1))
    outcome_xgb_test_basic = xgb_model_basic.predict_proba(
        X_test_basic)[:, 1].reshape((-1, 1))
    outcome_lgb_test_basic = lightgbm_model_basic.predict_proba(
        X_test_basic)[:, 1].reshape((-1, 1))

    outcome_rf_test_GoQ = rf_model_GoQ.predict_proba(X_test_GoQ)[:, 1].reshape(
        (-1, 1))
    outcome_ada_test_GoQ = elastic_net_model_GoQ.predict(X_test_GoQ).reshape(
        (-1, 1))
    outcome_log_reg_model_test_GoQ = log_reg_model_GoQ.predict_proba(
        X_test_GoQ)[:, 1].reshape((-1, 1))
    outcme_lin_mode_testl_GoQ = lin_reg_model_GoQ.predict(X_test_GoQ).reshape(
        (-1, 1))
    outcome_kmeans_test_GoQ = km_model_GoQ.predict_proba(
        X_test_GoQ)[:, 1].reshape((-1, 1))
    #outcome_svm_test_GoQ  = svm_model_GoQ.predict(X_test_GoQ).reshape((-1,1))
    outcome_xgb_test_GoQ = xgb_model_GoQ.predict_proba(X_test_GoQ)[:,
                                                                   1].reshape(
                                                                       (-1, 1))
    outcome_lgb_test_GoQ = lightgbm_model_GOQ.predict_proba(
        X_test_GoQ)[:, 1].reshape((-1, 1))

    outcome_rf_test_basic_GoQ = rf_model_basic_GoQ.predict_proba(
        X_test_basic_GoQ)[:, 1].reshape((-1, 1))
    outcome_ada_test_basic_GoQ = elastic_net_model_basic_GoQ.predict(
        X_test_basic_GoQ).reshape((-1, 1))
    outcome_log_reg_model_test_basic_GoQ = log_reg_model_basic_GoQ.predict_proba(
        X_test_basic_GoQ)[:, 1].reshape((-1, 1))
    outcme_lin_mode_testl_basic_GoQ = lin_reg_model_basic_GoQ.predict(
        X_test_basic_GoQ).reshape((-1, 1))
    outcome_kmeans_test_basic_GoQ = km_model_basic_GoQ.predict_proba(
        X_test_basic_GoQ)[:, 1].reshape((-1, 1))
    #outcome_svm_test_basic_GoQ = svm_model_basic_GoQ.predict(X_test_basic_GoQ).reshape((-1,1))
    outcome_xgb_test_basic_GoQ = xgb_model_basic_GoQ.predict_proba(
        X_test_basic_GoQ)[:, 1].reshape((-1, 1))
    outcome_lgb_test_basic_GoQ = lightgbm_model_basic_GoQ.predict_proba(
        X_test_basic_GoQ)[:, 1].reshape((-1, 1))
    '''X_test = np.concatenate([outcome_nn_test_gv,outcome_nn_test\
                             ,outcome_svm_test,outcome_rf_test,outcome_ada_test,outcome_log_reg_model_test,outcme_lin_model_test,outcome_kmeans_test,\
                             outcome_rf_test_basic,outcome_ada_test_basic,outcome_log_reg_model_test_basic,outcme_lin_mode_testl_basic,outcome_kmeans_test_basic,\
                             outcome_svm_test_basic\
                             ,outcome_rf_test_GoQ,outcome_ada_test_GoQ,outcome_log_reg_model_test_GoQ,outcme_lin_mode_testl_GoQ,outcome_kmeans_test_GoQ,outcome_svm_test_GoQ,\
                             outcome_rf_test_basic_GoQ,outcome_ada_test_basic_GoQ,outcome_log_reg_model_test_basic_GoQ,outcme_lin_mode_testl_basic_GoQ,\
                             outcome_kmeans_test_basic_GoQ,outcome_svm_test_basic_GoQ],axis=1)'''

    X_test = np.concatenate([outcome_nn_test_gv,outcome_nn_test,outcome_nn_test_v2,outcome_nn_test_gv_v2,outcome_nn_test_v3,outcome_nn_test_gv_v3,outcome_nn_test_v4,outcome_nn_test_gv_v4\
                             ,outcome_rf_test,outcome_ada_test,outcome_log_reg_model_test,outcme_lin_model_test,outcome_kmeans_test,\
                             outcome_rf_test_basic,outcome_ada_test_basic,outcome_log_reg_model_test_basic,outcme_lin_mode_testl_basic,outcome_kmeans_test_basic,\
                             outcome_rf_test_GoQ,outcome_ada_test_GoQ,outcome_log_reg_model_test_GoQ,outcme_lin_mode_testl_GoQ,outcome_kmeans_test_GoQ,\
                             outcome_rf_test_basic_GoQ,outcome_ada_test_basic_GoQ,outcome_log_reg_model_test_basic_GoQ,outcme_lin_mode_testl_basic_GoQ,\
                             outcome_kmeans_test_basic_GoQ,outcome_xgb_test,outcome_lgb_test,outcome_xgb_test_basic,outcome_lgb_test_basic,outcome_xgb_test_GoQ,\
                             outcome_lgb_test_GoQ,outcome_xgb_test_basic_GoQ,outcome_lgb_test_basic_GoQ],axis=1)

    return X_test
Ejemplo n.º 32
0
    # Save the classification model
    ml_model_name = model_folder + label + "_clf.model"
    pickle.dump(clf, open(ml_model_name, 'wb'))


labels = [
    'CVSS2_Conf', 'CVSS2_Integrity', 'CVSS2_Avail', 'CVSS2_AccessVect',
    'CVSS2_AccessComp', 'CVSS2_Auth', 'CVSS2_Severity'
]

clfs = {
    'CVSS2_Conf': {
        'LGBM':
        LGBMClassifier(num_leaves=100,
                       max_depth=-1,
                       objective='multiclass',
                       n_jobs=-1,
                       random_state=42)
    },
    'CVSS2_Integrity': {
        'XGB':
        XGBClassifier(objective='multiclass',
                      max_depth=0,
                      max_leaves=100,
                      grow_policy='lossguide',
                      n_jobs=-1,
                      random_state=42,
                      tree_method='hist')
    },
    'CVSS2_Avail': {
        'LGBM':
    predicted_lr = model_lr.predict(X_test) ; print("LogisticRegression",metrics.accuracy_score(Y_test, predicted_lr),"\n")
    #aa = model_lr.coef_    
    
if cond01 == 3:
    from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bayes
    model_nb = GaussianNB(); model_nb.fit(X_train, Y_train)
    predicted_nb = model_nb.predict(X_test) ; print("Gaussian Naive Bayes",metrics.accuracy_score(Y_test, predicted_nb),"\n")

if cond01 == 4:
    from sklearn.ensemble import GradientBoostingClassifier # GradientBoosting
    model_gb = GradientBoostingClassifier(); model_gb.fit(X_train, Y_train)
    predicted_gb = model_gb.predict(X_test) ; print("GradientBoosting",metrics.accuracy_score(Y_test, predicted_gb),"\n")

if cond01 == 5:
    from lightgbm import LGBMClassifier  # LightGBM
    model_lgbm = LGBMClassifier(); model_lgbm.fit(X_train, Y_train)
    predicted_lgbm = model_lgbm.predict(X_test); print("LightGBM",metrics.accuracy_score(Y_test, predicted_lgbm),"\n")



#
##http://myenigma.hatenablog.com/entry/2015/10/09/223629
#import seaborn as sns
#iris = sns.load_dataset("iris") #サンプルデータセット
##sns.pairplot(iris);
#sns.pairplot(iris,hue="species");
#sns.plt.savefig("iris.png")
#sns.plt.show()
#

Ejemplo n.º 34
0
             "./model2/sample_mydata_model_xgboost{}.pickle.dat".format(
                 cnt), "wb"))
 else:
     print("LGBMClassifier")
     model = LGBMClassifier(
         boosting_type='gbdt',
         task='train',
         num_leaves=2**depth - 1,
         num_iterations=steps,
         learning_rate=0.01,
         n_estimators=2000,
         max_bin=425,
         subsample_for_bin=50000,
         objective='binary',
         min_split_gain=0,
         min_child_weight=5,
         min_child_samples=10,
         feature_fraction=0.9,
         feature_fraction_bynode=0.8,
         drop_rate=0.05,
         subsample=0.8,
         subsample_freq=1,
         colsample_bytree=1,
         reg_alpha=3,
         reg_lambda=5,
         seed=1000,
         # n_jobs=4,
         silent=True)
     # 建议使用CV的方式训练预测。
     model.fit(
         train_x,
         train_y,
Ejemplo n.º 35
0
	else:
		pipeline.fit(audit_X, audit_y)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
	store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)

if "Audit" in datasets:
	build_audit(DecisionTreeClassifier(min_samples_leaf = 7, random_state = 13), "DecisionTreeAudit", compact = False, flat = True)
	build_audit(GradientBoostingClassifier(n_estimators = 71, random_state = 13), "GradientBoostingAudit")
	build_audit(LGBMClassifier(objective = "binary", n_estimators = 71, random_state = 13), "LightGBMAudit")
	build_audit(LogisticRegression(multi_class = "ovr", solver = "liblinear", random_state = 13), "LogisticRegressionAudit")
	build_audit(RandomForestClassifier(n_estimators = 17, random_state = 13), "RandomForestAudit", compact = False, flat = False)
	build_audit(XGBClassifier(objective = "binary:logistic", ntree_limit = 71, random_state = 13), "XGBoostAudit")

sparsify("Audit")

audit_X, audit_y = load_audit("AuditNA")

if ("Audit" in datasets) or ("AuditNA" in datasets):
	build_audit(LGBMClassifier(objective = "binary", n_estimators = 71, random_state = 13), "LightGBMAuditNA")
	build_audit(XGBClassifier(objective = "binary:logistic", ntree_limit = 71, random_state = 13), "XGBoostAuditNA")

def load_sentiment(name):
	df = load_csv(name)
	return (df["Sentence"], df["Score"])
Ejemplo n.º 36
0
def without_cv_transfer_a_to_b_modeling():
    """

    :return:
    """

    '''Data input'''
    data_a_train = pd.read_csv('../data/A_train_final.csv', index_col='no')
    data_b_train = pd.read_csv('../data/B_train_final.csv', index_col='no')
    y_of_b_train = data_b_train['flag']
    data_b_test = pd.read_csv('../data/B_test_final.csv', index_col='no')

    '''A train特征工程'''
    data_a_train_without_label = data_a_train.drop('flag', axis=1)

    data_a_train_without_label['UserInfo_222x82'] = data_a_train_without_label['UserInfo_82'] * data_a_train_without_label['UserInfo_222']

    '''缺失值填充'''
    data_a_train_filled = data_a_train_without_label.fillna(value=10)

    '''特征的名字'''
    feature_name = list(data_a_train_without_label.columns.values)
    data_b_test_user_id = list(data_b_test.index.values)

    '''构造训练集和测试集'''
    x_temp = data_a_train_filled.iloc[:, :].as_matrix()  # 自变量
    y = data_a_train.iloc[:, -1].as_matrix()  # 因变量

    '''Feature selection 注意如果加特征的话,feature name还是需要改的'''
    X, dropped_feature_name, len_feature_choose = lgb_feature_selection(feature_name, x_temp, y, "0.1*mean")

    '''B train特征工程'''
    data_b_train_without_label = data_b_train.drop('flag', axis=1)

    data_b_train_without_label['UserInfo_222x82'] = data_b_train_without_label['UserInfo_82'] * data_b_train_without_label['UserInfo_222']
    data_b_train_filled = data_b_train_without_label.fillna(value=10)

    '''b test 特征工程'''
    data_b_test['UserInfo_222x82'] = data_b_test['UserInfo_82'] * data_b_test['UserInfo_222']
    data_b_test_filled = data_b_test.fillna(value=10)

    '''特征筛选'''
    data_b_train_filled_after_feature_selection = data_test_feature_drop(data_b_train_filled, dropped_feature_name)
    data_b_test_filled_after_feature_selection = data_test_feature_drop(data_b_test_filled, dropped_feature_name)

    '''用A_train建模预测B_train'''

    print '起始时间'
    print time.clock()*1.0/60

    parameter_n_estimators = 400
    classifier = LGBMClassifier(n_estimators=parameter_n_estimators)

    a_model = classifier.fit(X, y)

    prob_of_b_train = a_model.predict_proba(data_b_train_filled_after_feature_selection)

    print '训练终止时间'
    print time.clock()*1.0/60

    '''画roc曲线'''
    fpr, tpr, thresholds = roc_curve(y_of_b_train, prob_of_b_train[:, 1])

    roc_auc = auc(fpr, tpr)

    print '\nauc='+str(roc_auc)

    '''预测Btest'''

    prob_of_b_test = a_model.predict_proba(data_b_test_filled_after_feature_selection)

    result_file_name = '../result/B_test_predict_using_A_LGBLGB_without_cv_fillna_10' + '_N_' + str(parameter_n_estimators) + '_features_' + \
                       str(len_feature_choose) + '_offline_'+str(roc_auc)+'.csv'

    write_predict_results_to_csv(result_file_name, data_b_test_user_id, prob_of_b_test[:, 1].tolist())