Exemple #1
0
def tune_params():  
    f1_t_total, f1_v_total = [], []
    for max_depth in range(6,15):
        for subsample in [0.6,0.7,0.8]:
            for colsample_bytree in [0.6,0.7,0.8]:
                for reg_alpha in [0.1,1,10]:
                    lgb_base = LGBMClassifier(n_estimators = 150,objective = 'binary',
                                      random_state=1234,n_jobs = 3,colsample_bytree=colsample_bytree, 
                                      reg_alpha=reg_alpha,
                                      max_depth = max_depth, subsample = subsample)
                    _params = { 'max_depth':max_depth,
                        'subsample':subsample,
                            'colsample_bytree':colsample_bytree,
                                'reg_alpha':reg_alpha,
                            }
                    lgb_base.fit(X_t, y_t)
                    y_t_pre = lgb_base.predict(X_t)
                    y_v_pre = lgb_base.predict(X_v)
                    f1_t_each = f1_score(y_t, y_t_pre,average = 'micro')
                    f1_v_each = f1_score(y_v, y_v_pre,average = 'micro')
                    f1_t_total.append(f1_t_each)
                    f1_v_total.append(f1_v_each)
                    print(_params)
                    myfile1 = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_saveparams_f1_0418.txt',
                                  'a', encoding='utf-8')
                    print(_params['max_depth'],_params['subsample'],_params['colsample_bytree'],
                          _params['reg_alpha'],file = myfile1)
                    
                    myfile1.close()
                    print(f1_t_each,f1_v_each)
                    myfile = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_tunparms_f1_0418.txt',
                                  'a', encoding='utf-8')
                    print(f1_t_each,',',f1_v_each,file = myfile)
                    myfile.close()                   
    return f1_t_total,f1_v_total
    def baseline_xiong(self, profile: Profile, shared: Storage, logger: Logger,
                       converted):
        a_std = converted[1].std(-1)
        g_mean = converted[3].mean(-1)
        g_std = converted[3].std(-1)
        m_over_0_count = (converted[2] >= 0.0).sum(-1).astype(np.float32)
        a_mean = converted[1].mean(-1)
        a_l2_std = np.sqrt(converted[1][:, 0, :]**2 +
                           converted[1][:, 1, :]**2 +
                           converted[1][:, 2, :]**2).std(-1)[:, np.newaxis]
        m_l2_std = np.sqrt(converted[2][:, 0, :]**2 +
                           converted[2][:, 1, :]**2 +
                           converted[2][:, 2, :]**2).std(-1)[:, np.newaxis]

        features = np.concatenate(
            (a_std, g_mean, g_std, m_over_0_count, a_mean, a_l2_std, m_l2_std),
            axis=1)
        labels = converted[
            0]  # onehot.fit_transform(converted[0].reshape(-1, 1)).toarray()

        length = labels.shape[0]

        classifier = LGBMClassifier()
        classifier.fit(features[:int(length * 0.7)],
                       labels[:int(length * 0.7)])

        validate_y = labels[int(length * 0.7):]
        predict_y = classifier.predict(features[int(length * 0.7):])
        logger.info('Xiong')
        logger.info(f'Accuracy: {accuracy_score(validate_y, predict_y)}')
        logger.info(
            f'Precision: {precision_score(validate_y, predict_y, average=None)}'
        )
        logger.info(
            f'Recall: {recall_score(validate_y, predict_y, average=None)}')
Exemple #3
0
def criteo_gdbtlr(X_idx, X_value, y):
    import numpy as np
    from sklearn.metrics import roc_auc_score, accuracy_score
    from sklearn.linear_model import LogisticRegression
    from lightgbm.sklearn import LGBMClassifier

    X_idx = X_idx.values.tolist()
    y = y.values.tolist()
    num_leaves = 31
    model = LGBMClassifier(num_leaves=num_leaves)
    model.fit(X_idx, y)
    model_path = os.path.join(pwd_path, 'gbdtlr_model1.pt')
    y_pred = model.predict(X_idx, pred_leaf=True)
    y_pred_gbdt = model.predict(X_idx, pred_leaf=False)
    acc = model.score(X_idx, y)
    print("gbdt train acc:", acc)
    s = roc_auc_score(y, y_pred_gbdt)
    print('gbdt auc:', s)
    a = accuracy_score(y, y_pred_gbdt)
    print('gbdt train acc:', a)
    import pickle  # pickle模块

    # 保存Model(注:save文件夹要预先建立,否则会报错)
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)

    # # 读取Model
    # with open('save/clf.pickle', 'rb') as f:
    #     clf2 = pickle.load(f)

    transformed_matrix = np.zeros(
        [len(y_pred), len(y_pred[0]) * num_leaves], dtype=np.int64)
    for i in range(0, len(y_pred)):
        temp = np.arange(len(y_pred[0])) * num_leaves - 1 + np.array(y_pred[i])
        transformed_matrix[i][temp] += 1

    lr_model = LogisticRegression()
    lr_model.fit(transformed_matrix, y)
    y_pred_lr = lr_model.predict(transformed_matrix)
    print("truth_y:", y[:100], 'y_pred_lr:', y_pred_lr[:100])

    s = roc_auc_score(y, y_pred_lr)
    print('auc:', s)
Exemple #4
0
        def score(params, skf=skf, sample_weight=sample_weight):
            params = {"max_depth": int(params["max_depth"]),
                      "subsample": params["subsample"],
                      "colsample_bytree": params['colsample_bytree'],
                      "num_leaves": int(params['num_leaves']),
                      "n_jobs": -2
                      }

            clf = LGBMClassifier(n_estimators=500, learning_rate=0.05, **params)

            list_score_acc = []
            list_score_logloss = []

            for train, val in skf.split(self.X, self.y):
                X_train, X_val = self.X[train], self.X[val]
                y_train, y_val = self.y[train], self.y[val]

                weight_train = sample_weight[train]
                weight_val = sample_weight[val]

                clf.fit(X_train, y_train,
                        sample_weight=weight_train,
                        eval_sample_weight=[weight_val],
                        eval_set=[(X_val, y_val)],

                        eval_metric="logloss",
                        early_stopping_rounds=0,
                        verbose=False
                        )

                _score_acc = accuracy_score(y_val, clf.predict(X_val), sample_weight=weight_val)
                _score_logloss = log_loss(y_val, clf.predict_proba(X_val), sample_weight=weight_val)

                list_score_acc.append(_score_acc)
                list_score_logloss.append(_score_logloss)
                """
                ##n_estimaters=0 causes error at .fit()
                if clf.best_iteration_ != -1:
                    list_best_iter.append(clf.best_iteration_)
                else:
                    list_best_iter.append(params['n_estimators'])
                break
                """
            # logger.info("n_estimators: {}".format(list_best_iter))
            # params["n_estimators"] = np.mean(list_best_iter, dtype=int)

            score_acc = (np.mean(list_score_acc), np.min(list_score_acc), np.max(list_score_acc))
            # logger.info("score_acc %s" % np.mean(list_score_acc))

            # score_logloss = (np.mean(list_score_logloss), np.min(list_score_logloss), np.max(list_score_logloss))
            # score_f1 = (np.mean(list_score_f1), np.min(list_score_f1), np.max(list_score_f1))
            # score_auc = (np.mean(list_score_auc), np.min(list_score_auc), np.max(list_score_auc))

            logloss = np.mean(list_score_logloss)
            return {'loss': logloss, 'status': STATUS_OK, 'localCV_acc': score_acc}
Exemple #5
0
def lgb(x_train, y_train, x_val, y_val):
    lgb = LGBMClassifier(n_estimators=1000,
                         max_depth=10,
                         subsample=0.7,
                         colsample_bytree=0.7,
                         learning_rate=0.01,
                         random_state=2020)
    lgb.fit(x_train, y_train)
    result = lgb.predict(x_val)
    score = f1_score(result, y_val)
    return score
Exemple #6
0
    def find_best_cv(self):
        Util.split_cv(self.X, self.y, self.n_folds_list, ORG_DATA_DIR)

        acc_score_means = []
        acc_score_vars = []
        for num_of_fold in self.n_folds_list:
            print("============")
            logger.info("==evaluating %s fold==" % num_of_fold)
            CV_DIR = os.path.join(ORG_DATA_DIR, "n_folds_%s/" % num_of_fold)
            acc_score = []
            for i in range(num_of_fold):
                logger.info("loading %s th cv data in %s folds" % (i, num_of_fold))
                X_train = pd.read_csv(os.path.join(CV_DIR, "X_train_%s.csv") % i, header=None, sep="\t").values
                X_val = pd.read_csv(os.path.join(CV_DIR, "X_val_%s.csv") % i, header=None, sep="\t").values
                y_train = pd.read_csv(os.path.join(CV_DIR, "y_train_%s.csv") % i, header=None, sep="\t").values
                y_c, y_r = y_train.shape
                y_train = y_train.reshape(y_c, )
                y_val = pd.read_csv(os.path.join(CV_DIR, "y_val_%s.csv") % i, header=None, sep="\t").values
                y_c, y_r = y_val.shape
                y_val = y_val.reshape(y_c, )
                logger.info("end loading %s th cv data in %s folds" % (i, num_of_fold))
                logger.info("X_train.shape: %s %s" % X_train.shape)
                logger.info("X_val.shape: %s %s" % X_val.shape)
                logger.info("y_train.shape: %s" % y_train.shape)
                logger.info("y_val.shape: %s" % y_val.shape)

                clf = LGBMClassifier(objective="binary",
                                     n_estimators=20)

                weight_train = self._calc_w(y_train)

                clf.fit(X_train, y_train,
                        sample_weight=weight_train,
                        eval_set=[(X_val, y_val)],
                        verbose=True)
                y_pred = clf.predict(X_val)
                logger.info("acc socore: %s folds, %s iteration" % (num_of_fold, i))
                acc_score.append(accuracy_score(y_val, y_pred))
            logger.info("mean acc score of %s folds is %s" % (num_of_fold, np.mean(acc_score)))
            acc_score_means.append(np.mean(acc_score))
            logger.info("variance of acc score of %s folds is %s" % (num_of_fold, np.var(acc_score)))
            acc_score_vars.append(np.var(acc_score))
        for i in range(len(self.n_folds_list)):
            logger.info(
                "===%s_folds=== mean acc:%s, var acc: %s " % (self.n_folds_list[i],
                                                              acc_score_means[i],
                                                              acc_score_vars[i])
            )
Exemple #7
0
def get_ntree():  
    f1_t_total, f1_v_total = [], []
    for ntree in range(10, 810, 10):
        lgb_base = LGBMClassifier(n_estimators = ntree,objective = 'binary',
                      random_state=1234,n_jobs = 2,colsample_bytree=0.8, reg_alpha=1,
                      max_depth = 15, subsample = 0.8)

        print('此时 ntree = %s' % ntree)
        lgb_base.fit(X_t, y_t)
        y_t_pre = lgb_base.predict(X_t)
        y_v_pre = lgb_base.predict(X_v)
        f1_t_each = f1_score(y_t, y_t_pre,average = 'micro')
        f1_v_each = f1_score(y_v, y_v_pre,average = 'micro')
        f1_t_total.append(f1_t_each)
        f1_v_total.append(f1_v_each)
        myfile = open('D:\\workspace python\\contest\\accu_save\\' + 'lgbbase_810_2.txt',
                      'a', encoding='utf-8')
        print(f1_t_each,',',f1_v_each,file = myfile)
        myfile.close()
    return f1_t_total,f1_v_total
Exemple #8
0
def evaluate_age():
    features = pd.read_csv(
        'data/combine_feature/part-00000-380aaa4b-c838-43f4-8cb7-80164a4256f2-c000.csv'
    )
    y = features.age.values
    features.drop(['user_id', 'age', 'gender'], axis=1, inplace=True)
    print(features.shape)
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        y,
                                                        test_size=0.2)
    lightgbm = LGBMClassifier(n_estimators=200,
                              num_leaves=100,
                              feature_fraction=0.75,
                              bagging_fraction=0.75,
                              learning_rate=0.1)
    lightgbm.fit(X_train,
                 y_train,
                 eval_set=[(X_test, y_test)],
                 early_stopping_rounds=5)
    pred = lightgbm.predict(X_test)
    print(classification_report(y_test, pred))
    joblib.dump(lightgbm, 'data/lgb_age')
class PHSICAdasynLGBM(BaseEstimator):
    """
    An estimator upsampling minority classes, finding a small set of 
    stable biomarkers, and fitting a gradient boosting model over them

    Parameters
    ----------
    n_features : int, optional (default=30)
        Max. number of biomarkers (important features) to be selected

    adasyn_neighbors : int, optional (default=10)
        K neighbors for ADASYN upsampling algorithm
        
    B : int, optional (default=20)
        Block size for Block HSIC Lasso
        
    M : int, optional (default=10)
        Max allowed permutations of samples for Block HSIC Lasso

    hsic_splits :  int, optional (default=5)
        number of folds for verifying feature stability

    feature_neighbor_threshold : float, optional (default=0.4)
        threshold for considering neighbors of important features in stability check
    """
    def __init__(self,
                 n_features=30,
                 adasyn_neighbors=10,
                 B=20,
                 M=10,
                 hsic_splits=3,
                 stability_minimum_across_splits=2,
                 feature_neighbor_threshold=0.4):
        self.n_features = n_features
        self.adasyn_neighbors = adasyn_neighbors
        self.M = M
        self.B = B
        self.hsic_splits = hsic_splits
        self.neighbor_threshold = feature_neighbor_threshold
        self.stability_minimum_across_splits = stability_minimum_across_splits

    def fit(self, X, y):
        if X.shape[1] > 10000:
            #clf = RandomForestClassifier(n_estimators=1000,n_jobs=-1).fit(X,y)
            clf = LGBMClassifier(n_estimators=1000, n_jobs=-1).fit(X, y)
            ftimp = clf.feature_importances_
            relevant = np.where(ftimp > 0)[0]
            print("relevant ft:", len(relevant), "/", X.shape[1])
        else:
            relevant = np.arange(X.shape[1])

        sss = StratifiedShuffleSplit(n_splits=self.hsic_splits,
                                     random_state=42)
        idxs = []
        hsics = []
        for train_index, test_index in list(sss.split(X, y)):
            hsic_lasso2 = HSICLasso()
            hsic_lasso2.input(X[:, relevant][train_index], y[train_index])
            hsic_lasso2.classification(
                self.n_features, B=self.B,
                M=self.M)  #(self.n_features, B=self.B, M=self.M)
            hsics.append(hsic_lasso2)

            # not just best features - get their neighbors (similar features) too
            all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel()
            for i in range(len(all_ft_idx)):
                idx = np.array(hsic_lasso2.get_index_neighbors(
                    feat_index=i, num_neighbors=10),
                               dtype=int)
                score = np.array(hsic_lasso2.get_index_neighbors_score(
                    feat_index=i, num_neighbors=10),
                                 dtype=int)
                idx = idx[np.where(score > self.neighbor_threshold)[0]]
                all_ft_idx = np.concatenate((all_ft_idx, idx))
            all_ft_idx = np.unique(all_ft_idx)

            idxs.append(relevant[all_ft_idx])
            #if len(idxs) == 1:
            #    self.hsic_idx_ = idxs[0]
            #else:
            #    self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_)
        self.hsic_idx_ = []

        stability_concession = 0
        while len(self.hsic_idx_) == 0:
            featurecandidates = np.unique(np.concatenate(idxs))
            for candidate in featurecandidates:
                occurrences = np.sum(
                    [1 if candidate in idx else 0 for idx in idxs])
                if occurrences > self.stability_minimum_across_splits - stability_concession:
                    self.hsic_idx_.append(candidate)
            if len(self.hsic_idx_) > 1:
                break
            else:
                # failed to find commonly occurring features - reduce threshold
                stability_concession += 1
        print("HSIC done.", len(self.hsic_idx_), "(out of ",
              len(featurecandidates), " candidates)")

        print("Upsampling with ADASYN... (features: " +
              str(len(self.hsic_idx_)) + ")")
        sm = ADASYN(sampling_strategy="minority",
                    n_neighbors=self.adasyn_neighbors,
                    n_jobs=-1)
        sX, sy = X[:, self.hsic_idx_], y
        if self.adasyn_neighbors > 0:
            try:
                sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y)
                for i in range(len(np.unique(y) - 1)):
                    sX, sy = sm.fit_resample(sX, sy)
            except:
                pass
            print("ADASYN done. Starting clf")

        self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy)
        print("done")
        return self

    def predict_proba(self, X):
        return self.clf_.predict_proba(X[:, self.hsic_idx_])

    def predict(self, X):
        return self.clf_.predict(X[:, self.hsic_idx_])
Exemple #10
0
                     subsample_for_bin=800,
                     n_jobs=4)
# # specify your configurations as a dict
# param_grid_xgboost={'min_child_samples':np.arange(10,100,10)}
# start_time=time.clock()
# grid_lgb=GridSearchCV(lgb,param_grid_xgboost,cv=5,scoring='accuracy')
# grid_lgb.fit(X,y)
# endtime=time.clock()
# print('score',grid_lgb.grid_scores_)
# print('Xgboost_best_estimator_',grid_lgb.best_estimator_)
# print('Xgboost_best_score_',grid_lgb.best_score_)
# print('Xgboost_best_params_',grid_lgb.best_params_)
# print("run_time",endtime-start_time)

start_time = time.clock()
score_all = 0
kf = KFold(n_splits=5, shuffle=True)
for train, test in kf.split(X):
    print(len(train), len(test))
    X_train = X[train]
    X_test = X[test]
    y_train = y[train]
    y_test = y[test]
    lgb.fit(X_train, y_train)
    preds = lgb.predict(X_test)
    score = accuracy_score(y_test, preds)
    print("score:", score)
    score_all = score_all + score
print("score_all", score_all / 5)
endtime = time.clock()
print("run_time", endtime - start_time)
Exemple #11
0
def zip_process(zip):
    z = str(zip)
    z = int(z.split('-')[0])
    return zip_dic[z]


df['gender'] = df['gender'].apply(gender_process)
df['age'] = df['age'].apply(age_process)
df['genres'] = df['genres'].apply(genres_process)
df['zip'] = df['zip'].apply(zip_process)

y = df['label'].values
df.drop(
    columns=['user_id', 'movie_id', 'rating', 'timestamp', 'title', 'label'],
    axis=1,
    inplace=True)
x = df.values

length = int(len(x) * 0.9)
x_train = x[0:length]
y_train = y[0:length]
x_test = x[length:]
y_test = y[length:]

model = LGBMClassifier(n_estimators=1200)
model.fit(x_train, y_train)
prediction = model.predict(x_test)
acc = accuracy_score(y_test, prediction)
print(acc)
Exemple #12
0
            subsample_freq=1,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=5,
            random_state=2020,
            n_jobs=24,
        )
        clf.fit(
            X_trn,
            Y_trn,
            eval_set=[(X_val, Y_val)],
            early_stopping_rounds=500,
            verbose=200,
        )
        print('val_acc: {:.5f}'.format(
            accuracy_score(Y_val, clf.predict(X_val))))
        oof[val_idx] = clf.predict_proba(X_val)
        sub += clf.predict_proba(X_sub) / skf.n_splits

    print('cv_acc : {:.5f}'.format(accuracy_score(Y_train,
                                                  oof.argmax(axis=1))))
    print(
        classification_report(Y_train,
                              oof.argmax(axis=1),
                              target_names=lbl.classes_))

    oof_files = [
        'bert_oof0_2', 'cnn_oof0_2', 'mlp_oof0_2', 'bert_oof1', 'cnn_oof1',
        'mlp_oof1'
    ]
    sub_files = [
Exemple #13
0
print(
    "Select best LGB model with n_estimators = {}  with best_score={}".format(
        best_clf.best_params_['n_estimators'], best_clf.best_score_))
#%%
for a in [100, 300, 600, 1000]:
    for b in [0.0001, 0.001, 0.01, 0.1, 1, 10]:
        LBMclf = LGBMClassifier(random_state=50,
                                n_jobs=-1,
                                n_estimators=a,
                                reg_lambda=b)
        LBMclf.fit(train_data, train_label)
        print(
            "The reuslt AUC_ROC of the lightGBM with n_estimators={} and reg_lambda={} on test data is"
            .format(a, b),
            roc_auc_score(test_label.tolist(),
                          LBMclf.predict(test_data).tolist()))

#%%
# Make the model with the specified regularization parameter
clf = LogisticRegression()
best_clf = GridSearchCV(clf,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1,
                        param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]})
best_clf.fit(train_data, train_label)
print("Select best Logistic Regression model with C = {} with best_score={}".
      format(best_clf.best_params_['C'], best_clf.best_score_))
#%%
for c in [0.001, 0.01, 0.1, 1, 10, 100]:
    test_clf = LogisticRegression(C=c)
Exemple #14
0
                            drop_first=True)).reshape(X.shape[0])

#veriyi bölme
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.21,
                                                    random_state=42)

#modeli kurma

LightGBM = LGBMClassifier()

LightGBM.fit(X_train, y_train)

#modelden tahmin tapma
pred = LightGBM.predict(X_test)

#ilkel başarı değeri
print(f"İlkel başarı değeri : {accuracy_score(y_test,pred)}")

#hiperparametre seçelim

hiperparams = {
    'max_depth': np.arange(2, 10, 2),
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1],
    'n_estimators': np.arange(200, 1000, 200)
}

model_cv = GridSearchCV(LightGBM, hiperparams, cv=10,
                        n_jobs=-1).fit(X_train, y_train)
print(model_cv.best_params_)
x_train, x_test, y_train, y_test = train_test_split(data_features_part,
                                                    data_target_part,
                                                    test_size=0.2,
                                                    random_state=2021)

## 导入LightGBM模型
from lightgbm.sklearn import LGBMClassifier

# 定义 LightGBM 模型
clf = LGBMClassifier()

# 在训练集上训练LightGBM模型
clf.fit(x_train, y_train)

# 在训练集和测试集上分布利用训练好的模型进行预测
train_predict = clf.predict(x_train)
test_predict = clf.predict(x_test)

from sklearn import metrics

# 利用accuracy(准确度)【预测正确的样本数目占总预测样本数目的比例】评估模型效果
print('The accuracy of the Logistic Regression is:',
      metrics.accuracy_score(y_train, train_predict))
print('The accuracy of the Logistic Regression is:',
      metrics.accuracy_score(y_test, test_predict))

# 查看混淆矩阵 (预测值和真实值的各类情况统计矩阵)
confusion_matrix_result = metrics.confusion_matrix(test_predict, y_test)
print('The confusion matrix result:\n', confusion_matrix_result)

# 利用热力图对于结果进行可视化
def classes(data, label, test):
    model = LGBMClassifier()
    model.fit(data, label)
    ans = model.predict(test)
    estimate(model, data)
    return ans
Exemple #17
0
    n_folds = 3
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)
    l = list(skf.split(x_train, y_train))
    train_sets = np.zeros((x_train.shape[0], len(clfs)))
    test_sets = np.zeros((x_test.shape[0], len(clfs)))
    for j, clf in enumerate(clfs):
        '''依次训练各个单模型'''
        print(j, clf)
        test_j = np.zeros((x_test.shape[0], len(l)))
        for i, (trn_idx, val_idx) in enumerate(skf.split(x_train, y_train)):
            '''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。'''
            # print("Fold", i)
            trn_x, trn_y = x_train[trn_idx], y_train[trn_idx]
            val_x, val_y = x_train[val_idx], y_train[val_idx]
            clf.fit(trn_x, trn_y)
            y_submission = clf.predict(val_x)[:, 1]
            train_sets[val_idx, j] = y_submission
            test_j[:, i] = clf.predict(x_test)[:, 1]
        '''对于测试集,直接用这k个模型的预测值均值作为新的特征。'''
        test_sets[:, j] = test_j.mean(axis=1)

    print(train_sets)
    print(test_sets)

    lgb.fit(train_sets, y_train)
    result = lgb.predict(test_sets)[:, 1]
    result = (result - result.min()) / (result.max() - result.min())
    print(result)
    print(result.shape)
class PHSICAdasynLGBM(BaseEstimator):
    """
    An estimator upsampling minority classes, finding a small set of 
    stable biomarkers, and fitting a gradient boosting model over them

    Parameters
    ----------
    n_features : int, optional (default=30)
        Max. number of biomarkers (important features) to be selected

    adasyn_neighbors : int, optional (default=10)
        K neighbors for ADASYN upsampling algorithm
        
    B : int, optional (default=20)
        Block size for Block HSIC Lasso
        
    M : int, optional (default=10)
        Max allowed permutations of samples for Block HSIC Lasso

    hsic_splits :  int, optional (default=5)
        number of folds for verifying feature stability

    feature_neighbor_threshold : float, optional (default=0.4)
        threshold for considering neighbors of important features in stability check
    """
    def __init__(self,
                 n_features=30,
                 adasyn_neighbors=10,
                 B=20,
                 M=10,
                 hsic_splits=5,
                 feature_neighbor_threshold=0.4):
        self.n_features = n_features
        self.adasyn_neighbors = adasyn_neighbors
        self.M = M
        self.B = B
        self.hsic_splits = hsic_splits
        self.neighbor_threshold = feature_neighbor_threshold

    def fit(self, X, y):
        sss = StratifiedShuffleSplit(n_splits=self.hsic_splits,
                                     random_state=42)
        idxs = []
        hsics = []
        for train_index, test_index in list(sss.split(X, y)):
            hsic_lasso2 = HSICLasso()
            hsic_lasso2.input(X[train_index], y[train_index])
            hsic_lasso2.classification(
                self.n_features, B=self.B,
                M=self.M)  #(self.n_features, B=self.B, M=self.M)
            hsics.append(hsic_lasso2)

            # not just best features - get their neighbors (similar features) too
            all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel()
            for i in range(len(all_ft_idx)):
                idx = np.array(hsic_lasso2.get_index_neighbors(
                    feat_index=i, num_neighbors=10),
                               dtype=int)
                score = np.array(hsic_lasso2.get_index_neighbors_score(
                    feat_index=i, num_neighbors=10),
                                 dtype=int)
                idx = idx[np.where(score > self.neighbor_threshold)[0]]
                all_ft_idx = np.concatenate((all_ft_idx, idx))
            all_ft_idx = np.unique(all_ft_idx)

            idxs.append(all_ft_idx)
            if len(idxs) == 1:
                self.hsic_idx_ = idxs[0]
            else:
                self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_)
        print("HSIC done.", len(self.hsic_idx_))

        print("Upsampling with ADASYN... (features: " +
              str(len(self.hsic_idx_)) + ")")
        sm = ADASYN(sampling_strategy="minority",
                    n_neighbors=self.adasyn_neighbors,
                    n_jobs=-1)
        sX, sy = X[:, self.hsic_idx_], y
        if self.adasyn_neighbors > 0:
            try:
                sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y)
                for i in range(len(np.unique(y) - 1)):
                    sX, sy = sm.fit_resample(sX, sy)
            except:
                pass
            print("ADASYN done. Starting clf")

        self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy)
        print("done")
        return self

    def predict_proba(self, X):
        return self.clf_.predict_proba(X[:, self.hsic_idx_])

    def predict(self, X):
        return self.clf_.predict(X[:, self.hsic_idx_])
    #"""# 调参后训练
    time1 = time.time()
    lgb = LGBMClassifier(boosting_type='gbdt',
                         learning_rate=l_r,
                         n_estimators=n_e,
                         num_leaves=num_leaves,
                         subsample=ss,
                         colsample_bytree=c_b,
                         objective='binary',
                         random_state=10)
    lgb = joblib.load('LightGBM_model.pkl')
    dtime1 = time.time() - time1

    # 预测测试集1
    time2 = time.time()
    test_pre = lgb.predict(data['test_data'])
    dtime2 = time.time() - time2
    test_preb = lgb.predict_proba(data['test_data'])
    acc = metrics.accuracy_score(data['test_label'], test_pre)
    t = [i[1] for i in test_preb]
    auc = metrics.roc_auc_score(data['test_label'], t)
    recall = metrics.recall_score(data['test_label'], test_pre)
    prec = metrics.precision_score(data['test_label'], test_pre)
    f1 = metrics.f1_score(data['test_label'], test_pre)
    print(lgb.get_params())
    print(lgb.feature_importances_)
    print('''train time: %d
    predict time: %d
    acc: %f
    auc: %f
    recall: %f
def loadDataFrame():
    iris = load_iris()
    iris_data = iris.data
    iris_target = iris.target
    iris_df = pd.DataFrame(iris_data, columns=iris.feature_names)
    iris_df['target'] = pd.Series(iris_target)

    return iris_df


if __name__ == "__main__":
    iris_df = loadDataFrame()
    value_counts = iris_df['target'].value_counts()
    print(value_counts)
    X_iris = iris_df.iloc[:, :-1].values
    y_iris = iris_df.iloc[:, -1].values

    X_train, X_test, y_train, y_test = train_test_split(X_iris,
                                                        y_iris,
                                                        test_size=0.3,
                                                        stratify=y_iris)

    model = LGBMClassifier()

    model.fit(X_train, y_train)
    print(model.get_params())

    y_pred = model.predict(X_test)
    print("accuracy: %s" % np.mean(y_pred == y_test))
Exemple #21
0
def multi_machine_learing_models(data_train, data_cv):
    print('正在训练模型!')
    data_train=pd.concat([data_train,data_cv],axis=0)
    y_train = data_train['label'].apply(lambda x: 0 if x == 'good' else 1)
    y_test = data_cv['label'].apply(lambda x: 0 if x == 'good' else 1)

    X_train = data_train.drop(['URL', 'label'], axis=1)
    X_test = data_cv.drop(['URL', 'label'], axis=1)

    filename_bayes = 'classifier_model\c_bayes.model'
    filename_LGB = 'classifier_model\c_LGB.model'
    filename_ada = 'classifier_model\c_ada.model'
    filename_rf = 'classifier_model\c_rf.model'
    filename_decision_tree = 'classifier_model\c_decision_tree.model'
    filename_lgs = 'classifier_model\c_lgs.model'

    vote = []
    for i in range(len(y_test)):
        vote.append(0)

    bayes = BernoulliNB()
    bayes.fit(X_train, y_train)
    print('\nbayes模型的准确度:', bayes.score(X_test, y_test))
    predict = bayes.predict(X_test)
    vote = list(map(lambda x: x[0] + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(bayes, filename_bayes)

    gbc = LGBMClassifier(n_estimators=200, objective='binary')
    gbc.fit(X_train, y_train)
    print('LGBMClassifier模型的准确度:', gbc.score(X_test, y_test))
    predict = gbc.predict(X_test)
    vote = list(map(lambda x: 3 * x[0] + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(gbc, filename_LGB)

    ada = AdaBoostClassifier(n_estimators=100)  # 迭代100次
    ada.fit(X_train, y_train)
    print('ada模型的准确度:', ada.score(X_test, y_test))
    predict = ada.predict(X_test)
    vote = list(map(lambda x: 2 * x[0] + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(ada, filename_ada)

    rf = RandomForestClassifier(n_estimators=100, oob_score=True)
    rf.fit(X_train, y_train)
    print('\nrf模型的准确度:', rf.score(X_test, y_test))
    predict = rf.predict(X_test)
    vote = list(map(lambda x: x[0] * 3 + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(rf, filename_rf)

    decision_tree = tree.DecisionTreeClassifier()
    decision_tree.fit(X_train, y_train)
    print('\ndecision_tree模型的准确度:', decision_tree.score(X_test, y_test))
    predict = decision_tree.predict(X_test)
    vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(decision_tree, filename_decision_tree)

    lgs = LogisticRegression()
    lgs.fit(X_train, y_train)
    print('\nLogisticRegression模型的准确度:', lgs.score(X_test, y_test))
    predict = lgs.predict(X_test)
    vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote)))
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)
    print("precison:", precision)
    print("recall:", recall)
    joblib.dump(lgs, filename_lgs)

    print('\n投票结果:')
    vote_r = []
    for i in range(len(vote)):
        if vote[i] >= 3:
            vote_r.append(1)
        else:
            vote_r.append(0)
    precision = metrics.precision_score(y_test, vote_r)
    recall = metrics.recall_score(y_test, vote_r)
    acc = metrics.accuracy_score(y_test, vote_r)
    print('准确度:', acc)
    print("precison:", precision)
    print("recall:", recall)