def test_probability(): # Predict probabilities. X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50}, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), random_state=0).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))) # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( base_estimator=LogisticRegression(), random_state=0, max_samples=5).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)))
def cross_validation(name): with open('../data/conv_pred/train_data_ad_ignore_' + name + '.pickle', 'rb') as f: data = pickle.load(f) v = DictVectorizer() X = v.fit_transform(data['X']) y = np.array(data['y']) kf = KFold(n_splits=5) fscore = 0 ftscore = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #model = RandomForestClassifier(n_estimators=100, n_jobs=8,class_weight={0:1,1:3000}) model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8) model.fit(X_train, y_train) predict = model.predict_proba(X_test) score, t_score = eval(y_test, predict) pprint( sorted(zip( np.mean([ est.steps[1][1].feature_importances_ for est in model.estimators_ ], axis=0), v.feature_names_), key=lambda x: x[0], reverse=True)) print('score : ', str(score)) print('true_score : ', str(t_score)) fscore += score ftscore += t_score print('\n') print('final score : ', str(fscore / 10)) print('final true_score : ', str(ftscore / 10))
def Model_3(train, test): ''' Trains the model and Saves the predictions in a CSV file train : Training set test : Test set ''' # Preprocessing X_train = [DPC(i) for i in train['Sequence']] X_test = [DPC(i) for i in test['Sequence']] Y_train = train['label'] # Training clf = BalancedBaggingClassifier(base_estimator=RandomForestClassifier( bootstrap=False, n_estimators=450, random_state=6), n_estimators=25, n_jobs=-1, random_state=6, verbose=1) clf.fit(X_train, Y_train) # Predicting Y_pred = clf.predict(X_test) Y_prob = [x[1] for x in clf.predict_proba(X_test)] result = pd.DataFrame() result["ID"] = test["ID"] result["Label"] = Y_prob result.to_csv("Submission_3.csv", index=False) result["Label"] = Y_pred result.to_csv("Predictions_3.csv", index=False)
def cross_validation(x): with open('../data/conv_pred/train_data_' + x + '.pickle', 'rb') as f: data = pickle.load(f) print(data) v = DictVectorizer() X = v.fit_transform(data['X']) y = np.array(data['y']) zero = 0 one = 0 for i in y: if i == 0: zero += 1 else: one += 1 print(zero) print(one) cv = 5 kf = KFold(n_splits=cv) fscore = 0 ftscore = 0 all_f_value = 0 all_prec = 0 for train_index, test_index in tqdm(kf.split(X)): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #model = RandomForestRe(n_estimators=100, n_jobs=8) model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8) #model = xgb.XGBClassifier(n_estimators=500,max_delta_step=1,scale_pos_weight=zero/one) model.fit(X_train, y_train) predict = model.predict_proba(X_test) precision, recall, f_value, all_pre = eval(y_test, predict) all_prec += all_pre fscore += precision ftscore += recall all_f_value += f_value pprint( sorted(zip( np.mean([ est.steps[1][1].feature_importances_ for est in model.estimators_ ], axis=0), v.feature_names_), key=lambda x: x[0], reverse=True)) print('\n') print('final precision : ', str(fscore / cv)) print('final recall : ', str(ftscore / cv)) print('final f-value : ', str(all_f_value / cv)) print('final all_precision : ', str(all_prec / cv))
class Classifier(BaseEstimator): def __init__(self): # mimicking balanced random forest with the BalancedBaggingClassifier # and DecisionTreeClassifier combination self.bbc = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(max_features='auto'), ratio=determine_ratio, random_state=0, n_estimators=50, n_jobs=1) def fit(self, X, y): self.bbc.fit(X, y) def predict_proba(self, X): return self.bbc.predict_proba(X)
def cross_validation_another(x): with open('../data/conv_pred/super_train_data_day_' + 'A' + '.pickle', 'rb') as f: data = pickle.load(f) with open('../data/conv_pred/super_test_data_day_' + 'A' + '.pickle', 'rb') as f: test = pickle.load(f) v = DictVectorizer() X_train = v.fit_transform(data['X']) y_train = np.array(data['y']) X_test = v.transform(test['X']) y_test = np.array(test['y']) zero = 0 one = 0 for i in y_train: if i == 0: zero += 1 else: one += 1 print(zero) print(one) model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8, max_samples=0.6) #model = xgb.XGBClassifier(n_estimators=500, max_delta_step=1, scale_pos_weight=zero / one) model.fit(X_train, y_train) predict = model.predict_proba(X_test) precision, recall, f_value, all_pre = eval(y_test, predict) all_prec = all_pre fscore = precision ftscore = recall all_f_value = f_value print('\n') print('final precision : ', str(fscore)) print('final recall : ', str(ftscore)) print('final f-value : ', str(all_f_value)) print('final all_precision : ', str(all_prec))
X_train_o = X_train[:, 0:original_len] X_test_o = X_test[:, 0:original_len] X_train_n = X_train[:, original_len:] X_test_n = X_test[:, original_len:] for clf, clf_name in zip(clf_list, clf_name_list): print('processing', clf_name, 'round', i + 1) if clf_name != 'xgb': clf = BalancedBaggingClassifier(base_estimator=clf, ratio='auto', replacement=False) # fully supervised clf.fit(X_train_o, y_train.ravel()) y_pred = clf.predict_proba(X_test_o) roc_score = roc_auc_score(y_test, y_pred[:, 1]) prec_n = get_precn(y_test, y_pred[:, 1]) result_dict[clf_name + 'ROC' + 'o'].append(roc_score) result_dict[clf_name + 'PRC@n' + 'o'].append(prec_n) # unsupervised clf.fit(X_train_n, y_train.ravel()) y_pred = clf.predict_proba(X_test_n) roc_score = roc_auc_score(y_test, y_pred[:, 1]) prec_n = get_precn(y_test, y_pred[:, 1]) result_dict[clf_name + 'ROC' + 'n'].append(roc_score)
class Models(object): def __init__(self, model_path=None, feature_engineer=False, train_mode=True): ''' @description: initlize Class, EX: model @param {type} : feature_engineer: whether using feature engineering, if `False`, then compare common ML models res_model: res network model resnext_model: resnext network model wide_model: wide res network model bert: bert model ml_data: new mldata class @return: No return ''' # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持cuda, 则将模型加载到cuda中 ########################################### # TODO: module 2 task 2.1 # ########################################### self.res_model = torchvision.models.resnet152( pretrained=True) # res model for modal feature [1* 1000] self.res_model = self.res_model.to(config.device) self.resnext_model = torchvision.models.resnext101_32x8d( pretrained=True) self.resnext_model = self.resnext_model.to(config.device) self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True) self.wide_model = self.wide_model.to(config.device) # 加载 bert 模型, 如果支持cuda, 则将模型加载到cuda中 self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert') self.bert = self.bert.to(config.device) # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练 self.ml_data = MLData(debug_mode=True, train_mode=train_mode) # 如果不训练, 则加载训练好的模型,进行预测 if train_mode: self.model = lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.load(model_path) labelNameToIndex = json.load( open(config.root_path + '/data/label2id.json', encoding='utf-8')) self.ix2label = {v: k for k, v in labelNameToIndex.items()} def feature_engineer(self): ''' @description: This function is building all kings of features @param {type} None @return: X_train, feature of train set X_test, feature of test set y_train, label of train set y_test, label of test set ''' logger.info("generate embedding feature ") # 获取tfidf 特征, word2vec 特征, word2vec不进行任何聚合 ########################################### # TODO: module 3 task 1.1 # ########################################### train_tfidf, train = get_embedding_feature(self.ml_data.train, self.ml_data.em.tfidf, self.ml_data.em.w2v) test_tfidf, test = get_embedding_feature(self.ml_data.dev, self.ml_data.em.tfidf, self.ml_data.em.w2v) logger.info("generate autoencoder feature ") # 获取到autoencoder 的embedding, 根据encoder 获取而不是decoder train_ae = get_autoencoder_feature( train, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) test_ae = get_autoencoder_feature( test, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) logger.info("generate basic feature ") # 获取nlp 基本特征 train = get_basic_feature(train) test = get_basic_feature(test) logger.info("generate modal feature ") # 加载图书封面的文件 cover = os.listdir(config.root_path + '/data/book_cover/') # 根据title 匹配图书封面 train['cover'] = train['title'].progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test['title'].progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') # 根据封面获取封面的embedding ########################################### # TODO: module 3 task 1.2 # ########################################### train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) train['resnext_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) test['resnext_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) train['wide_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) test['wide_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) logger.info("generate bert feature ") ########################################### # TODO: module 3 task 1.3 # ########################################### train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) logger.info("generate lda feature ") ########################################### # TODO: module 3 task 1.4 # ########################################### # 生成bag of word格式数据 train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), test['bow'])) logger.info("formate data") # 将所有的特征拼接到一起 train = formate_data(train, train_tfidf, train_ae) test = formate_data(test, test_tfidf, test_ae) # 生成训练,测试的数据 cols = [x for x in train.columns if str(x) not in ['labelIndex']] X_train = train[cols] X_test = test[cols] train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] return X_train, X_test, y_train, y_test def param_search(self, search_method='grid'): ''' @description: use param search tech to find best param @param {type} search_method: two options. grid or bayesian optimization @return: None ''' # 使用网格搜索 或者贝叶斯优化 寻找最优参数 if search_method == 'grid': logger.info("use grid search") self.model = Grid_Train_model(self.model, self.X_train, self.X_test, self.y_train, self.y_test) elif search_method == 'bayesian': logger.info("use bayesian optimization") trn_data = lgb.Dataset(data=self.X_train, label=self.y_train, free_raw_data=False) param = bayes_parameter_opt_lgb(trn_data) logger.info("best param", param) return param def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): ''' @description: handle unbalance data, then search best param @param {type} imbalance_method, three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier search_method: two options. grid or bayesian optimization @return: None ''' logger.info("get all freature") # 生成所有feature self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble ########################################### # TODO: module 4 task 1.1 # ########################################### if imbalance_method == 'over_sampling': logger.info("Use SMOTE deal with unbalance data ") self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': logger.info("Use ClusterCentroids deal with unbalance data ") self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' logger.info('search best param') # 使用set_params 将搜索到的最优参数设置为模型的参数 if imbalance_method != 'ensemble': ########################################### # TODO: module 4 task 1.2 # ########################################### # param = self.param_search(search_method=search_method) # param['params']['num_leaves'] = int(param['params']['num_leaves']) # param['params']['max_depth'] = int(param['params']['max_depth']) param = {} param['params'] = {} param['params']['num_leaves'] = 3 param['params']['max_depth'] = 5 self.model = self.model.set_params(**param['params']) logger.info('fit model ') # 训练, 并输出模型的结果 self.model.fit(self.X_train, self.y_train) ########################################### # TODO: module 4 task 1.3 # ########################################### Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的精确率 logger.info('Train accuracy %s' % per) # 输出测试集的准确率 logger.info('test accuracy %s' % acc) # 输出recall logger.info('test recall %s' % recall) # 输出F1-score logger.info('test F1_score %s' % f1) self.save(model_name) def process(self, title, desc): ########################################### # TODO: module 5 task 1.1 # ########################################### # 处理数据, 生成模型预测所需要的特征 df = pd.DataFrame([[title, desc]], columns=['title', 'desc']) df['text'] = df['title'] + df['desc'] df["queryCut"] = df["text"].apply(query_cut) df["queryCutRMStopWord"] = df["queryCut"].apply( lambda x: [word for word in x if word not in self.ml_data.em.stopWords]) df_tfidf, df = get_embedding_feature(df, self.ml_data.em.tfidf, self.ml_data.em.w2v) print("generate basic feature ") df = get_basic_feature(df) print("generate modal feature ") df['cover'] = '' df['res_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) df['resnext_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) df['wide_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") df['bert_embedding'] = df.text.progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print("generate lda feature ") df['bow'] = df['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) df['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), df.bow)) print("generate autoencoder feature ") df_ae = get_autoencoder_feature(df, self.ml_data.em.ae.max_features, self.ml_data.em.ae.max_len, self.ml_data.em.ae.encoder, tokenizer=self.ml_data.em.ae.tokenizer) print("formate data") df['labelIndex'] = 1 df = formate_data(df, df_tfidf, df_ae) cols = [x for x in df.columns if str(x) not in ['labelIndex']] X_train = df[cols] return X_train def predict(self, title, desc): ''' @description: 根据输入的title, desc 预测图书的类别 @param {type} title, input desc: input @return: label ''' ########################################### # TODO: module 5 task 1.1 # ########################################### inputs = self.process(title, desc) label = self.ix2label[self.model.predict(inputs)[0]] proba = np.max(self.model.predict_proba(inputs)) return label, proba def save(self, model_name): ''' @description:save model @param {type} model_name, file name for saving @return: None ''' ########################################### # TODO: module 4 task 1.4 # ########################################### joblib.dump(self.model, root_path + '/model/ml_model/' + model_name) def load(self, path): ''' @description: load model @param {type} path: model path @return:None ''' ########################################### # TODO: module 4 task 1.4 # ########################################### self.model = joblib.load(path)
# ADD CODE HERE #first_test(X_train, y_train, X_test, y_test) #second_test(X_train, y_train, X_test, y_test) third_test(X_train, y_train, X_test, y_test) #fourth_test(X_train, y_train, X_test, y_test) #fifth_test(X_train, y_train, X_test, y_test) #sixth_test(X_train, y_train, X_test, y_test) #X, y = SMOTETomek(n_jobs=-1).fit_sample(X_LS, y_LS) #do_cv_RF(X, y) #score = cross_val_score(model, X_LS, y_LS, cv=10, scoring="roc_auc") # print(np.mean(score)) exit("No need to make submission now") with measure_time('Training'): model.fit(X_LS, y_LS) # PREDICTION TS = load_from_csv(args.ts) X_TS = create_fingerprints(TS["SMILES"].values) # Predict y_pred = model.predict_proba(X_TS)[:, 1] # Estimated AUC of the model auc_predicted = 0.75 # it seems a bit pessimistic, right? # Making the submission file fname = make_submission(y_pred, auc_predicted, 'Bagging_model') print('Submission file "{}" successfully written'.format(fname))
def model_baseline3(x_train, y_train, x_test, y_test): bagging = BaggingClassifier(random_state=0) balanced_bagging = BalancedBaggingClassifier(random_state=0) bagging.fit(x_train, y_train) balanced_bagging.fit(x_train, y_train) prob = bagging.predict_proba(x_test)[:, 1] predict_score = [float('%.2f' % x) for x in prob] loss_val = log_loss(y_test, predict_score) y_pred = [1 if x > 0.5 else 0 for x in predict_score] fpr, tpr, thresholds = roc_curve(y_test, predict_score) mean_fpr = np.linspace(0, 1, 100) mean_tpr = interp(mean_fpr, fpr, tpr) x_auc = auc(fpr, tpr) fig = plt.figure('Bagging') ax = fig.add_subplot(1, 1, 1) name = 'base_Bagging' plt.plot(mean_fpr, mean_tpr, linestyle='--', label='{} (area = %0.2f, logloss = %0.2f)'.format(name) % (x_auc, loss_val), lw=2) y_pred_bagging = bagging.predict(x_test) cm_bagging = confusion_matrix(y_test, y_pred_bagging) cm1 = plt.figure() plot_confusion_matrix(cm_bagging, classes=[0, 1], title='Confusion matrix of BaggingClassifier') # balanced_bagging prob = balanced_bagging.predict_proba(x_test)[:, 1] predict_score = [float('%.2f' % x) for x in prob] loss_val = log_loss(y_test, predict_score) fpr, tpr, thresholds = roc_curve(y_test, predict_score) mean_fpr = np.linspace(0, 1, 100) mean_tpr = interp(mean_fpr, fpr, tpr) x_auc = auc(fpr, tpr) plt.figure('Bagging') # 选择图 name = 'base_Balanced_Bagging' plt.plot(mean_fpr, mean_tpr, linestyle='--', label='{} (area = %0.2f, logloss = %0.2f)'.format(name) % (x_auc, loss_val), lw=2) y_pred_balanced_bagging = balanced_bagging.predict(x_test) cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging) cm2 = plt.figure() plot_confusion_matrix(cm_balanced_bagging, classes=[0, 1], title='Confusion matrix of BalancedBagging') plt.figure('Bagging') # 选择图 plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', label='Luck') # make nice plotting ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() ax.spines['left'].set_position(('outward', 10)) ax.spines['bottom'].set_position(('outward', 10)) plt.xlim([0, 1]) plt.ylim([0, 1]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic') plt.legend(loc="lower right") plt.show() return cm1, cm2, fig
def buildModel(X, y): # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2])) print X.shape, y.shape scaler = StandardScaler() print(scaler.fit(X)) scaled_train_x = scaler.transform(X) X_train, X_test, y_train, y_test = train_test_split(scaled_train_x, y, random_state=19, test_size=0.3) bag = BalancedBaggingClassifier(n_estimators=200, random_state=19) svm = SVC(class_weight='balanced', random_state=19, decision_function_shape='ovr') neural = MLPClassifier(max_iter=500, random_state=19, solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(49, 8, 4)) ada = AdaBoostClassifier(n_estimators=100, random_state=19) logistic = LogisticRegression(solver='lbfgs', max_iter=500) bag.fit(X_train, y_train) svm.fit(X_train, y_train) neural.fit(X_train, y_train) ada.fit(X_train, y_train) logistic.fit(X_train, y_train) # joblib.dump(bag,'bag.pkl') # joblib.dump(scaler,'scaler.pkl') y_pred = bag.predict(X_test) y_pred2 = svm.predict(X_test) y_pred3 = neural.predict(X_test) y_pred4 = ada.predict(X_test) y_pred5 = logistic.predict(X_test) print matthews_corrcoef(y_test, y_pred) print matthews_corrcoef(y_test, y_pred2) print matthews_corrcoef(y_test, y_pred3) print matthews_corrcoef(y_test, y_pred4) print matthews_corrcoef(y_test, y_pred5) print confusion_matrix(y_test, y_pred) print confusion_matrix(y_test, y_pred2) print confusion_matrix(y_test, y_pred3) print confusion_matrix(y_test, y_pred4) print confusion_matrix(y_test, y_pred5) print(classification_report_imbalanced(y_test, y_pred)) print(classification_report_imbalanced(y_test, y_pred2)) print(classification_report_imbalanced(y_test, y_pred3)) print(classification_report_imbalanced(y_test, y_pred4)) print(classification_report_imbalanced(y_test, y_pred5)) probs_ada = ada.predict_proba(X_test) probs_bag = bag.predict_proba(X_test) probs_neural = neural.predict_proba(X_test) probs_logistic = logistic.predict_proba(X_test) probs_svm = svm.decision_function(X_test) ROCplot(probs_ada, y_test, "Plots/ROCplotADA-organelle.png") ROCplot(probs_logistic, y_test, "Plots/ROCplotLogistic-organelle.png") ROCplot(probs_bag, y_test, "Plots/ROCplotBAG-organelle.png") ROCplot(probs_neural, y_test, "Plots/ROCplotNeural-organelle.png") ROCplot(probs_svm, y_test, "Plots/ROCplotSVM-organelle.png") multiROCplot( [probs_ada, probs_logistic, probs_bag, probs_neural, probs_svm], y_test, "Plots/multiROCplot.png", ['AdaBoost', 'Logistic', 'Bagging Classifier', 'MLP', 'SVM'])
class Models(object): def __init__(self, feature_engineer=False): ''' @description: initlize Class, EX: model @param {type} : feature_engineer: whether using feature engineering, if `False`, then compare common ML models res_model: res network model resnext_model: resnext network model wide_model: wide res network model bert: bert model ml_data: new mldata class @return: No return ''' # 1. 使用torchvision 初始化resnet152模型 # 2. 使用torchvision 初始化 resnext101_32x8d 模型 # 3. 使用torchvision 初始化 wide_resnet101_2 模型 # 4. 加载bert 模型 print("load") self.res_model = torchvision.models.resnet152(pretrained=False) self.res_model.load_state_dict( torch.load(config.root_path + '/model/resnet150/resnet152-b121ed2d.pth')) self.res_model = self.res_model.to(config.device) self.resnext_model = torchvision.models.resnext101_32x8d( pretrained=True) self.resnext_model = self.resnext_model.to(config.device) self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True) self.wide_model = self.wide_model.to(config.device) self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert') self.bert = self.bert.to(config.device) self.ml_data = MLData(debug_mode=True) if feature_engineer: self.model = lgb.LGBMClassifier(objective='multiclass', device='gpu', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.models = [ RandomForestClassifier(n_estimators=500, max_depth=5, random_state=0), LogisticRegression(solver='liblinear', random_state=0), MultinomialNB(), SVC(), lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.8, feature_fraction=0.8), ] def feature_engineer(self): ''' @description: This function is building all kings of features @param {type} None @return: X_train, feature of train set X_test, feature of test set y_train, label of train set y_test, label of test set ''' logger.info("generate embedding feature ") train_tfidf, test_tfidf, train, test = get_embedding_feature( self.ml_data) logger.info("generate basic feature ") # 1. 获取 基本的 NLP feature train = get_basic_feature(train) test = get_basic_feature(test) print(test.loc[0]) logger.info("generate modal feature ") cover = os.listdir(config.root_path + '/data/book_cover/') train['cover'] = train.title.progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test.title.progress_apply( lambda x: config.root_path + '/data/book_cover/' + x + '.jpg' if x + '.jpg' in cover else '') # 1. 获取 三大CV模型的 modal embedding train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) print(len(test.loc[0, 'res_embedding'])) #train['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model)) #test['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model)) #train['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model)) #test['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model)) logger.info("generate bert feature ") # 1. 获取bert embedding train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print(test.loc[0]) logger.info("generate lda feature ") # 1. 获取 lda feature train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.em.lda.id2word.doc2bow(x)) print(test['queryCutRMStopWord']) print(test['bow']) # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.em.lda, doc), test['bow'])) print(test['lda']) print(test.loc[0]) logger.info("formate data") print(test) print(test_tfidf) train, test = formate_data(train, test, train_tfidf, test_tfidf) print(test) print(test.loc[0]) cols = [x for x in train.columns if str(x) not in ['labelIndex']] print(cols) X_train = train[cols] X_test = test[cols] print(X_test) train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] print(y_test) return X_train, X_test, y_train, y_test def param_search(self, search_method='grid'): ''' @description: use param search tech to find best param @param {type} search_method: two options. grid or bayesian optimization @return: None ''' if search_method == 'grid': logger.info("use grid search") self.model = Grid_Train_model(self.model, self.X_train, self.X_test, self.y_train, self.y_test) elif search_method == 'bayesian': logger.info("use bayesian optimization") trn_data = lgb.Dataset(data=self.X_train, label=self.y_train, free_raw_data=False) param = bayes_parameter_opt_lgb(trn_data) logger.info("best param", param) return param def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): ''' @description: handle unbalance data, then search best param @param {type} imbalance_method, three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier search_method: two options. grid or bayesian optimization @return: None ''' logger.info("get all freature") self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None if imbalance_method == 'over_sampling': logger.info("Use SMOTE deal with unbalance data ") # 1. 使用over_sampling 处理样本不平衡问题 print(self.y_train) self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) print(self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': logger.info("Use ClusterCentroids deal with unbalance data ") # 1. 使用 under_sampling 处理样本不平衡问题 print(self.X_train) #print(self.y_train) self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) print(self.X_train) #print(self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' logger.info('search best param') if imbalance_method != 'ensemble': param = self.param_search(search_method=search_method) param['params']['num_leaves'] = int(param['params']['num_leaves']) param['params']['max_depth'] = int(param['params']['max_depth']) self.model = self.model.set_params(**param['params']) logger.info('fit model ') self.model.fit(self.X_train, self.y_train) # 1. 预测测试集的label # 2. 预测训练机的label # 3. 计算percision , accuracy, recall, fi_score Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 logger.info('Train accuracy %s' % per) # 输出测试集的准确率 logger.info('test accuracy %s' % acc) # 输出recall logger.info('test recall %s' % recall) # 输出F1-score logger.info('test F1_score %s' % f1) self.save(model_name) def model_select(self, X_train, X_test, y_train, y_test, feature_method='tf-idf'): ''' @description: using different embedding feature to train common ML models @param {type} X_train, feature of train X_test, feature of test set y_train, label of train set y_test, label of test set feature_method, three options , tfidf, word2vec and fasttext @return: None ''' for model in self.models: model_name = model.__class__.__name__ print(model_name) clf = model.fit(X_train, y_train) Test_predict_label = clf.predict(X_test) Train_predict_label = clf.predict(X_train) per, acc, recall, f1 = get_score(y_train, y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 logger.info(model_name + '_' + 'Train accuracy %s' % per) # 输出测试集的准确率 logger.info(model_name + '_' + ' test accuracy %s' % acc) # 输出recall logger.info(model_name + '_' + 'test recall %s' % recall) # 输出F1-score logger.info(model_name + '_' + 'test F1_score %s' % f1) def predict(self, title, desc): inputs = self.process(title, desc) label = self.ix2label[self.model.predict(inputs)[0]] proba = np.max(self.model.predict_proba(inputs)) return label, proba def save(self, model_name): joblib.dump(self.model, root_path + '/model/ml_model/' + model_name) def load(self, path): self.model = joblib.load(path)
random_state=0, n_estimators=num_emtimators, replacement=True, n_jobs=num_jobs) balanced_RF.fit(xx_train, yy_train) y_pred = balanced_RF.predict(xx_test) print('testdataset-BalancedBaggingClassifier:') print(classification_report_imbalanced(yy_test, y_pred)) y_pred = balanced_RF.predict(xx_train) print('traindataset-BalancedBaggingClassifier:') print(classification_report_imbalanced(yy_train, y_pred)) yy_probability = balanced_RF.predict_proba(xx_test) listFilePath_test = rootpath + 'testlist.list' L_file = open(listFilePath_test, 'r') domaindata_path = '/home/shiqiang/feature_extraction/DeepDomFeatures/train/' k = 0 startLen = 0 for line in L_file: if line.strip() == "": continue chain_name = line.split()[0] labelPath = domaindata_path + chain_name + '/' + chain_name + 'new.label' test_label = np.loadtxt(labelPath, dtype=np.int64) seqLength = test_label.shape[0] if seqLength > 700: seqLength = 700 endLen = startLen + seqLength
AUC_model3(best_clf, X_train, y_train, X_test, y_test, n_classes) # ### Make prediction # Data to be predicted data_predict = data2pred.drop(['pat', 'indication'], axis=1) data_predict_index = data_predict.index data_predict_pipeline = pd.DataFrame(pipeline.fit_transform(data_predict)) print(data_predict_pipeline.shape) data_predict_pipeline.index = data_predict_index data_predict_pipeline.head() # Final prediction dataset needs to have the same contents as the training and testing set pred_final_model = pd.DataFrame( best_clf.predict(data_predict_pipeline)) ## predicted indications pred_final_prob = pd.DataFrame( best_clf.predict_proba(data_predict_pipeline)) ## predicted probabilities pred_final_model.index = data_predict_index pred_final_prob.index = data_predict_index # Plot top features (use RandomUnderSampler which maybe a bit different from BalancedBaggingClassifier) rus = RandomUnderSampler(random_state=1) X_resampled, y_resampled = rus.fit_sample(X_train, y_train) # Adapt X_train, y_train X_train2 = X_resampled.copy() y_train2 = y_resampled.copy() GBM_clf.fit(X_train2, y_train2) # Plot top features feature_importances = pd.concat([ pd.DataFrame(x_data_pipeline.columns),
def run_training(fold_): total_roc = [] total_conf = [] t0 = time.time() #df = pd.read_csv("../input/embedded_train_tiny_folds.csv") df = pd.read_hdf(path_or_buf="../input/tiny_data/full_data_folds.h5", key='dataset') #print("tg\n",df.target.value_counts()) #print(" ") t1 = time.time() total_time = t1 - t0 print("time to read file", total_time) print(f"fold: {fold_}") t0 = time.time() train_df = df[df.kfold != fold_].reset_index(drop=True) test_df = df[df.kfold == fold_].reset_index(drop=True) # print("train shape\n", train_df.shape) # print("test shape\n", test_df.shape) #features xtrain = train_df.drop(["kfold", "target"], axis=1) xtest = test_df.drop(["kfold", "target"], axis=1) # Standard scaler sc = StandardScaler() sc.fit(xtrain) xtrain = sc.transform(xtrain) xtest = sc.transform(xtest) # target # First make the target binary train_df.target = train_df.target.apply(lambda x: 'open' if x == 'open' else 'closed') test_df.target = test_df.target.apply(lambda x: 'open' if x == 'open' else 'closed') ytrain = train_df.target ytest = test_df.target #model n_estimators = 500 model = BalancedBaggingClassifier( linear_model.LogisticRegression(penalty='l2', C=10, class_weight='balanced', max_iter=5000, solver='liblinear'), n_estimators=n_estimators, n_jobs=-1, max_samples=0.2, max_features=0.6, # bootstrap_features=True ) #fit the model on training data model.fit(xtrain, ytrain) # make predictions preds = model.predict(xtest) preds_proba = model.predict_proba(xtest)[:, 1] #print('preds shape',preds_proba.shape) t1 = time.time() total_time = t1 - t0 print('time to fit model:', total_time) accuracy_score = np.sum(preds == ytest) / len(ytest) conf_m = confusion_matrix(ytest, preds) print("confusion m\n", conf_m) roc_score = roc_auc_score(ytest, preds_proba) print('ROC AUC score\n', roc_score) t = [fold_, roc_score] total_conf.append(conf_m) total_roc.append(t) test_df.loc[:, "lr_bagging_pred"] = preds_proba return test_df[["id", "target", "kfold", "lr_bagging_pred"]], np.mean(total_roc, axis=0)[1]
from imblearn.ensemble import EasyEnsemble from sklearn.metrics import recall_score, precision_score from sklearn.tree import DecisionTreeClassifier from load_data import load_data import logistic_regression import matplotlib.pyplot as plt import numpy as np from sklearn.decomposition import PCA from imblearn.ensemble import BalancedBaggingClassifier from roc import calculate_roc, evaluate if __name__ == '__main__': X_train, y_train = load_data( './dataset/car/car-vgood-5-fold/car-vgood-5-2tra.dat') X_test, y_test = load_data( './dataset/car/car-vgood-5-fold/car-vgood-5-2tst.dat') X_train, y_train = map(np.array, [X_train, y_train]) X_test, y_test = map(np.array, [X_test, y_test]) bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), ratio='auto', replacement=False, random_state=0) bbc.fit(X_train, y_train) score = bbc.predict_proba(X_test) evaluate(y_test, score)
def method13_notRec(self, name, test_ids): with open('../data/time_weight/fitting_balanced_' + name + '.pickle', 'rb') as f: time_weight = pickle.load(f) parm_dic = {'A': {'conv': 0, 'click': 0.20701892, 'view': 0.78720054, 'cart': 0.19557122}, 'B': {'conv': 1, 'click': 0.43314098, 'view': 0.5480186, 'cart': 1}, 'C': {'conv': 0, 'click': 0, 'view': 0.71978554, 'cart': 1}, 'D': {'conv': 1, 'click': 0, 'view': 0.82985685, 'cart': 0}} if name != 'D': with open('../data/matrix/all_time_weighted_' + name + '.pickle', 'rb') as f: sparse_data = pickle.load(f) with open('../data/matrix/all_id_dic_time_weighted_' + name + '.pickle', 'rb') as f: id_dic = pickle.load(f) model = NMF(n_components=128, max_iter=1024, tol=0.001) user_feature_matrix = model.fit_transform(sparse_data) item_feature_matrix = model.components_ if name != 'C': with open('../data/conv_pred/train_data_' + name + '.pickle', 'rb') as f: data = pickle.load(f) with open('../data/conv_pred/test_X_cut_origin_' + name + '.pickle', 'rb') as f: name_dic_train = pickle.load(f) v = DictVectorizer() X = v.fit_transform(data['X']) y = np.array(data['y']) forest = BalancedBaggingClassifier(n_estimators=500, n_jobs=1,random_state=777) forest.fit(X, y) forest2 = BalancedBaggingClassifier(n_estimators=500, n_jobs=1, random_state=1234) forest2.fit(X, y) forest3 = BalancedBaggingClassifier(n_estimators=500, n_jobs=1, random_state=1919) forest3.fit(X, y) forest4 = BalancedBaggingClassifier(n_estimators=500, n_jobs=1, random_state=114514) forest4.fit(X, y) forest5 = BalancedBaggingClassifier(n_estimators=500, n_jobs=1, random_state=334) forest5.fit(X, y) # with open('../data/conv_pred/train_data_notRec_' + name + '.pickle', 'rb') as f: # data = pickle.load(f) # X = v.transform(data['X']) # y = np.array(data['y']) # # notRecforest = BalancedBaggingClassifier(n_estimators=100, n_jobs=1) # notRecforest.fit(X, y) test_min = datetime.datetime(year=2017, month=5, day=1) predict_test = {} for i in tqdm.tqdm(test_ids): # ユニークitem idを取得 tmp_dict = {} past_items = pd.unique(self.personal_train[name][i]['product_id']) # 過去のデータから商品の重みを計算 for j in past_items: tmp_dict[j] = 0 for _, row in self.personal_train[name][i][ self.personal_train[name][i]['product_id'] == j].iterrows(): if row['event_type'] == 1: tmp_dict[j] += parm_dic[name]['view'] * time_weight[ -1 * (row['time_stamp'] - test_min).days] elif row['event_type'] == 0: tmp_dict[j] += parm_dic[name]['cart'] * time_weight[ -1 * (row['time_stamp'] - test_min).days] elif row['event_type'] == 2: tmp_dict[j] += parm_dic[name]['click'] * time_weight[ -1 * (row['time_stamp'] - test_min).days] elif row['event_type'] == 3: tmp_dict[j] += parm_dic[name]['conv'] * time_weight[ -1 * (row['time_stamp'] - test_min).days] sorted_list = sorted(tmp_dict.items(), key=itemgetter(1), reverse=True) sorted_list = [x for x, y in sorted_list] old_set = sorted_list if name == 'D': if len(sorted_list) > 22: sorted_list = sorted_list[:22] predict_test[i] = sorted_list else: if name != 'C': sorted_list2 = [] input_data = [] for k in sorted_list: if k in name_dic_train[i].keys() and len(name_dic_train[i][k]) != 0: sorted_list2.append(k) input_data.append(name_dic_train[i][k]) if len(input_data) != 0: X = v.transform(input_data) pred = forest.predict_proba(X)[:,1] pred2 = forest2.predict_proba(X)[:, 1] pred3 = forest3.predict_proba(X)[:, 1] pred4 = forest4.predict_proba(X)[:, 1] pred5 = forest5.predict_proba(X)[:, 1] pred=(pred+pred2+pred3+pred4+pred5)/5 #pred_notRec = notRecforest.predict_proba(X)[:,1] conv_list = [] rec_list=[] mysort = sorted(zip(sorted_list2, pred), key=lambda x: x[1], reverse=True) #notRecsort = sorted(zip(sorted_list2, pred_notRec), key=lambda x: x[1], reverse=False) # for k in range(len(notRecsort)): # if notRecsort[k][1] >= 0.5: # conv_list.append(notRecsort[k][0]) for k in range(len(mysort)): if mysort[k][1] >= 0.5: rec_list.append(mysort[k][0]) for k in old_set: if k not in rec_list: rec_list.append(k) sorted_list = rec_list if len(sorted_list) > 22: sorted_list = sorted_list[:22] # elif name == 'A': # for k in conv_list: # if len(sorted_list) >= 22: # break # if k not in sorted_list: # sorted_list.append(k) nmf_number = 22 - len(sorted_list) if len(sorted_list) > 22: sorted_list = sorted_list[:22] if nmf_number > 0: est_user_eval = np.dot(user_feature_matrix[id_dic['user_id'].index(i)], item_feature_matrix) # est_user_eval = cm.dot(cm.CUDAMatrix(user_feature_matrix[id_dic['user_id'].index(i):id_dic['user_id'].index(i) + 1]),cm.CUDAMatrix(item_feature_matrix)).asarray()[0] tmp = sorted(zip(est_user_eval, id_dic['product_id']), key=lambda x: x[0], reverse=True) predict = list(zip(*tmp))[1] add_list = [] num = 0 while len(add_list) != nmf_number: if predict[num] not in sorted_list: add_list.append(predict[num]) num += 1 sorted_list.extend(add_list) predict_test[i] = sorted_list return predict_test
class Models(object): """ 获取基于机器学习的文本算法 """ def __init__(self, model_path=None, feature_engineer=False, train_mode=True): # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持 cuda, 则将模型加载到 cuda 中 self.res_model = torchvision.models.resnet152(pretrained=True).to( config.device) self.resnext_model = torchvision.models.resnext101_32x8d( pretrained=True).to(config.device) self.wide_model = torchvision.models.wide_resnet101_2( pretrained=True).to(config.device) # 加载 bert 模型, 如果支持 cuda, 则将模型加载到 cuda 中 self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path + '/model/bert') self.bert = BertModel.from_pretrained(config.root_path + '/model/bert').to(config.device) # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练 self.ml_data = MLData(debug_mode=True, train_mode=train_mode) # 如果不训练, 则加载训练好的模型,进行预测 if not train_mode: self.load(model_path) labelNameToIndex = json.load( open(config.root_path + '/data/label2id.json', encoding='utf-8')) self.ix2label = {v: k for k, v in labelNameToIndex.items()} else: # 如果 feature_engineer, 则使用lightgbm 进行训练, 反之对比经典机器学习模型 if feature_engineer: self.model = lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.9, feature_fraction=0.8, seed=1440) else: self.models = [ RandomForestClassifier(n_estimators=500, max_depth=5, random_state=0), LogisticRegression(solver='liblinear', random_state=0), MultinomialNB(), SVC(), lgb.LGBMClassifier(objective='multiclass', n_jobs=10, num_class=33, num_leaves=30, reg_alpha=10, reg_lambda=200, max_depth=3, learning_rate=0.05, n_estimators=2000, bagging_freq=1, bagging_fraction=0.8, feature_fraction=0.8), ] def feature_engineer(self): print(" generate embedding feature ") # 获取 tfidf 特征, word2vec 特征, word2vec 不进行任何聚合 train_tfidf, train = get_embedding_feature(self.ml_data.train, self.ml_data.tfidf, self.ml_data.w2v) # train 是通过 pandas 创建的一个对象,get_embedding_feature 后得到的列为: # w2v: 一条句子中的词换成 w2v 模型编码的 vector。该列的每一行为:[seq, 300] # w2v_label_mean:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300] # w2v_label_max:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300] # w2v_mean:[seq, 300] -> [300] # w2v_max:[seq, 300] -> [300] # w2v_win_2_mean:窗口滑动思想提取特征,该列的每一行为:[300] # w2v_win_3_mean # w2v_win_4_mean # w2v_win_2_max # w2v_win_3_max # w2v_win_4_max test_tfidf, test = get_embedding_feature(self.ml_data.dev, self.ml_data.tfidf, self.ml_data.w2v) print("generate basic feature ") # 获取nlp 基本特征 train = get_basic_feature(train) test = get_basic_feature(test) print("generate lda feature ") # 生成 bag of word 格式数据 train['bow'] = train['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) test['bow'] = test['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) # test['bow'] 一行:[(10, 1), (78, 1), (162, 3), (177, 1), (192, 1)...] # 在bag of word 基础上得到lda的embedding train['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), train['bow'])) test['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), test['bow'])) # test['lda'] 一行:[0.002929521957412362, 0.0024772200267761946, .... ] 有 30 个主题,一行是 30 个主题的概率分布 print("generate modal feature ") # 加载图书封面的文件 cover = os.listdir(config.book_cover_path) # 根据title 匹配图书封面 train['cover'] = train['title'].progress_apply( lambda x: config.book_cover_path + x + '.jpg' if x + '.jpg' in cover else '') test['cover'] = test.title.progress_apply( lambda x: config.book_cover_path + x + '.jpg' if x + '.jpg' in cover else '') # 根据封面获取封面的embedding train['res_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.res_model)) test['res_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) train['resnext_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) test['resnext_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) train['wide_embedding'] = train['cover'].progress_apply( lambda x: get_img_embedding(x, self.wide_model)) test['wide_embedding'] = test.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") train['bert_embedding'] = train['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) test['bert_embedding'] = test['text'].progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) # print("generate autoencoder feature ") # 获取到 autoencoder 的embedding, 根据encoder 获取而不是decoder # TODO # train_ae = get_autoencoder_feature( # train, # self.ml_data.ae.max_features, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) # test_ae = get_autoencoder_feature( # test, # self.ml_data.ae.max_fe atures, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) print("formate data") # 将所有的特征拼接到一起 train = formate_data( train, train_tfidf) # train = formate_data(train, train_tfidf, train_ae) test = formate_data( test, test_tfidf) # test = formate_data(test, test_tfidf, test_ae) # 生成训练,测试的数据 cols = [x for x in train.columns if str(x) not in ['labelIndex']] X_train = train[cols] X_test = test[cols] print(X_test) train["labelIndex"] = train["labelIndex"].astype(int) test["labelIndex"] = test["labelIndex"].astype(int) y_train = train["labelIndex"] y_test = test["labelIndex"] return X_train, X_test, y_train, y_test def param_search(self, search_method='grid'): # 使用网格搜索 或者贝叶斯优化 寻找最优参数 if search_method == 'grid': print("use grid search") self.model = Grid_Train_model(self.model, self.X_train, self.X_test, self.y_train, self.y_test) elif search_method == 'bayesian': print("use bayesian optimization") trn_data = lgb.Dataset(data=self.X_train, label=self.y_train, free_raw_data=False) param = bayes_parameter_opt_lgb(trn_data) print("best param", param) return param def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): print("get all feature") # 生成所有 feature self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble if imbalance_method == 'over_sampling': print("Use SMOTE deal with unbalance data ") # https://www.zhihu.com/question/269698662 # https://www.cnblogs.com/kamekin/p/9824294.html self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': print("Use ClusterCentroids deal with unbalance data") self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' print('search best param') # 使用 set_params 将搜索到的最优参数设置为模型的参数 if imbalance_method != 'ensemble': param = self.param_search(search_method=search_method) param['params']['num_leaves'] = int(param['params']['num_leaves']) param['params']['max_depth'] = int(param['params']['max_depth']) self.model = self.model.set_params(**param['params']) print('fit model ') # 训练, 并输出模型的结果 self.model.fit(self.X_train, self.y_train) Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的精确率 print('Train accuracy %s' % per) # 输出测试集的准确率 print('test accuracy %s' % acc) # 输出recall print('test recall %s' % recall) # 输出F1-score print('test F1_score %s' % f1) self.save(model_name) def model_select(self, X_train, X_test, y_train, y_test, feature_method='tf-idf'): # 对比tfidf word2vec fasttext 等词向量以及常见机器学习模型的效果 for model in self.models: model_name = model.__class__.__name__ print(model_name) clf = model.fit(X_train, y_train) Test_predict_label = clf.predict(X_test) Train_predict_label = clf.predict(X_train) per, acc, recall, f1 = get_score(y_train, y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 print(model_name + '_' + 'Train accuracy %s' % per) # 输出测试集的准确率 print(model_name + '_' + ' test accuracy %s' % acc) # 输出recall print(model_name + '_' + 'test recall %s' % recall) # 输出F1-score print(model_name + '_' + 'test F1_score %s' % f1) def process(self, title, desc): # 处理数据, 生成模型预测所需要的特征 df = pd.DataFrame([[title, desc]], columns=['title', 'desc']) df['text'] = df['title'] + df['desc'] df["queryCut"] = df["text"].apply(query_cut) df["queryCutRMStopWord"] = df["queryCut"].apply( lambda x: [word for word in x if word not in get_stop_word_list()]) df_tfidf, df = get_embedding_feature(df, self.ml_data.tfidf, self.ml_data.w2v) print("generate basic feature ") df = get_basic_feature(df) print("generate modal feature ") df['cover'] = '' df['res_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.res_model)) df['resnext_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.resnext_model)) df['wide_embedding'] = df.cover.progress_apply( lambda x: get_img_embedding(x, self.wide_model)) print("generate bert feature ") df['bert_embedding'] = df.text.progress_apply( lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert )) print("generate lda feature ") df['bow'] = df['queryCutRMStopWord'].apply( lambda x: self.ml_data.lda.id2word.doc2bow(x)) df['lda'] = list( map(lambda doc: get_lda_features(self.ml_data.lda, doc), df.bow)) print("generate autoencoder feature ") # df_ae = get_autoencoder_feature(df, # self.ml_data.ae.max_features, # self.ml_data.ae.max_len, # self.ml_data.ae.encoder, # tokenizer=self.ml_data.ae.tokenizer) print("formate data") df['labelIndex'] = 1 df = formate_data(df, df_tfidf) #, df_ae) cols = [x for x in df.columns if str(x) not in ['labelIndex']] X_train = df[cols] return X_train def predict(self, title, desc): ''' @description: 根据输入的title, desc 预测图书的类别 @param {type} title, input desc: input @return: label ''' inputs = self.process(title, desc) label = self.ix2label[self.model.predict(inputs)[0]] proba = np.max(self.model.predict_proba(inputs)) return label, proba def save(self, model_name): ''' @description:save model @param {type} model_name, file name for saving @return: None ''' joblib.dump(self.model, root_path + '/model/ml_model/' + model_name) def load(self, path): ''' @description: load model @param {type} path: model path @return:None ''' self.model = joblib.load(path)
val_vecs = (joblib.load("./vectorized_data/val_%s" % i) for i in instances) # BalancedBaggingClassifier n_estimator and n_jobs params params = [(100, -1), (50, -1)] for data in train_vecs: val_vec = next(val_vecs) # 'sync' training and validation data for param in params: print("Processing %s %s" % (data[1], param[0])) bb_model = BalancedBaggingClassifier(n_estimators=param[0], n_jobs=param[1], ratio="not minority") print("Fitting...") bb_model.fit(data[0], train_labels) print("Testing...") preds = bb_model.predict_proba(val_vec) auc = roc_auc_score(val_labels, preds[:, 1]) brier = brier_score_loss(val_labels, preds[:, 1]) results = results.append( { "data_file": data[1], "bb_n_est": param[0], "auc": auc, "brier": brier }, ignore_index=True) results.to_csv("./classifier_results.csv", index=False) print("AUC: %.3f, BRIER: %.3f" % (auc, brier))