def model_select(self, X_train, X_test, y_train, y_test, feature_method='tf-idf'): ''' @description: using different embedding feature to train common ML models @param {type} X_train, feature of train set X_test, feature of test set y_train, label of train set y_test, label of test set feature_method, three options , tfidf, word2vec and fasttext @return: None ''' # 对比tfidf word2vec fasttext 等词向量以及常见机器学习模型的效果 for model in self.models: model_name = model.__class__.__name__ print(model_name) clf = model.fit(X_train, y_train) Test_predict_label = clf.predict(X_test) Train_predict_label = clf.predict(X_train) per, acc, recall, f1 = get_score(y_train, y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 logger.info(model_name + '_' + 'Train accuracy %s' % per) # 输出测试集的准确率 logger.info(model_name + '_' + ' test accuracy %s' % acc) # 输出recall logger.info(model_name + '_' + 'test recall %s' % recall) # 输出F1-score logger.info(model_name + '_' + 'test F1_score %s' % f1)
def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): ''' @description: handle unbalance data, then search best param @param {type} imbalance_method, three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier search_method: two options. grid or bayesian optimization @return: None ''' logger.info("get all freature") # 生成所有feature self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble ########################################### # TODO: module 4 task 1.1 # ########################################### if imbalance_method == 'over_sampling': logger.info("Use SMOTE deal with unbalance data ") self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': logger.info("Use ClusterCentroids deal with unbalance data ") self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' logger.info('search best param') # 使用set_params 将搜索到的最优参数设置为模型的参数 if imbalance_method != 'ensemble': ########################################### # TODO: module 4 task 1.2 # ########################################### # param = self.param_search(search_method=search_method) # param['params']['num_leaves'] = int(param['params']['num_leaves']) # param['params']['max_depth'] = int(param['params']['max_depth']) param = {} param['params'] = {} param['params']['num_leaves'] = 3 param['params']['max_depth'] = 5 self.model = self.model.set_params(**param['params']) logger.info('fit model ') # 训练, 并输出模型的结果 self.model.fit(self.X_train, self.y_train) ########################################### # TODO: module 4 task 1.3 # ########################################### Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的精确率 logger.info('Train accuracy %s' % per) # 输出测试集的准确率 logger.info('test accuracy %s' % acc) # 输出recall logger.info('test recall %s' % recall) # 输出F1-score logger.info('test F1_score %s' % f1) self.save(model_name)
def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): ''' @description: handle unbalance data, then search best param @param {type} imbalance_method, three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier search_method: two options. grid or bayesian optimization @return: None ''' logger.info("get all freature") self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None if imbalance_method == 'over_sampling': logger.info("Use SMOTE deal with unbalance data ") # 1. 使用over_sampling 处理样本不平衡问题 print(self.y_train) self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) print(self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': logger.info("Use ClusterCentroids deal with unbalance data ") # 1. 使用 under_sampling 处理样本不平衡问题 print(self.X_train) #print(self.y_train) self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) print(self.X_train) #print(self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' logger.info('search best param') if imbalance_method != 'ensemble': param = self.param_search(search_method=search_method) param['params']['num_leaves'] = int(param['params']['num_leaves']) param['params']['max_depth'] = int(param['params']['max_depth']) self.model = self.model.set_params(**param['params']) logger.info('fit model ') self.model.fit(self.X_train, self.y_train) # 1. 预测测试集的label # 2. 预测训练机的label # 3. 计算percision , accuracy, recall, fi_score Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的准确率 logger.info('Train accuracy %s' % per) # 输出测试集的准确率 logger.info('test accuracy %s' % acc) # 输出recall logger.info('test recall %s' % recall) # 输出F1-score logger.info('test F1_score %s' % f1) self.save(model_name)