コード例 #1
0
ファイル: models.py プロジェクト: zingp/bookClassification
    def model_select(self,
                     X_train,
                     X_test,
                     y_train,
                     y_test,
                     feature_method='tf-idf'):
        '''
        @description: using different embedding feature to train common ML models
        @param {type}
        X_train, feature of train set
        X_test, feature of test set
        y_train, label of train set
        y_test, label of test set
        feature_method, three options , tfidf, word2vec and fasttext
        @return: None
        '''
        # 对比tfidf word2vec fasttext 等词向量以及常见机器学习模型的效果
        for model in self.models:
            model_name = model.__class__.__name__
            print(model_name)
            clf = model.fit(X_train, y_train)
            Test_predict_label = clf.predict(X_test)
            Train_predict_label = clf.predict(X_train)
            per, acc, recall, f1 = get_score(y_train, y_test,
                                             Train_predict_label,
                                             Test_predict_label)
            # 输出训练集的准确率
            logger.info(model_name + '_' + 'Train accuracy %s' % per)

            # 输出测试集的准确率
            logger.info(model_name + '_' + ' test accuracy %s' % acc)

            # 输出recall
            logger.info(model_name + '_' + 'test recall %s' % recall)

            # 输出F1-score
            logger.info(model_name + '_' + 'test F1_score %s' % f1)
コード例 #2
0
 def unbalance_helper(self,
                      imbalance_method='under_sampling',
                      search_method='grid'):
     '''
     @description: handle unbalance data, then search best param
     @param {type}
     imbalance_method,  three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier
     search_method: two options. grid or bayesian optimization
     @return: None
     '''
     logger.info("get all freature")
     # 生成所有feature
     self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
     )
     model_name = None
     # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble
     ###########################################
     #          TODO: module 4 task 1.1        #
     ###########################################
     if imbalance_method == 'over_sampling':
         logger.info("Use SMOTE deal with unbalance data ")
         self.X_train, self.y_train = SMOTE().fit_resample(
             self.X_train, self.y_train)
         self.X_test, self.y_test = SMOTE().fit_resample(
             self.X_train, self.y_train)
         model_name = 'lgb_over_sampling'
     elif imbalance_method == 'under_sampling':
         logger.info("Use ClusterCentroids deal with unbalance data ")
         self.X_train, self.y_train = ClusterCentroids(
             random_state=0).fit_resample(self.X_train, self.y_train)
         self.X_test, self.y_test = ClusterCentroids(
             random_state=0).fit_resample(self.X_test, self.y_test)
         model_name = 'lgb_under_sampling'
     elif imbalance_method == 'ensemble':
         self.model = BalancedBaggingClassifier(
             base_estimator=DecisionTreeClassifier(),
             sampling_strategy='auto',
             replacement=False,
             random_state=0)
         model_name = 'ensemble'
     logger.info('search best param')
     # 使用set_params 将搜索到的最优参数设置为模型的参数
     if imbalance_method != 'ensemble':
         ###########################################
         #          TODO: module 4 task 1.2        #
         ###########################################
         # param = self.param_search(search_method=search_method)
         # param['params']['num_leaves'] = int(param['params']['num_leaves'])
         # param['params']['max_depth'] = int(param['params']['max_depth'])
         param = {}
         param['params'] = {}
         param['params']['num_leaves'] = 3
         param['params']['max_depth'] = 5
         self.model = self.model.set_params(**param['params'])
     logger.info('fit model ')
     # 训练, 并输出模型的结果
     self.model.fit(self.X_train, self.y_train)
     ###########################################
     #          TODO: module 4 task 1.3        #
     ###########################################
     Test_predict_label = self.model.predict(self.X_test)
     Train_predict_label = self.model.predict(self.X_train)
     per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                      Train_predict_label,
                                      Test_predict_label)
     # 输出训练集的精确率
     logger.info('Train accuracy %s' % per)
     # 输出测试集的准确率
     logger.info('test accuracy %s' % acc)
     # 输出recall
     logger.info('test recall %s' % recall)
     # 输出F1-score
     logger.info('test F1_score %s' % f1)
     self.save(model_name)
コード例 #3
0
    def unbalance_helper(self,
                         imbalance_method='under_sampling',
                         search_method='grid'):
        '''
        @description: handle unbalance data, then search best param
        @param {type}
        imbalance_method,  three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        logger.info("get all freature")
        self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
        )
        model_name = None
        if imbalance_method == 'over_sampling':
            logger.info("Use SMOTE deal with unbalance data ")

            # 1. 使用over_sampling 处理样本不平衡问题
            print(self.y_train)
            self.X_train, self.y_train = SMOTE().fit_resample(
                self.X_train, self.y_train)
            print(self.y_train)
            self.X_test, self.y_test = SMOTE().fit_resample(
                self.X_train, self.y_train)
            model_name = 'lgb_over_sampling'

        elif imbalance_method == 'under_sampling':
            logger.info("Use ClusterCentroids deal with unbalance data ")

            # 1. 使用 under_sampling 处理样本不平衡问题
            print(self.X_train)
            #print(self.y_train)
            self.X_train, self.y_train = ClusterCentroids(
                random_state=0).fit_resample(self.X_train, self.y_train)
            print(self.X_train)
            #print(self.y_train)
            self.X_test, self.y_test = ClusterCentroids(
                random_state=0).fit_resample(self.X_test, self.y_test)
            model_name = 'lgb_under_sampling'

        elif imbalance_method == 'ensemble':
            self.model = BalancedBaggingClassifier(
                base_estimator=DecisionTreeClassifier(),
                sampling_strategy='auto',
                replacement=False,
                random_state=0)
            model_name = 'ensemble'
        logger.info('search best param')

        if imbalance_method != 'ensemble':
            param = self.param_search(search_method=search_method)
            param['params']['num_leaves'] = int(param['params']['num_leaves'])
            param['params']['max_depth'] = int(param['params']['max_depth'])
            self.model = self.model.set_params(**param['params'])

        logger.info('fit model ')
        self.model.fit(self.X_train, self.y_train)

        # 1. 预测测试集的label
        # 2. 预测训练机的label
        # 3. 计算percision , accuracy, recall, fi_score

        Test_predict_label = self.model.predict(self.X_test)
        Train_predict_label = self.model.predict(self.X_train)
        per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                         Train_predict_label,
                                         Test_predict_label)

        # 输出训练集的准确率
        logger.info('Train accuracy %s' % per)
        # 输出测试集的准确率
        logger.info('test accuracy %s' % acc)
        # 输出recall
        logger.info('test recall %s' % recall)
        # 输出F1-score
        logger.info('test F1_score %s' % f1)
        self.save(model_name)