Python FileUtil.isemptyの例

プログラミング言語: Python

名前空間/パッケージ名: com.utils.fileutil

クラス/型: FileUtil

メソッド/関数: isempty

hotexamples.comのコード掲載数: 11

Python FileUtil.isempty - 11件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのcom.utils.fileutil.FileUtil.isemptyの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

mkdirs(8)

isempty(7)

isexist(6)

listdir(4)

read(3)

write(2)

empty(1)

getfilename(1)

getparentdir(1)

コード例 #1

ファイルを表示

def clear_un_img():
    # 图片存放的路径
    all_img_url = os.path.join(RESOURCE_BASE_URL, "collect/img")
    # 这个目录下是需要保留的图片
    leave_img_url = os.path.join(RESOURCE_BASE_URL, "collect")

    if FileUtil.isempty(leave_img_url):
        FileUtil.empty(all_img_url)
    else:
        all_imgs = FileUtil.listdir(all_img_url)

        dirs = [leave_img_url]
        for parent, dirnames, filenames in os.walk(leave_img_url):
            for dirname in dirnames:
                dirs.append(os.path.join(parent, dirname))

        leave_imgs = []
        for dir_ in dirs:
            imglist = collect.read_weibo(dir_, isreadimg=True)
            imglist = flatten(
                [img.get("img") for img in imglist if img.get("img")])
            leave_imgs += imglist

        # 删除多余的图片
        map(lambda p: os.remove(p) if p not in leave_imgs else None, all_imgs)

コード例 #2

ファイルを表示

ファイル: classification.py プロジェクト: laurelhhq/Image_Emotion

    def get_classificator(self,
                          train_datas,
                          class_label,
                          iscrossvalidate=False,
                          isbalance=False,
                          minority_target=None):
        """
        获取分类器
        :param train_datas
        :param class_label
        :param iscrossvalidate 是否需要读取交叉验证后的结果来获得训练器
        :param isbalance 是否需要平衡数据
        :param minority_target 少数类，只有在 isbalance 为 true 时才起作用
        :return:
        """
        out = os.path.join(TEXT_OUT, "best_train_test_index/train_index.txt")
        if iscrossvalidate and (not FileUtil.isexist(out)
                                or FileUtil.isempty(out)):
            raise ValueError("please use cross_validation() firstly")

        # fit data
        fit_train_datas = self.fit_data(train_datas)
        class_label = np.asarray(class_label)

        if iscrossvalidate:
            train_index = np.loadtxt(out, dtype=int)
        else:
            train_index = np.array(range(fit_train_datas.shape[0]))

        fit_train_datas, class_label = fit_train_datas[
            train_index], class_label[train_index]
        sample_weight = None

        # SMOTE
        if isbalance:
            fit_train_datas, class_label = preprocessing.my_smote(
                fit_train_datas, class_label, minority_target, per=0.5)
#            sample_weight = _weights._balance_weights(class_label)

# sample_weight
#        if isbalance:
#            sample_weight = _weights._balance_weights(class_label)

# 训练模型
        self.bayes.fit(fit_train_datas,
                       class_label,
                       sample_weight=sample_weight)
        return self

コード例 #3

ファイルを表示

ファイル: classification.py プロジェクト: zqlhuanying/Image_Emotion

    def get_classificator(self, train_datas, class_label, iscrossvalidate=False, isbalance=False, minority_target=None):
        """
        获取分类器
        :param train_datas
        :param class_label
        :param iscrossvalidate 是否需要读取交叉验证后的结果来获得训练器
        :param isbalance 是否需要平衡数据
        :param minority_target 少数类，只有在 isbalance 为 true 时才起作用
        :return:
        """
        out = os.path.join(TEXT_OUT, "best_train_test_index/train_index.txt")
        if iscrossvalidate and (not FileUtil.isexist(out) or FileUtil.isempty(out)):
            raise ValueError("please use cross_validation() firstly")

        # fit data
        fit_train_datas = self.fit_data(train_datas)
        class_label = np.asarray(class_label)

        if iscrossvalidate:
            train_index = np.loadtxt(out, dtype=int)
        else:
            train_index = np.array(range(fit_train_datas.shape[0]))

        fit_train_datas, class_label = fit_train_datas[train_index], class_label[train_index]
        sample_weight = None

        # SMOTE
        if isbalance:
            fit_train_datas, class_label = preprocessing.my_smote(fit_train_datas, class_label, minority_target, per=0.5)
#            sample_weight = _weights._balance_weights(class_label)

        # sample_weight
#        if isbalance:
#            sample_weight = _weights._balance_weights(class_label)

        # 训练模型
        self.bayes.fit(fit_train_datas, class_label, sample_weight=sample_weight)
        return self

コード例 #4

ファイルを表示

ファイル: clear_unneccessary_image.py プロジェクト: zqlhuanying/Image_Emotion

def clear_un_img():
    # 图片存放的路径
    all_img_url = os.path.join(RESOURCE_BASE_URL, "collect/img")
    # 这个目录下是需要保留的图片
    leave_img_url = os.path.join(RESOURCE_BASE_URL, "collect")

    if FileUtil.isempty(leave_img_url):
        FileUtil.empty(all_img_url)
    else:
        all_imgs = FileUtil.listdir(all_img_url)

        dirs = [leave_img_url]
        for parent, dirnames, filenames in os.walk(leave_img_url):
            for dirname in dirnames:
                dirs.append(os.path.join(parent, dirname))

        leave_imgs = []
        for dir_ in dirs:
            imglist = collect.read_weibo(dir_, isreadimg=True)
            imglist = flatten([img.get("img") for img in imglist if img.get("img")])
            leave_imgs += imglist

        # 删除多余的图片
        map(lambda p: os.remove(p) if p not in leave_imgs else None, all_imgs)

コード例 #5

ファイルを表示

    def _get_splited_train(self):
        """
        优先从文件中读取训练集分词后的结果
        :return:
        """
        dir_ = os.path.join(TEXT_OUT, "split")
        if self.subjective:
            split_txt = os.path.join(dir_, self.__class__.__name__ + ".txt")
            training_datas = Load.load_training_balance()
        else:
            split_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt")
            training_datas = Load.load_training_objective_balance()

        if self.f or not FileUtil.isexist(split_txt) or FileUtil.isempty(split_txt):
            # 加载训练集
            # 每个句子还包含类别信息
            splited_words_list = Feature.__split(flatten(training_datas))
#            splited_words_list = Feature.__del_low_frequency_word(splited_words_list)

            FileUtil.write(split_txt, splited_words_list)
        else:
            splited_words_list = FileUtil.read(split_txt)

        return splited_words_list

コード例 #6

ファイルを表示

    train = train_datas
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    test = Load.load_test_balance()
    test_datas, test_label, _ = feature.get_key_words(test)
    test = test_datas
    # 构建适合 bayes 分类的数据集
    if not sp.issparse(test_datas):
        test = feature.cal_weight_improve(test_datas, test_label)

    crossvalidate = False
    # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例
    if crossvalidate:
        out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt")
        if not FileUtil.isexist(out) or FileUtil.isempty(out):
            clf0 = Classification()
            clf0.cross_validation(train, class_label, score="recall")
        test_index = np.loadtxt(out, dtype=int)
        test = train[test_index]
        test_label = np.asanyarray(class_label)[test_index].tolist()

    method_options = ("second", "four", "five")
    method_options_0 = ("B", "C", "D")
    linestyle = (':', '--', '-')
    plot.get_instance()
    for i in range(len(method_options)):
        bayes = IncrBayes()
        clf = Classification(bayes=bayes)
        clf.get_classificator(train, class_label, iscrossvalidate=crossvalidate,
                              isbalance=False, minority_target=EMOTION_CLASS.keys())

コード例 #7

ファイルを表示

ファイル: classification.py プロジェクト: laurelhhq/Image_Emotion

    def get_incr_classificator_thread(self, incr_datas, incr_class_label,
                                      test_datas, test_class_label):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func1(i0):
            c_true0 = incr_class_label[i0:i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                clf0 = copy.deepcopy(self)
                clf0.bayes.class_log_prior_, clf0.bayes.feature_log_prob_ = clf0.bayes.update(
                    c_pred0, text0, copy=True)
                loss0 = clf0.metrics_my_zero_one_loss(test_datas)

                # clf0.bayes.class_log_prior_ = origin_class_log_prob_
                # clf0.bayes.feature_log_prob_ = origin_feature_log_prob_

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        def func(i0):
            c_true0 = incr_class_label[i0:i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                if lock0.acquire():
                    self.bayes.class_log_prior_, self.bayes.feature_log_prob_ = self.bayes.update(
                        c_pred0, text0, copy=True)
                    loss0 = self.metrics_my_zero_one_loss(test_datas)

                    self.bayes.class_log_prior_ = origin_class_log_prob_
                    self.bayes.feature_log_prob_ = origin_feature_log_prob_

                    lock0.release()

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        print "Begin Increment Classification: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        class_count_out = os.path.join(dir_, "class_count.txt")
        class_log_prob_out = os.path.join(dir_, "class_log_prob.txt")
        feature_count_out = os.path.join(dir_, "feature_count.txt")
        feature_log_prob_out = os.path.join(dir_, "feature_log_prob.txt")

        out = (class_count_out, class_log_prob_out, feature_count_out,
               feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(
                    self.bayes, "class_log_prior_"):
                raise ValueError(
                    "please use get_classificator() to get classificator firstly"
                )

            fit_incr_datas = self.fit_data(incr_datas)
            n_samples, _ = fit_incr_datas.shape
            incr_class_label = np.array(incr_class_label)

            lock0 = threading.Lock()
            lock1 = threading.Lock()

            # threadpool
            poolsize = 30
            pool = ThreadPool(poolsize)

            for i in range(n_samples):
                if i % 5 == 0:
                    print "Begin Increment Classification_%d: %s" % (
                        i / 5, time.strftime('%Y-%m-%d %H:%M:%S'))
                # 分类损失，求最小值的处理方式
                loss = []
                # 增量集中优先选择更改分类器参数的文本
                text = []
                # 增量集中优先选择更改分类器参数的文本所对应的类别
                c_pred = []
                # 增量集中优先选择更改分类器参数的文本所对应的下标
                # index = 0

                origin_class_log_prob_ = self.bayes.class_log_prior_
                origin_feature_log_prob_ = self.bayes.feature_log_prob_

                # threadpool
                requests = makeRequests(func, range(fit_incr_datas.shape[0]))
                [pool.putRequest(req) for req in requests]
                pool.wait()
                #                for i0 in range(fit_incr_datas.shape[0]):
                #                    threading.Thread(target=func, args=(i0, )).start()

                minindex = np.argmin(loss)
                self.bayes.update(c_pred[minindex], text[minindex])
                fit_incr_datas = sp.vstack([
                    fit_incr_datas[:minindex, :],
                    fit_incr_datas[minindex + 1:, :]
                ])

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_,
                          self.bayes.feature_log_prob_)
            map(lambda x: np.savetxt(x[0], x[1]), zip(out, bayes_args))
        else:
            self.bayes.class_count_ = np.loadtxt(out[0])
            self.bayes.class_log_prior_ = np.loadtxt(out[1])
            self.bayes.feature_count_ = np.loadtxt(out[2])
            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        return self

コード例 #8

ファイルを表示

ファイル: classification.py プロジェクト: laurelhhq/Image_Emotion

    def get_incr_classificator(self,
                               incr_datas,
                               incr_class_label,
                               test_datas,
                               test_class_label,
                               method="first"):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func(x, y):
            block.append(fit_incr_datas[x[3] + 1:y[3], :])
            label_block.append(incr_class_label[x[3] + 1:y[3]])
            block0.append(fit_incr_datas[y[3]:y[3] + 1, :])
            return y

        def handle(clf, method):
            if method == "zero":
                return handle_zero(clf)
            elif method == "first":
                return handle_first(clf)
            elif method == "second":
                return handle_second(clf)
            elif method == "third":
                return handle_third(clf)
            elif method == "four":
                return handle_four(clf)
            elif method == "five":
                return handle_five(clf)
            else:
                pass

        def handle_zero(clf):
            """
            寻找当前分类器下预测正确的样本
            :param clf:
            :return:
            """
            incr_pre_label = clf.predict(fit_incr_datas)
            # 选出预测正确的下标
            true_index = (incr_class_label == incr_pre_label).nonzero()

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)

            res = []
            for i0 in true_index[0]:
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = incr_pre_label[i0]

                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                loss0 = clf.metrics_another_zero_one_loss(
                    origin_proba, test_proba)

                res.append((loss0, text0, c_pred0, i0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return res

        def handle_first(clf):
            # 最原始的分类损失度的计算
            # 分类损失，求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0:i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                        c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_my_zero_one_loss(test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_second(clf):
            # 另一种分类损失度的计算
            #            predict_true = handle(clf, "zero")
            #            if predict_true:
            #                return predict_true

            # 分类损失，求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0:i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]

                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                        c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_third(clf):
            # todo
            # 如何获得合适的阖值
            def get_fit(e0):
                # 获得合适的阖值
                return 20
#                while len((r >= e0).nonzero()[0]) == 0:
#                    e0 = int(e0 / 2)
#                return e0

            global e
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2]
            # 支持度
            r = np.divide(max_proba, second_max_proba)
            # 阖值
            e = get_fit(e)
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice,
                     max_proba[indice][0]) for indice in select_indices[0]]

        def handle_third_another(clf):
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba
            # 支持度
            r = np.divide(max_proba, leave_proba)
            # 阖值
            e = 5
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice,
                     max_proba[indice][0]) for indice in select_indices[0]]

        def handle_four(clf):
            # My Own Idea
            # 存放 Test 的结果
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况：某个样本属于某个类的概率很高，update后属于某个类别的概率也很高，但是
                # 前后两个类别可能不一致
                smooth = np.asarray([
                    1 if origin_label[j] == label[j] else -1
                    for j in range(len(origin_label))
                ])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        def handle_five(clf):
            """
            类支持度和无显著性差异的结合
            :param clf:
            :return:
            """
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            fit_for_class_support = handle(clf, "third")
            print "The result of class-support: %d samples" % len(
                fit_for_class_support)

            #            fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support)
            #            print "The result of class-support: %d samples" % len(fit_for_class_support)
            # My Own Idea
            # 存放 Test 的结果
            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)

            for i0 in range(len(fit_for_class_support)):
                text0 = fit_for_class_support[i0][1]
                c_pred0 = fit_for_class_support[i0][2]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况：某个样本属于某个类的概率很高，update后属于某个类别的概率也很高，但是
                # 前后两个类别可能不一致
                smooth = np.asarray([
                    1 if origin_label[j] == label[j] else -1
                    for j in range(len(origin_label))
                ])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        method_options = ("first", "second", "third", "four", "five")
        if method not in method_options:
            raise ValueError("method has to be one of " + str(method_options))

        print "Begin Increment Classification: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        suffix = ".blp"
        class_count_out = os.path.join(dir_, "class_count_" + method + suffix)
        class_log_prob_out = os.path.join(dir_,
                                          "class_log_prob_" + method + suffix)
        feature_count_out = os.path.join(dir_,
                                         "feature_count_" + method + suffix)
        feature_log_prob_out = os.path.join(
            dir_, "feature_log_prob_" + method + suffix)

        out = (class_count_out, class_log_prob_out, feature_count_out,
               feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(
                    self.bayes, "class_log_prior_"):
                raise ValueError(
                    "please use get_classificator() to get classificator firstly"
                )

            fit_incr_datas = self.fit_data(incr_datas)
            incr_class_label = np.asanyarray(incr_class_label)
            # 保存需要增加到key_words.txt文档中的数据
            add_to_key_words = []

            i = 0
            while fit_incr_datas.nnz > 0:
                print
                print "Begin Increment Classification_%d: %s" % (
                    i, time.strftime('%Y-%m-%d %H:%M:%S'))

                need_to_update = handle(self, method)
                # 如果没有可更新的，表示剩余的增量集并不适合当前的分类器，所以舍去
                # 更新时，增量集会不断减少
                block = []
                label_block = []
                # 更新时，训练集会不断增加
                block0 = []
                if need_to_update:
                    # 根据 loss 从小到大排序
                    accord_to_loss = sorted(need_to_update, key=lambda x: x[0])
                    for data in accord_to_loss:
                        self.bayes.update(data[2], data[1])
                    # 根据 index 排序
                    accord_to_index = sorted(need_to_update,
                                             key=lambda x: x[3])

                    #                    index = [index0[3] for index0 in accord_to_index]
                    #                    [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index]
                    #                    raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index]

                    block0.append(test_datas)
                    reduce(func, accord_to_index, (0.0, "", "", -1))
                    block.append(fit_incr_datas[accord_to_index[-1][3] +
                                                1:, :])
                    label_block.append(
                        incr_class_label[accord_to_index[-1][3] + 1:])
                    test_datas = sp.vstack(block0)
                    print "This times updates %d samples" % len(need_to_update)
                else:
                    block.append(fit_incr_datas[0:0, :])
                    label_block.append(incr_class_label[0:0])
                    print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[
                        0]
                fit_incr_datas = sp.vstack(block)
                incr_class_label = np.concatenate(label_block)
                i += 1

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_,
                          self.bayes.feature_log_prob_)
            # 保存到文本
            map(lambda x: bp.pack_ndarray_file(x[0], x[1]),
                zip(bayes_args, out))
            # 追加
#            path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt")
#            FileUtil.write(path, add_to_key_words, "a")
        else:
            # speed up
            self.bayes.class_count_ = bp.unpack_ndarray_file(out[0])
            self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1])
            self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2])
            self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3])


#            self.bayes.class_count_ = np.loadtxt(out[0])
#            self.bayes.class_log_prior_ = np.loadtxt(out[1])
#            self.bayes.feature_count_ = np.loadtxt(out[2])
#            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        print
        return self

コード例 #9

ファイルを表示

    def _collect(self, splited_words_list, sentence_size):
        dir_ = os.path.join(TEXT_OUT, "key_words")
        if self.subjective:
            key_words_txt = os.path.join(dir_, self.__class__.__name__ + ".txt")
        else:
            key_words_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt")
#        def norm(word_scores):
#            """
#            以样本为单位正则化
#            归一化（正则化）
#            Normalization 主要思想是对每个样本计算其p-范数，然后对该样本中每个元素除以该范数，
#            这样处理的结果是使得每个处理后样本的p-范数（l1-norm,l2-norm）等于1。
#
#            p-范数的计算公式：||X||p=(|x1|^p+|x2|^p+...+|xn|^p)^1/p
#
#            该方法主要应用于文本分类和聚类中。
#
#            :param word_scores: a dict {word: score}
#            """
#            p = 0.0
#            for v in word_scores.values():
#                p += math.pow(math.fabs(v), 2)
#            p = math.pow(p, 1.0 / 2)
#
#            for k, v in word_scores.items():
#                word_scores[k] = v / p

#        def reduce_dim(word_scores):
#            """
#            降维：选取累加权重信息占比超过 0.9 的特征词
#            """
#            _size = len(word_scores)
#            _max = math.pow(_size, 1.0 / 2) * 0.85
#            res = {}
#            # 降序排序
#            sort = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
#            _sum = 0.0
#            for k, v in sort:
#                if(_sum > _max):
#                    break
#                res[k] = v
#                _sum += v
#            return res

        if not self.istrain or self.f or not FileUtil.isexist(key_words_txt) or FileUtil.isempty(key_words_txt):
            print "Cal Scores: ", time.strftime('%Y-%m-%d %H:%M:%S')
            if len(splited_words_list) == sentence_size:
                train_range = slice(sentence_size)
            else:
                train_range = slice(sentence_size, len(splited_words_list))

            # 获取所有类别下的文本
            all_class_datas = Feature.all_class_text(splited_words_list[train_range], self.getclasses())

            # 获取类别标签
            class_label = [d.get("emotion-1-type") for d in splited_words_list[: sentence_size]]

            # return term/frequency or term/score
            res = []
            for splited_words_dict in splited_words_list[0: sentence_size]:
                splited_words = splited_words_dict.get("sentence")
                label = splited_words_dict.get("emotion-1-type")
                # 计算每个单词的得分 scores: {word: [score, frequency], ...}
                scores = {splited_word: [self.cal_score(splited_word, splited_words, label, all_class_datas,
                                                        [d.get("sentence") for d in splited_words_list[train_range]]),
                                         frequency]
                          for splited_word, frequency in splited_words.items()}
                # 归一化
                # norm(scores)
                # 降维处理
                sorted_words = scores
#                if not self.istrain:
#                    sorted_words = reduce_dim(scores)

                # Collection
                # if False return term/score
                # if True  return term/frequency
#                if False:
#                    for k in sorted_words.keys():
#                        sorted_words[k] = splited_words.count(k)

                res.append({"sentence": sorted_words,
                            "emotion-1-type": splited_words_dict.get("emotion-1-type")})
            print "Cal Scores Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # FileUtil.write(TEST_BASE_URL + "scores.txt", res)
            print "Begin Normalization: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # 归一化
            self.norm(res)
            # FileUtil.write(TEST_BASE_URL + "norm.txt", res)
            print "Normalization Done: ", time.strftime('%Y-%m-%d %H:%M:%S')

            print "Begin Reduce: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # 降维
            self.reduce_dim(res)
            print "Reduce Done: ", time.strftime('%Y-%m-%d %H:%M:%S')

            # Try Convert term/score to term/frequency
            # if False return term/score
            # if True  return term/frequency
            for d in res:
                ws = d.get("sentence")
                for k, v in ws.items():
                    ws[k] = v[0]
                    if True:
                        ws[k] = v[1]

            # 由于分词或降维的过程中，有可能因为样本的信息关键词不够，
            # 使得该样本经过上诉步骤后为空，返回这类样本的索引
            danger_index = []
            res = filter(lambda x: danger_index.append(x[0]) if not x[1].get("sentence") else x,
                         enumerate(res))
            res = list(zip(*res)[1])

            class_label = [c for i, c in enumerate(class_label)
                           if i not in danger_index]

            # 写入文件
            if self.istrain:
                FileUtil.write(key_words_txt, res)
        else:
            res = FileUtil.read(key_words_txt)
            class_label = [r["emotion-1-type"] for r in res]
            danger_index = []

        # 输出统计信息
        if False:
            self.__print_top_key_word(res)
        return res, class_label, danger_index

コード例 #10

ファイルを表示

ファイル: classification.py プロジェクト: zqlhuanying/Image_Emotion

    def get_incr_classificator_thread(self, incr_datas, incr_class_label, test_datas, test_class_label):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func1(i0):
            c_true0 = incr_class_label[i0: i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                clf0 = copy.deepcopy(self)
                clf0.bayes.class_log_prior_, clf0.bayes.feature_log_prob_ = clf0.bayes.update(c_pred0, text0, copy=True)
                loss0 = clf0.metrics_my_zero_one_loss(test_datas)

                # clf0.bayes.class_log_prior_ = origin_class_log_prob_
                # clf0.bayes.feature_log_prob_ = origin_feature_log_prob_

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        def func(i0):
            c_true0 = incr_class_label[i0: i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                if lock0.acquire():
                    self.bayes.class_log_prior_, self.bayes.feature_log_prob_ = self.bayes.update(c_pred0, text0, copy=True)
                    loss0 = self.metrics_my_zero_one_loss(test_datas)

                    self.bayes.class_log_prior_ = origin_class_log_prob_
                    self.bayes.feature_log_prob_ = origin_feature_log_prob_

                    lock0.release()

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        class_count_out = os.path.join(dir_, "class_count.txt")
        class_log_prob_out = os.path.join(dir_, "class_log_prob.txt")
        feature_count_out = os.path.join(dir_, "feature_count.txt")
        feature_log_prob_out = os.path.join(dir_, "feature_log_prob.txt")

        out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"):
                raise ValueError("please use get_classificator() to get classificator firstly")

            fit_incr_datas = self.fit_data(incr_datas)
            n_samples, _ = fit_incr_datas.shape
            incr_class_label = np.array(incr_class_label)

            lock0 = threading.Lock()
            lock1 = threading.Lock()

            # threadpool
            poolsize = 30
            pool = ThreadPool(poolsize)

            for i in range(n_samples):
                if i % 5 == 0:
                    print "Begin Increment Classification_%d: %s" % (i / 5, time.strftime('%Y-%m-%d %H:%M:%S'))
                # 分类损失，求最小值的处理方式
                loss = []
                # 增量集中优先选择更改分类器参数的文本
                text = []
                # 增量集中优先选择更改分类器参数的文本所对应的类别
                c_pred = []
                # 增量集中优先选择更改分类器参数的文本所对应的下标
                # index = 0

                origin_class_log_prob_ = self.bayes.class_log_prior_
                origin_feature_log_prob_ = self.bayes.feature_log_prob_

                # threadpool
                requests = makeRequests(func, range(fit_incr_datas.shape[0]))
                [pool.putRequest(req) for req in requests]
                pool.wait()
#                for i0 in range(fit_incr_datas.shape[0]):
#                    threading.Thread(target=func, args=(i0, )).start()

                minindex = np.argmin(loss)
                self.bayes.update(c_pred[minindex], text[minindex])
                fit_incr_datas = sp.vstack([fit_incr_datas[:minindex, :], fit_incr_datas[minindex + 1:, :]])

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_, self.bayes.feature_log_prob_)
            map(lambda x: np.savetxt(x[0], x[1]), zip(out, bayes_args))
        else:
            self.bayes.class_count_ = np.loadtxt(out[0])
            self.bayes.class_log_prior_ = np.loadtxt(out[1])
            self.bayes.feature_count_ = np.loadtxt(out[2])
            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
        return self

コード例 #11

ファイルを表示

ファイル: classification.py プロジェクト: zqlhuanying/Image_Emotion

    def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func(x, y):
            block.append(fit_incr_datas[x[3] + 1: y[3], :])
            label_block.append(incr_class_label[x[3] + 1: y[3]])
            block0.append(fit_incr_datas[y[3]:y[3] + 1, :])
            return y

        def handle(clf, method):
            if method == "zero":
                return handle_zero(clf)
            elif method == "first":
                return handle_first(clf)
            elif method == "second":
                return handle_second(clf)
            elif method == "third":
                return handle_third(clf)
            elif method == "four":
                return handle_four(clf)
            elif method == "five":
                return handle_five(clf)
            else:
                pass

        def handle_zero(clf):
            """
            寻找当前分类器下预测正确的样本
            :param clf:
            :return:
            """
            incr_pre_label = clf.predict(fit_incr_datas)
            # 选出预测正确的下标
            true_index = (incr_class_label == incr_pre_label).nonzero()

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)

            res = []
            for i0 in true_index[0]:
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = incr_pre_label[i0]

                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)

                res.append((loss0, text0, c_pred0, i0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return res

        def handle_first(clf):
            # 最原始的分类损失度的计算
            # 分类损失，求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0: i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_my_zero_one_loss(test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_second(clf):
            # 另一种分类损失度的计算
#            predict_true = handle(clf, "zero")
#            if predict_true:
#                return predict_true

            # 分类损失，求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0: i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]

                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_third(clf):
            # todo
            # 如何获得合适的阖值
            def get_fit(e0):
                # 获得合适的阖值
                return 20
#                while len((r >= e0).nonzero()[0]) == 0:
#                    e0 = int(e0 / 2)
#                return e0

            global e
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2]
            # 支持度
            r = np.divide(max_proba, second_max_proba)
            # 阖值
            e = get_fit(e)
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]]

        def handle_third_another(clf):
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba
            # 支持度
            r = np.divide(max_proba, leave_proba)
            # 阖值
            e = 5
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]]

        def handle_four(clf):
            # My Own Idea
            # 存放 Test 的结果
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况：某个样本属于某个类的概率很高，update后属于某个类别的概率也很高，但是
                # 前后两个类别可能不一致
                smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        def handle_five(clf):
            """
            类支持度和无显著性差异的结合
            :param clf:
            :return:
            """
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            fit_for_class_support = handle(clf, "third")
            print "The result of class-support: %d samples" % len(fit_for_class_support)

#            fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support)
#            print "The result of class-support: %d samples" % len(fit_for_class_support)
            # My Own Idea
            # 存放 Test 的结果
            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)

            for i0 in range(len(fit_for_class_support)):
                text0 = fit_for_class_support[i0][1]
                c_pred0 = fit_for_class_support[i0][2]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况：某个样本属于某个类的概率很高，update后属于某个类别的概率也很高，但是
                # 前后两个类别可能不一致
                smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        method_options = ("first", "second", "third", "four", "five")
        if method not in method_options:
            raise ValueError("method has to be one of " + str(method_options))

        print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        suffix = ".blp"
        class_count_out = os.path.join(dir_, "class_count_" + method + suffix)
        class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix)
        feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix)
        feature_log_prob_out = os.path.join(dir_, "feature_log_prob_" + method + suffix)

        out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"):
                raise ValueError("please use get_classificator() to get classificator firstly")

            fit_incr_datas = self.fit_data(incr_datas)
            incr_class_label = np.asanyarray(incr_class_label)
            # 保存需要增加到key_words.txt文档中的数据
            add_to_key_words = []

            i = 0
            while fit_incr_datas.nnz > 0:
                print
                print "Begin Increment Classification_%d: %s" % (i, time.strftime('%Y-%m-%d %H:%M:%S'))

                need_to_update = handle(self, method)
                # 如果没有可更新的，表示剩余的增量集并不适合当前的分类器，所以舍去
                # 更新时，增量集会不断减少
                block = []
                label_block = []
                # 更新时，训练集会不断增加
                block0 = []
                if need_to_update:
                    # 根据 loss 从小到大排序
                    accord_to_loss = sorted(need_to_update, key=lambda x: x[0])
                    for data in accord_to_loss:
                        self.bayes.update(data[2], data[1])
                    # 根据 index 排序
                    accord_to_index = sorted(need_to_update, key=lambda x: x[3])

#                    index = [index0[3] for index0 in accord_to_index]
#                    [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index]
#                    raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index]

                    block0.append(test_datas)
                    reduce(func, accord_to_index, (0.0, "", "", -1))
                    block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :])
                    label_block.append(incr_class_label[accord_to_index[-1][3] + 1:])
                    test_datas = sp.vstack(block0)
                    print "This times updates %d samples" % len(need_to_update)
                else:
                    block.append(fit_incr_datas[0:0, :])
                    label_block.append(incr_class_label[0:0])
                    print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[0]
                fit_incr_datas = sp.vstack(block)
                incr_class_label = np.concatenate(label_block)
                i += 1

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_, self.bayes.feature_log_prob_)
            # 保存到文本
            map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out))
            # 追加
#            path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt")
#            FileUtil.write(path, add_to_key_words, "a")
        else:
            # speed up
            self.bayes.class_count_ = bp.unpack_ndarray_file(out[0])
            self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1])
            self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2])
            self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3])

#            self.bayes.class_count_ = np.loadtxt(out[0])
#            self.bayes.class_log_prior_ = np.loadtxt(out[1])
#            self.bayes.feature_count_ = np.loadtxt(out[2])
#            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
        print
        return self