Example #1
0
def clear_un_img():
    # 图片存放的路径
    all_img_url = os.path.join(RESOURCE_BASE_URL, "collect/img")
    # 这个目录下是需要保留的图片
    leave_img_url = os.path.join(RESOURCE_BASE_URL, "collect")

    if FileUtil.isempty(leave_img_url):
        FileUtil.empty(all_img_url)
    else:
        all_imgs = FileUtil.listdir(all_img_url)

        dirs = [leave_img_url]
        for parent, dirnames, filenames in os.walk(leave_img_url):
            for dirname in dirnames:
                dirs.append(os.path.join(parent, dirname))

        leave_imgs = []
        for dir_ in dirs:
            imglist = collect.read_weibo(dir_, isreadimg=True)
            imglist = flatten(
                [img.get("img") for img in imglist if img.get("img")])
            leave_imgs += imglist

        # 删除多余的图片
        map(lambda p: os.remove(p) if p not in leave_imgs else None, all_imgs)
Example #2
0
def write_to_file(path, data, call):
    parent_dir = FileUtil.getparentdir(path)
    FileUtil.mkdirs(parent_dir)

    with open(path, "w") as fp:
        s = call(data)
        if isinstance(s, basestring):
            fp.write(s)
        else:
            [fp.write(s0) for s0 in s]
Example #3
0
def write_to_file(path, data, call):
    parent_dir = FileUtil.getparentdir(path)
    FileUtil.mkdirs(parent_dir)

    with open(path, "w") as fp:
        s = call(data)
        if isinstance(s, basestring):
            fp.write(s)
        else:
            [fp.write(s0) for s0 in s]
    def __save_result(self, sentences):
        FileUtil.mkdirs(ImageClassification.image_train_path)
        current = time.strftime('%Y-%m-%d %H:%M:%S')
        out = os.path.join(ImageClassification.image_train_path, current + ".txt")

        with open(out, "w") as fp:
            for sentence in sentences:
                s = ("sentence:" + sentence.get("sentence") + "\n" +
                     "img:" + ",".join(sentence.get("img")) + "\n" +
                     "label:" + sentence.get("label")) + "\n"
                fp.write(s)
    def __process_img(self, img_urls):
        dir_ = os.path.join(ImageClassification.image_train_path, "img")
        FileUtil.mkdirs(dir_)

        def copy_img(img_url):
            filename = os.path.split(img_url)[1]
            filepath = os.path.join(dir_, filename)
            cv2.imwrite(filepath, cv2.imread(img_url))
            return filepath

        return [copy_img(img_url) for img_url in img_urls]
Example #6
0
    def __process_img(self, img_urls):
        dir_ = os.path.join(ImageClassification.image_train_path, "img")
        FileUtil.mkdirs(dir_)

        def copy_img(img_url):
            filename = os.path.split(img_url)[1]
            filepath = os.path.join(dir_, filename)
            cv2.imwrite(filepath, cv2.imread(img_url))
            return filepath

        return [copy_img(img_url) for img_url in img_urls]
Example #7
0
    def __save_result(self, sentences):
        FileUtil.mkdirs(ImageClassification.image_train_path)
        current = time.strftime('%Y-%m-%d %H:%M:%S')
        out = os.path.join(ImageClassification.image_train_path,
                           current + ".txt")

        with open(out, "w") as fp:
            for sentence in sentences:
                s = ("sentence:" + sentence.get("sentence") + "\n" + "img:" +
                     ",".join(sentence.get("img")) + "\n" + "label:" +
                     sentence.get("label")) + "\n"
                fp.write(s)
    def get_classificator(self,
                          train_datas,
                          class_label,
                          iscrossvalidate=False,
                          isbalance=False,
                          minority_target=None):
        """
        获取分类器
        :param train_datas
        :param class_label
        :param iscrossvalidate 是否需要读取交叉验证后的结果来获得训练器
        :param isbalance 是否需要平衡数据
        :param minority_target 少数类,只有在 isbalance 为 true 时才起作用
        :return:
        """
        out = os.path.join(TEXT_OUT, "best_train_test_index/train_index.txt")
        if iscrossvalidate and (not FileUtil.isexist(out)
                                or FileUtil.isempty(out)):
            raise ValueError("please use cross_validation() firstly")

        # fit data
        fit_train_datas = self.fit_data(train_datas)
        class_label = np.asarray(class_label)

        if iscrossvalidate:
            train_index = np.loadtxt(out, dtype=int)
        else:
            train_index = np.array(range(fit_train_datas.shape[0]))

        fit_train_datas, class_label = fit_train_datas[
            train_index], class_label[train_index]
        sample_weight = None

        # SMOTE
        if isbalance:
            fit_train_datas, class_label = preprocessing.my_smote(
                fit_train_datas, class_label, minority_target, per=0.5)
#            sample_weight = _weights._balance_weights(class_label)

# sample_weight
#        if isbalance:
#            sample_weight = _weights._balance_weights(class_label)

# 训练模型
        self.bayes.fit(fit_train_datas,
                       class_label,
                       sample_weight=sample_weight)
        return self
Example #9
0
 def cal_weight_improve(self, key_words, class_label):
     """
     计算获取特征词后的权重信息
     :param key_words: [{'sentence': {}}, ...] or [{}, ...] 有可能是测试集数据有可能是训练集数据
     :return:
     """
     print "Cal Improve Weight: ", time.strftime('%Y-%m-%d %H:%M:%S')
     if not self.istrain:
         dir_ = os.path.join(TEXT_OUT, "key_words")
         filename = self.__class__.__name__ + ".txt" if self.subjective else self.__class__.__name__ + "_objective.txt"
         url = os.path.join(dir_, filename)
         train_key_words = FileUtil.read(url)
         train_class_label = [d.get("emotion-1-type") for d in train_key_words]
     else:
         train_key_words = key_words
         train_class_label = class_label
     train_key_words = [d.get("sentence") if "sentence" in d else d for d in train_key_words]
     key_words = [d.get("sentence") if "sentence" in d else d for d in key_words]
     # 获得 tf
     key_words = [{k: v / sum(d.values()) for k, v in d.items()} for d in key_words]
     fit_train_key_words = Feature_Hasher.transform(train_key_words)
     fit_key_words = Feature_Hasher.transform(key_words)
     tfidf = TfidfImprove()
     # 训练 idf
     tfidf.fit(fit_train_key_words, train_class_label)
     weight_matrix = tfidf.transform(fit_key_words, class_label)
     print "Cal Weight Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
     print
     return weight_matrix
def _check_feature_size(url):
    l = []
    for line in FileUtil.read(url):
        line = ",".join(line.get("sentence"))
        line = line.split(",")
        l.append(line)

    feature_size = set(flatten(l))
    return len(feature_size)
    def get_classificator(self, train_datas, class_label, iscrossvalidate=False, isbalance=False, minority_target=None):
        """
        获取分类器
        :param train_datas
        :param class_label
        :param iscrossvalidate 是否需要读取交叉验证后的结果来获得训练器
        :param isbalance 是否需要平衡数据
        :param minority_target 少数类,只有在 isbalance 为 true 时才起作用
        :return:
        """
        out = os.path.join(TEXT_OUT, "best_train_test_index/train_index.txt")
        if iscrossvalidate and (not FileUtil.isexist(out) or FileUtil.isempty(out)):
            raise ValueError("please use cross_validation() firstly")

        # fit data
        fit_train_datas = self.fit_data(train_datas)
        class_label = np.asarray(class_label)

        if iscrossvalidate:
            train_index = np.loadtxt(out, dtype=int)
        else:
            train_index = np.array(range(fit_train_datas.shape[0]))

        fit_train_datas, class_label = fit_train_datas[train_index], class_label[train_index]
        sample_weight = None

        # SMOTE
        if isbalance:
            fit_train_datas, class_label = preprocessing.my_smote(fit_train_datas, class_label, minority_target, per=0.5)
#            sample_weight = _weights._balance_weights(class_label)

        # sample_weight
#        if isbalance:
#            sample_weight = _weights._balance_weights(class_label)

        # 训练模型
        self.bayes.fit(fit_train_datas, class_label, sample_weight=sample_weight)
        return self
Example #12
0
def process_img(urls):
    dir_ = os.path.join(RESOURCE_BASE_URL, "collect/img")
    FileUtil.mkdirs(dir_)

    if isinstance(urls, basestring):
        urls = [[urls]]
    filepath = []
    for i, row in enumerate(urls):
        p = []
        for j, url in enumerate(row):
            filename = FileUtil.getfilename(url)
            filepath0 = os.path.join(dir_, filename)
            try:
                r = requests.get(url, timeout=1)
                if r.status_code == 200:
                    img = Image.open(StringIO(r.content))
                    if img.mode != "RGB":
                        img = img.convert("RGB")
                    img.save(filepath0)
                    p.append(filepath0)
            except requests.exceptions.Timeout, requests.exceptions.ConnectionError:
                print "Timeout"
        filepath.append(p)
Example #13
0
def process_img(urls):
    dir_ = os.path.join(RESOURCE_BASE_URL, "collect/img")
    FileUtil.mkdirs(dir_)

    if isinstance(urls, basestring):
        urls = [[urls]]
    filepath = []
    for i, row in enumerate(urls):
        p = []
        for j, url in enumerate(row):
            filename = FileUtil.getfilename(url)
            filepath0 = os.path.join(dir_, filename)
            try:
                r = requests.get(url, timeout=1)
                if r.status_code == 200:
                    img = Image.open(StringIO(r.content))
                    if img.mode != "RGB":
                        img = img.convert("RGB")
                    img.save(filepath0)
                    p.append(filepath0)
            except requests.exceptions.Timeout, requests.exceptions.ConnectionError:
                print "Timeout"
        filepath.append(p)
Example #14
0
    def _get_splited_train(self):
        """
        优先从文件中读取训练集分词后的结果
        :return:
        """
        dir_ = os.path.join(TEXT_OUT, "split")
        if self.subjective:
            split_txt = os.path.join(dir_, self.__class__.__name__ + ".txt")
            training_datas = Load.load_training_balance()
        else:
            split_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt")
            training_datas = Load.load_training_objective_balance()

        if self.f or not FileUtil.isexist(split_txt) or FileUtil.isempty(split_txt):
            # 加载训练集
            # 每个句子还包含类别信息
            splited_words_list = Feature.__split(flatten(training_datas))
#            splited_words_list = Feature.__del_low_frequency_word(splited_words_list)

            FileUtil.write(split_txt, splited_words_list)
        else:
            splited_words_list = FileUtil.read(split_txt)

        return splited_words_list
def clear_un_img():
    # 图片存放的路径
    all_img_url = os.path.join(RESOURCE_BASE_URL, "collect/img")
    # 这个目录下是需要保留的图片
    leave_img_url = os.path.join(RESOURCE_BASE_URL, "collect")

    if FileUtil.isempty(leave_img_url):
        FileUtil.empty(all_img_url)
    else:
        all_imgs = FileUtil.listdir(all_img_url)

        dirs = [leave_img_url]
        for parent, dirnames, filenames in os.walk(leave_img_url):
            for dirname in dirnames:
                dirs.append(os.path.join(parent, dirname))

        leave_imgs = []
        for dir_ in dirs:
            imglist = collect.read_weibo(dir_, isreadimg=True)
            imglist = flatten([img.get("img") for img in imglist if img.get("img")])
            leave_imgs += imglist

        # 删除多余的图片
        map(lambda p: os.remove(p) if p not in leave_imgs else None, all_imgs)
    def read_train(self, path):
        def handle_read(datas):
            l = []
            d = dict()
            for data in datas:
                if data.startswith("sentence"):
                    d = dict()
                    d["sentence"] = data[data.find(":") + 1:]
                    l.append(d)
                elif data.startswith("img"):
                    d["img"] = filter(lambda x: x, data[data.find(":") + 1:].split(","))
                elif data.startswith("label"):
                    d["label"] = data[data.find(":") + 1:]
            return l

        path = path if path.startswith(RESOURCE_BASE_URL) else os.path.join(RESOURCE_BASE_URL, path)
        filenames = FileUtil.listdir(path, isrecursion=False)
        return flatten([CommonUtil.read_from_file(filename, handle_read) for filename in filenames])
Example #17
0
def read_weibo(path, isreadimg=False):
    def handle_read(datas):
        fit_datas = datas
        if not isreadimg:
            fit_datas = [data for data in datas if not data.startswith("img")]

        l = []
        d = dict()
        for data in fit_datas:
            if data.startswith("sentence"):
                d = dict()
                d["sentence"] = data[data.find(":") + 1:]
                l.append(d)
            elif data.startswith("img"):
                d["img"] = filter(lambda x: x, data[data.find(":") + 1:].split(","))
        return l

    path = path if path.startswith(RESOURCE_BASE_URL) else os.path.join(RESOURCE_BASE_URL, path)
    filenames = FileUtil.listdir(path, isrecursion=False)
    return flatten([read_from_file(filename, handle_read) for filename in filenames])
Example #18
0
    def read_train(self, path):
        def handle_read(datas):
            l = []
            d = dict()
            for data in datas:
                if data.startswith("sentence"):
                    d = dict()
                    d["sentence"] = data[data.find(":") + 1:]
                    l.append(d)
                elif data.startswith("img"):
                    d["img"] = filter(lambda x: x,
                                      data[data.find(":") + 1:].split(","))
                elif data.startswith("label"):
                    d["label"] = data[data.find(":") + 1:]
            return l

        path = path if path.startswith(RESOURCE_BASE_URL) else os.path.join(
            RESOURCE_BASE_URL, path)
        filenames = FileUtil.listdir(path, isrecursion=False)
        return flatten([
            CommonUtil.read_from_file(filename, handle_read)
            for filename in filenames
        ])
Example #19
0
def read_weibo(path, isreadimg=False):
    def handle_read(datas):
        fit_datas = datas
        if not isreadimg:
            fit_datas = [data for data in datas if not data.startswith("img")]

        l = []
        d = dict()
        for data in fit_datas:
            if data.startswith("sentence"):
                d = dict()
                d["sentence"] = data[data.find(":") + 1:]
                l.append(d)
            elif data.startswith("img"):
                d["img"] = filter(lambda x: x,
                                  data[data.find(":") + 1:].split(","))
        return l

    path = path if path.startswith(RESOURCE_BASE_URL) else os.path.join(
        RESOURCE_BASE_URL, path)
    filenames = FileUtil.listdir(path, isrecursion=False)
    return flatten(
        [read_from_file(filename, handle_read) for filename in filenames])
Example #20
0
    train = train_datas
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    test = Load.load_test_balance()
    test_datas, test_label, _ = feature.get_key_words(test)
    test = test_datas
    # 构建适合 bayes 分类的数据集
    if not sp.issparse(test_datas):
        test = feature.cal_weight_improve(test_datas, test_label)

    crossvalidate = False
    # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例
    if crossvalidate:
        out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt")
        if not FileUtil.isexist(out) or FileUtil.isempty(out):
            clf0 = Classification()
            clf0.cross_validation(train, class_label, score="recall")
        test_index = np.loadtxt(out, dtype=int)
        test = train[test_index]
        test_label = np.asanyarray(class_label)[test_index].tolist()

    method_options = ("second", "four", "five")
    method_options_0 = ("B", "C", "D")
    linestyle = (':', '--', '-')
    plot.get_instance()
    for i in range(len(method_options)):
        bayes = IncrBayes()
        clf = Classification(bayes=bayes)
        clf.get_classificator(train, class_label, iscrossvalidate=crossvalidate,
                              isbalance=False, minority_target=EMOTION_CLASS.keys())
def count_img():
    # 图片存放的路径
    all_img_url = os.path.join(RESOURCE_BASE_URL, "collect/img")
    print "It's have %d images" % len(FileUtil.listdir(all_img_url))
Example #22
0
    def get_incr_classificator_thread(self, incr_datas, incr_class_label,
                                      test_datas, test_class_label):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func1(i0):
            c_true0 = incr_class_label[i0:i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                clf0 = copy.deepcopy(self)
                clf0.bayes.class_log_prior_, clf0.bayes.feature_log_prob_ = clf0.bayes.update(
                    c_pred0, text0, copy=True)
                loss0 = clf0.metrics_my_zero_one_loss(test_datas)

                # clf0.bayes.class_log_prior_ = origin_class_log_prob_
                # clf0.bayes.feature_log_prob_ = origin_feature_log_prob_

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        def func(i0):
            c_true0 = incr_class_label[i0:i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                if lock0.acquire():
                    self.bayes.class_log_prior_, self.bayes.feature_log_prob_ = self.bayes.update(
                        c_pred0, text0, copy=True)
                    loss0 = self.metrics_my_zero_one_loss(test_datas)

                    self.bayes.class_log_prior_ = origin_class_log_prob_
                    self.bayes.feature_log_prob_ = origin_feature_log_prob_

                    lock0.release()

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        print "Begin Increment Classification: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        class_count_out = os.path.join(dir_, "class_count.txt")
        class_log_prob_out = os.path.join(dir_, "class_log_prob.txt")
        feature_count_out = os.path.join(dir_, "feature_count.txt")
        feature_log_prob_out = os.path.join(dir_, "feature_log_prob.txt")

        out = (class_count_out, class_log_prob_out, feature_count_out,
               feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(
                    self.bayes, "class_log_prior_"):
                raise ValueError(
                    "please use get_classificator() to get classificator firstly"
                )

            fit_incr_datas = self.fit_data(incr_datas)
            n_samples, _ = fit_incr_datas.shape
            incr_class_label = np.array(incr_class_label)

            lock0 = threading.Lock()
            lock1 = threading.Lock()

            # threadpool
            poolsize = 30
            pool = ThreadPool(poolsize)

            for i in range(n_samples):
                if i % 5 == 0:
                    print "Begin Increment Classification_%d: %s" % (
                        i / 5, time.strftime('%Y-%m-%d %H:%M:%S'))
                # 分类损失,求最小值的处理方式
                loss = []
                # 增量集中优先选择更改分类器参数的文本
                text = []
                # 增量集中优先选择更改分类器参数的文本所对应的类别
                c_pred = []
                # 增量集中优先选择更改分类器参数的文本所对应的下标
                # index = 0

                origin_class_log_prob_ = self.bayes.class_log_prior_
                origin_feature_log_prob_ = self.bayes.feature_log_prob_

                # threadpool
                requests = makeRequests(func, range(fit_incr_datas.shape[0]))
                [pool.putRequest(req) for req in requests]
                pool.wait()
                #                for i0 in range(fit_incr_datas.shape[0]):
                #                    threading.Thread(target=func, args=(i0, )).start()

                minindex = np.argmin(loss)
                self.bayes.update(c_pred[minindex], text[minindex])
                fit_incr_datas = sp.vstack([
                    fit_incr_datas[:minindex, :],
                    fit_incr_datas[minindex + 1:, :]
                ])

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_,
                          self.bayes.feature_log_prob_)
            map(lambda x: np.savetxt(x[0], x[1]), zip(out, bayes_args))
        else:
            self.bayes.class_count_ = np.loadtxt(out[0])
            self.bayes.class_log_prior_ = np.loadtxt(out[1])
            self.bayes.feature_count_ = np.loadtxt(out[2])
            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        return self
Example #23
0
    def cross_validation(self, train_datas, class_label, score='precision'):
        """
        K-Fold Cross Validation
        采用交叉验证的方式来优化贝叶斯参数
        选出具有最佳 score 的训练集和测试集
        此时训练集和测试集就不需要事先选好,交给交叉验证来完成
        :param train_datas:
        :param class_label:
        :param score:
        :return:
        """
        score_options = ('precision', 'recall', 'f1', 'accuracy')
        if score not in score_options:
            raise ValueError('score has to be one of ' + str(score_options))

        # fit data
        fit_train_datas = self.fit_data(train_datas)

        n_samples = fit_train_datas.shape[0]
        class_label = np.array(class_label)

        max_result = []
        max_index = []
        max_ = 0
        i = 0
        while (max_ < 0.6 and i <= 200):
            i += 1
            print "Seeking %d; max: %f; %s" % (
                i, max_, time.strftime('%Y-%m-%d %H:%M:%S'))

            result = []
            index = []
            cv = cross_validation.KFold(n_samples, n_folds=4, shuffle=True)

            for train_index, test_index in cv:
                train0, train0_label = fit_train_datas[
                    train_index], class_label[train_index]
                test0, test0_label = fit_train_datas[test_index], class_label[
                    test_index]
                self.get_classificator(train0, train0_label)
                c_pred0 = self.predict(test0)

                if score == "precision":
                    result.append(self.metrics_precision(test0_label, c_pred0))
                    index.append((train_index, test_index))
                elif score == "recall":
                    result.append(self.metrics_recall(test0_label, c_pred0))
                    index.append((train_index, test_index))
                elif score == "f1":
                    result.append(self.metrics_f1(test0_label, c_pred0))
                    index.append((train_index, test_index))
                else:
                    result.append(self.metrics_accuracy(test0_label, c_pred0))
                    index.append((train_index, test_index))

            max_ = max(result)
            max_result.append(max_)
            max_index.append(index[np.argmax(result)])

        argmax = np.argmax(max_result)
        print "Seeking Done; max: %f; %s" % (
            max_result[argmax], time.strftime('%Y-%m-%d %H:%M:%S'))

        # 对最大值再训练一次,得到最优的参数
        self.get_classificator(fit_train_datas[max_index[argmax][0]],
                               class_label[max_index[argmax][0]])

        dir_ = os.path.join(TEXT_OUT, "best_train_test_index")
        FileUtil.mkdirs(dir_)
        current = time.strftime('%Y-%m-%d %H:%M:%S')
        train_index_out = os.path.join(dir_, "train_index.txt")
        test_index_out = os.path.join(dir_, "test_index.txt")

        map(lambda x: np.savetxt(x[0], x[1], fmt="%d"),
            zip((train_index_out, test_index_out), (max_index[argmax])))
    def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func(x, y):
            block.append(fit_incr_datas[x[3] + 1: y[3], :])
            label_block.append(incr_class_label[x[3] + 1: y[3]])
            block0.append(fit_incr_datas[y[3]:y[3] + 1, :])
            return y

        def handle(clf, method):
            if method == "zero":
                return handle_zero(clf)
            elif method == "first":
                return handle_first(clf)
            elif method == "second":
                return handle_second(clf)
            elif method == "third":
                return handle_third(clf)
            elif method == "four":
                return handle_four(clf)
            elif method == "five":
                return handle_five(clf)
            else:
                pass

        def handle_zero(clf):
            """
            寻找当前分类器下预测正确的样本
            :param clf:
            :return:
            """
            incr_pre_label = clf.predict(fit_incr_datas)
            # 选出预测正确的下标
            true_index = (incr_class_label == incr_pre_label).nonzero()

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)

            res = []
            for i0 in true_index[0]:
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = incr_pre_label[i0]

                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)

                res.append((loss0, text0, c_pred0, i0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return res

        def handle_first(clf):
            # 最原始的分类损失度的计算
            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0: i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_my_zero_one_loss(test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_second(clf):
            # 另一种分类损失度的计算
#            predict_true = handle(clf, "zero")
#            if predict_true:
#                return predict_true

            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0: i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]

                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_third(clf):
            # todo
            # 如何获得合适的阖值
            def get_fit(e0):
                # 获得合适的阖值
                return 20
#                while len((r >= e0).nonzero()[0]) == 0:
#                    e0 = int(e0 / 2)
#                return e0

            global e
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2]
            # 支持度
            r = np.divide(max_proba, second_max_proba)
            # 阖值
            e = get_fit(e)
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]]

        def handle_third_another(clf):
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba
            # 支持度
            r = np.divide(max_proba, leave_proba)
            # 阖值
            e = 5
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]]

        def handle_four(clf):
            # My Own Idea
            # 存放 Test 的结果
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        def handle_five(clf):
            """
            类支持度和无显著性差异的结合
            :param clf:
            :return:
            """
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            fit_for_class_support = handle(clf, "third")
            print "The result of class-support: %d samples" % len(fit_for_class_support)

#            fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support)
#            print "The result of class-support: %d samples" % len(fit_for_class_support)
            # My Own Idea
            # 存放 Test 的结果
            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)

            for i0 in range(len(fit_for_class_support)):
                text0 = fit_for_class_support[i0][1]
                c_pred0 = fit_for_class_support[i0][2]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        method_options = ("first", "second", "third", "four", "five")
        if method not in method_options:
            raise ValueError("method has to be one of " + str(method_options))

        print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        suffix = ".blp"
        class_count_out = os.path.join(dir_, "class_count_" + method + suffix)
        class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix)
        feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix)
        feature_log_prob_out = os.path.join(dir_, "feature_log_prob_" + method + suffix)

        out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"):
                raise ValueError("please use get_classificator() to get classificator firstly")

            fit_incr_datas = self.fit_data(incr_datas)
            incr_class_label = np.asanyarray(incr_class_label)
            # 保存需要增加到key_words.txt文档中的数据
            add_to_key_words = []

            i = 0
            while fit_incr_datas.nnz > 0:
                print
                print "Begin Increment Classification_%d: %s" % (i, time.strftime('%Y-%m-%d %H:%M:%S'))

                need_to_update = handle(self, method)
                # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去
                # 更新时,增量集会不断减少
                block = []
                label_block = []
                # 更新时,训练集会不断增加
                block0 = []
                if need_to_update:
                    # 根据 loss 从小到大排序
                    accord_to_loss = sorted(need_to_update, key=lambda x: x[0])
                    for data in accord_to_loss:
                        self.bayes.update(data[2], data[1])
                    # 根据 index 排序
                    accord_to_index = sorted(need_to_update, key=lambda x: x[3])

#                    index = [index0[3] for index0 in accord_to_index]
#                    [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index]
#                    raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index]

                    block0.append(test_datas)
                    reduce(func, accord_to_index, (0.0, "", "", -1))
                    block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :])
                    label_block.append(incr_class_label[accord_to_index[-1][3] + 1:])
                    test_datas = sp.vstack(block0)
                    print "This times updates %d samples" % len(need_to_update)
                else:
                    block.append(fit_incr_datas[0:0, :])
                    label_block.append(incr_class_label[0:0])
                    print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[0]
                fit_incr_datas = sp.vstack(block)
                incr_class_label = np.concatenate(label_block)
                i += 1

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_, self.bayes.feature_log_prob_)
            # 保存到文本
            map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out))
            # 追加
#            path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt")
#            FileUtil.write(path, add_to_key_words, "a")
        else:
            # speed up
            self.bayes.class_count_ = bp.unpack_ndarray_file(out[0])
            self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1])
            self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2])
            self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3])

#            self.bayes.class_count_ = np.loadtxt(out[0])
#            self.bayes.class_log_prior_ = np.loadtxt(out[1])
#            self.bayes.feature_count_ = np.loadtxt(out[2])
#            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
        print
        return self
Example #25
0
    def get_incr_classificator(self,
                               incr_datas,
                               incr_class_label,
                               test_datas,
                               test_class_label,
                               method="first"):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func(x, y):
            block.append(fit_incr_datas[x[3] + 1:y[3], :])
            label_block.append(incr_class_label[x[3] + 1:y[3]])
            block0.append(fit_incr_datas[y[3]:y[3] + 1, :])
            return y

        def handle(clf, method):
            if method == "zero":
                return handle_zero(clf)
            elif method == "first":
                return handle_first(clf)
            elif method == "second":
                return handle_second(clf)
            elif method == "third":
                return handle_third(clf)
            elif method == "four":
                return handle_four(clf)
            elif method == "five":
                return handle_five(clf)
            else:
                pass

        def handle_zero(clf):
            """
            寻找当前分类器下预测正确的样本
            :param clf:
            :return:
            """
            incr_pre_label = clf.predict(fit_incr_datas)
            # 选出预测正确的下标
            true_index = (incr_class_label == incr_pre_label).nonzero()

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)

            res = []
            for i0 in true_index[0]:
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = incr_pre_label[i0]

                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                loss0 = clf.metrics_another_zero_one_loss(
                    origin_proba, test_proba)

                res.append((loss0, text0, c_pred0, i0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return res

        def handle_first(clf):
            # 最原始的分类损失度的计算
            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0:i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                        c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_my_zero_one_loss(test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_second(clf):
            # 另一种分类损失度的计算
            #            predict_true = handle(clf, "zero")
            #            if predict_true:
            #                return predict_true

            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0:i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]

                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                        c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_third(clf):
            # todo
            # 如何获得合适的阖值
            def get_fit(e0):
                # 获得合适的阖值
                return 20
#                while len((r >= e0).nonzero()[0]) == 0:
#                    e0 = int(e0 / 2)
#                return e0

            global e
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2]
            # 支持度
            r = np.divide(max_proba, second_max_proba)
            # 阖值
            e = get_fit(e)
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice,
                     max_proba[indice][0]) for indice in select_indices[0]]

        def handle_third_another(clf):
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba
            # 支持度
            r = np.divide(max_proba, leave_proba)
            # 阖值
            e = 5
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice,
                     max_proba[indice][0]) for indice in select_indices[0]]

        def handle_four(clf):
            # My Own Idea
            # 存放 Test 的结果
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([
                    1 if origin_label[j] == label[j] else -1
                    for j in range(len(origin_label))
                ])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        def handle_five(clf):
            """
            类支持度和无显著性差异的结合
            :param clf:
            :return:
            """
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            fit_for_class_support = handle(clf, "third")
            print "The result of class-support: %d samples" % len(
                fit_for_class_support)

            #            fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support)
            #            print "The result of class-support: %d samples" % len(fit_for_class_support)
            # My Own Idea
            # 存放 Test 的结果
            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)

            for i0 in range(len(fit_for_class_support)):
                text0 = fit_for_class_support[i0][1]
                c_pred0 = fit_for_class_support[i0][2]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([
                    1 if origin_label[j] == label[j] else -1
                    for j in range(len(origin_label))
                ])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        method_options = ("first", "second", "third", "four", "five")
        if method not in method_options:
            raise ValueError("method has to be one of " + str(method_options))

        print "Begin Increment Classification: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        suffix = ".blp"
        class_count_out = os.path.join(dir_, "class_count_" + method + suffix)
        class_log_prob_out = os.path.join(dir_,
                                          "class_log_prob_" + method + suffix)
        feature_count_out = os.path.join(dir_,
                                         "feature_count_" + method + suffix)
        feature_log_prob_out = os.path.join(
            dir_, "feature_log_prob_" + method + suffix)

        out = (class_count_out, class_log_prob_out, feature_count_out,
               feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(
                    self.bayes, "class_log_prior_"):
                raise ValueError(
                    "please use get_classificator() to get classificator firstly"
                )

            fit_incr_datas = self.fit_data(incr_datas)
            incr_class_label = np.asanyarray(incr_class_label)
            # 保存需要增加到key_words.txt文档中的数据
            add_to_key_words = []

            i = 0
            while fit_incr_datas.nnz > 0:
                print
                print "Begin Increment Classification_%d: %s" % (
                    i, time.strftime('%Y-%m-%d %H:%M:%S'))

                need_to_update = handle(self, method)
                # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去
                # 更新时,增量集会不断减少
                block = []
                label_block = []
                # 更新时,训练集会不断增加
                block0 = []
                if need_to_update:
                    # 根据 loss 从小到大排序
                    accord_to_loss = sorted(need_to_update, key=lambda x: x[0])
                    for data in accord_to_loss:
                        self.bayes.update(data[2], data[1])
                    # 根据 index 排序
                    accord_to_index = sorted(need_to_update,
                                             key=lambda x: x[3])

                    #                    index = [index0[3] for index0 in accord_to_index]
                    #                    [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index]
                    #                    raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index]

                    block0.append(test_datas)
                    reduce(func, accord_to_index, (0.0, "", "", -1))
                    block.append(fit_incr_datas[accord_to_index[-1][3] +
                                                1:, :])
                    label_block.append(
                        incr_class_label[accord_to_index[-1][3] + 1:])
                    test_datas = sp.vstack(block0)
                    print "This times updates %d samples" % len(need_to_update)
                else:
                    block.append(fit_incr_datas[0:0, :])
                    label_block.append(incr_class_label[0:0])
                    print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[
                        0]
                fit_incr_datas = sp.vstack(block)
                incr_class_label = np.concatenate(label_block)
                i += 1

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_,
                          self.bayes.feature_log_prob_)
            # 保存到文本
            map(lambda x: bp.pack_ndarray_file(x[0], x[1]),
                zip(bayes_args, out))
            # 追加
#            path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt")
#            FileUtil.write(path, add_to_key_words, "a")
        else:
            # speed up
            self.bayes.class_count_ = bp.unpack_ndarray_file(out[0])
            self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1])
            self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2])
            self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3])


#            self.bayes.class_count_ = np.loadtxt(out[0])
#            self.bayes.class_log_prior_ = np.loadtxt(out[1])
#            self.bayes.feature_count_ = np.loadtxt(out[2])
#            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        print
        return self
    def cross_validation(self, train_datas, class_label, score='precision'):
        """
        K-Fold Cross Validation
        采用交叉验证的方式来优化贝叶斯参数
        选出具有最佳 score 的训练集和测试集
        此时训练集和测试集就不需要事先选好,交给交叉验证来完成
        :param train_datas:
        :param class_label:
        :param score:
        :return:
        """
        score_options = ('precision', 'recall', 'f1', 'accuracy')
        if score not in score_options:
            raise ValueError('score has to be one of ' +
                             str(score_options))

        # fit data
        fit_train_datas = self.fit_data(train_datas)

        n_samples = fit_train_datas.shape[0]
        class_label = np.array(class_label)

        max_result = []
        max_index = []
        max_ = 0
        i = 0
        while(max_ < 0.6 and i <= 200):
            i += 1
            print "Seeking %d; max: %f; %s" % (i, max_, time.strftime('%Y-%m-%d %H:%M:%S'))

            result = []
            index = []
            cv = cross_validation.KFold(n_samples, n_folds=4, shuffle=True)

            for train_index, test_index in cv:
                train0, train0_label = fit_train_datas[train_index], class_label[train_index]
                test0, test0_label = fit_train_datas[test_index], class_label[test_index]
                self.get_classificator(train0, train0_label)
                c_pred0 = self.predict(test0)

                if score == "precision":
                    result.append(self.metrics_precision(test0_label, c_pred0))
                    index.append((train_index, test_index))
                elif score == "recall":
                    result.append(self.metrics_recall(test0_label, c_pred0))
                    index.append((train_index, test_index))
                elif score == "f1":
                    result.append(self.metrics_f1(test0_label, c_pred0))
                    index.append((train_index, test_index))
                else:
                    result.append(self.metrics_accuracy(test0_label, c_pred0))
                    index.append((train_index, test_index))

            max_ = max(result)
            max_result.append(max_)
            max_index.append(index[np.argmax(result)])

        argmax = np.argmax(max_result)
        print "Seeking Done; max: %f; %s" % (max_result[argmax], time.strftime('%Y-%m-%d %H:%M:%S'))

        # 对最大值再训练一次,得到最优的参数
        self.get_classificator(fit_train_datas[max_index[argmax][0]], class_label[max_index[argmax][0]])

        dir_ = os.path.join(TEXT_OUT, "best_train_test_index")
        FileUtil.mkdirs(dir_)
        current = time.strftime('%Y-%m-%d %H:%M:%S')
        train_index_out = os.path.join(dir_, "train_index.txt")
        test_index_out = os.path.join(dir_, "test_index.txt")

        map(lambda x: np.savetxt(x[0], x[1], fmt="%d"),
            zip(
                    (train_index_out, test_index_out),
                    (max_index[argmax])
                )
            )
    def get_incr_classificator_thread(self, incr_datas, incr_class_label, test_datas, test_class_label):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func1(i0):
            c_true0 = incr_class_label[i0: i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                clf0 = copy.deepcopy(self)
                clf0.bayes.class_log_prior_, clf0.bayes.feature_log_prob_ = clf0.bayes.update(c_pred0, text0, copy=True)
                loss0 = clf0.metrics_my_zero_one_loss(test_datas)

                # clf0.bayes.class_log_prior_ = origin_class_log_prob_
                # clf0.bayes.feature_log_prob_ = origin_feature_log_prob_

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        def func(i0):
            c_true0 = incr_class_label[i0: i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                if lock0.acquire():
                    self.bayes.class_log_prior_, self.bayes.feature_log_prob_ = self.bayes.update(c_pred0, text0, copy=True)
                    loss0 = self.metrics_my_zero_one_loss(test_datas)

                    self.bayes.class_log_prior_ = origin_class_log_prob_
                    self.bayes.feature_log_prob_ = origin_feature_log_prob_

                    lock0.release()

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        class_count_out = os.path.join(dir_, "class_count.txt")
        class_log_prob_out = os.path.join(dir_, "class_log_prob.txt")
        feature_count_out = os.path.join(dir_, "feature_count.txt")
        feature_log_prob_out = os.path.join(dir_, "feature_log_prob.txt")

        out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"):
                raise ValueError("please use get_classificator() to get classificator firstly")

            fit_incr_datas = self.fit_data(incr_datas)
            n_samples, _ = fit_incr_datas.shape
            incr_class_label = np.array(incr_class_label)

            lock0 = threading.Lock()
            lock1 = threading.Lock()

            # threadpool
            poolsize = 30
            pool = ThreadPool(poolsize)

            for i in range(n_samples):
                if i % 5 == 0:
                    print "Begin Increment Classification_%d: %s" % (i / 5, time.strftime('%Y-%m-%d %H:%M:%S'))
                # 分类损失,求最小值的处理方式
                loss = []
                # 增量集中优先选择更改分类器参数的文本
                text = []
                # 增量集中优先选择更改分类器参数的文本所对应的类别
                c_pred = []
                # 增量集中优先选择更改分类器参数的文本所对应的下标
                # index = 0

                origin_class_log_prob_ = self.bayes.class_log_prior_
                origin_feature_log_prob_ = self.bayes.feature_log_prob_

                # threadpool
                requests = makeRequests(func, range(fit_incr_datas.shape[0]))
                [pool.putRequest(req) for req in requests]
                pool.wait()
#                for i0 in range(fit_incr_datas.shape[0]):
#                    threading.Thread(target=func, args=(i0, )).start()

                minindex = np.argmin(loss)
                self.bayes.update(c_pred[minindex], text[minindex])
                fit_incr_datas = sp.vstack([fit_incr_datas[:minindex, :], fit_incr_datas[minindex + 1:, :]])

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_, self.bayes.feature_log_prob_)
            map(lambda x: np.savetxt(x[0], x[1]), zip(out, bayes_args))
        else:
            self.bayes.class_count_ = np.loadtxt(out[0])
            self.bayes.class_log_prior_ = np.loadtxt(out[1])
            self.bayes.feature_count_ = np.loadtxt(out[2])
            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
        return self
Example #28
0
    def _collect(self, splited_words_list, sentence_size):
        dir_ = os.path.join(TEXT_OUT, "key_words")
        if self.subjective:
            key_words_txt = os.path.join(dir_, self.__class__.__name__ + ".txt")
        else:
            key_words_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt")
#        def norm(word_scores):
#            """
#            以样本为单位正则化
#            归一化(正则化)
#            Normalization 主要思想是对每个样本计算其p-范数,然后对该样本中每个元素除以该范数,
#            这样处理的结果是使得每个处理后样本的p-范数(l1-norm,l2-norm)等于1。
#
#            p-范数的计算公式:||X||p=(|x1|^p+|x2|^p+...+|xn|^p)^1/p
#
#            该方法主要应用于文本分类和聚类中。
#
#            :param word_scores: a dict {word: score}
#            """
#            p = 0.0
#            for v in word_scores.values():
#                p += math.pow(math.fabs(v), 2)
#            p = math.pow(p, 1.0 / 2)
#
#            for k, v in word_scores.items():
#                word_scores[k] = v / p

#        def reduce_dim(word_scores):
#            """
#            降维:选取累加权重信息占比超过 0.9 的特征词
#            """
#            _size = len(word_scores)
#            _max = math.pow(_size, 1.0 / 2) * 0.85
#            res = {}
#            # 降序排序
#            sort = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
#            _sum = 0.0
#            for k, v in sort:
#                if(_sum > _max):
#                    break
#                res[k] = v
#                _sum += v
#            return res

        if not self.istrain or self.f or not FileUtil.isexist(key_words_txt) or FileUtil.isempty(key_words_txt):
            print "Cal Scores: ", time.strftime('%Y-%m-%d %H:%M:%S')
            if len(splited_words_list) == sentence_size:
                train_range = slice(sentence_size)
            else:
                train_range = slice(sentence_size, len(splited_words_list))

            # 获取所有类别下的文本
            all_class_datas = Feature.all_class_text(splited_words_list[train_range], self.getclasses())

            # 获取类别标签
            class_label = [d.get("emotion-1-type") for d in splited_words_list[: sentence_size]]

            # return term/frequency or term/score
            res = []
            for splited_words_dict in splited_words_list[0: sentence_size]:
                splited_words = splited_words_dict.get("sentence")
                label = splited_words_dict.get("emotion-1-type")
                # 计算每个单词的得分 scores: {word: [score, frequency], ...}
                scores = {splited_word: [self.cal_score(splited_word, splited_words, label, all_class_datas,
                                                        [d.get("sentence") for d in splited_words_list[train_range]]),
                                         frequency]
                          for splited_word, frequency in splited_words.items()}
                # 归一化
                # norm(scores)
                # 降维处理
                sorted_words = scores
#                if not self.istrain:
#                    sorted_words = reduce_dim(scores)

                # Collection
                # if False return term/score
                # if True  return term/frequency
#                if False:
#                    for k in sorted_words.keys():
#                        sorted_words[k] = splited_words.count(k)

                res.append({"sentence": sorted_words,
                            "emotion-1-type": splited_words_dict.get("emotion-1-type")})
            print "Cal Scores Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # FileUtil.write(TEST_BASE_URL + "scores.txt", res)
            print "Begin Normalization: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # 归一化
            self.norm(res)
            # FileUtil.write(TEST_BASE_URL + "norm.txt", res)
            print "Normalization Done: ", time.strftime('%Y-%m-%d %H:%M:%S')

            print "Begin Reduce: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # 降维
            self.reduce_dim(res)
            print "Reduce Done: ", time.strftime('%Y-%m-%d %H:%M:%S')

            # Try Convert term/score to term/frequency
            # if False return term/score
            # if True  return term/frequency
            for d in res:
                ws = d.get("sentence")
                for k, v in ws.items():
                    ws[k] = v[0]
                    if True:
                        ws[k] = v[1]

            # 由于分词或降维的过程中,有可能因为样本的信息关键词不够,
            # 使得该样本经过上诉步骤后为空,返回这类样本的索引
            danger_index = []
            res = filter(lambda x: danger_index.append(x[0]) if not x[1].get("sentence") else x,
                         enumerate(res))
            res = list(zip(*res)[1])

            class_label = [c for i, c in enumerate(class_label)
                           if i not in danger_index]

            # 写入文件
            if self.istrain:
                FileUtil.write(key_words_txt, res)
        else:
            res = FileUtil.read(key_words_txt)
            class_label = [r["emotion-1-type"] for r in res]
            danger_index = []

        # 输出统计信息
        if False:
            self.__print_top_key_word(res)
        return res, class_label, danger_index
Example #29
0
def classifict(feature, sentences, incr=False, out=False):
    if isinstance(sentences, basestring):
        sentences = [sentences]

    # 获得主客观分类器
    feature.subjective = False
    objective_clf = get_objective_classification(feature)

    # 测试集
    # 主客观部分
    test_datas_objective, c_true_objective, danger_index_objective = feature.get_key_words(sentences)

    test_objective = test_datas_objective
    if not sp.issparse(test_datas_objective):
        test_objective = feature.cal_weight_improve(test_datas_objective, c_true_objective)

    c_pred_objective = objective_clf.predict(test_objective)

    # 获得情绪分类器
    feature.subjective = True
    emotion_clf = get_emotion_classification(feature, incr=incr)

    # 测试集
    # 情绪部分
    test_datas, c_true, danger_index = feature.get_key_words(sentences)

    test = test_datas
    if not sp.issparse(test_datas):
        test = feature.cal_weight_improve(test_datas, c_true)

    c_pred = []
    for i in range(len(sentences)):
        if i not in danger_index_objective and i not in danger_index:
            before_i_in_danger_obj = np.sum(np.asarray(danger_index_objective) < i)
            before_i_in_danger_ = np.sum(np.asarray(danger_index) < i)

            c = emotion_clf.predict(test[i - before_i_in_danger_])[0] if c_pred_objective[i - before_i_in_danger_obj] == "Y"\
                else c_pred_objective[i - before_i_in_danger_obj]
            c_pred.append(c)

    if out:
        dir_ = os.path.join(OUT_BASE_URL, "out0")
        FileUtil.mkdirs(dir_)
        current = time.strftime('%Y-%m-%d %H:%M:%S')
        o = os.path.join(dir_, current + ".xml")

        with open(o, "w") as fp:
            for i, s in enumerate(sentences):
                if i not in danger_index_objective and i not in danger_index:
                    before_i_in_danger_obj = np.sum(np.asarray(danger_index_objective) < i)
                    before_i_in_danger_ = np.sum(np.asarray(danger_index) < i)
                    fp.write(
                        """<weibo emotion-type="%s">
    <sentence emotion-1-type="%s" emotion-2-type="none" emotion-tag="%s">
        %s
    </sentence>
</weibo>
""" % (c_pred[i - before_i_in_danger_], c_pred[i - before_i_in_danger_], "N" if c_pred_objective[i - before_i_in_danger_obj] == "N" else "Y", s))
                else:
                    fp.write(
                        """<weibo emotion-type="%s">
    <sentence emotion-1-type="%s" emotion-2-type="none" emotion-tag="%s">
        %s
    </sentence>
</weibo>
""" % ("None", "None", "N", s + "\n Can't recognize because it has insufficient key_words"))

    else:
        print c_pred
Example #30
0
def count_img():
    # 图片存放的路径
    all_img_url = os.path.join(RESOURCE_BASE_URL, "collect/img")
    print "It's have %d images" % len(FileUtil.listdir(all_img_url))