def clear_un_img(): # 图片存放的路径 all_img_url = os.path.join(RESOURCE_BASE_URL, "collect/img") # 这个目录下是需要保留的图片 leave_img_url = os.path.join(RESOURCE_BASE_URL, "collect") if FileUtil.isempty(leave_img_url): FileUtil.empty(all_img_url) else: all_imgs = FileUtil.listdir(all_img_url) dirs = [leave_img_url] for parent, dirnames, filenames in os.walk(leave_img_url): for dirname in dirnames: dirs.append(os.path.join(parent, dirname)) leave_imgs = [] for dir_ in dirs: imglist = collect.read_weibo(dir_, isreadimg=True) imglist = flatten( [img.get("img") for img in imglist if img.get("img")]) leave_imgs += imglist # 删除多余的图片 map(lambda p: os.remove(p) if p not in leave_imgs else None, all_imgs)
def get_classificator(self, train_datas, class_label, iscrossvalidate=False, isbalance=False, minority_target=None): """ 获取分类器 :param train_datas :param class_label :param iscrossvalidate 是否需要读取交叉验证后的结果来获得训练器 :param isbalance 是否需要平衡数据 :param minority_target 少数类,只有在 isbalance 为 true 时才起作用 :return: """ out = os.path.join(TEXT_OUT, "best_train_test_index/train_index.txt") if iscrossvalidate and (not FileUtil.isexist(out) or FileUtil.isempty(out)): raise ValueError("please use cross_validation() firstly") # fit data fit_train_datas = self.fit_data(train_datas) class_label = np.asarray(class_label) if iscrossvalidate: train_index = np.loadtxt(out, dtype=int) else: train_index = np.array(range(fit_train_datas.shape[0])) fit_train_datas, class_label = fit_train_datas[ train_index], class_label[train_index] sample_weight = None # SMOTE if isbalance: fit_train_datas, class_label = preprocessing.my_smote( fit_train_datas, class_label, minority_target, per=0.5) # sample_weight = _weights._balance_weights(class_label) # sample_weight # if isbalance: # sample_weight = _weights._balance_weights(class_label) # 训练模型 self.bayes.fit(fit_train_datas, class_label, sample_weight=sample_weight) return self
def get_classificator(self, train_datas, class_label, iscrossvalidate=False, isbalance=False, minority_target=None): """ 获取分类器 :param train_datas :param class_label :param iscrossvalidate 是否需要读取交叉验证后的结果来获得训练器 :param isbalance 是否需要平衡数据 :param minority_target 少数类,只有在 isbalance 为 true 时才起作用 :return: """ out = os.path.join(TEXT_OUT, "best_train_test_index/train_index.txt") if iscrossvalidate and (not FileUtil.isexist(out) or FileUtil.isempty(out)): raise ValueError("please use cross_validation() firstly") # fit data fit_train_datas = self.fit_data(train_datas) class_label = np.asarray(class_label) if iscrossvalidate: train_index = np.loadtxt(out, dtype=int) else: train_index = np.array(range(fit_train_datas.shape[0])) fit_train_datas, class_label = fit_train_datas[train_index], class_label[train_index] sample_weight = None # SMOTE if isbalance: fit_train_datas, class_label = preprocessing.my_smote(fit_train_datas, class_label, minority_target, per=0.5) # sample_weight = _weights._balance_weights(class_label) # sample_weight # if isbalance: # sample_weight = _weights._balance_weights(class_label) # 训练模型 self.bayes.fit(fit_train_datas, class_label, sample_weight=sample_weight) return self
def clear_un_img(): # 图片存放的路径 all_img_url = os.path.join(RESOURCE_BASE_URL, "collect/img") # 这个目录下是需要保留的图片 leave_img_url = os.path.join(RESOURCE_BASE_URL, "collect") if FileUtil.isempty(leave_img_url): FileUtil.empty(all_img_url) else: all_imgs = FileUtil.listdir(all_img_url) dirs = [leave_img_url] for parent, dirnames, filenames in os.walk(leave_img_url): for dirname in dirnames: dirs.append(os.path.join(parent, dirname)) leave_imgs = [] for dir_ in dirs: imglist = collect.read_weibo(dir_, isreadimg=True) imglist = flatten([img.get("img") for img in imglist if img.get("img")]) leave_imgs += imglist # 删除多余的图片 map(lambda p: os.remove(p) if p not in leave_imgs else None, all_imgs)
def _get_splited_train(self): """ 优先从文件中读取训练集分词后的结果 :return: """ dir_ = os.path.join(TEXT_OUT, "split") if self.subjective: split_txt = os.path.join(dir_, self.__class__.__name__ + ".txt") training_datas = Load.load_training_balance() else: split_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt") training_datas = Load.load_training_objective_balance() if self.f or not FileUtil.isexist(split_txt) or FileUtil.isempty(split_txt): # 加载训练集 # 每个句子还包含类别信息 splited_words_list = Feature.__split(flatten(training_datas)) # splited_words_list = Feature.__del_low_frequency_word(splited_words_list) FileUtil.write(split_txt, splited_words_list) else: splited_words_list = FileUtil.read(split_txt) return splited_words_list
train = train_datas if not sp.issparse(train_datas): train = feature.cal_weight_improve(train_datas, class_label) test = Load.load_test_balance() test_datas, test_label, _ = feature.get_key_words(test) test = test_datas # 构建适合 bayes 分类的数据集 if not sp.issparse(test_datas): test = feature.cal_weight_improve(test_datas, test_label) crossvalidate = False # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例 if crossvalidate: out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt") if not FileUtil.isexist(out) or FileUtil.isempty(out): clf0 = Classification() clf0.cross_validation(train, class_label, score="recall") test_index = np.loadtxt(out, dtype=int) test = train[test_index] test_label = np.asanyarray(class_label)[test_index].tolist() method_options = ("second", "four", "five") method_options_0 = ("B", "C", "D") linestyle = (':', '--', '-') plot.get_instance() for i in range(len(method_options)): bayes = IncrBayes() clf = Classification(bayes=bayes) clf.get_classificator(train, class_label, iscrossvalidate=crossvalidate, isbalance=False, minority_target=EMOTION_CLASS.keys())
def get_incr_classificator_thread(self, incr_datas, incr_class_label, test_datas, test_class_label): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func1(i0): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = self.predict(text0)[0] if c_true0 == c_pred0: loss0 = 0 else: clf0 = copy.deepcopy(self) clf0.bayes.class_log_prior_, clf0.bayes.feature_log_prob_ = clf0.bayes.update( c_pred0, text0, copy=True) loss0 = clf0.metrics_my_zero_one_loss(test_datas) # clf0.bayes.class_log_prior_ = origin_class_log_prob_ # clf0.bayes.feature_log_prob_ = origin_feature_log_prob_ if lock1.acquire(): text.append(text0) c_pred.append(c_pred0) loss.append(loss0) lock1.release() def func(i0): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = self.predict(text0)[0] if c_true0 == c_pred0: loss0 = 0 else: if lock0.acquire(): self.bayes.class_log_prior_, self.bayes.feature_log_prob_ = self.bayes.update( c_pred0, text0, copy=True) loss0 = self.metrics_my_zero_one_loss(test_datas) self.bayes.class_log_prior_ = origin_class_log_prob_ self.bayes.feature_log_prob_ = origin_feature_log_prob_ lock0.release() if lock1.acquire(): text.append(text0) c_pred.append(c_pred0) loss.append(loss0) lock1.release() print "Begin Increment Classification: ", time.strftime( '%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) class_count_out = os.path.join(dir_, "class_count.txt") class_log_prob_out = os.path.join(dir_, "class_log_prob.txt") feature_count_out = os.path.join(dir_, "feature_count.txt") feature_log_prob_out = os.path.join(dir_, "feature_log_prob.txt") out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr( self.bayes, "class_log_prior_"): raise ValueError( "please use get_classificator() to get classificator firstly" ) fit_incr_datas = self.fit_data(incr_datas) n_samples, _ = fit_incr_datas.shape incr_class_label = np.array(incr_class_label) lock0 = threading.Lock() lock1 = threading.Lock() # threadpool poolsize = 30 pool = ThreadPool(poolsize) for i in range(n_samples): if i % 5 == 0: print "Begin Increment Classification_%d: %s" % ( i / 5, time.strftime('%Y-%m-%d %H:%M:%S')) # 分类损失,求最小值的处理方式 loss = [] # 增量集中优先选择更改分类器参数的文本 text = [] # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = [] # 增量集中优先选择更改分类器参数的文本所对应的下标 # index = 0 origin_class_log_prob_ = self.bayes.class_log_prior_ origin_feature_log_prob_ = self.bayes.feature_log_prob_ # threadpool requests = makeRequests(func, range(fit_incr_datas.shape[0])) [pool.putRequest(req) for req in requests] pool.wait() # for i0 in range(fit_incr_datas.shape[0]): # threading.Thread(target=func, args=(i0, )).start() minindex = np.argmin(loss) self.bayes.update(c_pred[minindex], text[minindex]) fit_incr_datas = sp.vstack([ fit_incr_datas[:minindex, :], fit_incr_datas[minindex + 1:, :] ]) bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) map(lambda x: np.savetxt(x[0], x[1]), zip(out, bayes_args)) else: self.bayes.class_count_ = np.loadtxt(out[0]) self.bayes.class_log_prior_ = np.loadtxt(out[1]) self.bayes.feature_count_ = np.loadtxt(out[2]) self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime( '%Y-%m-%d %H:%M:%S') return self
def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func(x, y): block.append(fit_incr_datas[x[3] + 1:y[3], :]) label_block.append(incr_class_label[x[3] + 1:y[3]]) block0.append(fit_incr_datas[y[3]:y[3] + 1, :]) return y def handle(clf, method): if method == "zero": return handle_zero(clf) elif method == "first": return handle_first(clf) elif method == "second": return handle_second(clf) elif method == "third": return handle_third(clf) elif method == "four": return handle_four(clf) elif method == "five": return handle_five(clf) else: pass def handle_zero(clf): """ 寻找当前分类器下预测正确的样本 :param clf: :return: """ incr_pre_label = clf.predict(fit_incr_datas) # 选出预测正确的下标 true_index = (incr_class_label == incr_pre_label).nonzero() origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) res = [] for i0 in true_index[0]: text0 = fit_incr_datas.getrow(i0) c_pred0 = incr_pre_label[i0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) res.append((loss0, text0, c_pred0, i0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return res def handle_first(clf): # 最原始的分类损失度的计算 # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_my_zero_one_loss(test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_second(clf): # 另一种分类损失度的计算 # predict_true = handle(clf, "zero") # if predict_true: # return predict_true # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_third(clf): # todo # 如何获得合适的阖值 def get_fit(e0): # 获得合适的阖值 return 20 # while len((r >= e0).nonzero()[0]) == 0: # e0 = int(e0 / 2) # return e0 global e # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2] # 支持度 r = np.divide(max_proba, second_max_proba) # 阖值 e = get_fit(e) # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_third_another(clf): # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba # 支持度 r = np.divide(max_proba, leave_proba) # 阖值 e = 5 # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_four(clf): # My Own Idea # 存放 Test 的结果 predict_true = handle(clf, "zero") if predict_true: return predict_true f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(fit_incr_datas.shape[0]): text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([ 1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label)) ]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] def handle_five(clf): """ 类支持度和无显著性差异的结合 :param clf: :return: """ predict_true = handle(clf, "zero") if predict_true: return predict_true fit_for_class_support = handle(clf, "third") print "The result of class-support: %d samples" % len( fit_for_class_support) # fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support) # print "The result of class-support: %d samples" % len(fit_for_class_support) # My Own Idea # 存放 Test 的结果 f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(len(fit_for_class_support)): text0 = fit_for_class_support[i0][1] c_pred0 = fit_for_class_support[i0][2] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([ 1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label)) ]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] method_options = ("first", "second", "third", "four", "five") if method not in method_options: raise ValueError("method has to be one of " + str(method_options)) print "Begin Increment Classification: ", time.strftime( '%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) suffix = ".blp" class_count_out = os.path.join(dir_, "class_count_" + method + suffix) class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix) feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix) feature_log_prob_out = os.path.join( dir_, "feature_log_prob_" + method + suffix) out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr( self.bayes, "class_log_prior_"): raise ValueError( "please use get_classificator() to get classificator firstly" ) fit_incr_datas = self.fit_data(incr_datas) incr_class_label = np.asanyarray(incr_class_label) # 保存需要增加到key_words.txt文档中的数据 add_to_key_words = [] i = 0 while fit_incr_datas.nnz > 0: print print "Begin Increment Classification_%d: %s" % ( i, time.strftime('%Y-%m-%d %H:%M:%S')) need_to_update = handle(self, method) # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去 # 更新时,增量集会不断减少 block = [] label_block = [] # 更新时,训练集会不断增加 block0 = [] if need_to_update: # 根据 loss 从小到大排序 accord_to_loss = sorted(need_to_update, key=lambda x: x[0]) for data in accord_to_loss: self.bayes.update(data[2], data[1]) # 根据 index 排序 accord_to_index = sorted(need_to_update, key=lambda x: x[3]) # index = [index0[3] for index0 in accord_to_index] # [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index] # raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index] block0.append(test_datas) reduce(func, accord_to_index, (0.0, "", "", -1)) block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :]) label_block.append( incr_class_label[accord_to_index[-1][3] + 1:]) test_datas = sp.vstack(block0) print "This times updates %d samples" % len(need_to_update) else: block.append(fit_incr_datas[0:0, :]) label_block.append(incr_class_label[0:0]) print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[ 0] fit_incr_datas = sp.vstack(block) incr_class_label = np.concatenate(label_block) i += 1 bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) # 保存到文本 map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out)) # 追加 # path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt") # FileUtil.write(path, add_to_key_words, "a") else: # speed up self.bayes.class_count_ = bp.unpack_ndarray_file(out[0]) self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1]) self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2]) self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3]) # self.bayes.class_count_ = np.loadtxt(out[0]) # self.bayes.class_log_prior_ = np.loadtxt(out[1]) # self.bayes.feature_count_ = np.loadtxt(out[2]) # self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime( '%Y-%m-%d %H:%M:%S') print return self
def _collect(self, splited_words_list, sentence_size): dir_ = os.path.join(TEXT_OUT, "key_words") if self.subjective: key_words_txt = os.path.join(dir_, self.__class__.__name__ + ".txt") else: key_words_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt") # def norm(word_scores): # """ # 以样本为单位正则化 # 归一化(正则化) # Normalization 主要思想是对每个样本计算其p-范数,然后对该样本中每个元素除以该范数, # 这样处理的结果是使得每个处理后样本的p-范数(l1-norm,l2-norm)等于1。 # # p-范数的计算公式:||X||p=(|x1|^p+|x2|^p+...+|xn|^p)^1/p # # 该方法主要应用于文本分类和聚类中。 # # :param word_scores: a dict {word: score} # """ # p = 0.0 # for v in word_scores.values(): # p += math.pow(math.fabs(v), 2) # p = math.pow(p, 1.0 / 2) # # for k, v in word_scores.items(): # word_scores[k] = v / p # def reduce_dim(word_scores): # """ # 降维:选取累加权重信息占比超过 0.9 的特征词 # """ # _size = len(word_scores) # _max = math.pow(_size, 1.0 / 2) * 0.85 # res = {} # # 降序排序 # sort = sorted(word_scores.items(), key=lambda x: x[1], reverse=True) # _sum = 0.0 # for k, v in sort: # if(_sum > _max): # break # res[k] = v # _sum += v # return res if not self.istrain or self.f or not FileUtil.isexist(key_words_txt) or FileUtil.isempty(key_words_txt): print "Cal Scores: ", time.strftime('%Y-%m-%d %H:%M:%S') if len(splited_words_list) == sentence_size: train_range = slice(sentence_size) else: train_range = slice(sentence_size, len(splited_words_list)) # 获取所有类别下的文本 all_class_datas = Feature.all_class_text(splited_words_list[train_range], self.getclasses()) # 获取类别标签 class_label = [d.get("emotion-1-type") for d in splited_words_list[: sentence_size]] # return term/frequency or term/score res = [] for splited_words_dict in splited_words_list[0: sentence_size]: splited_words = splited_words_dict.get("sentence") label = splited_words_dict.get("emotion-1-type") # 计算每个单词的得分 scores: {word: [score, frequency], ...} scores = {splited_word: [self.cal_score(splited_word, splited_words, label, all_class_datas, [d.get("sentence") for d in splited_words_list[train_range]]), frequency] for splited_word, frequency in splited_words.items()} # 归一化 # norm(scores) # 降维处理 sorted_words = scores # if not self.istrain: # sorted_words = reduce_dim(scores) # Collection # if False return term/score # if True return term/frequency # if False: # for k in sorted_words.keys(): # sorted_words[k] = splited_words.count(k) res.append({"sentence": sorted_words, "emotion-1-type": splited_words_dict.get("emotion-1-type")}) print "Cal Scores Done: ", time.strftime('%Y-%m-%d %H:%M:%S') # FileUtil.write(TEST_BASE_URL + "scores.txt", res) print "Begin Normalization: ", time.strftime('%Y-%m-%d %H:%M:%S') # 归一化 self.norm(res) # FileUtil.write(TEST_BASE_URL + "norm.txt", res) print "Normalization Done: ", time.strftime('%Y-%m-%d %H:%M:%S') print "Begin Reduce: ", time.strftime('%Y-%m-%d %H:%M:%S') # 降维 self.reduce_dim(res) print "Reduce Done: ", time.strftime('%Y-%m-%d %H:%M:%S') # Try Convert term/score to term/frequency # if False return term/score # if True return term/frequency for d in res: ws = d.get("sentence") for k, v in ws.items(): ws[k] = v[0] if True: ws[k] = v[1] # 由于分词或降维的过程中,有可能因为样本的信息关键词不够, # 使得该样本经过上诉步骤后为空,返回这类样本的索引 danger_index = [] res = filter(lambda x: danger_index.append(x[0]) if not x[1].get("sentence") else x, enumerate(res)) res = list(zip(*res)[1]) class_label = [c for i, c in enumerate(class_label) if i not in danger_index] # 写入文件 if self.istrain: FileUtil.write(key_words_txt, res) else: res = FileUtil.read(key_words_txt) class_label = [r["emotion-1-type"] for r in res] danger_index = [] # 输出统计信息 if False: self.__print_top_key_word(res) return res, class_label, danger_index
def get_incr_classificator_thread(self, incr_datas, incr_class_label, test_datas, test_class_label): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func1(i0): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = self.predict(text0)[0] if c_true0 == c_pred0: loss0 = 0 else: clf0 = copy.deepcopy(self) clf0.bayes.class_log_prior_, clf0.bayes.feature_log_prob_ = clf0.bayes.update(c_pred0, text0, copy=True) loss0 = clf0.metrics_my_zero_one_loss(test_datas) # clf0.bayes.class_log_prior_ = origin_class_log_prob_ # clf0.bayes.feature_log_prob_ = origin_feature_log_prob_ if lock1.acquire(): text.append(text0) c_pred.append(c_pred0) loss.append(loss0) lock1.release() def func(i0): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = self.predict(text0)[0] if c_true0 == c_pred0: loss0 = 0 else: if lock0.acquire(): self.bayes.class_log_prior_, self.bayes.feature_log_prob_ = self.bayes.update(c_pred0, text0, copy=True) loss0 = self.metrics_my_zero_one_loss(test_datas) self.bayes.class_log_prior_ = origin_class_log_prob_ self.bayes.feature_log_prob_ = origin_feature_log_prob_ lock0.release() if lock1.acquire(): text.append(text0) c_pred.append(c_pred0) loss.append(loss0) lock1.release() print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) class_count_out = os.path.join(dir_, "class_count.txt") class_log_prob_out = os.path.join(dir_, "class_log_prob.txt") feature_count_out = os.path.join(dir_, "feature_count.txt") feature_log_prob_out = os.path.join(dir_, "feature_log_prob.txt") out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"): raise ValueError("please use get_classificator() to get classificator firstly") fit_incr_datas = self.fit_data(incr_datas) n_samples, _ = fit_incr_datas.shape incr_class_label = np.array(incr_class_label) lock0 = threading.Lock() lock1 = threading.Lock() # threadpool poolsize = 30 pool = ThreadPool(poolsize) for i in range(n_samples): if i % 5 == 0: print "Begin Increment Classification_%d: %s" % (i / 5, time.strftime('%Y-%m-%d %H:%M:%S')) # 分类损失,求最小值的处理方式 loss = [] # 增量集中优先选择更改分类器参数的文本 text = [] # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = [] # 增量集中优先选择更改分类器参数的文本所对应的下标 # index = 0 origin_class_log_prob_ = self.bayes.class_log_prior_ origin_feature_log_prob_ = self.bayes.feature_log_prob_ # threadpool requests = makeRequests(func, range(fit_incr_datas.shape[0])) [pool.putRequest(req) for req in requests] pool.wait() # for i0 in range(fit_incr_datas.shape[0]): # threading.Thread(target=func, args=(i0, )).start() minindex = np.argmin(loss) self.bayes.update(c_pred[minindex], text[minindex]) fit_incr_datas = sp.vstack([fit_incr_datas[:minindex, :], fit_incr_datas[minindex + 1:, :]]) bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) map(lambda x: np.savetxt(x[0], x[1]), zip(out, bayes_args)) else: self.bayes.class_count_ = np.loadtxt(out[0]) self.bayes.class_log_prior_ = np.loadtxt(out[1]) self.bayes.feature_count_ = np.loadtxt(out[2]) self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S') return self
def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func(x, y): block.append(fit_incr_datas[x[3] + 1: y[3], :]) label_block.append(incr_class_label[x[3] + 1: y[3]]) block0.append(fit_incr_datas[y[3]:y[3] + 1, :]) return y def handle(clf, method): if method == "zero": return handle_zero(clf) elif method == "first": return handle_first(clf) elif method == "second": return handle_second(clf) elif method == "third": return handle_third(clf) elif method == "four": return handle_four(clf) elif method == "five": return handle_five(clf) else: pass def handle_zero(clf): """ 寻找当前分类器下预测正确的样本 :param clf: :return: """ incr_pre_label = clf.predict(fit_incr_datas) # 选出预测正确的下标 true_index = (incr_class_label == incr_pre_label).nonzero() origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) res = [] for i0 in true_index[0]: text0 = fit_incr_datas.getrow(i0) c_pred0 = incr_pre_label[i0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) res.append((loss0, text0, c_pred0, i0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return res def handle_first(clf): # 最原始的分类损失度的计算 # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_my_zero_one_loss(test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_second(clf): # 另一种分类损失度的计算 # predict_true = handle(clf, "zero") # if predict_true: # return predict_true # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_third(clf): # todo # 如何获得合适的阖值 def get_fit(e0): # 获得合适的阖值 return 20 # while len((r >= e0).nonzero()[0]) == 0: # e0 = int(e0 / 2) # return e0 global e # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2] # 支持度 r = np.divide(max_proba, second_max_proba) # 阖值 e = get_fit(e) # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_third_another(clf): # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba # 支持度 r = np.divide(max_proba, leave_proba) # 阖值 e = 5 # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_four(clf): # My Own Idea # 存放 Test 的结果 predict_true = handle(clf, "zero") if predict_true: return predict_true f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(fit_incr_datas.shape[0]): text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] def handle_five(clf): """ 类支持度和无显著性差异的结合 :param clf: :return: """ predict_true = handle(clf, "zero") if predict_true: return predict_true fit_for_class_support = handle(clf, "third") print "The result of class-support: %d samples" % len(fit_for_class_support) # fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support) # print "The result of class-support: %d samples" % len(fit_for_class_support) # My Own Idea # 存放 Test 的结果 f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(len(fit_for_class_support)): text0 = fit_for_class_support[i0][1] c_pred0 = fit_for_class_support[i0][2] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] method_options = ("first", "second", "third", "four", "five") if method not in method_options: raise ValueError("method has to be one of " + str(method_options)) print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) suffix = ".blp" class_count_out = os.path.join(dir_, "class_count_" + method + suffix) class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix) feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix) feature_log_prob_out = os.path.join(dir_, "feature_log_prob_" + method + suffix) out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"): raise ValueError("please use get_classificator() to get classificator firstly") fit_incr_datas = self.fit_data(incr_datas) incr_class_label = np.asanyarray(incr_class_label) # 保存需要增加到key_words.txt文档中的数据 add_to_key_words = [] i = 0 while fit_incr_datas.nnz > 0: print print "Begin Increment Classification_%d: %s" % (i, time.strftime('%Y-%m-%d %H:%M:%S')) need_to_update = handle(self, method) # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去 # 更新时,增量集会不断减少 block = [] label_block = [] # 更新时,训练集会不断增加 block0 = [] if need_to_update: # 根据 loss 从小到大排序 accord_to_loss = sorted(need_to_update, key=lambda x: x[0]) for data in accord_to_loss: self.bayes.update(data[2], data[1]) # 根据 index 排序 accord_to_index = sorted(need_to_update, key=lambda x: x[3]) # index = [index0[3] for index0 in accord_to_index] # [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index] # raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index] block0.append(test_datas) reduce(func, accord_to_index, (0.0, "", "", -1)) block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :]) label_block.append(incr_class_label[accord_to_index[-1][3] + 1:]) test_datas = sp.vstack(block0) print "This times updates %d samples" % len(need_to_update) else: block.append(fit_incr_datas[0:0, :]) label_block.append(incr_class_label[0:0]) print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[0] fit_incr_datas = sp.vstack(block) incr_class_label = np.concatenate(label_block) i += 1 bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) # 保存到文本 map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out)) # 追加 # path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt") # FileUtil.write(path, add_to_key_words, "a") else: # speed up self.bayes.class_count_ = bp.unpack_ndarray_file(out[0]) self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1]) self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2]) self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3]) # self.bayes.class_count_ = np.loadtxt(out[0]) # self.bayes.class_log_prior_ = np.loadtxt(out[1]) # self.bayes.feature_count_ = np.loadtxt(out[2]) # self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S') print return self