def inspect_global_freq(self): d1 = defaultdict(int) for k1, v1 in self.total_tag_to_features__dict.iteritems(): for k2, v2 in v1.iteritems(): d1[k2] += v2 uprint(Counter(d1).most_common()) uprint(u''.join([v1[0] for v1 in Counter(d1).most_common()]))
def func(counts, is_precision=False): if not is_precision: for1, for2 = original_tags, recommend_tags else: for2, for1 = original_tags, recommend_tags processed = set([]) for method in ["exact", "peer", "child", "parent"]: match_count = 0 for t1 in (for1 - processed): # 其实核心就是对这一层进行遍历 matched_t1 = None for t2 in for2: # 不需要相减,因为其他recommend_tags还要判定关系 n_t1, n_t2 = t2, t1 if verbose: print method, "[n_t1]", n_t1, "[n_t2]", n_t2 if getattr(tags_tree, "is_" + method)(n_t1, n_t2): if verbose: print "√" matched_t1 = n_t2 break if matched_t1: # 这样在这个循环外部的for循环rt1就没有机会重复计算了 processed.add(n_t2) match_count += 1 update(counts, method, match_count) if verbose: uprint("[processed]", processed) counts.unmatch += len(for1 - processed) # 计算是否完全没有 召回|正确 if len(processed) == 0: text = "no_precision" if is_precision else "no_recall" item1['eval_result'].append(text)
def test_test_data(self): return True match_count = 0 for record1 in data_list_test: # 总共 4 个测试items item1 = FoobarModel(record1) # 至少一个有交集 common_tags1 = set(recommend_tags(item1)) & set(item1.tags) uprint('[common_tags]', common_tags1) if common_tags1: match_count += 1 match_rate = match_count / float(len(data_list_test)) print "[match_rate]", match_rate
def inspect(self, name=None): if name == 'name_to_nodes': pass if name == 'feature_to_nodes': pass if name == 'features_weight': for name1, nodes_set1 in self.name_to_nodes.iteritems(): for node1 in nodes_set1: uprint(node1.name, Counter(node1.features_weight).most_common(), "\n") if name is None: uprint(self)
def extract_features_weight(self, item1): assert isinstance(item1.item_content, unicode) # 没效果 #segment_features = self.classify.documents_with_segments.get(item1.item_id, None) #if not segment_features: segment_features = Counter(jieba_parse(item1.item_content)) segment_features = Counter({}) unicode_features = Counter(list(item1.item_content)) mix_features = segment_features + unicode_features self.filter_by_stop_list(mix_features) if self.debug: uprint("[mix_features]", mix_features) return mix_features
def inspect_result(result, filter_item_ids=set([])): for idx1, two_parts in enumerate(result): print "第", idx1 + 1, "个" original_tags, recommend_data = two_parts if recommend_data['item_id'] not in filter_item_ids: continue print "试题ID", recommend_data['item_id'] print "试题内容", recommend_data['item_content'] uprint(u"关键词列表 => 熵", recommend_data['features_weight']) uprint(u"原始标签:", original_tags) uprint(u"推荐标签:", recommend_data['recommend_tags']) uprint(u"推荐细节:", recommend_data['recommend_tags_detail']) print "\n" * 3
def recommend_tags(item1): """ 参数: 输入的item1一般来说必须是持久化的。 """ # TODO 可能把 node1.features_weight 直接优化成数组来做 # A. 计算特征相似度 result_rule = self.association_rule_learning_engine(item1) # B. 计算文本相似度 result_text = self.text_similarity_engine(item1, result_rule['data']) count_feature, count_text = result_rule['counter'], result_text['counter'] result_mix = sorted([[t1[0], count_feature[t1[0]] + count_text[t1[0]] * self.mix_unicode_coefficient] for t1 in result_rule['data']], key=lambda i1: -i1[1]) print "=" * 60 uprint(u"题目ID", item1.item_id) uprint(u"题目content", item1.item_content) print uprint(u"original 知识点", self.model.tags_model__extract_tags(item1)) uprint(u"features 相似度", result_rule['data']) uprint(u"unicode 相似度", result_text['data']) uprint(u"mix 相似度", result_mix) if result_mix: candidate_tags = result_mix[0:self.default_guess_count] # 用于提升超过两个推荐的"正确度"。 max_score = max([i1[1] for i1 in result_mix]) candidate_tags = filter(lambda i1: i1[1] >= (max_score * self.mix_score_percent), candidate_tags) else: candidate_tags = [] uprint(u"[final]", candidate_tags) print "\n" * 3 candidate_tags = [{"name": name1, "ids": self.tags_tree.fetch_name_ids(name1), "weight": weight1} for name1, weight1 in candidate_tags] return { "item_id": item1.item_id, "item_content": item1.item_content, "recommend_tags": candidate_tags, "original_tags": self.model.tags_model__extract_tags(item1), }
def recommend_tags(item1): """ 参数: 输入的item1一般来说必须是持久化的。 """ # TODO 可能把 node1.features_weight 直接优化成数组来做 # A. 计算特征相似度 result_rule = self.association_rule_learning_engine(item1) # B. 计算文本相似度 result_text = self.text_similarity_engine(item1, result_rule['data']) count_feature, count_text = result_rule['counter'], result_text[ 'counter'] result_mix = sorted([[ t1[0], count_feature[t1[0]] + count_text[t1[0]] * self.mix_unicode_coefficient ] for t1 in result_rule['data']], key=lambda i1: -i1[1]) print "=" * 60 uprint(u"题目ID", item1.item_id) uprint(u"题目content", item1.item_content) print uprint(u"original 知识点", self.model.tags_model__extract_tags(item1)) uprint(u"features 相似度", result_rule['data']) uprint(u"unicode 相似度", result_text['data']) uprint(u"mix 相似度", result_mix) if result_mix: candidate_tags = result_mix[0:self.default_guess_count] # 用于提升超过两个推荐的"正确度"。 max_score = max([i1[1] for i1 in result_mix]) candidate_tags = filter( lambda i1: i1[1] >= (max_score * self.mix_score_percent), candidate_tags) else: candidate_tags = [] uprint(u"[final]", candidate_tags) print "\n" * 3 candidate_tags = [{ "name": name1, "ids": self.tags_tree.fetch_name_ids(name1), "weight": weight1 } for name1, weight1 in candidate_tags] return { "item_id": item1.item_id, "item_content": item1.item_content, "recommend_tags": candidate_tags, "original_tags": self.model.tags_model__extract_tags(item1), }