def get_documents(docs): # docs only have the (docid, tokens, length of subject keyword) docs_tokens = [] for doc in docs: tokens, sub_tok_len = get_keywords(doc[1], doc[2], tokenizer) docs_tokens.append((doc[0], tokens, sub_tok_len)) for docid, tokens, sub_tok_len in docs_tokens: for i in range(sub_tok_len): # We change subject token weight to twice of its original tokens[i] = (tokens[i][0], tokens[i][1] * 2) # merge the same tokens def merge_tokens(tokens): tmp_tokens = {} for tok, count in tokens: if tmp_tokens.has_key(tok): tmp_tokens[tok] += count else: tmp_tokens[tok] = count return sorted(tmp_tokens.iteritems(), key=lambda d: d[1], reverse=True) docs_tokens_new = [] for docid, tokens, sub_tok_len in docs_tokens: docs_tokens_new.append((docid, merge_tokens(tokens))) # appear frequence and appear documents of every token tokens_appear_map = {} doc_terms_count = {} for docid, tokens in docs_tokens_new: terms_count = 0 for token, count in tokens: tokens_appear_map.setdefault(token,[]).append(docid) terms_count += count doc_terms_count[docid] = terms_count #for token, appear_list in tokens_appear_map.iteritems(): # print token, appear_list #for docid, tokens in docs_tokens_new: # print docid, doc_terms_count[docid] # print the last result #for docid, tokens in docs_tokens_new: # print docid # for toks in tokens: # print toks[0], toks[1] documents = [] for docid, tokens in docs_tokens_new: token_map = {} for key, count in tokens: token_map[key] = {"count": count} token_list = [key[0] for key in tokens] documents.append({"text": "id:%s\ntext:%s" % (docid, " ".join(token_list)), "docid": docid, "token_map": token_map, "token_list": token_list}) return documents, tokens_appear_map, doc_terms_count
def quota_sensitivity(topic, start_ts, end_ts): ''' 关键词中敏感词的个数L,关键词个数N,敏感度L/N 敏感词set1, 关键词set2, min(1,len(set1&set2)/len(set2)) ''' limit = 50 keywords_set = get_keywords(topic, start_ts, end_ts, limit) # 获得前limit的keywords_set class_result = db.session.query(ClassSensitivity).filter(ClassSensitivity.topic==topic ,\ ClassSensitivity.start_ts==start_ts ,\ ClassSensitivity.end_ts==end_ts).first() class_sensitivity_set = set(json.loads(class_result.words)) L = len(class_sensitivity_set & keywords_set) ratio_class = float(L) / float(limit) classfication = 1 if ratio_class < 1: save_sensitivity_quota(topic, start_ts, end_ts, classfication, ratio_class) else: ratio_class = 1 save_sensitivity_quota(topic, start_ts, end_ts, classfication, ratio_class) word_result = db.session.query(WordSensitivity).filter(WordSensitivity.topic==topic ,\ WordSensitivity.start_ts==start_ts ,\ WordSensitivity.end_ts==end_ts).first() word_sensitivity_set = set(json.loads(word_result.words)) L = len(word_sensitivity_set & keywords_set) ratio_word = float(L) / float(limit) classfication = 2 if ratio_word < 1: save_sensitivity_quota(topic, start_ts, end_ts, classfication, ratio_word) else: ratio_class = 1 save_sensitivity_quota(topic, start_ts, end_ts, classfication, ratio_word) place_result = db.session.query(PlaceSensitivity).filter(PlaceSensitivity.topic==topic ,\ PlaceSensitivity.start_ts==start_ts ,\ PlaceSensitivity.end_ts==end_ts).first() place_sensitivity_set = set(json.loads(place_result.words)) L = len(place_sensitivity_set & keywords_set) ratio_place = float(L) / float(limit) classfication = 3 if ratio_place < 1: save_sensitivity_quota(topic, start_ts, end_ts, classfication , ratio_place) else: ratio_class = 1 save_sensitivity_quota(topic, start_ts, end_ts, classfication, ratio_place)
def get_documents(docs): # docs only have the (docid, tokens, length of subject keyword) docs_tokens = [] for doc in docs: tokens, sub_tok_len = get_keywords(doc[1], doc[2], tokenizer) docs_tokens.append((doc[0], tokens, sub_tok_len)) for docid, tokens, sub_tok_len in docs_tokens: for i in range(sub_tok_len): # We change subject token weight to twice of its original tokens[i] = (tokens[i][0], tokens[i][1] * 2) # merge the same tokens def merge_tokens(tokens): tmp_tokens = {} for tok, count in tokens: if tmp_tokens.has_key(tok): tmp_tokens[tok] += count else: tmp_tokens[tok] = count return sorted(tmp_tokens.iteritems(), key=lambda d: d[1], reverse=True) docs_tokens_new = [] for docid, tokens, sub_tok_len in docs_tokens: docs_tokens_new.append((docid, merge_tokens(tokens))) # appear frequence and appear documents of every token tokens_appear_map = {} doc_terms_count = {} for docid, tokens in docs_tokens_new: terms_count = 0 for token, count in tokens: tokens_appear_map.setdefault(token, []).append(docid) terms_count += count doc_terms_count[docid] = terms_count #for token, appear_list in tokens_appear_map.iteritems(): # print token, appear_list #for docid, tokens in docs_tokens_new: # print docid, doc_terms_count[docid] # print the last result #for docid, tokens in docs_tokens_new: # print docid # for toks in tokens: # print toks[0], toks[1] documents = [] for docid, tokens in docs_tokens_new: token_map = {} for key, count in tokens: token_map[key] = {"count": count} token_list = [key[0] for key in tokens] documents.append({ "text": "id:%s\ntext:%s" % (docid, " ".join(token_list)), "docid": docid, "token_map": token_map, "token_list": token_list }) return documents, tokens_appear_map, doc_terms_count