def main(): words = [] sentences = [] with open('..//dataset//computer.txt', 'r') as f: for line in f: words += util.split_words(line) sentences += util.split_sentences(line) # Tf-idf evaluation tfidf = Tfidf(words, sample_doc_ids) tfidf.calc_tfidf() # Show tf-idf values sorted_tfidf = util.sort_dict_by_value(tfidf.tfidf) for i in range(len(sorted_tfidf)): print('Word: {0:25}; tfidf = {1}'.format(sorted_tfidf[i][0], sorted_tfidf[i][1])) # Work out summary summary = tfidf.best_sentences(sentences, 100) for sentence in summary: print(sentence.text) print("Score: {0}\n".format(sentence.score)) print("-----------\nDONE")
def split_words_for_index(self, title): """docstring for split_words_for_index""" words = split_words(title) if util.pinyin_match: words += split_pinyin(title) return words
# plus 1 smoothing prob_fe[(fps, eps)] = math.log(count_e[eps] + 1) - math.log(c) return prob_fe def dump_probs(probs, fname): with open(fname, "w") as out: for ((fps, eps), p) in probs.items(): if len(fps) > 0 and len(eps) > 0: out.write("%s\t%s\t%.4f\n" % (fps, eps, p)) train_src = util.read_file(sys.argv[1]) train_tgt = util.read_file(sys.argv[2]) src_words = util.split_words(train_src) tgt_words = util.split_words(train_tgt) alignment_e, alignment_f = util.read_alignment(sys.argv[3]) outf = sys.argv[4] max_len = int(sys.argv[5]) # max phrase length def process_i(i): return i, phrase_extract(alignment_e[i], alignment_f[i], tgt_words[i], src_words[i], max_len) NUM_THREADS = 4 def main():
def __init__(self, text): self.text = text self.score = 0.0 self.words = util.split_words(self.text)
def query(name, text, offset=0, limit=10, sort_field='id', conditions=None): """docstring for query""" conditions = conditions if isinstance(conditions, dict) and conditions else {} tm = time.time() result = [] # 如果搜索文本和查询条件均没有,那就直接返回 [] if not text.strip() and not conditions: return result text = utf8(text.strip()) splited_words = split_words(text) words = [mk_sets_key(name, word) for word in splited_words] if conditions: condition_keys = [mk_condition_key(name, c, utf8(conditions[c])) for c in conditions] # 将条件的 key 放入关键词搜索集合内,用于 sinterstore 搜索 words += condition_keys else: condition_keys = [] if not words: return result temp_store_key = "tmpinterstore:%s" % "+".join(words) if len(words) > 1: if not util.redis.exists(temp_store_key): # 将多个词语组合对比,得到交集,并存入临时区域 util.redis.sinterstore(temp_store_key, words) # 将临时搜索设为1天后自动清除 util.redis.expire(temp_store_key, 86400) # 拼音搜索 if util.pinyin_match: splited_pinyin_words = split_pinyin(text) pinyin_words = [mk_sets_key(name, w) for w in splited_pinyin_words] pinyin_words += condition_keys temp_sunion_key = "tmpsunionstore:%s" % "+".join(words) temp_pinyin_store_key = "tmpinterstore:%s" % "+".join(pinyin_words) # 找出拼音的交集 util.redis.sinterstore(temp_pinyin_store_key, pinyin_words) # 合并中文和拼音的搜索结果 util.redis.sunionstore(temp_sunion_key, [temp_store_key, temp_pinyin_store_key]) # 将临时搜索设为1天后自动清除 util.redis.expire(temp_pinyin_store_key, 86400) util.redis.expire(temp_sunion_key, 86400) temp_store_key = temp_sunion_key else: temp_store_key = words[0] # 根据需要的数量取出 ids ids = util.redis.sort(temp_store_key, start=offset, num=limit, by=mk_score_key(name, "*"), desc=True) result = hmget(name, ids, sort_field=sort_field) logging.debug("{}:\"{}\" | Time spend:{}s".format(name, text, time.time()-tm)) return result
def query(name, text, offset=0, limit=10, sort_field='id', conditions=None): """docstring for query""" conditions = conditions if isinstance(conditions, dict) and conditions else {} tm = time.time() result = [] # 如果搜索文本和查询条件均没有,那就直接返回 [] if not text.strip() and not conditions: return result text = utf8(text.strip()) splited_words = split_words(text) words = [] for word in splited_words: words.append(mk_sets_key(name, word)) condition_keys = [] if conditions: for c in conditions: condition_keys.append(mk_condition_key(name, c, utf8(conditions[c]))) # 将条件的 key 放入关键词搜索集合内,用于 sinterstore 搜索 words += condition_keys if not words: return result temp_store_key = "tmpinterstore:%s" % "+".join(words) if len(words) > 1: if not util.redis.exists(temp_store_key): # 将多个词语组合对比,得到交集,并存入临时区域 util.redis.sinterstore(temp_store_key, words) # 将临时搜索设为1天后自动清除 util.redis.expire(temp_store_key, 86400) # 拼音搜索 if util.pinyin_match: splited_pinyin_words = split_pinyin(text) pinyin_words = [] for w in splited_pinyin_words: pinyin_words.append(mk_sets_key(name, w)) pinyin_words += condition_keys temp_sunion_key = "tmpsunionstore:%s" % "+".join(words) temp_pinyin_store_key = "tmpinterstore:%s" % "+".join(pinyin_words) # 找出拼音的 util.redis.sinterstore(temp_pinyin_store_key, pinyin_words) # 合并中文和拼音的搜索结果 util.redis.sunionstore(temp_sunion_key, [temp_store_key, temp_pinyin_store_key]) # 将临时搜索设为1天后自动清除 util.redis.expire(temp_pinyin_store_key, 86400) util.redis.expire(temp_sunion_key, 86400) temp_store_key = temp_sunion_key else: temp_store_key = words[0] # 根据需要的数量取出 ids ids = util.redis.sort(temp_store_key, start = offset, num = limit, by = mk_score_key(name, "*"), desc = True) result = util.hmget(name, ids, sort_field=sort_field) logging.debug("%s:\"%s\" | Time spend:%ss" % (name, text, time.time()-tm)) return result