Beispiel #1
0
def main():
    words = []
    sentences = []
    with open('..//dataset//computer.txt', 'r') as f:
        for line in f:
            words += util.split_words(line)
            sentences += util.split_sentences(line)

    # Tf-idf evaluation
    tfidf = Tfidf(words, sample_doc_ids)
    tfidf.calc_tfidf()

    # Show tf-idf values
    sorted_tfidf = util.sort_dict_by_value(tfidf.tfidf)
    for i in range(len(sorted_tfidf)):
        print('Word: {0:25}; tfidf = {1}'.format(sorted_tfidf[i][0],
                                                 sorted_tfidf[i][1]))

    # Work out summary
    summary = tfidf.best_sentences(sentences, 100)

    for sentence in summary:
        print(sentence.text)
        print("Score: {0}\n".format(sentence.score))

    print("-----------\nDONE")
Beispiel #2
0
    def split_words_for_index(self, title):
        """docstring for split_words_for_index"""

        words = split_words(title)
        if util.pinyin_match:
            words += split_pinyin(title)

        return words
    def split_words_for_index(self, title):
        """docstring for split_words_for_index"""

        words = split_words(title)
        if util.pinyin_match:
            words += split_pinyin(title)
        
        return words
        # plus 1 smoothing
        prob_fe[(fps, eps)] = math.log(count_e[eps] + 1) - math.log(c)

    return prob_fe


def dump_probs(probs, fname):
    with open(fname, "w") as out:
        for ((fps, eps), p) in probs.items():
            if len(fps) > 0 and len(eps) > 0:
                out.write("%s\t%s\t%.4f\n" % (fps, eps, p))


train_src = util.read_file(sys.argv[1])
train_tgt = util.read_file(sys.argv[2])
src_words = util.split_words(train_src)
tgt_words = util.split_words(train_tgt)
alignment_e, alignment_f = util.read_alignment(sys.argv[3])
outf = sys.argv[4]
max_len = int(sys.argv[5])  # max phrase length


def process_i(i):
    return i, phrase_extract(alignment_e[i], alignment_f[i], tgt_words[i],
                             src_words[i], max_len)


NUM_THREADS = 4


def main():
Beispiel #5
0
 def __init__(self, text):
     self.text = text
     self.score = 0.0
     self.words = util.split_words(self.text)
def query(name, text, offset=0, limit=10, sort_field='id', conditions=None):
    """docstring for query"""

    conditions = conditions if isinstance(conditions, dict) and conditions else {}

    tm = time.time()
    result = []

    # 如果搜索文本和查询条件均没有,那就直接返回 []
    if not text.strip() and not conditions:
        return result

    text = utf8(text.strip())
    splited_words = split_words(text)

    words = [mk_sets_key(name, word) for word in splited_words]

    if conditions:
        condition_keys = [mk_condition_key(name, c, utf8(conditions[c]))
                          for c in conditions]
        # 将条件的 key 放入关键词搜索集合内,用于 sinterstore 搜索
        words += condition_keys
    else:
        condition_keys = []

    if not words:
        return result

    temp_store_key = "tmpinterstore:%s" % "+".join(words)

    if len(words) > 1:
        if not util.redis.exists(temp_store_key):
            # 将多个词语组合对比,得到交集,并存入临时区域
            util.redis.sinterstore(temp_store_key, words)
            # 将临时搜索设为1天后自动清除
            util.redis.expire(temp_store_key, 86400)
        # 拼音搜索
        if util.pinyin_match:
            splited_pinyin_words = split_pinyin(text)

            pinyin_words = [mk_sets_key(name, w) for w in splited_pinyin_words]
            pinyin_words += condition_keys
            temp_sunion_key = "tmpsunionstore:%s" % "+".join(words)
            temp_pinyin_store_key = "tmpinterstore:%s" % "+".join(pinyin_words)
            # 找出拼音的交集
            util.redis.sinterstore(temp_pinyin_store_key, pinyin_words)
            # 合并中文和拼音的搜索结果
            util.redis.sunionstore(temp_sunion_key, [temp_store_key, temp_pinyin_store_key])
            # 将临时搜索设为1天后自动清除
            util.redis.expire(temp_pinyin_store_key, 86400)
            util.redis.expire(temp_sunion_key, 86400)
            temp_store_key = temp_sunion_key
    else:
        temp_store_key = words[0]

    # 根据需要的数量取出 ids
    ids = util.redis.sort(temp_store_key,
                          start=offset,
                          num=limit,
                          by=mk_score_key(name, "*"),
                          desc=True)
    result = hmget(name, ids, sort_field=sort_field)
    logging.debug("{}:\"{}\" | Time spend:{}s".format(name, text, time.time()-tm))
    return result
Beispiel #7
0
def query(name, text, offset=0, limit=10, sort_field='id', conditions=None):
    """docstring for query"""

    conditions = conditions if isinstance(conditions, dict) and conditions else {}

    tm = time.time()
    result = []

    # 如果搜索文本和查询条件均没有,那就直接返回 []
    if not text.strip() and not conditions:
        return result

    text = utf8(text.strip())
    splited_words = split_words(text)

    words = []
    for word in splited_words:
        words.append(mk_sets_key(name, word))

    condition_keys = []
    if conditions:
        for c in conditions:
            condition_keys.append(mk_condition_key(name, c, utf8(conditions[c])))
            
        # 将条件的 key 放入关键词搜索集合内,用于 sinterstore 搜索
        words += condition_keys
    
    if not words:
        return result

    temp_store_key = "tmpinterstore:%s" % "+".join(words)
    
    if len(words) > 1:
        if not util.redis.exists(temp_store_key):
            # 将多个词语组合对比,得到交集,并存入临时区域
            util.redis.sinterstore(temp_store_key, words)
            
            # 将临时搜索设为1天后自动清除
            util.redis.expire(temp_store_key, 86400)
        
        # 拼音搜索
        if util.pinyin_match:
            splited_pinyin_words = split_pinyin(text)

            pinyin_words = []
            for w in splited_pinyin_words:
                pinyin_words.append(mk_sets_key(name, w))
                
            pinyin_words += condition_keys
            
            temp_sunion_key = "tmpsunionstore:%s" % "+".join(words)
            temp_pinyin_store_key = "tmpinterstore:%s" % "+".join(pinyin_words)
            
            # 找出拼音的
            util.redis.sinterstore(temp_pinyin_store_key, pinyin_words)
            
            # 合并中文和拼音的搜索结果
            util.redis.sunionstore(temp_sunion_key, [temp_store_key, temp_pinyin_store_key])
            
            # 将临时搜索设为1天后自动清除
            util.redis.expire(temp_pinyin_store_key, 86400)
            util.redis.expire(temp_sunion_key, 86400)
            
            temp_store_key = temp_sunion_key
    else:
        temp_store_key = words[0]

    # 根据需要的数量取出 ids
    ids = util.redis.sort(temp_store_key,
                    start = offset,
                    num = limit,
                    by = mk_score_key(name, "*"),
                    desc = True)

    result = util.hmget(name, ids, sort_field=sort_field)
    logging.debug("%s:\"%s\" | Time spend:%ss" % (name, text, time.time()-tm))
    return result