Python BM25 Beispiele, gensim.summarization.bm25.BM25 Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: ad_hoc.py Projekt: FilipeAz/IST-PRI

def calculate_BM25(keywords, lst):  #calculates the BM25 weights

    maxweight = -1000
    results = []

    stopWords = set(stopwords.words('portuguese'))
    for i in range(len(keywords)):
        keywords[i] = keywords[i].lower()

    new_keywords = [word for word in keywords if word not in stopWords]
    #print(new_keywords)

    corpus = create_corpus(lst)
    #print(corpus)

    bm25 = BM25.BM25(corpus)
    average_idf = sum(float(val) for val in bm25.idf.values()) / len(
        bm25.idf)  #average idf
    #print(average_idf)

    scores = bm25.get_scores(new_keywords, average_idf)  #bm25 scores
    #print(scores)

    for i in range(len(scores)):
        if scores[i] != 0:
            results = results + [[
                scores[i], get_title(lst[i]),
                get_party(lst[i])
            ]]  #different than 0 scores

    results.sort(key=lambda document: document[0],
                 reverse=True)  #most relevant ones

    return results, new_keywords

Beispiel #2

0

Datei anzeigen

Datei: rank.py Projekt: microsoft/verseagility

def create_bm25():
    """Function to create or update BM25 object"""
    # Run arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--task",
                        default=4,
                        type=int,
                        help="Task where: \
                            -task 1 : classification subcat \
                            -task 2 : classification cat \
                            -task 3 : ner \
                            -task 4 : qa")
    parser.add_argument('--register_model', action='store_true', help="")
    parser.add_argument('--download_train', action='store_true', help="")
    args = parser.parse_args()

    # Load data
    cl = pr.Clean(task=args.task, download_train=args.download_train)
    data = cl.dt.load('fn_clean', dir='data_dir')

    # Split tokenized data
    toks = data.question_clean.apply(cl.transform_by_task).to_list()

    # Create BM25 Object
    bm = bm25.BM25(toks)

    # Dump objects
    with open(cl.dt.get_path('fn_rank', 'model_dir'), 'wb') as fp:
        pickle.dump(bm, fp)
        pickle.dump(data, fp)
    logger.warning('[INFO] Created and stored BM25 object.')

    # Upload
    if args.register_model:
        cl.dt.upload('model_dir', destination='model')

Beispiel #3

0

Datei anzeigen

Datei: AnalysisDriver.py Projekt: Gdreamlend/PyGAAP

	def train(self, knownDocuments):
		for root,dirs,files in os.walk(dirname):
		for f in knownDocuments:
			corpus.append(tokenization(f))
			filenames.append(f)
		dictionary = corpora.Dictionary(corpus)
		doc_vectors = [dictionary.doc2bow(text) for text in corpus]
		vec1 = doc_vectors[0]

		bm25Model = bm25.BM25(corpus)
		average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())

	def analyze(self, unknownDocument):
		url_k_words = "articles/sanchongmen_ch5.txt"
		sentence = open(url_k_words,"rb").read()
		# Based on TF-IDF to get key words
		# tags = jieba.analyse.extract_tags(sentence, withWeight=True, topK=20, allowPOS=())
		# Based on TextRank to get top K words
		tags =jieba.analyse.textrank(sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
		query = tags
		scores = bm25Model.get_scores(query)
		# scores.sort(reverse=True)
		idx = scores.index(max(scores))
		fname = filenames[idx]
		return fname
	def displayName():
		return "BM25 Driver for Chinese"

Beispiel #4

0

Datei anzeigen

Datei: data_utils.py Projekt: HarshKohli/TableClassifier

def get_ranks_baseline(data, tables):
    texts, id_to_index = get_table_words(tables)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    bm25_obj = bm25.BM25(corpus)
    ranks, num_samples = [], len(data)
    for count, one_data in enumerate(data):
        id, query = one_data['table_id'], one_data['question_tokens']
        query_doc = dictionary.doc2bow(query.split())
        scores = bm25_obj.get_scores(query_doc)
        score_tuples = [(score, i) for i, score in enumerate(scores)]
        score_tuples.sort(reverse=True)
        target_index = id_to_index[id]
        rank = -1
        for index, tup in enumerate(score_tuples):
            cur_index = tup[1]
            if cur_index == target_index:
                rank = index + 1
                break
        if rank != -1:
            ranks.append(rank)
        else:
            print('Invalid baseline input')
        if count % 100 == 0:
            print('Done ' + str(count) + ' out of ' + str(num_samples) + ' inputs')
    return ranks

Beispiel #5

0

Datei anzeigen

def get_fitness_answer(input_seq):
    #     通过bm25的相似度匹配问题
    input_seq = get_final_input(input_seq)

    bm25Model = bm25.BM25(question_list)
    average_idf = sum(
        map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(
            bm25Model.idf.keys())
    scores = bm25Model.get_scores(input_seq, average_idf)

    sorted_scores = list(set(scores))
    sorted_scores.sort()

    answer = []
    question = []

    if sorted_scores[-1] > 0:
        one = scores.index(sorted_scores[-1])
        answer.append(get_key(question_answer_direct, question_list[one]))
        question.append(get_key(qa_dict, answer_list[one]))

        if sorted_scores[-2] > 0:
            two = scores.index(sorted_scores[-2])
            answer.append(get_key(question_answer_direct, question_list[two]))
            question.append(get_key(qa_dict, answer_list[two]))

            if sorted_scores[-3] > 0:
                three = scores.index(sorted_scores[-3])
                answer.append(
                    get_key(question_answer_direct, question_list[three]))
                question.append(get_key(qa_dict, answer_list[three]))

    return answer, question

Beispiel #6

0

Datei anzeigen

Datei: bm25util.py Projekt: yang0/textutils

def buildModel(jsonFile, fieldNames, query_str):
    # iterable 不能循环两次，所以创建两个变量
    t1 = jsonutil.iterCutFieldList(jsonFile, fieldNames)
    t2 = jsonutil.iterCutFieldList(jsonFile, fieldNames)

    # 建立单词索引字典
    dictionary = corpora.Dictionary(t1)
    dictionary.save(DICTIONARY_PATH)

    # 建立词袋模型.将词汇表示的文本，转换成用id表示
    corpus = [dictionary.doc2bow(text) for text in t2]
    print("词袋: %i " % len(corpus))

    bm25Model = bm25.BM25(corpus)


    # print("bm25 idf lens: %i " %len(bm25Model.f))

    average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())

    query = jiebautil.cutWords(query_str).split()
    query_bow = dictionary.doc2bow(query)

    scores = bm25Model.get_scores(query_bow, average_idf)
    # i = scores.index(max(scores))

    lineRead = LineReader(jsonFile)
    for i in range(5):
        score = max(scores)
        lineNum = scores.index(score) + 1
        s = lineRead.load(lineNum)
        j = json.loads(s)
        print(jsonutil.recursive_get(j, fieldNames[0]))

        del scores[lineNum-1]

Beispiel #7

0

Datei anzeigen

Datei: bm25_相似度.py Projekt: scuthls/XiaoAn

def get_fitness_answer(input_seq):
    #     通过bm25的相似度匹配问题
    input_seq = get_final_input(input_seq)

    bm25Model = bm25.BM25(question_list)
    scores = bm25Model.get_scores(input_seq)

    sorted_scores = list(set(scores))
    sorted_scores.sort()

    answer = []
    question = []

    if sorted_scores[-1] > 0:
        one = scores.index(sorted_scores[-1])
        answer.append(get_key(question_answer_direct, question_list[one]))
        question.append(get_key(qa_dict, answer_list[one]))

        if sorted_scores[-2] > 0:
            two = scores.index(sorted_scores[-2])
            answer.append(get_key(question_answer_direct, question_list[two]))
            question.append(get_key(qa_dict, answer_list[two]))

            if sorted_scores[-3] > 0:
                three = scores.index(sorted_scores[-3])
                answer.append(
                    get_key(question_answer_direct, question_list[three]))
                question.append(get_key(qa_dict, answer_list[three]))

    return answer, question

Beispiel #8

0

Datei anzeigen

def train():
    '''
        匹配过程
    :return:
    '''
    # 构建匹配语料库 398872 samples
    sku_names_texts = get_train_datas()
    sku_names_jieba = get_text_jieba(sku_names_texts)
    print(len(sku_names_texts), len(sku_names_jieba))
    print(sku_names_jieba[0])

    # 测试数据 1000 samples
    keywords_texts = get_test_datas()
    keywords_jieba = get_text_jieba(keywords_texts)
    print(len(keywords_texts))

    # 统计词表
    dictionary = corpora.Dictionary(sku_names_jieba)
    print(len(dictionary))

    # 用gensim建立BM25模型
    bm25Model = bm25.BM25(sku_names_jieba)
    # 根据gensim源码，计算平均逆文档频率
    average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())

    for i, item in enumerate(keywords_jieba):
        scores = bm25Model.get_scores(item, average_idf)
        # sorted_scores = sorted(scores, reverse=True)[:10]
        idx = scores.index(max(scores))
        print(i, "||", keywords_texts[i], "||", sku_names_texts[idx])

        with open("result/bm25_v1_results.txt", 'a', encoding='utf8') as wf:
            wf.write(str(i) + "||" + keywords_texts[i] + "||" + sku_names_texts[idx] + "\n")

Beispiel #9

0

Datei anzeigen

Datei: bm25.py Projekt: timbereye/SACExam

 def __init__(self,
              corpus_file_pattern=None,
              stop_words_file="stop_words.txt",
              MAX_LEN=300,
              path="./"):
     """
     BM25检索模块，主要是在BM25库基础上封装了预处理部分。
     :param corpus_file_pattern: 检索资料库-文本数据 str
     :param stop_words_file: 停用词表 str
     :param path: 保存的模型目录 str
     """
     os.makedirs(path, exist_ok=True)
     self.model = os.path.join(path, "bm25.m")
     self.sen = os.path.join(path, "sen.pkl")
     self.stop = os.path.join(path, "stop.pkl")
     self.MAX_LEN = MAX_LEN
     if os.path.isfile(self.model) and os.path.isfile(
             self.sen) and os.path.isfile(self.stop):
         self.load()
     else:
         assert corpus_file_pattern is not None, "Can not find model or corpus file."
         if os.path.isfile(stop_words_file):
             self.stop_words = self.load_stop_words(stop_words_file)
         self.sentences, corpus = self.get_corpus(corpus_file_pattern)
         self.bm25 = bm25.BM25(corpus)
         self.dump()

Beispiel #10

0

Datei anzeigen

Datei: indexes.py Projekt: arsenitheunicorn/infos_project

def index_bm25():
    lemmas = df['lemmas'].tolist()
    lemmas = [doc.split(' ') for doc in lemmas]
    dictionary = corpora.Dictionary(lemmas)
    corpus = [dictionary.doc2bow(text) for text in lemmas]
    bm25_obj = bm25.BM25(corpus)
    return bm25_obj, dictionary

Beispiel #11

0

Datei anzeigen

Datei: myui.py Projekt: Iconzjy/reptile

def getScores(dictionary):
    bm25Model = bm25.BM25(corpus)
    query = []
    #对查询词分词
    for word in query_str.strip().split():
        query.append(word)
    scores = bm25Model.get_scores(query)
    return scores

Beispiel #12

0

Datei anzeigen

 def get_most_similarity_article(self):
     """
     透過文章corpus取得average_idf，以便後續計算相關
     """
     dictionary = corpora.Dictionary(self.corpus)
     bm25Model = bm25.BM25(self.corpus)
     average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())
     return [bm25Model,average_idf]

Beispiel #13

0

Datei anzeigen

def doc_process_bm25(raw_docs):
    gen_docs = [[
        w.lower() for w in tokenizer.tokenize(text) if not w in stop_words
    ] for text in raw_docs]
    # dictionary = gensim.corpora.Dictionary(gen_docs)
    # corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
    bm25Model = bm25.BM25(gen_docs)
    return bm25Model

Beispiel #14

0

Datei anzeigen

Datei: code_to_api.py Projekt: josepablocam/ams

 def _build(self):
     # extract classes
     self.classes = self.api.classes
     self.class_docs = {
         c.path: self._preprocess_text(c.embedded_text).split(" ")
         for c in self.classes
     }
     self.index = bm25.BM25([self.class_docs[c.path] for c in self.classes])

Beispiel #15

0

Datei anzeigen

Datei: main.py Projekt: fkunneman/DiscoSumo

    def init_bm25(self, corpus):
        ids, questions = [], []
        for row in corpus:
            ids.append(row['id'])
            questions.append(row['tokens'])

        self.idx2id = dict([(i, qid) for i, qid in enumerate(ids)])
        self.id2idx = dict([(qid, i) for i, qid in enumerate(ids)])
        self.bm25 = bm25.BM25(questions)

Beispiel #16

0

Datei anzeigen

Datei: bm25_model.py Projekt: liuaiting/financial-news-analysis

def creat_model(path_model, path_avgidf):
    print("Creating bm25 model...")
    bm25Model = bm25.BM25(corpus)
    average_idf = sum(
        map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(
            bm25Model.idf.keys())
    pickle.dump(bm25Model, open(path_model, 'wb'))
    pickle.dump(average_idf, open(path_avgidf, 'wb'))
    return bm25Model, average_idf

Beispiel #17

0

Datei anzeigen

 def __init__(self):
     self.tokenizer = tokenizers.get_class('ltp')()
     self.docdb = retriever.get_class('sqlite')()
     self.corpus = retriever.utils.load_corpus(retriever.DEFAULTS['bm25_corpus_path'])
     self.bm25model = bm25.BM25(self.corpus)
     self.avg_idf = sum(map(lambda k: float(self.bm25model.idf[k]), self.bm25model.idf.keys())) \
         / len(self.bm25model.idf.keys())
     self.doc_titles = self.docdb.get_doc_ids()
     self.idx2title = {idx: self.doc_titles[idx] for idx in range(len(self.doc_titles))}

Beispiel #18

0

Datei anzeigen

    def train(self):
        if self.fresh:
            print("重新分词，创建BM25模型...")
            segs = self.initialize()
        else:
            print("从%s读入现有分词结果，创建BM25模型..." % file_questions_segs)
            segs = read_file(file_questions_segs)
            segs = [eval(x) for x in segs]

        self.model = bm25.BM25(segs)

Beispiel #19

0

Datei anzeigen

def training_bm25_model(data_set):
    sentence_tmp_list = list()
    for data in data_set:
        seg_list = jieba.cut(data)
        tmp_list = list()
        for seg in seg_list:
            tmp_list.append(seg)
        sentence_tmp_list.append(tmp_list)
    model = bm25.BM25(sentence_tmp_list)
    return model

Beispiel #20

0

Datei anzeigen

Datei: bm25.py Projekt: zx-feishang/ArticlePairMatching

def build_bm25(datas):
    corpus = []
    docid2index = {}
    for cur_id, cur_data in enumerate(datas):
        corpus.append(cur_data['doc1']['tokens_without_stopwords'])
        docid2index[cur_data['doc1']['docid']] = cur_id * 2
        corpus.append(cur_data['doc2']['tokens_without_stopwords'])
        docid2index[cur_data['doc2']['docid']] = cur_id * 2 + 1
    bm25Model = bm25.BM25(corpus)

    return bm25Model, docid2index

Beispiel #21

0

Datei anzeigen

Datei: bm25_fitness_data.py Projekt: zoey-wangzw/medical-chatbot

def get_fitness_answer(input_seq):
    """最后用bm25解决问题"""
    input_seq = get_final_input(input_seq)
    question_list, question_answer_direct = get_bm_data()
    bm25Model = bm25.BM25(question_list)
    scores = bm25Model.get_scores(input_seq)
    max_score = max(scores)
    idx = scores.index(max(scores))
    answer = get_key(question_answer_direct, question_list[idx])
    answer = str(answer[0])
    return max_score, answer

Beispiel #22

0

Datei anzeigen

def BM25(question_input):
    documentation_id = question_input["docid"]
    sentence = []
    corpus = []
    temp = []
    temp_fini = []
    for word in documentation[documentation_id]["text"]:
        sentence.append([word])
    for i in sentence:
        for k in i:
            word_list = k.split(" ")
        corpus.append(word_list)
        word_list = []

    query_str = question_input["question"]
    query_str_list = query_str.split(" ")
    simply_corpus = simply(corpus)
    bm25Model = bm25.BM25(simply_corpus)
    average_idf = sum(
        map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(
            bm25Model.idf.keys())
    scores = bm25Model.get_scores(query_str_list, average_idf)
    position = scores.index(max(scores))
    aim_sentence = documentation[documentation_id]["text"][position]
    aim_sentence_list = aim_sentence.split(".")
    for a in aim_sentence_list:
        temp_word = a.split(" ")
        for k in temp_word:
            temp.append(k)
        temp_fini.append(temp)
        temp = []
    tmp_fini_simply = simply(temp_fini)
    bm25Model = bm25.BM25(tmp_fini_simply)
    average_idf = sum(
        map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(
            bm25Model.idf.keys())
    scores = bm25Model.get_scores(query_str_list, average_idf)
    position = scores.index(max(scores))
    aim_sentence = aim_sentence_list[position]
    #    print("aim_sentence",aim_sentence)
    return aim_sentence

Beispiel #23

0

Datei anzeigen

def bm25_sim(corpus, sent, topk=5):
    '''
    corpus:
        ['*****', '******', '******', ...]
    sent: ['**', '*', '***', '**', ...]
    '''
    model = bm25.BM25(corpus)
    scores = model.get_scores(sent)
    scores = sorted(list(enumerate(scores)), key=lambda k: k[1],
                    reverse=True)[:topk]

    index = [idx[0] for idx in scores]

Beispiel #24

0

Datei anzeigen

def create_bm25_model(questions, answers):
    questions_tokens = []
    for q in tqdm(questions, desc="cut"):
        q_tokens = n_grams(q, n)
        questions_tokens.append(q_tokens)

    model = bm25.BM25(questions_tokens)
    average_idf = sum(float(val)
                      for val in model.idf.values()) / len(model.idf)
    data = [model, answers, average_idf]
    save_to_pkl(file=pkl_bm25, data=data)
    return model, answers, average_idf

Beispiel #25

0

Datei anzeigen

    def __init__(self, corpus):
        """
        Parameters
        ----------
        corpus : list of list of str
            Given corpus.

        """
        self.bm25 = bm25.BM25(corpus)
        self.average_idf = sum(
            map(lambda k: float(self.bm25.idf[k]),
                self.bm25.idf.keys())) / len(self.bm25.idf.keys())

Beispiel #26

0

Datei anzeigen

Datei: code.py Projekt: fishneck/CMB-news-pair

def train_text():
    # 测试文档预处理
    test_doc = []
    test_datas = pd.read_csv("test_data.csv", encoding="gbk")
    test_titles = test_datas["title"]
    for title in test_titles:
        test_doc.append(title)
    test_doc_list = []
    for doc in test_doc:
        doc_list = [word for word in jieba.cut(doc)]
        test_doc_list.append(doc_list)

    # 训练集预处理，去除噪声
    train_doc = []
    datas = pd.read_csv("train_data.csv")
    train_titles = datas["title"]
    for title in train_titles:
        if 13 < len(title) < 500:
            doc_list = [word for word in jieba.cut(title)]
            all_doc_list.append(doc_list)
        else:
            all_doc_list.append("。")
    all_doc_list = []
    for doc in train_doc:
        doc_list = [word for word in jieba.cut(doc)]
        all_doc_list.append(doc_list)

    # 制作词典
    dictionary = corpora.Dictionary(all_doc_list)
    dictionary.keys()
    print(dictionary.num_pos)
    dictionary.filter_extremes(no_below=25, no_above=0.5, keep_n=12330000)

    bm25Model = bm25.BM25(all_doc_list)
    average_idf = sum(
        map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(
            bm25Model.idf.keys())
    print(average_idf)
    results = []
    for doc_test_list in test_doc_list:
        score = bm25.BM25.get_scores(bm25Model, doc_test_list, average_idf)
        similiar_sorted = sorted(enumerate(score),
                                 key=lambda item: -item[1])[:21]
        indexs = [str(item[0] + 1) for item in similiar_sorted]
        results.append(" ".join(indexs))
    #写入文件
    with open("answers.txt", "w") as f:
        for item in results:
            item = item.strip().split()
            f.write("source_id" + "\t" + "target_id" + "\n")
            for i in range(1, 21):
                f.write(item[0] + "\t" + item[i] + "\n")

Beispiel #27

0

Datei anzeigen

Datei: utils.py Projekt: nducthang/tool-label-deepcare

def load_bm25():
    data = load_data()
    lst_question = data['question'].apply(
        lambda x: preprocessing(str(x))).tolist()
    # lst_answer = data['answer'].apply(lambda x: preprocessing(str(x))).tolist()
    # lst_qa = lst_question + lst_answer

    texts = [item.split() for item in lst_question]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    bm25_obj = bm25.BM25(corpus)
    params = {"BM25": bm25_obj, "texts": texts, 'dictionary': dictionary}
    return params

Beispiel #28

0

Datei anzeigen

Datei: papersort.py Projekt: yanhan19940405/papersort

def BMsort(list1, query_str):  ##list1指新闻构成的列表，query_str:指用户输入检索词
    dic = corpora.Dictionary(list1)
    bm25Model = bm25.BM25(list1)
    average_idf = sum(
        map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(
            bm25Model.idf.keys())
    query_str = jieba.cut(query_str)
    query_str = " ".join(query_str)
    query = []
    for word in query_str.strip().split():
        query.append(word)
    scores = bm25Model.get_scores(query, average_idf)
    return scores

Beispiel #29

0

Datei anzeigen

def baselines_eval():
    rankings_to_eval = read_query_test_rankings()
    qrels = parse_qrels()
    query_ids = list(qrels.keys())
    query_lookup = get_robust_eval_queries()
    queries = [query_lookup[query_id] for query_id in query_ids]
    k = 10 if len(sys.argv) == 1 else int(sys.argv[1])
    document_lookup = read_cache(name('./doc_lookup.json', ['with_titles']),
                                 get_robust_documents_with_titles)
    document_title_to_id = read_cache('./document_title_to_id.json',
                                      lambda: print('failed'))
    ordered_rankings_to_eval = [[
        document_title_to_id[title] for title in rankings_to_eval[query]
    ] for query in query_ids]
    ordered_qrels = [[document_title_to_id[title] for title in qrels[query]]
                     for query in query_ids]
    document_id_to_title = _.invert(document_title_to_id)
    doc_ids = range(len(document_id_to_title))
    documents = [
        document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids
    ]
    tokenizer = Tokenizer(
        rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces])
    tokenized_documents = read_cache(
        'tok_docs.json',
        lambda: tokenizer.process_all(clean_documents(documents)))
    tokenized_queries = tokenizer.process_all(clean_documents(queries))
    bm25 = gensim_bm25.BM25(tokenized_documents)
    # with open('./caches/106756_most_common_doc.json', 'r') as fh:
    #   doc_token_set = set(json.load(fh))
    # corpus, token_lookup = tokens_to_indexes(tokenized_documents,
    #                                          None,
    #                                          token_set=doc_token_set)
    # corpus = [[[token_lookup[term], f] for term, f in doc_fs.items()] for doc_fs in bm25.f]
    # tfidf = TfidfModel(corpus)
    # lsi = LsiModel(tfidf, id2word=_.invert(token_lookup), num_topics=300)
    glove_rankings = []
    # lsi_rankings = []
    glove = get_glove_lookup(embedding_dim=300, use_large_embed=True)
    encoded_docs = torch.stack(
        [encode_glove_fs(glove, bm25.idf, doc_fs) for doc_fs in bm25.f])
    encoded_docs = encoded_docs / torch.norm(encoded_docs, dim=1).unsqueeze(1)
    for q, qml_ranking in progressbar(zip(tokenized_queries,
                                          ordered_rankings_to_eval),
                                      max_value=len(tokenized_queries)):
        doc_ids = qml_ranking[:k] if '--rerank' in sys.argv else None
        glove_rankings.append(
            rank_glove(glove, bm25.idf, encoded_docs, q, doc_ids=doc_ids))
        # lsi_rankings.append(rank_lsi(lsi, tfidf, [token_lookup[term] if term in token_lookup else 0 for term in q], doc_ids=doc_ids))
    print('indri:', metrics_at_k(ordered_rankings_to_eval, ordered_qrels, k))
    print('glove:', metrics_at_k(glove_rankings, ordered_qrels, k))

Beispiel #30

0

Datei anzeigen

Datei: bm25.py Projekt: kpsc/nlp

def bm25_sim(corpus, result, topk=5):
    data = []
    with open(corpus, encoding='utf-8') as fcor:
        for line in tqdm(fcor):
            line = json.loads(line.strip())
            data.append(line['query'])

    with open(result, 'w', encoding='utf-8') as fres:
        stime = time.time()
        step = 1
        while len(data) > 5000:
            query = data.pop(0)
            model = bm25.BM25(data)
            scores = model.get_scores(query)
            scores = sorted(list(enumerate(scores)),
                            key=lambda k: k[1],
                            reverse=True)

            index = [scores[0][0]]
            index_remove = [scores[0][0]]
            temp = ''.join(query)
            for i in range(1, 100):
                idx, score = scores[i]
                if abs(score - scores[i - 1][1]) > 1e-4 and temp != ''.join(
                        data[idx]):
                    index.append(idx)
                index_remove.append(idx)

                if len(index) >= topk or score < 15.0:
                    break
            ids = len(index_remove)
            for i in range(ids, 100):
                if scores[i][1] >= scores[ids - 1][1] - 1.0:
                    index_remove.append(scores[i][0])

            if scores[0][1] > 20.0 and len(index) >= 3:
                output = {}
                output['query'] = ' '.join(query)
                output['candidates'] = [' '.join(data[i]) for i in index]
                fres.write(json.dumps(output, ensure_ascii=False) + '\n')

            index_remove = sorted(index_remove, reverse=True)
            for idx in index_remove:
                data.pop(idx)

            if step % 100 == 0:
                fres.flush()
                print('step: %d, spend-time: %.4f' %
                      (step, time.time() - stime))
                stime = time.time()
            step += 1