Beispiel #1
0
def get_similarity(query, documents):
    docs = query + documents
    docs = [word_token(d, lemma=True) for d in docs]
    tokenized_corpus = [doc.split(' ') for doc in docs]
    # print(tokenized_corpus)
    # print(tokenized_corpus)
    bm25 = BM25Okapi(tokenized_corpus[1:])
    bm25plus = BM25Plus(tokenized_corpus[1:])
    bm25L = BM25L(tokenized_corpus[1:])

    query = tokenized_corpus[0]
    # print(query)
    bm25_scores = bm25.get_scores(query)
    bm25plus_scores = bm25plus.get_scores(query)
    bm25L_scores = bm25L.get_scores(query)

    bm25_scores = [(i, v) for i, v in enumerate(bm25_scores)]
    bm25plus_scores = [(i, v) for i, v in enumerate(bm25plus_scores)]
    bm25L_scores = [(i, v) for i, v in enumerate(bm25L_scores)]

    bm25_scores.sort(key=lambda x: x[1], reverse=True)
    bm25plus_scores.sort(key=lambda x: x[1], reverse=True)
    bm25L_scores.sort(key=lambda x: x[1], reverse=True)

    # print(bm25_scores)
    # print(bm25plus_scores)
    # print(bm25L_scores)
    # print(bm25_scores)
    # print(bm25plus_scores)
    # print(bm25L_scores)

    return bm25_scores, bm25plus_scores, bm25L_scores
Beispiel #2
0
def bm25_classifier(query, descriptions, labels):
    """
    Computes BM25 scores of a given query in relation to all selected and preprocessed datasets and
    selects all datasets that exeed the threshold mean+3*sd.
    input: query and list of lables,
    output: list of labels that fit the query
    """
    preprocessed_descriptions = []
    for description in descriptions:
        preprocessed_descriptions.append(
            preprocessing.preprocess(str(description)))
    tokenized_corpus = [doc.split(" ") for doc in preprocessed_descriptions]
    bm25_modell = BM25Plus(tokenized_corpus)
    tokenized_query = query.split(" ")
    scores = bm25_modell.get_scores(tokenized_query)
    mean_scores = mean(scores)
    standard_deviation_scores = stdev(scores)
    selected = []
    for i in range(0, len(descriptions)):
        label = labels[i]
        description = descriptions[i]
        score = scores[i]
        if score > (mean_scores + 4 * standard_deviation_scores):
            selected.append(label)
    return selected
Beispiel #3
0
 def get_sparse_embedding(self):
     # Pickle save.
     pickle_name = f"bm25_pororo.bin"
     emd_path = os.path.join(self.data_path, pickle_name)
     if os.path.isfile(emd_path):
         with open(emd_path, "rb") as file:
             self.bm25 = pickle.load(file)
         print("Embedding pickle load.")
     else:
         print("Build passage embedding")
         self.bm25 = BM25Plus(self.tokenized_contexts)
         with open(emd_path, "wb") as file:
             pickle.dump(self.bm25, file)
         print("Embedding pickle saved.")
    def get_sentence_rank(self, query, top50_rank_list):
        self.query = query
        self.top50_rank_list = top50_rank_list
        top50_files_sent = []
        # for filename, rank in self.top50_rank_list:
        #     filename = filename.strip('.txt')
        #     sent_filename = filename + '_1.txt'
        #     top50_files_sent.append(sent_filename)
        #     f = open(sent_filename, 'r')
        #     document = f.read()
        #     f.close()
        #     sentences = nltk.sent_tokenize(document)  # this gives us a list of sentences
        #     # now loop over each sentence and tokenize it separately
        #     sentence_terms = ''
        #     for sentence in sentences:
        #         sentence_terms = self.tokenize(sentence)
        #     self.sentence_corpus.append(sentence_terms)

        for id, rank in self.top50_rank_list:
            filename = id.replace('Extracted Docs', 'Unprocessed_Docs')
            f = open(filename, 'r')
            document = f.read()
            f.close()
            sentences = nltk.sent_tokenize(
                document)  # this gives us a list of sentences
            # now loop over each sentence and tokenize it separately
            sentence_terms = ''
            for sentence in sentences:
                sentence_terms = self.tokenize(sentence)
            self.sentence_corpus.append(sentence_terms)
        if (len(self.sentence_corpus) == 0):
            self.sentence_corpus = ['a']
        bm25 = BM25Plus(self.sentence_corpus)

        # query1 = "what debts did qintex group leave ?"
        # tokenized_query = self.tokenize(self.query)

        sent_scores = bm25.get_scores(self.query)
        # print(sent_scores)
        # sent_scores_sorted = np.argsort(sent_scores)[::-1][:self.no_of_docs_reqd]
        top_50_sentences = bm25.get_top_n(self.query,
                                          self.sentence_corpus,
                                          n=self.no_of_docs_reqd)
        clean_sentences = []
        for tokened_sentence in top_50_sentences:
            clean_sentence = " ".join(tokened_sentence)
            clean_sentences.append(clean_sentence)
            # print(clean_sentence)
        return clean_sentences
Beispiel #5
0
    def get_embedding_BM25(self):

        pickle_name = f"BM25_embedding.bin"
        emd_path = os.path.join(self.data_path, pickle_name)
        if os.path.isfile(emd_path):
            with open(emd_path, "rb") as file:
                self.BM25 = pickle.load(file)
            print("BM25 Embedding pickle load.")
        else:
            print("Build passage BM25_embedding")
            tokenized_contexts = [self.tokenizer(i) for i in self.contexts]
            self.BM25 = BM25Plus(tokenized_contexts)
            with open(emd_path, "wb") as file:
                pickle.dump(self.BM25, file)
            print("BM25 Embedding pickle saved.")
Beispiel #6
0
def process_document(tokenized_corpus: [[str]], tokenized_document: [str],
                     buffer: [dict], queries: set[str]) -> [dict]:
    prev_word = ''
    word_ind = 1
    words_total = len(tokenized_document)

    for word in tokenized_document:
        if word != chunk_keyword and '’' not in word and len(
                word) >= max_word_len:
            query = chunk_keyword + ' ' + prev_word + ' ' + word
            # query = chunk_keyword + ' ' + word

            tokenized_query = tokenize_query(query)

            orders = []

            for ind, bm25 in enumerate([
                    BM25L(tokenized_corpus),
                    BM25Okapi(tokenized_corpus),
                    BM25Plus(tokenized_corpus)
            ]):
                documents_order = get_documents_order(tokenized_query, bm25,
                                                      ind)
                orders.append(documents_order)

            if all_different(orders) and (prev_word
                                          not in queries) and (word
                                                               not in queries):
                query_entry = {
                    'query': query,
                    'orders':
                    list(zip(['BM25L', 'BM25Okapi', 'BM25Plus'], orders))
                }

                buffer.append(query_entry)
                queries.add(prev_word)
                queries.add(word)

            prev_word = word

        if word_ind % 100 == 0:
            print(f'{word_ind}/{words_total} words processed')

        word_ind += 1

    return buffer
 def get_doc_rank(self, query, scores):
     self.query = query
     self.scores = scores
     for id in self.scores:
         f = open(id, 'r')
         document = f.read()
         f.close()
         doc_terms = self.tokenize(document)
         self.doc_corpus.append(doc_terms)
     if (len(self.doc_corpus) == 0):
         self.doc_corpus = ['a']
     bm25 = BM25Plus(self.doc_corpus)
     # query1 = "what debts did qintex group leave ?"
     # tokenized_query = self.tokenize(self.query)
     doc_scores = bm25.get_scores(self.query)
     # print(doc_scores)
     doc_scores_sorted = np.argsort(doc_scores)[::-1]
     # top_n = bm25.get_top_n(tokenized_query, self.doc_corpus, n=1)
     # print(doc_scores_sorted)
     self.write_to_dict(doc_scores_sorted.tolist())
     self.top50_rank_list = self.top50_rank()
     return self.top50_rank_list
@application.route('/search/', methods=['POST'])
def search():
    query = request.form['query'].strip()
    docs = engine.get_top_k_docs(query, k=50)

    return jsonify(docs)


@application.route('/save_relevance/', methods=['POST'])
def save_relevance():
    query = request.form['query'].strip()
    doc_id = request.form['doc_id']
    rel_score = request.form['rel_score']

    engine.store_relevance_judgements(query, doc_id, rel_score)
    return ('', 204)


if __name__ == '__main__':
    print('Initializing Search Engine...')

    corpus = get_corpus('data/corpus1.pkl') + get_corpus('data/corpus2.pkl')
    tokenized_corpus = get_tokenized_corpus(
        'data/tokenized_corpus1.pkl') + get_tokenized_corpus(
            'data/tokenized_corpus2.pkl')
    model = BM25Plus(tokenized_corpus)
    engine = SearchEngine(model, corpus, 'data/relevance_feedback.txt')

    print('Done!')

    application.run()
Beispiel #9
0
def get_resp():

    if request.method == 'POST':

        sent = request.form["sent"]
        query = request.form["query"]
        weight1 = request.form["weight1"]
        weight2 = request.form["weight2"]
        n = request.form["n"]

        ######
        if weight1 == "":
            weight1 = 0.5
        else:
            weight1 = float(weight1)

        if weight2 == "":
            weight2 = 0.5
        else:
            weight2 = float(weight2)

        if n == "":
            n = 3
        else:
            n = int(n)

        ######
        sentences = sent.split(',')

        candidate_embeddings = module.signatures['response_encoder'](
            input=tf.constant(sentences),
            context=tf.constant(sentences))['outputs']

        query_embedding = module.signatures['question_encoder'](tf.constant(
            [query]))['outputs'][0]
        similarities = angular_similarity([query_embedding],
                                          candidate_embeddings)
        score = similarities[
            0]  ###########  Ranker 1 - USEQA Angular #########

        #### BM25+ ####

        preprocessed_query = preprocess(query)
        preprocessed_sentences = [preprocess(sent) for sent in sentences]

        tokenized_corpus = [doc.split(" ") for doc in preprocessed_sentences]
        ranker = BM25Plus(tokenized_corpus)

        tokenized_query = preprocessed_query.split(" ")

        r2 = ranker.get_scores(tokenized_query)

        w1 = weight1
        w2 = weight2

        print("######################")
        print("weight one : ", w1)
        print("weight two : ", w2)
        print("####################")

        weighted_final = w1 * score + w2 * r2

        zipped = list(zip(weighted_final, sentences))

        resp1 = return_top_n(zipped, n)

        final = {"Query": query, "result": resp1}

        return (final)

    return '''<form method="POST">
# ----------------------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------------------

# ----------------------------------------------------------------------------------------------------------------------
from rank_bm25 import BM25Okapi, BM25L, BM25Plus
from Functions import *
from QEModels import *

corpus = DOCTEXT_list
tokenized_corpus = [doc.split(" ") for doc in corpus]

# Retrieval MODELS  ---------------------------------------------------------------------------------------------------
bm25 = BM25Okapi(tokenized_corpus)
bm25l = BM25L(tokenized_corpus)
bm25plus = BM25Plus(tokenized_corpus)

# RANKING --------------------------------------------------------------------------------------------------------------
# Original Query  - - - - - - - - -- - - - - -- - - -- - -- - -- - -- - - -- - -- - - - --- - - - - - -- - - -- - - -- -
runLines = []
with open("robust-BM25-run.txt", "w") as frun:
    for i in range(len(QUERYTEXT_list)):
        query = QUERYTEXT_list[i]
        tokenized_query = query.split(" ")
        print(tokenized_query)

        #  BM25
        doc_scores_bm25 = bm25.get_scores(tokenized_query)
        bm25TopN = bm25.get_top_n(tokenized_query, corpus, n=1000)

        for j in bm25TopN:
Beispiel #11
0
print("字典造好")
# tokenized_corpus = [doc.split(" ") for doc in respList]
# bm25 = BM25Okapi(tokenized_corpus)
print("训练好")
mesIdList = test["message_id"].unique()
midL = []; ridL = []

for mid in mesIdList:
    midL+=[mid for i in range(10)]
    query = getText(mid)
    tokenized_query = query.split(" ")


    corpus = tsv[tsv["message_id"]==mid]["response"].drop_duplicates("first").tolist()
    tokenized_corpus = [str(doc).split(" ") for doc in corpus]
    bm25 = BM25Plus(tokenized_corpus)
    res = bm25.get_top_n(tokenized_query, corpus, n=10)
    res1 = list(map(lambda x:getId(x),res))
    if len(res1)<10:
        res2 = random.sample(list(respIdList),10-len(res1))
    #
    #     remainid = test["response_id"].drop_duplicates(keep="first").values
    #     # print(remainid)
    #     remaincorpus = list(map(lambda x: getText(x), remainid))
    #     tokenized_corpus = [doc.split(" ") for doc in remaincorpus]
    #     bm25 = BM25L(tokenized_corpus)
    #     res = bm25.get_top_n(tokenized_query, remaincorpus, n=10-len(corpusid))
    #
    #     res2 = list(map(lambda x: text2id[x], res))
        res1 = list(res1) + list(res2)
    print(len(res1))
Beispiel #12
0
def corpus_index():
    cache_dict = open_cache()
    corpus = list(cache_dict.values())
    tokenized_corpus = [remove_stopwords(str(doc).split(" ")) for doc in corpus]
    bm25plus = BM25Plus(tokenized_corpus)
    return corpus, bm25plus, cache_dict