コード例 #1
0
    def relevance_finder(query, df, flag):
        df_new = df.reset_index()

        if flag == 'q':
            bm = BM25(df_new['tokenised question'])
            tf = bm.get_tf_for_query(query)
            idf = bm.get_idf_for_query(query)
            relevance_score = bm.get_bm25_scores(query, tf, idf)
            df_new['relevance_score'] = relevance_score
            # df_new.sort_values(['relevance_score'], ascending=False)
            df_new.sort_values(['relevance_score'],
                               ascending=False,
                               inplace=True)
            return df_new
        else:
            bm = BM25(df_new['tokenised reviews'])
            tf = bm.get_tf_for_query(query)
            idf = bm.get_idf_for_query(query)
            relevance_score = bm.get_bm25_scores(query, tf, idf)
            df_new['relevance_score'] = relevance_score
            # df_new.sort_values(['relevance_score'], ascending=False)
            df_new.sort_values(['relevance_score'],
                               ascending=False,
                               inplace=True)
            return df_new
コード例 #2
0
ファイル: chatbot.py プロジェクト: mkl04/P07
def do_bm25(query, df, best_n=3):

    query = normalize_terms(word_tokenize(query))
    text_lines_total = df.text.values

    text_lines_tokens = []
    for title in text_lines_total:

        title = re.sub(r'\W', ' ', title)  # matches any non-word character
        title = re.sub(r'\s+', ' ', title)  # matches any whitespace character
        title = title.replace('  ', ' ')
        title = title.replace('   ', ' ')

        tx_line = word_tokenize(title)
        text_lines_tokens.append(tx_line)

    news = [normalize_terms(sentence) for sentence in text_lines_tokens]
    bm25 = BM25(news)
    best_indexes = bm25.ranked(query, best_n)

    best_sentences = []
    for ind in best_indexes:
        sentence_found = text_lines_total[ind]
        best_sentences.append(sentence_found)

    return best_sentences
コード例 #3
0
    def relevant_docs_from_posting(self, query):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query: query
        :return: dictionary of relevant documents.
        """

        # -------------------- methods ------------------------------------
        if self.wordnet_toggle:
            query = wn.expand_query(query, self._parser.stop_words)
        if self.spelling_corr_toggle:
            query = sp.correct_spelling(query)
        # -----------------------------------------------------------------
        N = self._parser.num_of_docs
        total_len = self._parser.total_doc_length
        inverted_idx = self.indexer.inverted_idx
        # cosine_sim = CosineSimCalculator(inverted_idx, query, N)
        # cosine_sim.create_wiq_dict()
        # relevant_docs = cosine_sim.calc_similarity()

        bm25 = BM25(inverted_idx, query, N, total_len)
        bm25.create_wiq_dict()
        relevant_docs = bm25.calc_bm25()

        for doc, socre in relevant_docs.items():
            if doc in self._parser.retweet_dict:
                relevant_docs[doc] += self._parser.retweet_dict[doc]

        return relevant_docs
コード例 #4
0
def idf():
    word = request.args.get('word')
    bm25 = BM25()
    result = dict()
    result['word'] = word
    result['idf'] = bm25.idf(word)
    return jsonify(result)
コード例 #5
0
    def search(self, query, user_resource=None):
        self.index_search_result = dict()
        self.content_search_result = dict()
        tokenizer = Tokenizer()
        query_tokens = tokenizer.processItem(query)
        self.index_search_result, self.content_search_result = self.db.get_content_by_index(
            query_tokens)

        combined_result = combine_index_content_result(\
            self.index_search_result,\
            self.content_search_result\
        )
        bm25 = BM25(query_tokens)

        #bm25 get_relevance_score for combined result
        score = bm25.get_relevance_score(combined_result)

        #pagerank
        if user_resource is None:
            pr_score = self.pr.get_score_for_search(self.content_search_result)
        else:
            pr_score = PageRank.filter_score_from_pr_score(
                self.content_search_result, user_resource["pr_score"])

        combined_score = combine_score(score, pr_score)

        def get_score(content):
            print(content['url'], ": BM25 : ", score[content['url']], "PR: ",
                  pr_score[content['url']])
            return combined_score[content['url']]

        return sorted(self.content_search_result, key=get_score, reverse=True)
コード例 #6
0
ファイル: get_top100.py プロジェクト: AntoineLjn/marco-polo
def train_bm25(queries: List[Query], collection: Corpus) -> List[Query]:
    bm25 = BM25(collection)
    queries_list: List[Query] = []
    for query in queries:
        top_10 = bm25.top_n(query, n=TOP_N)
        query.update_answers(top_10, n=TOP_N)
        queries_list.append(query)
    return queries_list
コード例 #7
0
 def __init__(self, docs):
     self.docs = docs
     self.bm25 = BM25(docs)
     self.D = len(docs)
     self.d = 0.85
     self.weight = []
     self.weight_sum = []
     self.vertex = []
     self.max_iter = 200
     self.min_diff = 0.001
     self.top = []
コード例 #8
0
ファイル: summary.py プロジェクト: hello-lan/mlnlp_notes
 def __init__(self, docs):
     self.d = 0.85
     self.max_iter = 200
     self.min_diff = 0.001
     self.docs = docs
     self.bm25 = BM25(docs)
     self.D = len(docs)
     self.weight = []
     self.weight_sum = []
     self.vertex = defaultdict(lambda: 1)
     self.top = {}
     self.solve()
コード例 #9
0
def bm25(data_dir, dataset_type, tokenizer=default_tokenizer):
    train_path = os.path.join(data_dir, 'train.csv')
    if os.path.isfile(train_path + ".bm25"):
        bm25 = pickle.load(open(train_path + ".bm25", 'rb'))
        return bm25
    train_dataset_reader, _ = getReadersByDatasetType(dataset_type)
    answers = []
    for (q, a) in train_dataset_reader.conversations(train_path):
        answers.append(a)
    bm25 = BM25(tokenizer)
    bm25.fit(answers)
    pickle.dump(bm25, open(train_path + ".bm25", 'wb'))
    return bm25
コード例 #10
0
    def search(self, query, user_resource=None):
        self.index_search_result = dict()
        self.content_search_result = dict()
        #tokenizer = Tokenizer()
        #query_tokens = tokenizer.processItem(query)
        query_tokens = query.true_tokens
        print(query.token_weights)
        self.index_search_result, self.content_search_result = self.db.get_content_by_index(query_tokens, query.token_weights)
        print(len(self.index_search_result), len(self.content_search_result))
        
        combined_result = combine_index_content_result(\
            self.index_search_result,\
            self.content_search_result\
        )

        bm25 = BM25(query_tokens)
        #print("tokens", query_tokens)

        #print("combined result", combined_result)
        
        #bm25 get_relevance_score for combined result
        #score = bm25.get_relevance_score(combined_result, tags)
        
        #pagerank
        if user_resource is None:
            score = bm25.get_relevance_score(combined_result)
            pr_score = self.pr.get_score_for_search(self.content_search_result)
            combined_score = combine_score(score, pr_score)
        else:
            user = current_user
            score = bm25.get_relevance_score(combined_result, user.tags)
            pr_score = PageRank.filter_score_from_pr_score(self.content_search_result, user_resource["pr_score"])
            combined_score = combine_score(score, pr_score, pr=user.pr, bm25 = user.bm25)
        #print(user_resource, pr_score)
        #combined_score = combine_score(score , pr_score)
 
        def get_score(content):
            print(content['url'], ": BM25 : ", score[content['url']], "PR: ", pr_score[content['url']])
            return combined_score[content['url']]
 
        
        return sorted([content[0] for content in self.content_search_result], key=get_score, reverse=True)
コード例 #11
0
ファイル: similary.py プロジェクト: hello-lan/mlnlp_notes
 def __init__(self, text):
     self.sentences = self.split_text(text)
     self.docs = self.sentences2docs(self.sentences)
     self.bm25 = BM25(self.docs)
コード例 #12
0
GAMMA = 0.15

list_queries = utils.load_queries(utils.PARSED_QUERIES)
inverted_index = utils.load_inverted_index(
    os.path.join(utils.INDEX_DIR, "stem_False_stop_False_inverted_index.txt"))
# args = {debug: False, isstemmed: False, isstopped: False}

parser = argparse.ArgumentParser(description="Parser for JM Smoothing")

parser.add_argument("-d", "--debug", action="store_true")
parser.add_argument("-stem", "--isstemmed", action="store_true")
parser.add_argument("-stop", "--isstopped", action="store_true")

args = parser.parse_args()

baseline_run = BM25(args, inverted_index, utils.load_corpus_stats(),
                    list_queries)
# dict query_id: [[doc_name,score],[],[] ....]
baseline_run.compute_scores()
results = baseline_run.bm25_scores
query_mapping = utils.load_query_map()


def get_content(doc_names):
    doc_contents = []
    for doc in doc_names:
        with open(os.path.join(utils.CORPUS_DIR, "{}.txt".format(doc)),
                  "r") as f:
            doc_contents.append(f.read())
    return doc_contents

コード例 #13
0
eval_path = './eval/eval.txt'

# 返回的相关词个数
k1 = 5
k2 = 15

if __name__ == '__main__':
    start = time.time()
    print('开始执行')
    # 构建查询
    print('根据文件' + query_path + '构建查询并作查询扩展')
    query_list = build_query(query_path, w2v_path, vocab_path, k1)
    print('构建查询完毕')
    # bm模型
    print("构建BM25模型")
    bm = BM25()
    print('构建BM25模型完毕')
    # 导入倒排表
    print('从' + invert_table_path + '处导入倒排表')
    bm.build(invert_table_path)
    print('导入完毕')
    # 查询
    print("开始查询")
    res = start_query(bm, query_list, k2)
    print('存储查询结果到目录' + res_doc_path)
    get_doc_cont(res, res_doc_path, doc_path)
    # 计算p@10
    print('存储评估所需文件至' + res_path)
    eval_res(res, res_path)
    # 测试
    print("开始评估结果")
コード例 #14
0
          search_key=""
          for i in q.split():
             if i not in Retrieved_Stopword: 
                search_key+=i+" "
                

          search_key=search_key.rstrip()                                          #Removing last space
          search_key_length=len(search_key.split())
          if search_key_length>1 and search_key not in Final_Document: 
           Final_Document.append(search_key)



temp=""
adding_final_document=""
bm25 = BM25(Final_Document[:1000])

split_final_document=""
position=0
for  index, score in bm25.ranked(query, 25):
      #print('{} ->> {} ->>  {}'.format(position, ''.join(Final_Document[index]), score ))  ##Printing Ranked Document
      temp+=Final_Document[index]
      temp=''.join(temp+" ")
      adding_final_document+=temp        # Adding sentence
      position=position+1





コード例 #15
0
pprint.pprint(dictOf)
print('')
print('query :' + q)
print('+----------------------------------+')

print('')
print('Pembobotan TF-IDF')
tfidf = TfIdf().transform(q=q, document=document)
print("Bobot rata-rata: " + str(tfidf.weight_average()))
pprint.pprint(tfidf.get_weight())
print("+---------------------------------+")

print('')
print('Pembobotan W-IDF')
widf = WIdf().transform(q=q, document=document)
print("Bobot rata-rata: " + str(widf.weight_average()))
pprint.pprint(widf.get_weight())
print("+---------------------------------+")

print('')
print('Pembobotan TFRF')
tfrf = TFRF().transform(q=q, document=document)
print("Bobot rata-rata: " + str(tfrf.weight_average()))
pprint.pprint(tfrf.get_weight())
print("+---------------------------------+")

print('')
print('Pembobotan BM25')
bm25 = BM25().transform(q=q, document=document)
pprint.pprint(bm25.weigth())
print("+---------------------------------+")
コード例 #16
0
ファイル: sample.py プロジェクト: ranarag/python-bm25
    # - remover stopwords
    # - remover numerais
    # - stemming
    return [remove_diacritics(term).lower() for term in terms]


def remove_diacritics(text, encoding='utf8'):
    """Remove diacritics from bytestring or unicode, returning an unicode string"""
    nfkd_form = unicodedata.normalize('NFKD', to_unicode(text, encoding))
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode(encoding)


def to_unicode(text, encoding='utf8'):
    """Convert a string (bytestring in `encoding` or unicode), to unicode."""
    if isinstance(text, six.text_type):
        return text
    return text.decode(encoding)


# nltk.download('mac_morpho')
# `news` é uma list que contém listas de tokens
news = [normalize_terms(sentence) for sentence in mac_morpho.sents()]
print(repr(news[0]))

# Estou utilizando os 1000 primeiros pra exemplificar. Processar todos os 51397 demora um tempinho
bm25 = BM25(news[:1000])
query = normalize_terms(nltk.word_tokenize('inflacao'))
for position, index in enumerate(bm25.ranked(query, 5)):
    print('{} - {}'.format(position, ' '.join(news[index])))
コード例 #17
0
 def __init__(self, corpus, b=0.75, k1=1.2):
     self.corpus = corpus
     logger.debug(f"Instantiating BM25Search with k1={k1} b={b}")
     self.vectorizer = BM25(b=b, k1=k1)
     self.vectorizer.fit(corpus)
コード例 #18
0
def search(request):
    key = request.GET.get('key')
    alogorithm_type = request.GET.get('alogorithm_type')
    print('we get alogorithm_type', alogorithm_type)
    if alogorithm_type != '2':
        alogorithm_type = '1'
    key_name = '{}_{}'.format(key, alogorithm_type)
    print('alogorithm_type', alogorithm_type)
    alogorithm_type = int(alogorithm_type)
    if alogorithm_type == 2:
        cache.delete('{}_{}'.format(key, 1))
        print('this is bm25')
    else:
        cache.delete('{}_{}'.format(key, 2))
        print('this is tfidf')
    print(key_name)
    papers = cache.get(key_name)
    print(papers, 'cache')
    if papers is None:
        if alogorithm_type == 2:
            print('invoke BM25 algorithm')
            start = time.time()
            papers = BM25(key)
            end = time.time()
            print('BM25 spend', end - start)
        else:
            print('invoke TFIDF algorithm')
            start = time.time()
            papers = TFIDF(key)
            end = time.time()
            print('TFIDF spend', end - start)
    # 127.0.0.1: 8000 / api / search?key = design & alogorithm_type_type = 1 & order = 1 & descend = 1&year=2015-2020&author=Zelalem Mekuria&venue=ccf
    # alogorithm 1 代表 tfidf, 2代表bm25,order 1 year,2citation;descend 1 降序,2 升序;后面就是按照输入过滤了
    # order: 1:year 2: citation
    # descend: 1 降序 2: 升序
    # 排序c
    order_by_date = request.GET.get('order')
    descend = request.GET.get('descend')
    if descend == '1':
        descend = True
    else:
        descend = False
    if order_by_date == '1':
        papers = sorted(papers, key=lambda x: x.year, reverse=descend)

    elif order_by_date == '2':
        papers = sorted(papers, key=lambda x: x.n_citation, reverse=descend)
    # filter
    year = request.GET.get('year')
    print('this is year', year)
    if year is not None and year != '':
        begin, end = year.split('-')
        # if begin==end or begin=='0000':

        temp = []
        for paper in papers:
            # localhost:8000/api/test?key=design&alogorithm=1&order=1&descend=1&year=2013-2014
            # localhost:8000/api/test?key=design&alogorithm=1&order=1&descend=1
            try:
                begin_date = datetime(year=int(begin),
                                      month=1,
                                      day=1,
                                      tzinfo=pytz.utc)
                end_date = datetime(year=int(end),
                                    month=1,
                                    day=1,
                                    tzinfo=pytz.utc)
                if paper.year <= end_date and paper.year >= begin_date:
                    temp.append(paper)
            except Exception:
                temp = []
                break
        papers = temp
    author = request.GET.get('author')
    if author is not None and author != '':
        temp = []
        for paper in papers:
            exist = paper.authors.filter(name=author).exists()
            print('author', author)
            print('exist', exist)
            if exist:
                temp.append(paper)
        papers = temp
    venue = request.GET.get('venue')
    if venue is not None and venue != '':
        temp = []
        for paper in papers:
            if paper.venue == venue:
                temp.append(paper)
        papers = temp
    #
    # History.objects.create()
    # print('key', key)
    # print('alogorithm_type', alogorithm_type)
    serializer = PaperSerializer(papers, many=True)
    return Response(serializer.data)
コード例 #19
0
def search():
    word = request.args.get('q')
    bm25 = BM25()
    results = bm25.search(word)
    return jsonify(results)
コード例 #20
0
ファイル: rule_based_QA.py プロジェクト: JianxingMa/LMMQA
    def __init__(self):
        # switch of train, dev, test model
        train = 0
        dev = 0
        test = 1

        # switch of loading data from pkl or reprocessing
        load_processed_doc = 1
        load_doc_from_pkl = 1

        # switch of testing BM25 accuracy
        test_BM25 = 0

        self.data = Data()
        self.config = Config()
        self.fileLoader = FileLoader(self.config, self.data)
        self.bdp = BasicDataProcessor(self.config, self.data)
        self.bm25 = BM25(self.config, self.data)

        # not used ner tags, will merge them together with 'O' tag
        self.other = [
            'SET', "MISC", 'EMAIL', 'URL', 'TITLE', 'IDEOLOGY',
            'CRIMINAL_CHARGE'
        ]

        self.fileLoader.load_doc()

        # load doc data
        if load_processed_doc:
            if load_doc_from_pkl:
                with open(self.config.doc_processed_path, 'rb') as f:
                    self.data.doc_processed = pickle.load(f)
            else:
                self.data.doc_processed = self.bdp.process_docs(
                    self.data.doc_texts)
                with open(self.config.doc_processed_path, 'wb') as f:
                    pickle.dump(self.data.doc_processed, f)

        # load train data
        if train:
            self.fileLoader.load_training_data()
            if test_BM25:
                self.bm25.test_training_BM25_accuracy(10)
                return

            # predict answer
            # self.predict_with_bm25_pars_sents(0)
            self.predict_with_bm25_sents(0)

        # load dev data
        if dev:
            self.fileLoader.load_dev_data()
            if test_BM25:
                self.bm25.test_BM25_par_on_dev()
                return

            # predict answer
            self.predict_with_bm25_pars_sents(1)
            # self.predict_with_bm25_sents(1)

        # load test data
        if test:
            self.fileLoader.load_test_data()

            # predict answer
            # self.predict_with_bm25_pars_sents(2)
            self.predict_with_bm25_sents(2)
コード例 #21
0
ファイル: main.py プロジェクト: Ziqih/cs510-lab
from os import listdir
from helpers import *
from bm25 import BM25
from bm25plus import BM25plus
from rank_bm25 import BM25Okapi
import matplotlib.pyplot as plt


path = 'input/'
dirs = [f for f in listdir(path)]
print(f"{len(dirs)} documents in total")
documents = []
for fname in dirs:
    with open(path+fname, "r") as f:
        documents.append(f.read())
bm25 = BM25(documents, k1=1.2, b=0.75)
scores = bm25.get_scores("best apps daily activity exercise diabetes")
l = []
for i, score in enumerate(scores):
    l.append((score, i))
l.sort(key=lambda x:x[0], reverse=True)
i = l[0][1]
with open('output.txt', 'w') as f:
    f.write(documents[i])

with open("topic.txt", "r") as f:
    query_list = []
    for line in f:
        query_list.append(line.strip())
print(f"{len(query_list)} queries in total")