def __bm25_ranking(self, lista): if self.ranking in ['bm25','BM25']: import nltk from nltk.corpus import stopwords wnl = nltk.WordNetLemmatizer() corpus_text = '' corpus_venue = '' if self.venue != None and self.tab in ['inproceedings']: corpus_text = [x['ts_i'] for x in lista] corpus_text = [x.replace("'", ' ').replace(":", ' ') for x in corpus_text] corpus_text = [nltk.word_tokenize(x) for x in corpus_text] else: corpus_text = [x['ts_tab'] for x in lista] corpus_text = [x.replace("'", ' ').replace(":", ' ') for x in corpus_text] corpus_text = [nltk.word_tokenize(x) for x in corpus_text] if self.venue != None and self.tab in ['inproceedings']: corpus_venue = [x['ts_p'] for x in lista] corpus_venue = [x.replace("'", ' ').replace(":", ' ') for x in corpus_venue] corpus_venue = [nltk.word_tokenize(x) for x in corpus_venue] elif self.venue != None: corpus_venue = [x['ts_venue'] for x in lista] corpus_venue = [x.replace("'", ' ').replace(":", ' ') for x in corpus_venue] corpus_venue = [nltk.word_tokenize(x) for x in corpus_venue] bm25_text = BM25Okapi(corpus_text) bm25_venue = '' if self.venue != None: bm25_venue = BM25Okapi(corpus_venue) query = nltk.word_tokenize(self.phrase) tokens = [wnl.lemmatize(x) for x in query if x not in stopwords.words('english') ] text_ranks = list(bm25_text.get_scores(tokens)) venue_ranks = '' if bm25_venue != '': venue = nltk.word_tokenize(self.venue) tokens_venue = [wnl.lemmatize(x) for x in venue if x not in stopwords.words('english') ] venue_ranks = list(bm25_venue.get_scores(tokens_venue)) text_ranks = [text_ranks[cont] + venue_ranks[cont] for cont in range(len(text_ranks))] new_lista = [] c = 0 for x in lista: x['ranking'] = text_ranks[c] new_lista.append(x) c+=1 return new_lista else: return lista
def set_archives_dataset(self, archives_dataset): self.title_corpus = [] self.abstract_corpus = [] self.raw_publications = [] self.profie_id_to_indices = {} start_index = 0 counter = 0 for profile_id, publications in archives_dataset.items(): for publication in publications: if self.use_abstract and self._is_valid_field( publication['content'], 'abstract'): tokenized_abstract = publication['content'][ 'abstract'].lower().split(' ') self.abstract_corpus.append(tokenized_abstract) self.raw_publications.append(publication) counter += 1 elif self.use_title and self._is_valid_field( publication['content'], 'title'): tokenized_title = publication['content']['title'].lower( ).split(' ') self.title_corpus.append(tokenized_title) self.raw_publications.append(publication) counter += 1 self.profie_id_to_indices[profile_id] = (start_index, counter) start_index = counter if self.use_title: self.bm25_titles = BM25Okapi(self.title_corpus) if self.use_abstract: self.bm25_abstracts = BM25Okapi(self.abstract_corpus)
def classify_duplicate(filename): filename_data = pd.read_csv(filename, header=0) data = [] for i in range(0, len(filename_data)): data.append(filename_data.iloc[i, 0] + ' ' + filename_data.iloc[i, 1]) #print(data) # initialize the TfidfVectorizer tfidf_vect = TfidfVectorizer(stop_words="english", smooth_idf=True).fit(data) # generate tfidf matrix for both the questions dtm_q1 = tfidf_vect.transform(filename_data["q1"]) dtm_q2 = tfidf_vect.transform(filename_data["q2"]) data_1 = [] for i in range(0, len(filename_data)): cosine_sim = cosine_similarity(dtm_q1[i], dtm_q2[i])[0] tokenized_corpus = [doc.split(" ") for doc in filename_data["q1"]] bm25 = BM25Okapi(tokenized_corpus) doc_scores = bm25.get_scores(filename_data.iloc[i, 1].split(" "))[i] data_1.append([cosine_sim, doc_scores]) #print(data_1) metrics = ["roc_auc"] binary_y = np.where(filename_data["is_duplicate"] == 0, 1, 0) # initiate a linear SVM model clf_svm = svm.LinearSVC() cv_svm = cross_validate(clf_svm, data_1, binary_y, scoring=metrics, cv=5) auc = cv_svm['test_roc_auc'].mean() return auc
def do_recommendation(texts): hotelReview_ds = pd.read_csv('hotelReview_ds.csv') rating_matrix = pd.read_csv('rating_matrix.csv', index_col=0) business_info_ds = pd.read_csv('business_info_ds.csv') tokenzied_review = pickle.load(open("tokenized_review.bin", "rb")) review_dataset = pickle.load(open("review_data.bin", "rb")) bm25 = BM25Okapi(tokenzied_review) tokenized_query = texts.split(" ") doc_scores = bm25.get_scores(tokenized_query) bm25.get_top_n(tokenized_query, review_dataset, n=9) # find simliar user related to that query request query_bm25 = hotelReview_ds.loc[doc_scores.argsort()[-9:] [::-1]]['reviewer_id'] # load vectors for similar users similar_users = rating_matrix[rating_matrix.index.isin(query_bm25)] print(rating_matrix.index) # calc avg ratings across the similar users similar_users = similar_users.mean(axis=0) # convert to dataframe so its easy to sort and filter similar_users_df = pd.DataFrame(similar_users, columns=['mean']) # order the dataframe similar_users_df_ordered = similar_users_df.sort_values(by=['mean'], ascending=False) # grab the top n hotels top_n_hotels = similar_users_df_ordered.head(9) top_n_hotels_indices = top_n_hotels.index.tolist() # lookup these hotels in the other dataframe to find informations hotel_info = business_info_ds[business_info_ds['id'].isin( top_n_hotels_indices)] return hotel_info
def process(): query = request.form.get("data") selected_category = request.form.get("category").lower() selected_sort = request.form.get("sort") search_query = f.spellCheck(query) df = pd.read_csv("./data/news_data.csv") doc_set = set() if selected_category == 'all': file = open("./data/index_dict.pkl", "rb") index_dict = pickle.load(file) doc_set = f.linearMergePosition(search_query, index_dict) else: file = open("./data/categorical_index_dict.pkl", "rb") category_index_dict = pickle.load(file) doc_set = f.linearMergePosition(search_query, category_index_dict[selected_category]) if doc_set != None: result_df = df.loc[df['Doc_ID'].isin([doc_id for doc_id in doc_set])].copy() df_ = pd.read_csv("./data/full_data_and_cleaned_data.csv") tokenized_corpus = [ df_['cleaned_data'][doc_id - 1].split(" ") for doc_id in doc_set ] bm25 = BM25Okapi(tokenized_corpus) tokenized_query = f.preprocess_words(search_query).split(" ") doc_scores = bm25.get_scores(tokenized_query) result_df["Scores"] = doc_scores #result_df.dropna(inplace=True) if selected_sort == "Relevance": result_df.sort_values('Scores', inplace=True, ascending=False) elif selected_sort == "Newest": result_df['Date'] = pd.to_datetime(result_df.date, yearfirst=True) result_df.sort_values('Date', inplace=True, ascending=False) else: result_df['Date'] = pd.to_datetime(result_df.date, yearfirst=True) result_df.sort_values('Date', inplace=True, ascending=True) all_links = [] all_links.append([search_query, query, '0', '0']) for headline, link, description, date in zip( result_df['headline'].values, result_df['link'].values, result_df['short_description'].values, result_df['date'].values): all_links.append([headline, link, str(description), date]) return jsonify(all_links) else: all_links = [] all_links.append([search_query, query, '0', '0']) return jsonify(all_links)
def get_10_closest_from_corpus(infile, queries): # load docs collected_articles = [] with open(infile) as fin: for line in tqdm(fin): json_object = json.loads(line.strip()) doc = json_object["target_text"] doc_id = json_object["target_title"] collected_articles.append( Article(doc_id, doc_id, doc, word_tokenize(doc, "german"))) # compute bm25 corpus = [art.word_list for art in collected_articles] bm25 = BM25Okapi(corpus) print("corpus indexed") closest = {query: [] for query in queries} for query in tqdm(queries): doc_scores = bm25.get_scores(query) tenth_best_score = sorted(doc_scores, reverse=True)[9] for idx, score in enumerate(doc_scores): if score >= tenth_best_score: closest[query].append((score, collected_articles[idx])) if len(closest[query]) < 10: print("Not enough closest queries") raise RuntimeError return closest
def _init_traindf_bm25_model(self): train_df = pd.read_csv(self.interface_config.train_single_turn_file_path, sep='\t') train_df = train_df.drop_duplicates(['topic_id', 'question_id']).reset_index(drop=True).fillna('no_q') added_tokens = [] added_cnames = ['initial_request', 'answer', 'topic_desc'] for qid in self.question_bank['question_id'].values: words = [] for cname in added_cnames: irs = train_df[train_df['question_id'] == qid][cname].unique() # irs = all_df[all_df['question_id'] == qid][cname].unique() for ir in irs: ws = stem_tokenize(ir) words.extend(ws) words = list(set(words)) added_tokens.append(words) self.question_bank['tokens_from_train'] = added_tokens self.question_bank['all_tokens'] = self.question_bank['tokenized_question_list'] + self.question_bank['tokens_from_train'] self.question_bank['all_token_str'] = self.question_bank['all_tokens'].map(lambda x: ' '.join(x)) # add train_df initial_request tokens # bm25_corpus = question_bank['tokenized_question_list'].tolist() bm25_corpus = self.question_bank['all_tokens'].tolist() bm25 = BM25Okapi(bm25_corpus) return bm25, bm25_corpus
def classify_duplicate(filename): data = pd.read_csv(filename) data.head() tfidf_vect = TfidfVectorizer(stop_words="english") docs = data.q1.values.tolist() + data.q2.values.tolist() docs_dtm = tfidf_vect.fit(docs) q1_dtm = tfidf_vect.transform(data['q1']) q2_dtm = tfidf_vect.transform(data['q2']) q1_dtm.shape q2_dtm.shape scores = [] for i in range(0, len(data)): sim_score = cosine_similarity(q1_dtm[i], q2_dtm[i])[0] bm25 = BM25Okapi([x.split(" ") for x in data["q1"].values.tolist()]) tokenized_query = data.q2[i].split(" ") bm25_score = bm25.get_scores(tokenized_query)[i] scores.append([sim_score, bm25_score]) scores clf_SVM = svm.LinearSVC() metrics = ["roc_auc"] cv_SVM = cross_validate(clf_SVM, scores,data['is_duplicate'], \ scoring=metrics, cv=5, \ return_train_score=True) return cv_SVM['test_roc_auc'].mean()
def main(): nlp = spacy.load("en_core_web_sm") # TF-IDF weighting; typical text mining method # term frequency-inverse document frequency text_list = df.text.str.lower().values tok_text = [] # for our tokenised corpus #Tokenising using SpaCy: for doc in tqdm(nlp.pipe(text_list, disable=["tagger", "parser", "ner"])): tok = [t.text for t in doc if t.is_alpha] tok_text.append(tok) bm25 = BM25Okapi(tok_text) query = "Flood Defence" tokenized_query = query.lower().split(" ") import time t0 = time.time() results = bm25.get_top_n(tokenized_query, df.text.values, n=3) t1 = time.time() print(f'Searched 50,000 records in {round(t1-t0,3) } seconds \n') for i in results: print(i)
def get_similarity(query, documents): docs = query + documents docs = [word_token(d, lemma=True) for d in docs] tokenized_corpus = [doc.split(' ') for doc in docs] # print(tokenized_corpus) # print(tokenized_corpus) bm25 = BM25Okapi(tokenized_corpus[1:]) bm25plus = BM25Plus(tokenized_corpus[1:]) bm25L = BM25L(tokenized_corpus[1:]) query = tokenized_corpus[0] # print(query) bm25_scores = bm25.get_scores(query) bm25plus_scores = bm25plus.get_scores(query) bm25L_scores = bm25L.get_scores(query) bm25_scores = [(i, v) for i, v in enumerate(bm25_scores)] bm25plus_scores = [(i, v) for i, v in enumerate(bm25plus_scores)] bm25L_scores = [(i, v) for i, v in enumerate(bm25L_scores)] bm25_scores.sort(key=lambda x: x[1], reverse=True) bm25plus_scores.sort(key=lambda x: x[1], reverse=True) bm25L_scores.sort(key=lambda x: x[1], reverse=True) # print(bm25_scores) # print(bm25plus_scores) # print(bm25L_scores) # print(bm25_scores) # print(bm25plus_scores) # print(bm25L_scores) return bm25_scores, bm25plus_scores, bm25L_scores
def arg_tfidf_ranking(query, documents): tokenized_query = [token.text for token in analyzer(query)] tokenized_docs = GetTokenizedDocuments(analyzer, documents) bm25 = BM25Okapi(tokenized_docs) doc_scores = bm25.get_scores(tokenized_query) for index, doc in enumerate(documents): doc['tfidf_score'] = doc_scores[index]
def createmodel(tok_text=[]): # ft_model = FastText.load(join(data_Path, '_fasttext.model')) weighted_doc_vects = [] bm25 = BM25Okapi(tok_text) for i, dd in tqdm(enumerate(tok_text)): txt = ' '.join(dd) doc_vector = [] for word in dd: # vector = ft_model[word] vector = getembeddings(word) weight = (bm25.idf[word] * ((bm25.k1 + 1.0) * bm25.doc_freqs[i][word])) / ( bm25.k1 * (1.0 - bm25.b + bm25.b * (bm25.doc_len[i] / bm25.avgdl)) + bm25.doc_freqs[i][word]) weighted_vector = vector * weight doc_vector.append(weighted_vector) doc_vector_mean = np.mean(doc_vector, axis=0) weighted_doc_vects.append(doc_vector_mean) pickle.dump(weighted_doc_vects, open(join(data_Path, "weighted_doc_vects.p"), "wb")) # save the results to disc # create a matrix from our document vectors try: data = np.vstack(weighted_doc_vects) except Exception as ex: pass # initialize a new index, using a HNSW index on Cosine Similarity index = nmslib.init(method='hnsw', space='cosinesimil') index.addDataPointBatch(data) index.createIndex({'post': 2}, print_progress=True) index.saveIndex(join(data_Path, '_NMSLIB.index'), save_data=True)
def compute_bm25(tokenized_query, story_id, paragraphs, n): tokenized_paragraphs = [paragraph.split(" ") \ for paragraph in paragraphs] bm25 = BM25Okapi(tokenized_paragraphs) best_p = bm25.get_top_n(tokenized_query, paragraphs, n=n) best_i = [p.split(" ")[0] for p in best_p] return best_i
def __init__(self, config, tokenizer, stemmer, isStemming): self.config = config self.tokenizer = tokenizer self.stemmer = stemmer self.isStemming = isStemming self.ann = load_cache(config.ANN_FILE) self.cn2eng_dic = load_cache(config.CN2ENG) self.corpus_idf = load_cache(config.CORPUS_IDF) self.corpus = [c.strip() for c in self.corpus_idf] self.tokenized_corpus = self.tokenize_corpus(self.corpus, self.tokenizer) self.queries = self.load_txt(config.QUERIES) self.tokenized_queries = self.tokenize_corpus(self.queries, self.tokenizer) self.mis_queries = self.load_txt(config.MIS_QUERIES) self.tokenized_mis_queries = self.tokenize_corpus( self.mis_queries, self.tokenizer) self.tokenized_suggested_queries = copy.deepcopy( self.tokenized_mis_queries) self.WORD_FREQUENCY = self.generate_word_freq( config.SPELLING_SUGGESTOR.WORD_FREQUENCY) self.STOP_WORD = self.load_txt(config.STOP_WORD) self.db_file = config.DATABASE_FILE self.BASIC_TABLE = config.BASIC_TABLE self.db = self.load_db() self.ranker = BM25Okapi(self.tokenized_corpus, k1=1.2, b=0.75)
def __init__(self, content): self.content = content tokenized_content = [doc["abstract"].lower().split() for doc in content] self.bm25 = BM25Okapi(tokenized_content) print("Initiating the search engine")
def _init_bm25_model(self): self.question_bank['tokenized_question_list'] = self.question_bank['question'].map(stem_tokenize) self.question_bank['tokenized_question_str'] = self.question_bank['tokenized_question_list'].map(lambda x: ' '.join(x)) bm25_corpus = self.question_bank['tokenized_question_list'].tolist() bm25 = BM25Okapi(bm25_corpus) return bm25, bm25_corpus
def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "BM25OkapiRanker": model_path = conf.path_models + 'vsm_bm25okapi/' + conf.get_desc() + '/' if force or (not os.path.exists(model_path)) \ or (not os.path.isfile(model_path + 'bm25okapi.pickle')) \ or (not os.path.isfile(model_path + 'bm25okapi_index_mapping.pickle')): utils.mk_dir_if_not_exists(model_path) dataset = BM25OkapiRanker.extractor.load_dataset(conf=conf) bow_corpus = [(Ranker.get_text(conf, data), data['filename']) for (index, data) in dataset.iterrows()] bow_corpus, names = map(list, zip(*bow_corpus)) index_mapping = BM25OkapiRanker.build_index_mapping(names) bm25 = BM25Okapi(bow_corpus) logging.info('BM25OkapiRanker : initialized') bm25_ranker = BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf) bm25_ranker.persist(model_path) return bm25_ranker else: with open(model_path + 'bm25okapi.pickle', mode='rb') as file: bm25 = pickle.load(file) logging.info('BM25OkapiRanker : loading bm25okapi.pickle from {}'.format(model_path)) with open(model_path + 'bm25okapi_index_mapping.pickle', mode='rb') as file: index_mapping = pickle.load(file) logging.info('BM25OkapiRanker : loading bm25_index_mapping.pickle from {}'.format(model_path)) logging.info('BM25OkapiRanker : initialized') return BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf)
def search(query): tokenized_corpus = [doc.split(" ") for doc in corpus] bm25 = BM25Okapi(tokenized_corpus) query = query.split(" ") subquery = [] subquery += query for item in subquery: query.append(item.replace('\n', '')) scores = (bm25.get_scores(query=query)) for score in scores: if score > 0: print(score) if score > 5: return True for item in query: for word in corpus: if item in exceptions: return False if len(item) < 2: continue if item.lower().find(word) != -1: return True if similarity(item, word) > 0.85: return True return False
def init_okapi25(): corpus = list(self.df_r[data_col].apply(lambda x: x.split())) indexed = BM25Okapi(corpus) pandarallel.initialize() bm25 = self.df_r[data_col].parallel_apply( lambda x: indexed.get_scores(x.split())) return np.argsort(bm25, axis=1)
def get_10_closest_from_corpus(infile, queries): # load docs collected_articles = [] with open(infile) as fin: for line in tqdm(fin): json_object = json.loads(line.strip()) # [{"id": [0, 0], # "question": "Der halluzinogene Pilz <Query> \"\" wurde erstmals in einem tropischen Regenwald in der # Region Uxpanapa in Veracruz im Südosten Mexikos entdeckt.", # "document": "page does not exist", "document_id": "Psilocybe naematoliformis"}] doc = json_object[0]["document"] doc_id = json_object[0]["document_id"] collected_articles.append( Article(doc_id, doc_id, doc, word_tokenize(doc, "german"))) # compute bm25 corpus = [art.word_list for art in collected_articles] bm25 = BM25Okapi(corpus) print("corpus indexed") closest = {query: [] for query in queries} for query in tqdm(queries): doc_scores = bm25.get_scores(query) tenth_best_score = sorted(doc_scores, reverse=True)[9] for idx, score in enumerate(doc_scores): if score >= tenth_best_score: closest[query].append((score, collected_articles[idx])) if len(closest[query]) < 10: print("Not enough closest queries") raise RuntimeError return closest
def compute_BM25(corpus_df: pd.DataFrame, query_df: pd.DataFrame, data_col: str, f_name: str, reindex: False) -> np.array: pandarallel.initialize() base_path = "/lfs/1/sahaana/enrichment/data/Okapi25Queries" corpus = list(corpus_df[data_col].parallel_apply(lambda x: x.split())) indexed = BM25Okapi(corpus) bm25 = query_df[data_col].parallel_apply( lambda x: indexed.get_scores(x.split())) bm25 = np.vstack(bm25) np.save(f"{base_path}/{f_name}.npy", bm25) final = np.argsort(bm25, axis=1) if not reindex: np.save(f"{base_path}/{f_name}_argsort.npy", final) print(f"Saved {f_name}") return final else: corpus_indexes = np.array(corpus_df.index) query_index = np.array(query_df.index) final = corpus_indexes[final] np.save(f"{base_path}/{f_name}_argsort.npy", final) np.save(f"{base_path}/{f_name}_QIDs.npy", query_index) print(f"Saved {f_name}") return query_index, bm25, final
def build_bm25_model(self, documents: List[Preprocessed_Document]): tokenized_document_strings: List[List[str]] = [] for document in documents: search_string = document.title_preprocessed + " " + document.body_preprocessed tokenized_document_strings.append(search_string.lower().split()) self._BM25_model = BM25Okapi(tokenized_document_strings) self._indexed_documents = documents
def get_bot_response(): what_the_user_said = request.args.get('msg') #train_path = 'message_without_id_train.csv' train_path_res = 'response_without_id_train.csv' #### the following code is to use two bm25, match query to message first and then match message to response # file = open(train_path) # read_csv = csv.reader(file) # corpus1 = [] # for row in read_csv: # corpus1.append(row[0]) # tokenized_corpus = [doc.split(" ") for doc in corpus1] # bm25 = BM25Okapi(tokenized_corpus) # tokenized_query = what_the_user_said.split(" ") # doc_scores = bm25.get_scores(tokenized_query) # result = bm25.get_top_n(tokenized_query, corpus1, n=1) # query = result[0] # message = pd.read_csv('merged_train.csv') # message['re_id']=message['response_id']+' '+ message['response'] # message_id = message[['message_id']].drop_duplicates(subset="message_id")['message_id'].to_list() # corpus = [] # for i in message_id: # response = message[message['message_id']==i]['re_id'].to_list() # corpus.append(response) # q = [] # with open('query_train.txt') as file: # for i in file: # q.append(i.strip()) # corpus_index = q.index(query) # tokenized_corpus1 = [doc.split(" ") for doc in corpus[corpus_index]] # BM25 = BM25Okapi(tokenized_corpus1) # tokenized_query1 = what_the_user_said.split(" ") # doc_scores1 = BM25.get_scores(tokenized_query1) # result1 = BM25.get_top_n(tokenized_query1, corpus[corpus_index], n=10) # if len(result1) !=0: # return result1[0].split(' ', 1)[1] file = open(train_path_res) read_csv = csv.reader(file) corpus = [] for row in read_csv: corpus.append(row[0]) tokenized_corpus = [doc.split(" ") for doc in corpus] bm25 = BM25Okapi(tokenized_corpus) tokenized_query = what_the_user_said.split(" ") doc_scores = bm25.get_scores(tokenized_query) result = bm25.get_top_n(tokenized_query, corpus, n=1) if len(result) != 0: return result[0] else: return "I don't know"
def corpus_index(): cache_dict = open_cache() corpus = list(cache_dict.values()) tokenized_corpus = [ remove_stopwords(str(doc).split(" ")) for doc in corpus ] bm25plus = BM25Okapi(tokenized_corpus) return corpus, bm25plus, cache_dict
def run_bm25(self): tokenized_corpus = [ passage.text.split(" ") for passage in self.passages ] bm25 = BM25Okapi(tokenized_corpus) tokenized_topic = self.topic.split(" ") self.bm25_scores = bm25.get_scores(tokenized_topic) assert len(self.passages) == len(self.bm25_scores)
def search(query): topicPlusContent, contents = getTopicContent(selectMainFromData()) tokenized_corpus = getTokenizedCorpus(topicPlusContent) tokenized_query = getNouns(query) print(tokenized_query) bm25 = BM25Okapi(tokenized_corpus) return bm25.get_top_n(tokenized_query, contents, n=1)[0]
def BM25Search(corpus,searchquery,ntopsentences): #corpus=f.readlines(); tokenized_corpus = [doc.split(" ") for doc in corpus] bm25 = BM25Okapi(tokenized_corpus) tokenized_query = searchquery.split(" ") doc_scores = bm25.get_scores(tokenized_query) return (doc_scores,bm25.get_top_n(tokenized_query, corpus, n=ntopsentences))
def _build_index(self, corpus: Iterable[str]) -> Union[BM25Okapi, BM25Sklearn]: if self.use_sklearn: bm25 = BM25Sklearn() bm25.fit(corpus) return bm25 else: tokenized_corpus = [self._tokenize(doc) for doc in corpus] return BM25Okapi(tokenized_corpus)
def genRankerAndTable(self): tokenized = [ self.changeToBasicForm(self.filterStopWords(word_tokenize(page))) for page in [self.contents[key] for key in self.contents] ] self.bm25 = BM25Okapi(tokenized) for t in tokenized: self.similarities.append(self.bm25.get_scores(t))
def __init__(self, corpus, tokenizer_fn): """ :param corpus: corpus of documents. :param tokenizer_fn: tokenizer function to extract tokens from the documents and the queries. """ self.tokenizer_fn = tokenizer_fn tokenized_corpus = [tokenizer_fn(doc) for doc in corpus] self.bm25 = BM25Okapi(tokenized_corpus)