Esempio n. 1
0
    def __bm25_ranking(self, lista):

        if self.ranking in ['bm25','BM25']:

            import nltk
            from nltk.corpus import stopwords
            wnl = nltk.WordNetLemmatizer()

            corpus_text = ''
            corpus_venue = ''

            if self.venue != None and self.tab in ['inproceedings']:
                corpus_text = [x['ts_i'] for x in lista]
                corpus_text = [x.replace("'", ' ').replace(":", ' ') for x in corpus_text]
                corpus_text = [nltk.word_tokenize(x) for x in corpus_text]
            else:
                corpus_text = [x['ts_tab'] for x in lista]
                corpus_text = [x.replace("'", ' ').replace(":", ' ') for x in corpus_text]
                corpus_text = [nltk.word_tokenize(x) for x in corpus_text]

            if self.venue != None and self.tab in ['inproceedings']:
                corpus_venue = [x['ts_p'] for x in lista]
                corpus_venue = [x.replace("'", ' ').replace(":", ' ') for x in corpus_venue]
                corpus_venue = [nltk.word_tokenize(x) for x in corpus_venue]

            elif self.venue != None:
                corpus_venue = [x['ts_venue'] for x in lista]
                corpus_venue = [x.replace("'", ' ').replace(":", ' ') for x in corpus_venue]
                corpus_venue = [nltk.word_tokenize(x) for x in corpus_venue]


            bm25_text = BM25Okapi(corpus_text)

            bm25_venue = ''
            if self.venue != None:
                bm25_venue = BM25Okapi(corpus_venue)

            query = nltk.word_tokenize(self.phrase)
            tokens = [wnl.lemmatize(x) for x in query if x not in stopwords.words('english') ]

            text_ranks = list(bm25_text.get_scores(tokens))

            venue_ranks = ''
            if bm25_venue != '':
                venue = nltk.word_tokenize(self.venue)
                tokens_venue = [wnl.lemmatize(x) for x in venue if x not in stopwords.words('english') ]
                venue_ranks = list(bm25_venue.get_scores(tokens_venue))

                text_ranks = [text_ranks[cont] + venue_ranks[cont] for cont in range(len(text_ranks))]

            new_lista = []
            c = 0
            for x in lista:
                x['ranking'] = text_ranks[c]
                new_lista.append(x)
                c+=1

            return new_lista
        else:
            return lista
Esempio n. 2
0
    def set_archives_dataset(self, archives_dataset):
        self.title_corpus = []
        self.abstract_corpus = []
        self.raw_publications = []
        self.profie_id_to_indices = {}
        start_index = 0
        counter = 0
        for profile_id, publications in archives_dataset.items():
            for publication in publications:
                if self.use_abstract and self._is_valid_field(
                        publication['content'], 'abstract'):
                    tokenized_abstract = publication['content'][
                        'abstract'].lower().split(' ')
                    self.abstract_corpus.append(tokenized_abstract)
                    self.raw_publications.append(publication)
                    counter += 1
                elif self.use_title and self._is_valid_field(
                        publication['content'], 'title'):
                    tokenized_title = publication['content']['title'].lower(
                    ).split(' ')
                    self.title_corpus.append(tokenized_title)
                    self.raw_publications.append(publication)
                    counter += 1
            self.profie_id_to_indices[profile_id] = (start_index, counter)
            start_index = counter

        if self.use_title:
            self.bm25_titles = BM25Okapi(self.title_corpus)
        if self.use_abstract:
            self.bm25_abstracts = BM25Okapi(self.abstract_corpus)
def classify_duplicate(filename):
    filename_data = pd.read_csv(filename, header=0)
    data = []

    for i in range(0, len(filename_data)):
        data.append(filename_data.iloc[i, 0] + ' ' + filename_data.iloc[i, 1])
    #print(data)
    # initialize the TfidfVectorizer
    tfidf_vect = TfidfVectorizer(stop_words="english",
                                 smooth_idf=True).fit(data)
    # generate tfidf matrix for both the questions
    dtm_q1 = tfidf_vect.transform(filename_data["q1"])
    dtm_q2 = tfidf_vect.transform(filename_data["q2"])
    data_1 = []
    for i in range(0, len(filename_data)):
        cosine_sim = cosine_similarity(dtm_q1[i], dtm_q2[i])[0]
        tokenized_corpus = [doc.split(" ") for doc in filename_data["q1"]]
        bm25 = BM25Okapi(tokenized_corpus)
        doc_scores = bm25.get_scores(filename_data.iloc[i, 1].split(" "))[i]
        data_1.append([cosine_sim, doc_scores])
    #print(data_1)
    metrics = ["roc_auc"]
    binary_y = np.where(filename_data["is_duplicate"] == 0, 1, 0)
    # initiate a linear SVM model
    clf_svm = svm.LinearSVC()
    cv_svm = cross_validate(clf_svm, data_1, binary_y, scoring=metrics, cv=5)

    auc = cv_svm['test_roc_auc'].mean()
    return auc
def do_recommendation(texts):
    hotelReview_ds = pd.read_csv('hotelReview_ds.csv')
    rating_matrix = pd.read_csv('rating_matrix.csv', index_col=0)
    business_info_ds = pd.read_csv('business_info_ds.csv')
    tokenzied_review = pickle.load(open("tokenized_review.bin", "rb"))
    review_dataset = pickle.load(open("review_data.bin", "rb"))

    bm25 = BM25Okapi(tokenzied_review)

    tokenized_query = texts.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    bm25.get_top_n(tokenized_query, review_dataset, n=9)

    # find simliar user related to that query request
    query_bm25 = hotelReview_ds.loc[doc_scores.argsort()[-9:]
                                    [::-1]]['reviewer_id']
    # load vectors for similar users
    similar_users = rating_matrix[rating_matrix.index.isin(query_bm25)]
    print(rating_matrix.index)
    # calc avg ratings across the similar users
    similar_users = similar_users.mean(axis=0)
    # convert to dataframe so its easy to sort and filter
    similar_users_df = pd.DataFrame(similar_users, columns=['mean'])

    # order the dataframe
    similar_users_df_ordered = similar_users_df.sort_values(by=['mean'],
                                                            ascending=False)
    # grab the top n hotels
    top_n_hotels = similar_users_df_ordered.head(9)
    top_n_hotels_indices = top_n_hotels.index.tolist()
    # lookup these hotels in the other dataframe to find informations
    hotel_info = business_info_ds[business_info_ds['id'].isin(
        top_n_hotels_indices)]
    return hotel_info
Esempio n. 5
0
def process():

    query = request.form.get("data")
    selected_category = request.form.get("category").lower()
    selected_sort = request.form.get("sort")

    search_query = f.spellCheck(query)

    df = pd.read_csv("./data/news_data.csv")

    doc_set = set()
    if selected_category == 'all':
        file = open("./data/index_dict.pkl", "rb")
        index_dict = pickle.load(file)
        doc_set = f.linearMergePosition(search_query, index_dict)
    else:
        file = open("./data/categorical_index_dict.pkl", "rb")
        category_index_dict = pickle.load(file)
        doc_set = f.linearMergePosition(search_query,
                                        category_index_dict[selected_category])

    if doc_set != None:
        result_df = df.loc[df['Doc_ID'].isin([doc_id
                                              for doc_id in doc_set])].copy()

        df_ = pd.read_csv("./data/full_data_and_cleaned_data.csv")

        tokenized_corpus = [
            df_['cleaned_data'][doc_id - 1].split(" ") for doc_id in doc_set
        ]
        bm25 = BM25Okapi(tokenized_corpus)
        tokenized_query = f.preprocess_words(search_query).split(" ")
        doc_scores = bm25.get_scores(tokenized_query)

        result_df["Scores"] = doc_scores
        #result_df.dropna(inplace=True)

        if selected_sort == "Relevance":
            result_df.sort_values('Scores', inplace=True, ascending=False)
        elif selected_sort == "Newest":
            result_df['Date'] = pd.to_datetime(result_df.date, yearfirst=True)
            result_df.sort_values('Date', inplace=True, ascending=False)
        else:
            result_df['Date'] = pd.to_datetime(result_df.date, yearfirst=True)
            result_df.sort_values('Date', inplace=True, ascending=True)

        all_links = []

        all_links.append([search_query, query, '0', '0'])

        for headline, link, description, date in zip(
                result_df['headline'].values, result_df['link'].values,
                result_df['short_description'].values,
                result_df['date'].values):
            all_links.append([headline, link, str(description), date])
        return jsonify(all_links)
    else:
        all_links = []
        all_links.append([search_query, query, '0', '0'])
        return jsonify(all_links)
Esempio n. 6
0
def get_10_closest_from_corpus(infile, queries):
    # load docs
    collected_articles = []
    with open(infile) as fin:
        for line in tqdm(fin):
            json_object = json.loads(line.strip())
            doc = json_object["target_text"]
            doc_id = json_object["target_title"]
            collected_articles.append(
                Article(doc_id, doc_id, doc, word_tokenize(doc, "german")))

    # compute bm25
    corpus = [art.word_list for art in collected_articles]
    bm25 = BM25Okapi(corpus)
    print("corpus indexed")

    closest = {query: [] for query in queries}
    for query in tqdm(queries):
        doc_scores = bm25.get_scores(query)
        tenth_best_score = sorted(doc_scores, reverse=True)[9]
        for idx, score in enumerate(doc_scores):
            if score >= tenth_best_score:
                closest[query].append((score, collected_articles[idx]))
        if len(closest[query]) < 10:
            print("Not enough closest queries")
            raise RuntimeError
    return closest
Esempio n. 7
0
    def _init_traindf_bm25_model(self):
        train_df = pd.read_csv(self.interface_config.train_single_turn_file_path, sep='\t')
        train_df = train_df.drop_duplicates(['topic_id', 'question_id']).reset_index(drop=True).fillna('no_q')

        added_tokens = []
        added_cnames = ['initial_request', 'answer', 'topic_desc']
        for qid in self.question_bank['question_id'].values:
            words = []
            for cname in added_cnames:
                irs = train_df[train_df['question_id'] == qid][cname].unique()
                # irs = all_df[all_df['question_id'] == qid][cname].unique()
                for ir in irs:
                    ws = stem_tokenize(ir)
                    words.extend(ws)
            words = list(set(words))
            added_tokens.append(words)
        self.question_bank['tokens_from_train'] = added_tokens
        self.question_bank['all_tokens'] = self.question_bank['tokenized_question_list'] + self.question_bank['tokens_from_train']
        self.question_bank['all_token_str'] = self.question_bank['all_tokens'].map(lambda x: ' '.join(x))

        # add train_df initial_request tokens
        # bm25_corpus = question_bank['tokenized_question_list'].tolist()
        bm25_corpus = self.question_bank['all_tokens'].tolist()
        bm25 = BM25Okapi(bm25_corpus)

        return bm25, bm25_corpus
def classify_duplicate(filename):
    data = pd.read_csv(filename)
    data.head()

    tfidf_vect = TfidfVectorizer(stop_words="english")

    docs = data.q1.values.tolist() + data.q2.values.tolist()
    docs_dtm = tfidf_vect.fit(docs)
    q1_dtm = tfidf_vect.transform(data['q1'])
    q2_dtm = tfidf_vect.transform(data['q2'])
    q1_dtm.shape
    q2_dtm.shape

    scores = []
    for i in range(0, len(data)):
        sim_score = cosine_similarity(q1_dtm[i], q2_dtm[i])[0]
        bm25 = BM25Okapi([x.split(" ") for x in data["q1"].values.tolist()])
        tokenized_query = data.q2[i].split(" ")
        bm25_score = bm25.get_scores(tokenized_query)[i]
        scores.append([sim_score, bm25_score])
    scores

    clf_SVM = svm.LinearSVC()
    metrics = ["roc_auc"]
    cv_SVM = cross_validate(clf_SVM, scores,data['is_duplicate'], \
                                scoring=metrics, cv=5, \
                                return_train_score=True)

    return cv_SVM['test_roc_auc'].mean()
Esempio n. 9
0
def main():
    nlp = spacy.load("en_core_web_sm")

    # TF-IDF weighting; typical text mining method
    # term frequency-inverse document frequency

    text_list = df.text.str.lower().values
    tok_text = []  # for our tokenised corpus

    #Tokenising using SpaCy:
    for doc in tqdm(nlp.pipe(text_list, disable=["tagger", "parser", "ner"])):
        tok = [t.text for t in doc if t.is_alpha]
        tok_text.append(tok)

    bm25 = BM25Okapi(tok_text)

    query = "Flood Defence"
    tokenized_query = query.lower().split(" ")
    import time
    t0 = time.time()
    results = bm25.get_top_n(tokenized_query, df.text.values, n=3)
    t1 = time.time()
    print(f'Searched 50,000 records in {round(t1-t0,3) } seconds \n')
    for i in results:
        print(i)
Esempio n. 10
0
def get_similarity(query, documents):
    docs = query + documents
    docs = [word_token(d, lemma=True) for d in docs]
    tokenized_corpus = [doc.split(' ') for doc in docs]
    # print(tokenized_corpus)
    # print(tokenized_corpus)
    bm25 = BM25Okapi(tokenized_corpus[1:])
    bm25plus = BM25Plus(tokenized_corpus[1:])
    bm25L = BM25L(tokenized_corpus[1:])

    query = tokenized_corpus[0]
    # print(query)
    bm25_scores = bm25.get_scores(query)
    bm25plus_scores = bm25plus.get_scores(query)
    bm25L_scores = bm25L.get_scores(query)

    bm25_scores = [(i, v) for i, v in enumerate(bm25_scores)]
    bm25plus_scores = [(i, v) for i, v in enumerate(bm25plus_scores)]
    bm25L_scores = [(i, v) for i, v in enumerate(bm25L_scores)]

    bm25_scores.sort(key=lambda x: x[1], reverse=True)
    bm25plus_scores.sort(key=lambda x: x[1], reverse=True)
    bm25L_scores.sort(key=lambda x: x[1], reverse=True)

    # print(bm25_scores)
    # print(bm25plus_scores)
    # print(bm25L_scores)
    # print(bm25_scores)
    # print(bm25plus_scores)
    # print(bm25L_scores)

    return bm25_scores, bm25plus_scores, bm25L_scores
Esempio n. 11
0
def arg_tfidf_ranking(query, documents):
    tokenized_query = [token.text for token in analyzer(query)]
    tokenized_docs = GetTokenizedDocuments(analyzer, documents)
    bm25 = BM25Okapi(tokenized_docs)
    doc_scores = bm25.get_scores(tokenized_query)
    for index, doc in enumerate(documents):
        doc['tfidf_score'] = doc_scores[index]
def createmodel(tok_text=[]):
    # ft_model = FastText.load(join(data_Path, '_fasttext.model'))
    weighted_doc_vects = []
    bm25 = BM25Okapi(tok_text)
    for i, dd in tqdm(enumerate(tok_text)):
        txt = ' '.join(dd)

        doc_vector = []
        for word in dd:
            # vector = ft_model[word]
            vector = getembeddings(word)
            weight = (bm25.idf[word] * ((bm25.k1 + 1.0) * bm25.doc_freqs[i][word])) / (
                    bm25.k1 * (1.0 - bm25.b + bm25.b * (bm25.doc_len[i] / bm25.avgdl)) + bm25.doc_freqs[i][word])
            weighted_vector = vector * weight
            doc_vector.append(weighted_vector)
        doc_vector_mean = np.mean(doc_vector, axis=0)
        weighted_doc_vects.append(doc_vector_mean)
        pickle.dump(weighted_doc_vects, open(join(data_Path, "weighted_doc_vects.p"), "wb"))  # save the results to disc

    # create a matrix from our document vectors
    try:
        data = np.vstack(weighted_doc_vects)
    except Exception as ex:
        pass

    # initialize a new index, using a HNSW index on Cosine Similarity
    index = nmslib.init(method='hnsw', space='cosinesimil')
    index.addDataPointBatch(data)
    index.createIndex({'post': 2}, print_progress=True)
    index.saveIndex(join(data_Path, '_NMSLIB.index'), save_data=True)
Esempio n. 13
0
def compute_bm25(tokenized_query, story_id, paragraphs, n):
    tokenized_paragraphs = [paragraph.split(" ") \
                            for paragraph in paragraphs]
    bm25 = BM25Okapi(tokenized_paragraphs)
    best_p = bm25.get_top_n(tokenized_query, paragraphs, n=n)
    best_i = [p.split(" ")[0] for p in best_p]
    return best_i
Esempio n. 14
0
    def __init__(self, config, tokenizer, stemmer, isStemming):
        self.config = config
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.isStemming = isStemming

        self.ann = load_cache(config.ANN_FILE)
        self.cn2eng_dic = load_cache(config.CN2ENG)

        self.corpus_idf = load_cache(config.CORPUS_IDF)
        self.corpus = [c.strip() for c in self.corpus_idf]
        self.tokenized_corpus = self.tokenize_corpus(self.corpus,
                                                     self.tokenizer)

        self.queries = self.load_txt(config.QUERIES)
        self.tokenized_queries = self.tokenize_corpus(self.queries,
                                                      self.tokenizer)

        self.mis_queries = self.load_txt(config.MIS_QUERIES)
        self.tokenized_mis_queries = self.tokenize_corpus(
            self.mis_queries, self.tokenizer)
        self.tokenized_suggested_queries = copy.deepcopy(
            self.tokenized_mis_queries)

        self.WORD_FREQUENCY = self.generate_word_freq(
            config.SPELLING_SUGGESTOR.WORD_FREQUENCY)
        self.STOP_WORD = self.load_txt(config.STOP_WORD)

        self.db_file = config.DATABASE_FILE
        self.BASIC_TABLE = config.BASIC_TABLE
        self.db = self.load_db()

        self.ranker = BM25Okapi(self.tokenized_corpus, k1=1.2, b=0.75)
Esempio n. 15
0
    def __init__(self, content):

        self.content = content
        tokenized_content = [doc["abstract"].lower().split() for doc in content]
        self.bm25 = BM25Okapi(tokenized_content)

        print("Initiating the search engine")
Esempio n. 16
0
    def _init_bm25_model(self):
        self.question_bank['tokenized_question_list'] = self.question_bank['question'].map(stem_tokenize)
        self.question_bank['tokenized_question_str'] = self.question_bank['tokenized_question_list'].map(lambda x: ' '.join(x))

        bm25_corpus = self.question_bank['tokenized_question_list'].tolist()
        bm25 = BM25Okapi(bm25_corpus)
        return bm25, bm25_corpus
Esempio n. 17
0
 def load(conf: Configuration, force: Optional[bool] = False,
          persist: Optional[bool] = True) -> "BM25OkapiRanker":
     model_path = conf.path_models + 'vsm_bm25okapi/' + conf.get_desc() + '/'
     if force or (not os.path.exists(model_path)) \
             or (not os.path.isfile(model_path + 'bm25okapi.pickle')) \
             or (not os.path.isfile(model_path + 'bm25okapi_index_mapping.pickle')):
         utils.mk_dir_if_not_exists(model_path)
         dataset = BM25OkapiRanker.extractor.load_dataset(conf=conf)
         bow_corpus = [(Ranker.get_text(conf, data), data['filename']) for (index, data) in dataset.iterrows()]
         bow_corpus, names = map(list, zip(*bow_corpus))
         index_mapping = BM25OkapiRanker.build_index_mapping(names)
         bm25 = BM25Okapi(bow_corpus)
         logging.info('BM25OkapiRanker : initialized')
         bm25_ranker = BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf)
         bm25_ranker.persist(model_path)
         return bm25_ranker
     else:
         with open(model_path + 'bm25okapi.pickle', mode='rb') as file:
             bm25 = pickle.load(file)
             logging.info('BM25OkapiRanker : loading bm25okapi.pickle from {}'.format(model_path))
         with open(model_path + 'bm25okapi_index_mapping.pickle', mode='rb') as file:
             index_mapping = pickle.load(file)
             logging.info('BM25OkapiRanker : loading bm25_index_mapping.pickle from {}'.format(model_path))
         logging.info('BM25OkapiRanker : initialized')
         return BM25OkapiRanker(model=bm25, index_mapping=index_mapping, conf=conf)
Esempio n. 18
0
def search(query):
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    query = query.split(" ")
    subquery = []
    subquery += query

    for item in subquery:
        query.append(item.replace('\n', ''))

    scores = (bm25.get_scores(query=query))
    for score in scores:
        if score > 0:
            print(score)

        if score > 5:
            return True

    for item in query:
        for word in corpus:
            if item in exceptions:
                return False
            if len(item) < 2:
                continue
            if item.lower().find(word) != -1:
                return True
            if similarity(item, word) > 0.85:
                return True

    return False
Esempio n. 19
0
 def init_okapi25():
     corpus = list(self.df_r[data_col].apply(lambda x: x.split()))
     indexed = BM25Okapi(corpus)
     pandarallel.initialize()
     bm25 = self.df_r[data_col].parallel_apply(
         lambda x: indexed.get_scores(x.split()))
     return np.argsort(bm25, axis=1)
def get_10_closest_from_corpus(infile, queries):
    # load docs
    collected_articles = []
    with open(infile) as fin:
        for line in tqdm(fin):
            json_object = json.loads(line.strip())
            # [{"id": [0, 0],
            #   "question": "Der halluzinogene Pilz <Query> \"\" wurde erstmals in einem tropischen Regenwald in der
            #   Region Uxpanapa in Veracruz im Südosten Mexikos entdeckt.",
            #   "document": "page does not exist", "document_id": "Psilocybe naematoliformis"}]
            doc = json_object[0]["document"]
            doc_id = json_object[0]["document_id"]
            collected_articles.append(
                Article(doc_id, doc_id, doc, word_tokenize(doc, "german")))

    # compute bm25
    corpus = [art.word_list for art in collected_articles]
    bm25 = BM25Okapi(corpus)
    print("corpus indexed")

    closest = {query: [] for query in queries}
    for query in tqdm(queries):
        doc_scores = bm25.get_scores(query)
        tenth_best_score = sorted(doc_scores, reverse=True)[9]
        for idx, score in enumerate(doc_scores):
            if score >= tenth_best_score:
                closest[query].append((score, collected_articles[idx]))
        if len(closest[query]) < 10:
            print("Not enough closest queries")
            raise RuntimeError
    return closest
def compute_BM25(corpus_df: pd.DataFrame, query_df: pd.DataFrame,
                 data_col: str, f_name: str, reindex: False) -> np.array:
    pandarallel.initialize()
    base_path = "/lfs/1/sahaana/enrichment/data/Okapi25Queries"
    corpus = list(corpus_df[data_col].parallel_apply(lambda x: x.split()))
    indexed = BM25Okapi(corpus)
    bm25 = query_df[data_col].parallel_apply(
        lambda x: indexed.get_scores(x.split()))
    bm25 = np.vstack(bm25)
    np.save(f"{base_path}/{f_name}.npy", bm25)
    final = np.argsort(bm25, axis=1)

    if not reindex:
        np.save(f"{base_path}/{f_name}_argsort.npy", final)
        print(f"Saved {f_name}")
        return final
    else:
        corpus_indexes = np.array(corpus_df.index)
        query_index = np.array(query_df.index)

        final = corpus_indexes[final]
        np.save(f"{base_path}/{f_name}_argsort.npy", final)
        np.save(f"{base_path}/{f_name}_QIDs.npy", query_index)
        print(f"Saved {f_name}")
        return query_index, bm25, final
Esempio n. 22
0
 def build_bm25_model(self, documents: List[Preprocessed_Document]):
     tokenized_document_strings: List[List[str]] = []
     for document in documents:
         search_string = document.title_preprocessed + " " + document.body_preprocessed
         tokenized_document_strings.append(search_string.lower().split())
     self._BM25_model = BM25Okapi(tokenized_document_strings)
     self._indexed_documents = documents
Esempio n. 23
0
def get_bot_response():

    what_the_user_said = request.args.get('msg')
    #train_path = 'message_without_id_train.csv'
    train_path_res = 'response_without_id_train.csv'

    #### the following code is to use two bm25, match query to message first and then match message to response
    # file = open(train_path)
    # read_csv = csv.reader(file)
    # corpus1 = []
    # for row in read_csv:
    #     corpus1.append(row[0])
    # tokenized_corpus = [doc.split(" ") for doc in corpus1]
    # bm25 = BM25Okapi(tokenized_corpus)
    # tokenized_query = what_the_user_said.split(" ")
    # doc_scores = bm25.get_scores(tokenized_query)
    # result = bm25.get_top_n(tokenized_query, corpus1, n=1)

    # query = result[0]

    # message = pd.read_csv('merged_train.csv')
    # message['re_id']=message['response_id']+' '+ message['response']
    # message_id = message[['message_id']].drop_duplicates(subset="message_id")['message_id'].to_list()
    # corpus = []
    # for i in message_id:
    #     response = message[message['message_id']==i]['re_id'].to_list()
    #     corpus.append(response)

    # q = []
    # with open('query_train.txt') as file:
    #     for i in file:
    #         q.append(i.strip())

    # corpus_index = q.index(query)

    # tokenized_corpus1 = [doc.split(" ") for doc in corpus[corpus_index]]
    # BM25 = BM25Okapi(tokenized_corpus1)
    # tokenized_query1 = what_the_user_said.split(" ")
    # doc_scores1 = BM25.get_scores(tokenized_query1)
    # result1 = BM25.get_top_n(tokenized_query1, corpus[corpus_index], n=10)

    # if len(result1) !=0:
    #     return result1[0].split(' ', 1)[1]

    file = open(train_path_res)
    read_csv = csv.reader(file)
    corpus = []
    for row in read_csv:
        corpus.append(row[0])
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = what_the_user_said.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    result = bm25.get_top_n(tokenized_query, corpus, n=1)

    if len(result) != 0:
        return result[0]
    else:
        return "I don't know"
Esempio n. 24
0
def corpus_index():
    cache_dict = open_cache()
    corpus = list(cache_dict.values())
    tokenized_corpus = [
        remove_stopwords(str(doc).split(" ")) for doc in corpus
    ]
    bm25plus = BM25Okapi(tokenized_corpus)
    return corpus, bm25plus, cache_dict
Esempio n. 25
0
 def run_bm25(self):
     tokenized_corpus = [
         passage.text.split(" ") for passage in self.passages
     ]
     bm25 = BM25Okapi(tokenized_corpus)
     tokenized_topic = self.topic.split(" ")
     self.bm25_scores = bm25.get_scores(tokenized_topic)
     assert len(self.passages) == len(self.bm25_scores)
Esempio n. 26
0
def search(query):
    topicPlusContent, contents = getTopicContent(selectMainFromData())
    tokenized_corpus = getTokenizedCorpus(topicPlusContent)
    tokenized_query = getNouns(query)
    print(tokenized_query)
    bm25 = BM25Okapi(tokenized_corpus)

    return bm25.get_top_n(tokenized_query, contents, n=1)[0]
Esempio n. 27
0
def BM25Search(corpus,searchquery,ntopsentences):

    #corpus=f.readlines();
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = searchquery.split(" ")
    doc_scores = bm25.get_scores(tokenized_query)
    return (doc_scores,bm25.get_top_n(tokenized_query, corpus, n=ntopsentences))
Esempio n. 28
0
 def _build_index(self, corpus: Iterable[str]) -> Union[BM25Okapi, BM25Sklearn]:
     if self.use_sklearn:
         bm25 = BM25Sklearn()
         bm25.fit(corpus)
         return bm25
     else:
         tokenized_corpus = [self._tokenize(doc) for doc in corpus]
         return BM25Okapi(tokenized_corpus)
 def genRankerAndTable(self):
     tokenized = [
         self.changeToBasicForm(self.filterStopWords(word_tokenize(page)))
         for page in [self.contents[key] for key in self.contents]
     ]
     self.bm25 = BM25Okapi(tokenized)
     for t in tokenized:
         self.similarities.append(self.bm25.get_scores(t))
Esempio n. 30
0
 def __init__(self, corpus, tokenizer_fn):
     """
     :param corpus: corpus of documents.
     :param tokenizer_fn: tokenizer function to extract tokens from the documents and the queries.
     """
     self.tokenizer_fn = tokenizer_fn
     tokenized_corpus = [tokenizer_fn(doc) for doc in corpus]
     self.bm25 = BM25Okapi(tokenized_corpus)