def search(self, question: str): bi_encoder = self.bi_encoder corpus = self.wiki corpus_embd = self.corpus_embeddings_for_bi_encoder query = question #Encode the query using the bi-encoder and find potentially relevant passages start_time = time.time() question_embedding = bi_encoder.encode(query, convert_to_tensor=True) hits = util.semantic_search(question_embedding, corpus_embd, top_k=10) hits = hits[0] # Get the hits for the first query end_time = time.time() #Output of top-5 hits print("Input question:", query) print("Results (after {:.3f} seconds):".format(end_time - start_time)) elements = [] for hit in hits[0:10]: index = hit['corpus_id'] article = corpus.iloc[index] # print('\nTitle:', article['title']) # print('Paragraph:', article['paragraph']) # print('Url:', article['url']) # print('Index:', index) elem = { "title": article['title'], "paragraph": article['paragraph'], "url": article['url'], "index": int(index) } elements.append(elem) # print("\n\n========\n") return elements
def retrieval(self, query): """Utility for retrieving passages most relevant to a given query.""" # First pass, find passages most similar to query question_embedding = self.text_encoder.encode(query, convert_to_tensor=True) hits = util.semantic_search(question_embedding, self.passage_embeddings, top_k=100)[0] # Second pass, re-rank passages more thoroughly cross_scores = self.pair_encoder.predict( [[query, self.entries[hit['corpus_id']]] for hit in hits]) for idx in range(len(cross_scores)): hits[idx]['cross-score'] = cross_scores[idx] # Select best few results hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) results = [] for hit in hits[:5]: if hit['cross-score'] > 1e-3: results += [self.entries[hit['corpus_id']]] return results
def search_n_articles(self, id_article): """ Get the n most similar articles contained in the database compute from cosine similarity distance between tensors. Parameters: ----------- id_article: int, the id of the original article n: int, the number of articles returned from the search request Returns: List(int), list of ids of the most semantically similar articles """ #TODO prétraiter le texte importé df_publis = self.data_utils.original_dataframe paper = df_publis[df_publis.id == id_article] # print(paper) text = paper.motcol + " " + paper.resume query_embedding = self.model.encode(text.values[0], convert_to_tensor=True) search_articles = util.semantic_search(query_embedding, self.corpus_embeddings)[0] matchs = [] for hit in search_articles: related_paper = self.data_corpus.iloc[hit['corpus_id'], :] titre_selected = self.data_utils.original_dataframe.motcol == related_paper[ 'motcol'] if self.data_utils.original_dataframe[ titre_selected].resume.values[0] != ";": matchs.append(self.data_utils. original_dataframe[titre_selected].id.values[0]) return matchs
def get_top_k(self, query: str, k=5) -> List[Dict]: """Get k most similar to query sentences You need to call load_embeddings or calc_embeddings first to use this method Args: query (str): text for which you want to find similar sentences k (int, optional): number of sentences to find. Defaults to 5. Returns: List[Dict[float, str, float]]: List with dictionaries of the following structure: { ts: timestamp of message, score: cosin similarity score text: message text } """ if self.embeddings is None: raise ValueError( "embeddings are not initialized. Call load_embeddings or calc_embeddings first" ) query_embedding = self.model.encode([query], convert_to_tensor=True, show_progress_bar=False) hits = util.semantic_search(query_embedding, self.embeddings, top_k=k) hits = hits[0] result = [{ "ts": str(self.text_df[hit["corpus_id"]][0]), "score": str(hit["score"]), "text": self.text_df[hit["corpus_id"]][1], } for hit in hits if hit["score"] != 1] return result
def sample(self, paper_id, abstract, title): """Given paper_text ( = paper_abstract+paper_title), samples out the most relevant paper Args: paper_id (str): the arxiv id of the paper which is treated as the starting point abstract (str): abstract of paper title (str) : title of paper Returns: [type]: [description] """ paper_text = abstract + ' ' + title paper_text = self.clean_text(paper_text) # get the vector for query paper query_embedding = self.model.encode(paper_text, convert_to_tensor=True) # retrieve top similar papers search_hits = util.semantic_search(query_embedding, self.corpus_embeddings)[0] # do softmax normalization and sampling using random strategy next_paper_id = self.corpus_ids[search_hits[0]['corpus_id']] if next_paper_id == paper_id: next_paper_id = self.corpus_ids[search_hits[1]['corpus_id']] return str(next_paper_id)
def recommend_by_title_similarity(df: pd.DataFrame, user_id=None, n_titles: int = 5, top_n: int = 10): """Function produces embeddings for book titles and performs semantic similarity search using a list of book titles passed as a query. :param df: DataFrame with all available data :param user_id: User ID :param n_titles: Number of titles to select from user history :param top_n: Number of similar titles to search for :return: Map object containing recommended book titles """ global title_embeddings user = user_id or select_user(df) query = get_user_data(df, user, titles_only=True) print(f'Top-5 books with highest ratings:') query = query[:n_titles] for title in query: print(title) # All unique content IDs and titles df = df[['Content_ID', 'Book-Title']].drop_duplicates().reset_index(drop=True).copy() # Pretrained NLP model model = SentenceTransformer('paraphrase-distilroberta-base-v1') # If the function was called for the 1st time, # we have to convert all titles into embeddings. if title_embeddings is None: title_embeddings = model.encode(df['Book-Title'].to_list(), convert_to_tensor=True) print(f'Book titles converted to embeddings. Shape: {title_embeddings.shape}') query_embeddings = model.encode(query, convert_to_tensor=True) print(f'Query titles converted to embeddings. Shape: {query_embeddings.shape}') # For repeated calls when embeddings are already available else: indexes = df[df['Book-Title'].isin(query)].index query_embeddings = title_embeddings[indexes] print(f'Embedding for query titles selected. Shape: {query_embeddings.shape}') # For every title in the original query extract indexes of similar titles recommendations = [] result = util.semantic_search(query_embeddings, title_embeddings, top_k=top_n) for query in result: for similar in query: # Do not recommend identical titles # and title with low similarity score. if 0.75 < similar['score'] < 0.95: recommendations.append(similar['corpus_id']) # Replace indexes by respective titles recommendations = map(lambda x: df.loc[x, 'Book-Title'], recommendations) print('Semantically similar titles:') for title in recommendations: print(title) return recommendations
def fetch_similar( queries: list, top_k: int=5, threshold: int=0.75, ) -> list[dict]: if not isinstance(queries, list): raise ValueError(f'Expected type list for queries; got {type(queries)}') elif not isinstance(top_k, int): raise ValueError(f'Expected type int for top_k; got {type(top_k)}') elif not isinstance(threshold, float): raise ValueError(f'Expected type float for threshold_; got {type(threshold)}') if not queries: return queries query_embeddings = embedder.encode(queries, convert_to_tensor=True) hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k=top_k) # hits = list if not hits: return hits res_all = [] for i in range(len(hits)): for j in range(len(hits[i])): if hits[i][j]['score'] >= threshold: score = hits[i][j]['score'] kb_phrase = corpus[hits[i][j]['corpus_id']] res = get_template() res['score'] = round(score, 5) res['phrase'] = queries[i] res['kb_phrase'] = kb_phrase res['subindustry'] = sub_ind res['method'] = 'semantic' res_all.append(res) return res_all
def test_semantic_search(self): """Tests util.semantic_search function""" num_queries = 20 num_k = 10 doc_emb = torch.tensor(np.random.randn(1000, 100)) q_emb = torch.tensor(np.random.randn(num_queries, 100)) hits = util.semantic_search(q_emb, doc_emb, top_k=num_k, query_chunk_size=5, corpus_chunk_size=17) assert len(hits) == num_queries assert len(hits[0]) == num_k #Sanity Check of the results cos_scores = util.pytorch_cos_sim(q_emb, doc_emb) cos_scores_values, cos_scores_idx = cos_scores.topk(num_k) cos_scores_values = cos_scores_values.cpu().tolist() cos_scores_idx = cos_scores_idx.cpu().tolist() for qid in range(num_queries): for hit_num in range(num_k): assert hits[qid][hit_num]['corpus_id'] == cos_scores_idx[qid][ hit_num] assert np.abs(hits[qid][hit_num]['score'] - cos_scores_values[qid][hit_num]) < 0.001
def search(self, question: str, corpus: str, top_k: int) -> pd.DataFrame: """ semantic search """ assert ( corpus in self.corpus_dict ), "Corpus not found, please fit the corpus first using the .fit() call" question_embedding = self.encoder.encode(question, convert_to_tensor=True) hits = util.semantic_search(question_embedding, self.corpus_embeddings_dict[corpus], top_k=top_k).pop() # now, score all retrieved passages with the cross_encoder cross_inp = [[question, self.corpus_dict[corpus][hit["corpus_id"]]] for hit in hits] cross_scores = self.cross_encoder.predict(cross_inp) # sort results by the cross-encoder scores for idx in range(len(cross_scores)): hits[idx]["cross-score"] = cross_scores[idx] hits[idx]["snippet"] = self.corpus_dict[corpus][ hits[idx]["corpus_id"]].replace("\n", " ") hits = sorted(hits, key=lambda x: x["cross-score"], reverse=True) return pd.DataFrame(hits)
def get_top_k(self, query: str, k=5) -> List[Dict]: r"""Get k most similar to query sentences You need to call load_embeddings or calc_embeddings first to use this method Args: query (str): text for which you want to find similar sentences k (int, optional): number of sentences to find. Defaults to 5. Returns: List[Dict[float, str, float]]: List with dictionaries of the following structure: { ts: timestamp of message, score: cosin similarity score text: message text } Example 1: calculate embeddings, save them and get top 5 sentences :: >>> df = pd.read_csv("data/prepared/edu_courses.tsv", sep="\t") >>> engine = SemanticEngine(text_df=df) >>> engine.calc_embeddings(df.text.tolist()) >>> engine.save_embeddings("data/embeddings/edu_courses.pkl") >>> query = "посоветуйте каких-нибудь курсов по pytorch" >>> result = engine.get_top_k(query, k=5) >>> for res in result: ... print(res["ts"], res["text"], res["score"], sep="\n") Example 2: load embeddings from file, and get top 5 sentences >>> df = pd.read_csv("data/prepared/edu_courses.tsv", sep="\t") >>> engine = SemanticEngine(text_df=df) >>> engine.load_embeddings("data/embeddings/edu_courses.pkl") >>> query = "посоветуйте каких-нибудь курсов по pytorch" >>> result = engine.get_top_k(query, k=5) >>> for res in result: ... print(res["ts"], res["text"], res["score"], sep="\n") """ if self.embeddings is None: raise ValueError( "embeddings are not initialized. Call `load_embeddings` or `calc_embeddings` first" ) if k > len(self.embeddings): warnings.warn(f"""`k` with value of {k} is bigger then number of sentences with value of {len(self.embeddings)}. Value of k is set to {len(self.embeddings)} """) k = len(self.embeddings) query_embedding = self.model.encode([query], convert_to_tensor=True, show_progress_bar=False) hits = util.semantic_search(query_embedding, self.embeddings, top_k=k) hits = hits[0] result = [{ "ts": str(self.text_df[hit["corpus_id"]][0]), "score": str(hit["score"]), "text": self.text_df[hit["corpus_id"]][1], } for hit in hits] return result
def search_papers(title, abstract): query_embedding = model.encode(title+' '+abstract, convert_to_tensor=True) #TODO, top_k = number of similar papers to show search_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k = 25) search_hits = search_hits[0] print("Paper:", title) print("Most similar papers:") for hit in search_hits: related_paper = papers[hit['corpus_id']] print("{:.2f}\t{}".format(hit['score'], related_paper['title']))
def search(query, k=3): # First, we encode the query (which can either be an image or a text string) query_emb = model.encode([query], convert_to_tensor=True, show_progress_bar=False) # Then, we use the util.semantic_search function, which computes the cosine-similarity # between the query embedding and all image embeddings. # It then returns the top_k highest ranked images, which we output hits = util.semantic_search(query_emb, img_emb, top_k=k)[0] print("Query:") for hit in hits: print(img_names[hit['corpus_id']])
def search_papers(title, abstract): query_embedding = model.encode(title + ' ' + abstract, convert_to_tensor=True) search_hits = util.semantic_search(query_embedding, corpus_embeddings) search_hits = search_hits[0] #Get the hits for the first query print("\n\nPaper:", title) print("Most similar papers:") for hit in search_hits: related_paper = papers[hit['corpus_id']] print("{:.2f}\t{}\t{} {}".format(hit['score'], related_paper['title'], related_paper['venue'], related_paper['year']))
async def main(req: func.HttpRequest) -> func.HttpResponse: client = GraphqlClient(endpoint="http://localhost:4000/graphql") query = """ query { questions { id name text embedding } } """ result = list(filter(filterQuestions, client.execute( query=query)["data"]["questions"])) corpus_embeddings = [torch.FloatTensor( o["embedding"]) for o in result] query = req.params.get('query') if not query: try: req_body = req.get_json() except ValueError: pass else: query = req_body.get('query') if query: embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') query_embedding = embedder.encode(query, convert_to_tensor=True) searched = util.semantic_search(query_embedding, corpus_embeddings)[0] ret = [] for o in searched: item = result[o["corpus_id"]] item["score"] = o["score"].astype(float) del item["embedding"] ret.append(item) return func.HttpResponse(json.dumps(ret)) else: return func.HttpResponse( "This HTTP triggered function executed successfully. Pass a name in the query string or in the request body for a personalized response.", status_code=200 )
def get_closest(self, conjecture_embedding, statements_embedding, score_function=util.cos_sim): retrieved = util.semantic_search( conjecture_embedding, list(statements_embedding.values()), score_function=score_function, top_k=30000, ) all_titles = list(statements_embedding.keys()) retrieved_list = list() for element in retrieved[0]: retrieved_list.append(all_titles[element["corpus_id"]]) return retrieved_list
def get_top_k(self, query, top_k): query_embedding = self.model.encode(query, convert_to_tensor=True) hits = util.semantic_search(query_embeddings=query_embedding, corpus_embeddings=self.embeddings, top_k=top_k) if not hits: return [] result = [] for hit in hits[0]: item = self.corpus_df.iloc[hit['corpus_id']] result.append({ 'ts': item['ts'], 'text': item['text'], 'score': hit['score'], }) return result
def handleQuestion(question): db["question"] = question start = time.time() q_embedding = model.encode(question, convert_to_tensor=True, show_progress_bar=True) s_results = util.semantic_search(q_embedding, embeddings, top_k=top_k) results = pd.DataFrame(s_results[0], columns=["corpus_id", "score"]) # sort/score results with the cross-encoder: # cross_inp = db.iloc[results.corpus_id][["question","Overview"]].to_numpy().tolist() # cross_sco=cross_encoder.predict(cross_inp, show_progress_bar=True) # results['cross_score']=cross_sco # results = results.sort_values("cross_score", ascending=True) end = time.time() results.duration = end - start return pd.merge(results, db, "inner", left_on="corpus_id", right_index=True).to_json(orient="record")
def _get_relevant_comments_helper(comments, query, query_embedding, corpus_embeddings): hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=10) hits = hits[0] cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6') cross_inp = [[query, comments[hit['corpus_id']]] for hit in hits] cross_scores = cross_encoder.predict(cross_inp) for idx in range(len(cross_scores)): hits[idx]['cross-score'] = cross_scores[idx] hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) #print top 10 hits # for hit in hits[:10]: #print(hit['score'], comments[hit['corpus_id']]) return hits[:10]
def fluid_search(self, query, considered_candidates=50, selected_candidates=5, second_pass=True): self.load_essence() if self.essence_ready == False: return ['The essence is not present at the required location.'] self.sync_cache() selected_candidates = min(selected_candidates, considered_candidates) query_embedding = self.text_encoder.encode(query, convert_to_tensor=True) hits = util.semantic_search(query_embedding, torch.Tensor(self.entry_embeddings), top_k=considered_candidates)[0] if second_pass: cross_scores = self.pair_encoder.predict([[query, self.entry_contents[hit['corpus_id']]] for hit in hits]) for idx in range(len(cross_scores)): hits[idx]['cross-score'] = cross_scores[idx] hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) return [self.entry_filenames[hit['corpus_id']] for hit in hits[:selected_candidates] if hit['cross-score'] > 1e-3] else: return [self.entry_filenames[hit['corpus_id']] for hit in hits[:selected_candidates]]
def retrieve_top_k_similar_issues(model, issues, embeddings, description, top_k): print('Retrieving top-{} similar issues...'.format(top_k)) start_time = time.time() description_embedding = model.encode(description, convert_to_tensor=True) results = util.semantic_search(description_embedding, embeddings, top_k=top_k)[0] similar_issues = [] for result in results: result_issue = issues.iloc[result['corpus_id']] issue = { 'id': int(result_issue['bug_id']), 'description': str(result_issue['full_description']), 'similarity': float(result['score']) } similar_issues.append(issue) end_time = time.time() duration_in_secs = end_time - start_time print('Retrieved top-{} similar issues in {} seconds'.format( top_k, round(duration_in_secs, 5))) return similar_issues
def recommend_by_similar_users(user_id: int, df: pd.DataFrame, embeddings: pd.DataFrame, top_k: int = 11) -> list: """Function searches 'embeddings' for vectors most similar to the 'user_id', selects books that were read and were not explicitly disliked by similar users, which are not present in the 'user_id' history. :param user_id: Integer ID for the user :param df: DataFrame with all available data :param embeddings: DataFrame of embeddings for all users :param top_k: Maximum number of similar user vectors to search :return: List of recommended content IDs """ # Array representing the query user query = embeddings.loc[user_id, :].values # Information about the query user authors, read_books = get_user_data(df, user_id) print(f'User {user_id} read authors:', authors) # Search for top_k vectors most similar to the query # (returns a pd.DataFrame with 2 columns: 'corpus_id' and 'score') similar_users = pd.DataFrame( util.semantic_search(query, embeddings.values, top_k=top_k)[0] ) # Add users IDs finding them by row indexes similar_users['User-ID'] = embeddings.iloc[similar_users['corpus_id'], :].index print(f'Similar users:\n{similar_users}') # Drop 1st row which represents the user that was used as a query # (self-match with similarity score=1.0) similar_users = similar_users.iloc[1:, :]['User-ID'] # Select books from similar users histories except disliked books, # one-off accidental readings and books present in current user's logs recommendations = select_unread_books(df, similar_users, read_books) display_recommendations(recommendations) return recommendations
def search(query): print("Input question:", query) ##### Sematic Search ##### # Encode the query using the bi-encoder and find potentially relevant passages question_embedding = bi_encoder.encode(query, convert_to_tensor=True) question_embedding = question_embedding.cuda() hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k) hits = hits[0] # Get the hits for the first query ##### Re-Ranking ##### # Now, score all retrieved passages with the cross_encoder cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits] cross_scores = cross_encoder.predict(cross_inp) # Sort results by the cross-encoder scores for idx in range(len(cross_scores)): hits[idx]['cross-score'] = cross_scores[idx] # Output of top-5 hits from bi-encoder print("\n-------------------------\n") print("Top-3 Bi-Encoder Retrieval hits") hits = sorted(hits, key=lambda x: x['score'], reverse=True) for hit in hits[0:3]: print("\t{:.3f}\t{}".format( hit['score'], passages[hit['corpus_id']].replace("\n", " "))) # Output of top-5 hits from re-ranker print("\n-------------------------\n") print("Top-3 Cross-Encoder Re-ranker hits") hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) for hit in hits[0:3]: print("\t{:.3f}\t{}".format( hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
def search(self, embds, index, q_embedding, top_k_hits=5): ''' index: the faiss index used for the search q_embedding: embedding of the query top_k_hits: number of hits to output ''' #FAISS works with inner product (dot product). When we normalize vectors to unit length, inner product is equal to cosine similarity q_embedding = q_embedding / np.linalg.norm(q_embedding) q_embedding = np.expand_dims(q_embedding, axis=0) # Search in FAISS. It returns a matrix with distances and corpus ids. distances, corpus_ids = index.search(q_embedding, top_k_hits) hits = [{'corpus_id': id, 'score': score} for id, score in zip(corpus_ids[0], distances[0])] hits = sorted(hits, key=lambda x: x['score'], reverse=True) top_urls = [] rank = 1 for hit in hits[0:top_k_hits]: id_num = self.df.index[hit['corpus_id']] print("{}\t{:.3f}\t{}".format(rank, hit['score'], id_num)) item = self.df.iloc[hit['corpus_id']] print(f"{item['descrption']}\t{item['brand']}\t{item['price']}\n{item['url']}") top_urls.append(item['url']) rank += 1 print() correct_hits = util.semantic_search(q_embedding, embds, top_k=top_k_hits)[0] correct_hits_ids = set([hit['corpus_id'] for hit in correct_hits]) ann_corpus_ids = set([hit['corpus_id'] for hit in hits]) if len(ann_corpus_ids) != len(correct_hits_ids): print("Approximate Nearest Neighbor returned a different number of results than expected") recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids) print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100)) return top_urls
# 將所屬類別存回資料庫 def returnCategory(policy_id, category_id): sqlstr = "insert into policy_category (policy_id, category_id) VALUES (%s, %s)" % ( policy_id, category_id) return DB.execution(DB.create, sqlstr) # 待匯入完畢後跑迴圈 policy = findPolicy() category = findCategory() # 分類與政見逐條比對 for j in category["data"]: m = [j["name"]] for i in policy["data"]: n = [i["content"].decode(encoding='utf-8', errors='ignore')] # 政見 result = xs.cossim(m, n) if result > 0.4: returnCategory(i["id"], j["id"]) else: embedder = SBert() corpus_embeddings = embedder.encode(n) query_embedding = embedder.encode(m) hits = semantic_search(query_embedding, corpus_embeddings) hits = hits[0] for hit in hits: if hit['score'] > 0.4: returnCategory(i["id"], j["id"])
hits = [] for id, score in zip(corpus_ids, scores): hits.append({'corpus_id': id, 'score': 1 - ((score**2) / 2)}) end_time = time.time() print("Input question:", inp_question) print("Results (after {:.3f} seconds):".format(end_time - start_time)) for hit in hits[0:top_k_hits]: print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']])) # Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity # Here, we compute the recall of ANN compared to the exact results correct_hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k_hits)[0] correct_hits_ids = set([hit['corpus_id'] for hit in correct_hits]) #Compute recall ann_corpus_ids = set(corpus_ids) if len(ann_corpus_ids) != len(correct_hits_ids): print( "Approximate Nearest Neighbor returned a different number of results than expected" ) recall = len( ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids) print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format( top_k_hits, recall * 100))
corpus_filepath = 'wiki-programmming-20210101.jsonl.gz' if not os.path.exists(corpus_filepath): util.http_get( 'https://sbert.net/datasets/wiki-programmming-20210101.jsonl.gz', corpus_filepath) with gzip.open(corpus_filepath, 'rt') as fIn: for line in fIn: data = json.loads(line.strip()) title = data['title'] for p in data['paragraphs']: if len(p) > 100: #Only take paragraphs with at least 100 chars docs.append((title, p)) paragraph_emb = model.encode([d[1] for d in docs], convert_to_tensor=True) print("Available Wikipedia Articles:") print(", ".join(sorted(list({d[0] for d in docs})))) # Example for semantic search while True: query = input("Query: ") query_emb = model.encode(query, convert_to_tensor=True) hits = util.semantic_search(query_emb, paragraph_emb, top_k=3)[0] for hit in hits: doc = docs[hit['corpus_id']] print("{:.2f}\t{}\t\t{}".format(hit['score'], doc[0], doc[1])) print("\n=================\n")
all_sentences.append(date) for ind, sentence in enumerate(all_sentences): tot_sentences += 1 sentence_id = tot_sentences doc_id_to_sentences.get(doc_id).update({sentence_id: sentence}) sentence_id_to_doc_id.update({sentence_id: doc_id}) print(tot_sentences) print("ENCODING SENTENCES") corpus_embeddings = embedder.encode(all_sentences, convert_to_tensor=True, show_progress_bar=True) query = "man" # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity top_k = 5 print("QUERYING") query_embedding = embedder.encode(query, convert_to_tensor=True, show_progress_bar=True) print("RUNNING SEM SEARCH") cos_scores = util.semantic_search(query_embedding, corpus_embeddings)[0] print("DONE") # print(cos_scores) # for score in cos_scores: # print(lookup[score["corpus_id"]], score["score"])
def deploy(question): tokenizer, model, bi_encoder = neuralqa() top_k = returns # Number of passages we want to retrieve with the bi-encoder question_embedding = bi_encoder.encode(question, convert_to_tensor=True) hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k) hits = hits[0] #define lists matches = [] ids = [] scores = [] answers = [] for hit in hits: matches.append(passages[hit['corpus_id']]) ids.append(passage_id[hit['corpus_id']]) scores.append(hit['score']) for match in matches: inputs = tokenizer.encode_plus(question, match, add_special_tokens=True, return_tensors="pt") input_ids = inputs["input_ids"].tolist()[0] text_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer_start_scores, answer_end_scores = model(**inputs) answer_start = torch.argmax( answer_start_scores ) # Get the most likely beginning of answer with the argmax of the score answer_end = torch.argmax( answer_end_scores ) + 1 # Get the most likely end of answer with the argmax of the score answer = tokenizer.convert_tokens_to_string( tokenizer.convert_ids_to_tokens( input_ids[answer_start:answer_end])) answers.append(answer) # generate result df df_results = pd.DataFrame({ 'PIMS_ID': ids, 'answer': answers, 'context': matches, "scores": scores }) st.header("Retrieved Answers:") for index, row in df_results.iterrows(): green = "<span class='highlight turquoise'>" + row[ 'answer'] + "<span class='bold'>Answer</span></span>" row['context'] = row['context'].replace(row['answer'], green) row['context'] = "<div>" + row['context'] + "</div>" st.markdown(row['context'], unsafe_allow_html=True) st.write("") st.write("Relevance:", round(row['scores'], 2), "PIMS_ID:", row['PIMS_ID']) st.write( "____________________________________________________________________" ) df_results.set_index('PIMS_ID', inplace=True) st.header("Summary:") st.table(df_results) del tokenizer, model, bi_encoder, question_embedding
top_k = min(5, len(s3)) #top_k = min(5, len(corpus)) for query in queries: query_embedding = model.encode(query, convert_to_tensor=True) # We use cosine-similarity and torch.topk to find the highest 5 scores cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings2)[0] top_results = torch.topk(cos_scores, k=top_k) print("\n\n======================\n\n") print("Query:", query) print("\nTop 5 most similar sentences in corpus:") # for score, idx in zip(top_results[0], top_results[1]): # print(s3[idx], "(Score: {:.4f})".format(score)) hits = util.semantic_search(query_embedding, corpus_embeddings2, top_k=5) hits = hits[0] #Get the hits for the first query for hit in hits: print(s3[hit['corpus_id']], "(Score: {:.4f})".format(hit['score'])) # Paraphrase Mining - finding texts with similar meaning for large colections of sentences 10000+ largecorpus =corpus_test['sentence_A'].unique() paraphrases = util.paraphrase_mining(model,largecorpus) df = pd.DataFrame.from_records(paraphrases) df[1] = [largecorpus[idx] for idx in df[1]] df[2] = [largecorpus[idx] for idx in df[2]] # df.to_csv("Paraphrase_Mining_pl.csv",index=False,header=["score","sentence1","sentence2"])
corpus_sentences = cache_data['sentences'][0:max_corpus_size] corpus_embeddings = cache_data['embeddings'][0:max_corpus_size] ############################### print("Corpus loaded with {} sentences / embeddings".format( len(corpus_sentences))) while True: inp_question = input("Please enter a question: ") print("Input question:", inp_question) #First, retrieve candidates using cosine similarity search start_time = time.time() question_embedding = model.encode(inp_question, convert_to_tensor=True) hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=num_candidates) hits = hits[0] #Get the hits for the first query print("Cosine-Similarity search took {:.3f} seconds".format(time.time() - start_time)) print("Top 5 hits with cosine-similarity:") for hit in hits[0:5]: print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']])) #Now, do the re-ranking with the cross-encoder start_time = time.time() sentence_pairs = [[inp_question, corpus_sentences[hit['corpus_id']]] for hit in hits] ce_scores = cross_encoder_model.predict(sentence_pairs)