Esempio n. 1
0
    def search(self, question: str):
        bi_encoder = self.bi_encoder
        corpus = self.wiki
        corpus_embd = self.corpus_embeddings_for_bi_encoder
        query = question

        #Encode the query using the bi-encoder and find potentially relevant passages
        start_time = time.time()
        question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
        hits = util.semantic_search(question_embedding, corpus_embd, top_k=10)
        hits = hits[0]  # Get the hits for the first query
        end_time = time.time()

        #Output of top-5 hits
        print("Input question:", query)
        print("Results (after {:.3f} seconds):".format(end_time - start_time))
        elements = []
        for hit in hits[0:10]:
            index = hit['corpus_id']
            article = corpus.iloc[index]
            # print('\nTitle:', article['title'])
            # print('Paragraph:', article['paragraph'])
            # print('Url:', article['url'])
            # print('Index:', index)
            elem = {
                "title": article['title'],
                "paragraph": article['paragraph'],
                "url": article['url'],
                "index": int(index)
            }
            elements.append(elem)
        # print("\n\n========\n")
        return elements
Esempio n. 2
0
    def retrieval(self, query):
        """Utility for retrieving passages most relevant to a given query."""
        # First pass, find passages most similar to query
        question_embedding = self.text_encoder.encode(query,
                                                      convert_to_tensor=True)
        hits = util.semantic_search(question_embedding,
                                    self.passage_embeddings,
                                    top_k=100)[0]

        # Second pass, re-rank passages more thoroughly
        cross_scores = self.pair_encoder.predict(
            [[query, self.entries[hit['corpus_id']]] for hit in hits])

        for idx in range(len(cross_scores)):
            hits[idx]['cross-score'] = cross_scores[idx]

        # Select best few results
        hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)

        results = []
        for hit in hits[:5]:
            if hit['cross-score'] > 1e-3:
                results += [self.entries[hit['corpus_id']]]

        return results
Esempio n. 3
0
    def search_n_articles(self, id_article):
        """ Get the n most similar articles contained in the database compute from cosine similarity 
        distance between tensors.
        Parameters:
        -----------
        id_article: int, the id of the original article  
        n: int, the number of articles returned from the search request
        Returns: List(int), list of ids of the most semantically similar articles
        """
        #TODO prétraiter le texte importé
        df_publis = self.data_utils.original_dataframe
        paper = df_publis[df_publis.id == id_article]
        #        print(paper)
        text = paper.motcol + " " + paper.resume

        query_embedding = self.model.encode(text.values[0],
                                            convert_to_tensor=True)

        search_articles = util.semantic_search(query_embedding,
                                               self.corpus_embeddings)[0]
        matchs = []
        for hit in search_articles:
            related_paper = self.data_corpus.iloc[hit['corpus_id'], :]
            titre_selected = self.data_utils.original_dataframe.motcol == related_paper[
                'motcol']
            if self.data_utils.original_dataframe[
                    titre_selected].resume.values[0] != ";":
                matchs.append(self.data_utils.
                              original_dataframe[titre_selected].id.values[0])

        return matchs
    def get_top_k(self, query: str, k=5) -> List[Dict]:
        """Get k most similar to query sentences
        You need to call load_embeddings or calc_embeddings first to use this method
        Args:
            query (str): text for which you want to find similar sentences
            k (int, optional): number of sentences to find. Defaults to 5.

        Returns:
            List[Dict[float, str, float]]: List with dictionaries of the following structure:
            {
                ts: timestamp of message,
                score: cosin similarity score
                text: message text
            }
        """
        if self.embeddings is None:
            raise ValueError(
                "embeddings are not initialized. Call load_embeddings or calc_embeddings first"
            )
        query_embedding = self.model.encode([query],
                                            convert_to_tensor=True,
                                            show_progress_bar=False)
        hits = util.semantic_search(query_embedding, self.embeddings, top_k=k)
        hits = hits[0]
        result = [{
            "ts": str(self.text_df[hit["corpus_id"]][0]),
            "score": str(hit["score"]),
            "text": self.text_df[hit["corpus_id"]][1],
        } for hit in hits if hit["score"] != 1]
        return result
Esempio n. 5
0
    def sample(self, paper_id, abstract, title):
        """Given paper_text ( = paper_abstract+paper_title), samples out the most relevant paper

        Args:
            paper_id (str): the arxiv id of the paper which is treated as the starting point
            abstract (str): abstract of paper
            title (str)   : title of paper

        Returns:
            [type]: [description]
        """
        paper_text = abstract + ' ' + title
        paper_text = self.clean_text(paper_text)

        # get the vector for query paper
        query_embedding = self.model.encode(paper_text, convert_to_tensor=True)

        # retrieve top similar papers
        search_hits = util.semantic_search(query_embedding,
                                           self.corpus_embeddings)[0]

        # do softmax normalization and sampling using random strategy
        next_paper_id = self.corpus_ids[search_hits[0]['corpus_id']]

        if next_paper_id == paper_id:
            next_paper_id = self.corpus_ids[search_hits[1]['corpus_id']]

        return str(next_paper_id)
def recommend_by_title_similarity(df: pd.DataFrame,
                                  user_id=None,
                                  n_titles: int = 5,
                                  top_n: int = 10):
    """Function produces embeddings for book titles
    and performs semantic similarity search using a list
    of book titles passed as a query.
    :param df: DataFrame with all available data
    :param user_id: User ID
    :param n_titles: Number of titles to select from user history
    :param top_n: Number of similar titles to search for
    :return: Map object containing recommended book titles
    """
    global title_embeddings

    user = user_id or select_user(df)

    query = get_user_data(df, user, titles_only=True)
    print(f'Top-5 books with highest ratings:')
    query = query[:n_titles]
    for title in query:
        print(title)

    # All unique content IDs and titles
    df = df[['Content_ID', 'Book-Title']].drop_duplicates().reset_index(drop=True).copy()

    # Pretrained NLP model
    model = SentenceTransformer('paraphrase-distilroberta-base-v1')

    # If the function was called for the 1st time,
    # we have to convert all titles into embeddings.
    if title_embeddings is None:
        title_embeddings = model.encode(df['Book-Title'].to_list(), convert_to_tensor=True)
        print(f'Book titles converted to embeddings. Shape: {title_embeddings.shape}')
        query_embeddings = model.encode(query, convert_to_tensor=True)
        print(f'Query titles converted to embeddings. Shape: {query_embeddings.shape}')

    # For repeated calls when embeddings are already available
    else:
        indexes = df[df['Book-Title'].isin(query)].index
        query_embeddings = title_embeddings[indexes]
        print(f'Embedding for query titles selected. Shape: {query_embeddings.shape}')

    # For every title in the original query extract indexes of similar titles
    recommendations = []
    result = util.semantic_search(query_embeddings, title_embeddings, top_k=top_n)
    for query in result:
        for similar in query:
            # Do not recommend identical titles
            # and title with low similarity score.
            if 0.75 < similar['score'] < 0.95:
                recommendations.append(similar['corpus_id'])

    # Replace indexes by respective titles
    recommendations = map(lambda x: df.loc[x, 'Book-Title'], recommendations)
    print('Semantically similar titles:')
    for title in recommendations:
        print(title)

    return recommendations
Esempio n. 7
0
def fetch_similar( queries: list,                   
                   top_k: int=5,
                   threshold: int=0.75, ) -> list[dict]:

    if not isinstance(queries, list):
        raise ValueError(f'Expected type list for queries; got {type(queries)}')    
    elif not isinstance(top_k, int):
        raise ValueError(f'Expected type int for top_k; got {type(top_k)}')
    elif not isinstance(threshold, float):
        raise ValueError(f'Expected type float for threshold_; got {type(threshold)}')

    if not queries:
        return queries
    
    query_embeddings  = embedder.encode(queries, convert_to_tensor=True)
    hits              = util.semantic_search(query_embeddings, corpus_embeddings, top_k=top_k)    # hits = list
    if not hits:
        return hits

    res_all = []
    for i in range(len(hits)):    
        for j in range(len(hits[i])):
            if hits[i][j]['score'] >= threshold:
                score = hits[i][j]['score']
                kb_phrase = corpus[hits[i][j]['corpus_id']]
                res = get_template()
                                
                res['score'] = round(score, 5)
                res['phrase'] = queries[i]
                res['kb_phrase'] = kb_phrase
                res['subindustry'] = sub_ind
                res['method'] = 'semantic'
                res_all.append(res)
                
    return res_all
Esempio n. 8
0
    def test_semantic_search(self):
        """Tests util.semantic_search function"""
        num_queries = 20
        num_k = 10

        doc_emb = torch.tensor(np.random.randn(1000, 100))
        q_emb = torch.tensor(np.random.randn(num_queries, 100))
        hits = util.semantic_search(q_emb,
                                    doc_emb,
                                    top_k=num_k,
                                    query_chunk_size=5,
                                    corpus_chunk_size=17)
        assert len(hits) == num_queries
        assert len(hits[0]) == num_k

        #Sanity Check of the results
        cos_scores = util.pytorch_cos_sim(q_emb, doc_emb)
        cos_scores_values, cos_scores_idx = cos_scores.topk(num_k)
        cos_scores_values = cos_scores_values.cpu().tolist()
        cos_scores_idx = cos_scores_idx.cpu().tolist()

        for qid in range(num_queries):
            for hit_num in range(num_k):
                assert hits[qid][hit_num]['corpus_id'] == cos_scores_idx[qid][
                    hit_num]
                assert np.abs(hits[qid][hit_num]['score'] -
                              cos_scores_values[qid][hit_num]) < 0.001
Esempio n. 9
0
    def search(self, question: str, corpus: str, top_k: int) -> pd.DataFrame:
        """
        semantic search
        """
        assert (
            corpus in self.corpus_dict
        ), "Corpus not found, please fit the corpus first using the .fit() call"
        question_embedding = self.encoder.encode(question,
                                                 convert_to_tensor=True)
        hits = util.semantic_search(question_embedding,
                                    self.corpus_embeddings_dict[corpus],
                                    top_k=top_k).pop()

        # now, score all retrieved passages with the cross_encoder
        cross_inp = [[question, self.corpus_dict[corpus][hit["corpus_id"]]]
                     for hit in hits]
        cross_scores = self.cross_encoder.predict(cross_inp)

        # sort results by the cross-encoder scores
        for idx in range(len(cross_scores)):
            hits[idx]["cross-score"] = cross_scores[idx]
            hits[idx]["snippet"] = self.corpus_dict[corpus][
                hits[idx]["corpus_id"]].replace("\n", " ")
        hits = sorted(hits, key=lambda x: x["cross-score"], reverse=True)
        return pd.DataFrame(hits)
Esempio n. 10
0
    def get_top_k(self, query: str, k=5) -> List[Dict]:
        r"""Get k most similar to query sentences
        You need to call load_embeddings or calc_embeddings first to use this method
        Args:
            query (str): text for which you want to find similar sentences
            k (int, optional): number of sentences to find. Defaults to 5.

        Returns:
            List[Dict[float, str, float]]: List with dictionaries of the following structure:
            {
                ts: timestamp of message,
                score: cosin similarity score
                text: message text
            }
        Example 1: calculate embeddings, save them and get top 5 sentences :: 
            >>> df = pd.read_csv("data/prepared/edu_courses.tsv", sep="\t")
            >>> engine = SemanticEngine(text_df=df)
            >>> engine.calc_embeddings(df.text.tolist())
            >>> engine.save_embeddings("data/embeddings/edu_courses.pkl")
            >>> query = "посоветуйте каких-нибудь курсов по pytorch"
            >>> result = engine.get_top_k(query, k=5)
            >>> for res in result:
            ...     print(res["ts"], res["text"], res["score"], sep="\n")

        Example 2: load embeddings from file, and get top 5 sentences
            >>> df = pd.read_csv("data/prepared/edu_courses.tsv", sep="\t")
            >>> engine = SemanticEngine(text_df=df)
            >>> engine.load_embeddings("data/embeddings/edu_courses.pkl")
            >>> query = "посоветуйте каких-нибудь курсов по pytorch"
            >>> result = engine.get_top_k(query, k=5)
            >>> for res in result:
            ...     print(res["ts"], res["text"], res["score"], sep="\n")
        """
        if self.embeddings is None:
            raise ValueError(
                "embeddings are not initialized. Call `load_embeddings` or `calc_embeddings` first"
            )
        if k > len(self.embeddings):
            warnings.warn(f"""`k` with value of {k} is bigger then number of 
                sentences with value of {len(self.embeddings)}.
                Value of k is set to {len(self.embeddings)}
                """)
            k = len(self.embeddings)

        query_embedding = self.model.encode([query],
                                            convert_to_tensor=True,
                                            show_progress_bar=False)
        hits = util.semantic_search(query_embedding, self.embeddings, top_k=k)
        hits = hits[0]
        result = [{
            "ts": str(self.text_df[hit["corpus_id"]][0]),
            "score": str(hit["score"]),
            "text": self.text_df[hit["corpus_id"]][1],
        } for hit in hits]
        return result
Esempio n. 11
0
def search_papers(title, abstract):
  query_embedding = model.encode(title+' '+abstract, convert_to_tensor=True)

#TODO, top_k = number of similar papers to show
  search_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k = 25)
  search_hits = search_hits[0]

  print("Paper:", title)
  print("Most similar papers:")
  for hit in search_hits:
    related_paper = papers[hit['corpus_id']]
    print("{:.2f}\t{}".format(hit['score'], related_paper['title']))
Esempio n. 12
0
def search(query, k=3):
    # First, we encode the query (which can either be an image or a text string)
    query_emb = model.encode([query], convert_to_tensor=True, show_progress_bar=False)
    
    # Then, we use the util.semantic_search function, which computes the cosine-similarity
    # between the query embedding and all image embeddings.
    # It then returns the top_k highest ranked images, which we output
    hits = util.semantic_search(query_emb, img_emb, top_k=k)[0]
    
    print("Query:")
    for hit in hits:
        print(img_names[hit['corpus_id']])
def search_papers(title, abstract):
    query_embedding = model.encode(title + ' ' + abstract,
                                   convert_to_tensor=True)

    search_hits = util.semantic_search(query_embedding, corpus_embeddings)
    search_hits = search_hits[0]  #Get the hits for the first query

    print("\n\nPaper:", title)
    print("Most similar papers:")
    for hit in search_hits:
        related_paper = papers[hit['corpus_id']]
        print("{:.2f}\t{}\t{} {}".format(hit['score'], related_paper['title'],
                                         related_paper['venue'],
                                         related_paper['year']))
Esempio n. 14
0
async def main(req: func.HttpRequest) -> func.HttpResponse:
    client = GraphqlClient(endpoint="http://localhost:4000/graphql")

    query = """
        query {
            questions {
                id
                name
                text
                embedding
            }
        }
        """

    result = list(filter(filterQuestions, client.execute(
        query=query)["data"]["questions"]))
    corpus_embeddings = [torch.FloatTensor(
        o["embedding"]) for o in result]

    query = req.params.get('query')
    if not query:
        try:
            req_body = req.get_json()
        except ValueError:
            pass
        else:
            query = req_body.get('query')

    if query:
        embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
        query_embedding = embedder.encode(query, convert_to_tensor=True)
        searched = util.semantic_search(query_embedding, corpus_embeddings)[0]

        ret = []

        for o in searched:
            item = result[o["corpus_id"]]
            item["score"] = o["score"].astype(float)
            del item["embedding"]
            ret.append(item)

        return func.HttpResponse(json.dumps(ret))
    else:
        return func.HttpResponse(
            "This HTTP triggered function executed successfully. Pass a name in the query string or in the request body for a personalized response.",
            status_code=200
        )
Esempio n. 15
0
    def get_closest(self,
                    conjecture_embedding,
                    statements_embedding,
                    score_function=util.cos_sim):
        retrieved = util.semantic_search(
            conjecture_embedding,
            list(statements_embedding.values()),
            score_function=score_function,
            top_k=30000,
        )

        all_titles = list(statements_embedding.keys())

        retrieved_list = list()
        for element in retrieved[0]:
            retrieved_list.append(all_titles[element["corpus_id"]])

        return retrieved_list
Esempio n. 16
0
    def get_top_k(self, query, top_k):
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        hits = util.semantic_search(query_embeddings=query_embedding,
                                    corpus_embeddings=self.embeddings,
                                    top_k=top_k)

        if not hits:
            return []

        result = []
        for hit in hits[0]:
            item = self.corpus_df.iloc[hit['corpus_id']]
            result.append({
                'ts': item['ts'],
                'text': item['text'],
                'score': hit['score'],
            })

        return result
Esempio n. 17
0
 def handleQuestion(question):
     db["question"] = question
     start = time.time()
     q_embedding = model.encode(question,
                                convert_to_tensor=True,
                                show_progress_bar=True)
     s_results = util.semantic_search(q_embedding, embeddings, top_k=top_k)
     results = pd.DataFrame(s_results[0], columns=["corpus_id", "score"])
     # sort/score results with the cross-encoder:
     # cross_inp = db.iloc[results.corpus_id][["question","Overview"]].to_numpy().tolist()
     # cross_sco=cross_encoder.predict(cross_inp, show_progress_bar=True)
     # results['cross_score']=cross_sco
     # results = results.sort_values("cross_score", ascending=True)
     end = time.time()
     results.duration = end - start
     return pd.merge(results,
                     db,
                     "inner",
                     left_on="corpus_id",
                     right_index=True).to_json(orient="record")
Esempio n. 18
0
def _get_relevant_comments_helper(comments, query, query_embedding,
                                  corpus_embeddings):
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=10)
    hits = hits[0]

    cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

    cross_inp = [[query, comments[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)

    #print top 10 hits
    # for hit in hits[:10]:
    #print(hit['score'], comments[hit['corpus_id']])

    return hits[:10]
Esempio n. 19
0
    def fluid_search(self, query, considered_candidates=50, selected_candidates=5, second_pass=True):
        self.load_essence()

        if self.essence_ready == False:
            return ['The essence is not present at the required location.']
        
        self.sync_cache()
        selected_candidates = min(selected_candidates, considered_candidates)
        query_embedding = self.text_encoder.encode(query, convert_to_tensor=True)
        hits = util.semantic_search(query_embedding, torch.Tensor(self.entry_embeddings), top_k=considered_candidates)[0]

        if second_pass:
            cross_scores = self.pair_encoder.predict([[query, self.entry_contents[hit['corpus_id']]] for hit in hits])

            for idx in range(len(cross_scores)):
                hits[idx]['cross-score'] = cross_scores[idx]

            hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
            return [self.entry_filenames[hit['corpus_id']] for hit in hits[:selected_candidates] if hit['cross-score'] > 1e-3]
        else:
            return [self.entry_filenames[hit['corpus_id']] for hit in hits[:selected_candidates]]
def retrieve_top_k_similar_issues(model, issues, embeddings, description,
                                  top_k):
    print('Retrieving top-{} similar issues...'.format(top_k))
    start_time = time.time()
    description_embedding = model.encode(description, convert_to_tensor=True)
    results = util.semantic_search(description_embedding,
                                   embeddings,
                                   top_k=top_k)[0]
    similar_issues = []
    for result in results:
        result_issue = issues.iloc[result['corpus_id']]
        issue = {
            'id': int(result_issue['bug_id']),
            'description': str(result_issue['full_description']),
            'similarity': float(result['score'])
        }
        similar_issues.append(issue)
    end_time = time.time()
    duration_in_secs = end_time - start_time
    print('Retrieved top-{} similar issues in {} seconds'.format(
        top_k, round(duration_in_secs, 5)))
    return similar_issues
def recommend_by_similar_users(user_id: int,
                               df: pd.DataFrame,
                               embeddings: pd.DataFrame,
                               top_k: int = 11) -> list:
    """Function searches 'embeddings' for vectors most similar
    to the 'user_id', selects books that were read and were not
    explicitly disliked by similar users, which are not present
    in the 'user_id' history.
    :param user_id: Integer ID for the user
    :param df: DataFrame with all available data
    :param embeddings: DataFrame of embeddings for all users
    :param top_k: Maximum number of similar user vectors to search
    :return: List of recommended content IDs
    """
    # Array representing the query user
    query = embeddings.loc[user_id, :].values
    # Information about the query user
    authors, read_books = get_user_data(df, user_id)
    print(f'User {user_id} read authors:', authors)

    # Search for top_k vectors most similar to the query
    # (returns a pd.DataFrame with 2 columns: 'corpus_id' and 'score')
    similar_users = pd.DataFrame(
        util.semantic_search(query, embeddings.values, top_k=top_k)[0]
    )
    # Add users IDs finding them by row indexes
    similar_users['User-ID'] = embeddings.iloc[similar_users['corpus_id'], :].index
    print(f'Similar users:\n{similar_users}')

    # Drop 1st row which represents the user that was used as a query
    # (self-match with similarity score=1.0)
    similar_users = similar_users.iloc[1:, :]['User-ID']

    # Select books from similar users histories except disliked books,
    # one-off accidental readings and books present in current user's logs
    recommendations = select_unread_books(df, similar_users, read_books)
    display_recommendations(recommendations)

    return recommendations
Esempio n. 22
0
def search(query):
    print("Input question:", query)

    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding,
                                corpus_embeddings,
                                top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    print("\n-------------------------\n")
    print("Top-3 Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:3]:
        print("\t{:.3f}\t{}".format(
            hit['score'], passages[hit['corpus_id']].replace("\n", " ")))

    # Output of top-5 hits from re-ranker
    print("\n-------------------------\n")
    print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:3]:
        print("\t{:.3f}\t{}".format(
            hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
Esempio n. 23
0
    def search(self, embds, index, q_embedding, top_k_hits=5):
        '''
        index: the faiss index used for the search
        q_embedding: embedding of the query
        top_k_hits: number of hits to output
        '''
        #FAISS works with inner product (dot product). When we normalize vectors to unit length, inner product is equal to cosine similarity
        q_embedding = q_embedding / np.linalg.norm(q_embedding)
        q_embedding = np.expand_dims(q_embedding, axis=0)
        # Search in FAISS. It returns a matrix with distances and corpus ids.
        distances, corpus_ids = index.search(q_embedding, top_k_hits)

        hits = [{'corpus_id': id, 'score': score} for id, score in zip(corpus_ids[0], distances[0])]
        hits = sorted(hits, key=lambda x: x['score'], reverse=True)

        top_urls = []
        rank = 1
        for hit in hits[0:top_k_hits]:
            id_num = self.df.index[hit['corpus_id']]
            print("{}\t{:.3f}\t{}".format(rank, hit['score'], id_num))
            item = self.df.iloc[hit['corpus_id']]
            print(f"{item['descrption']}\t{item['brand']}\t{item['price']}\n{item['url']}")
            top_urls.append(item['url'])
            rank += 1
            print()

        correct_hits = util.semantic_search(q_embedding, embds, top_k=top_k_hits)[0]
        correct_hits_ids = set([hit['corpus_id'] for hit in correct_hits])
        ann_corpus_ids = set([hit['corpus_id'] for hit in hits])
        if len(ann_corpus_ids) != len(correct_hits_ids):
            print("Approximate Nearest Neighbor returned a different number of results than expected")

        recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
        print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))

        return top_urls
# 將所屬類別存回資料庫
def returnCategory(policy_id, category_id):
    sqlstr = "insert into policy_category (policy_id, category_id) VALUES (%s, %s)" % (
        policy_id, category_id)
    return DB.execution(DB.create, sqlstr)


# 待匯入完畢後跑迴圈
policy = findPolicy()
category = findCategory()

# 分類與政見逐條比對
for j in category["data"]:
    m = [j["name"]]

    for i in policy["data"]:
        n = [i["content"].decode(encoding='utf-8', errors='ignore')]  # 政見
        result = xs.cossim(m, n)

        if result > 0.4:
            returnCategory(i["id"], j["id"])
        else:
            embedder = SBert()
            corpus_embeddings = embedder.encode(n)
            query_embedding = embedder.encode(m)
            hits = semantic_search(query_embedding, corpus_embeddings)
            hits = hits[0]
            for hit in hits:
                if hit['score'] > 0.4:
                    returnCategory(i["id"], j["id"])
    hits = []
    for id, score in zip(corpus_ids, scores):
        hits.append({'corpus_id': id, 'score': 1 - ((score**2) / 2)})

    end_time = time.time()

    print("Input question:", inp_question)
    print("Results (after {:.3f} seconds):".format(end_time - start_time))
    for hit in hits[0:top_k_hits]:
        print("\t{:.3f}\t{}".format(hit['score'],
                                    corpus_sentences[hit['corpus_id']]))

    # Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity
    # Here, we compute the recall of ANN compared to the exact results
    correct_hits = util.semantic_search(question_embedding,
                                        corpus_embeddings,
                                        top_k=top_k_hits)[0]
    correct_hits_ids = set([hit['corpus_id'] for hit in correct_hits])

    #Compute recall
    ann_corpus_ids = set(corpus_ids)
    if len(ann_corpus_ids) != len(correct_hits_ids):
        print(
            "Approximate Nearest Neighbor returned a different number of results than expected"
        )

    recall = len(
        ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
    print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(
        top_k_hits, recall * 100))
corpus_filepath = 'wiki-programmming-20210101.jsonl.gz'
if not os.path.exists(corpus_filepath):
    util.http_get(
        'https://sbert.net/datasets/wiki-programmming-20210101.jsonl.gz',
        corpus_filepath)

with gzip.open(corpus_filepath, 'rt') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
        title = data['title']
        for p in data['paragraphs']:
            if len(p) > 100:  #Only take paragraphs with at least 100 chars
                docs.append((title, p))

paragraph_emb = model.encode([d[1] for d in docs], convert_to_tensor=True)

print("Available Wikipedia Articles:")
print(", ".join(sorted(list({d[0] for d in docs}))))

# Example for semantic search
while True:
    query = input("Query: ")
    query_emb = model.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_emb, paragraph_emb, top_k=3)[0]

    for hit in hits:
        doc = docs[hit['corpus_id']]
        print("{:.2f}\t{}\t\t{}".format(hit['score'], doc[0], doc[1]))

    print("\n=================\n")
Esempio n. 27
0
    all_sentences.append(date)

    for ind, sentence in enumerate(all_sentences):
        tot_sentences += 1
        sentence_id = tot_sentences

        doc_id_to_sentences.get(doc_id).update({sentence_id: sentence})
        sentence_id_to_doc_id.update({sentence_id: doc_id})

print(tot_sentences)

print("ENCODING SENTENCES")
corpus_embeddings = embedder.encode(all_sentences,
                                    convert_to_tensor=True,
                                    show_progress_bar=True)

query = "man"

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 5
print("QUERYING")
query_embedding = embedder.encode(query,
                                  convert_to_tensor=True,
                                  show_progress_bar=True)

print("RUNNING SEM SEARCH")
cos_scores = util.semantic_search(query_embedding, corpus_embeddings)[0]
print("DONE")
# print(cos_scores)
# for score in cos_scores:
#     print(lookup[score["corpus_id"]], score["score"])
Esempio n. 28
0
def deploy(question):
    tokenizer, model, bi_encoder = neuralqa()
    top_k = returns  # Number of passages we want to retrieve with the bi-encoder
    question_embedding = bi_encoder.encode(question, convert_to_tensor=True)

    hits = util.semantic_search(question_embedding,
                                corpus_embeddings,
                                top_k=top_k)
    hits = hits[0]

    #define lists
    matches = []
    ids = []
    scores = []
    answers = []

    for hit in hits:
        matches.append(passages[hit['corpus_id']])
        ids.append(passage_id[hit['corpus_id']])
        scores.append(hit['score'])

    for match in matches:
        inputs = tokenizer.encode_plus(question,
                                       match,
                                       add_special_tokens=True,
                                       return_tensors="pt")
        input_ids = inputs["input_ids"].tolist()[0]

        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer_start_scores, answer_end_scores = model(**inputs)

        answer_start = torch.argmax(
            answer_start_scores
        )  # Get the most likely beginning of answer with the argmax of the score
        answer_end = torch.argmax(
            answer_end_scores
        ) + 1  # Get the most likely end of answer with the argmax of the score

        answer = tokenizer.convert_tokens_to_string(
            tokenizer.convert_ids_to_tokens(
                input_ids[answer_start:answer_end]))

        answers.append(answer)

    # generate result df
    df_results = pd.DataFrame({
        'PIMS_ID': ids,
        'answer': answers,
        'context': matches,
        "scores": scores
    })

    st.header("Retrieved Answers:")
    for index, row in df_results.iterrows():
        green = "<span class='highlight turquoise'>" + row[
            'answer'] + "<span class='bold'>Answer</span></span>"
        row['context'] = row['context'].replace(row['answer'], green)
        row['context'] = "<div>" + row['context'] + "</div>"
        st.markdown(row['context'], unsafe_allow_html=True)
        st.write("")
        st.write("Relevance:", round(row['scores'], 2), "PIMS_ID:",
                 row['PIMS_ID'])
        st.write(
            "____________________________________________________________________"
        )

    df_results.set_index('PIMS_ID', inplace=True)
    st.header("Summary:")
    st.table(df_results)

    del tokenizer, model, bi_encoder, question_embedding
Esempio n. 29
0
top_k = min(5, len(s3)) #top_k = min(5, len(corpus))
for query in queries:
    query_embedding = model.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings2)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    # for score, idx in zip(top_results[0], top_results[1]):
    #     print(s3[idx], "(Score: {:.4f})".format(score))

    hits = util.semantic_search(query_embedding, corpus_embeddings2, top_k=5)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print(s3[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))


# Paraphrase Mining - finding texts with similar meaning for large colections of sentences 10000+
largecorpus =corpus_test['sentence_A'].unique()
paraphrases = util.paraphrase_mining(model,largecorpus)

df = pd.DataFrame.from_records(paraphrases)
df[1] = [largecorpus[idx] for idx in df[1]] 
df[2] = [largecorpus[idx] for idx in df[2]] 
# df.to_csv("Paraphrase_Mining_pl.csv",index=False,header=["score","sentence1","sentence2"])

Esempio n. 30
0
        corpus_sentences = cache_data['sentences'][0:max_corpus_size]
        corpus_embeddings = cache_data['embeddings'][0:max_corpus_size]

###############################
print("Corpus loaded with {} sentences / embeddings".format(
    len(corpus_sentences)))

while True:
    inp_question = input("Please enter a question: ")
    print("Input question:", inp_question)

    #First, retrieve candidates using cosine similarity search
    start_time = time.time()
    question_embedding = model.encode(inp_question, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding,
                                corpus_embeddings,
                                top_k=num_candidates)
    hits = hits[0]  #Get the hits for the first query

    print("Cosine-Similarity search took {:.3f} seconds".format(time.time() -
                                                                start_time))
    print("Top 5 hits with cosine-similarity:")
    for hit in hits[0:5]:
        print("\t{:.3f}\t{}".format(hit['score'],
                                    corpus_sentences[hit['corpus_id']]))

    #Now, do the re-ranking with the cross-encoder
    start_time = time.time()
    sentence_pairs = [[inp_question, corpus_sentences[hit['corpus_id']]]
                      for hit in hits]
    ce_scores = cross_encoder_model.predict(sentence_pairs)