def main(): cwd = Path(__file__).parent if not os.path.exists(os.path.join(cwd, "post_dataset.pkl")): print("Downloading sample data of TopDup") urllib.request.urlretrieve( "https://storage.googleapis.com/topdup/dataset/post_dataset.pkl", os.path.join(cwd, "post_dataset.pkl"), ) print("Preprocessing documents") processor = ViPreProcessor() data = pickle.load(open(os.path.join(cwd, "post_dataset.pkl"), "rb")) docs = list() for d in tqdm(data[:1000]): content, meta = data_prep(jsonpickle.loads(d)) doc = processor.clean({"text": content}) for m in meta.keys(): if isinstance(meta[m], list): # serialize list meta[m] = "|".join(meta[m]) doc["meta"] = meta docs.append(doc) print("Ingesting data to SQLite database") db_path = os.path.join(cwd, "topdup.db") if os.path.exists(db_path): os.remove(db_path) with sqlite3.connect(db_path): document_store = FAISSDocumentStore(sql_url=f"sqlite:///{db_path}") document_store.write_documents(docs) pass
def test_not_implemented_splitby_in_split(srsb): """Test if code could raise exception for invalid args: 'split_by' and 'split_respect_sentence_boundary' """ processor = ViPreProcessor( split_by="dummy_string", split_respect_sentence_boundary=srsb ) assert_raises(lambda: processor.split({"text": "dummy"}), NotImplementedError)
def test_basic_cleaning_2(): """Test if output of clean function is dict and contains key 'text' """ processor = ViPreProcessor() text = 'Cho đến thời điểm này, có thể nói, ' + \ 'Klopp là một trong những đối thủ lớn nhất của Mourinho.' out = processor.clean({'text': text}) assert isinstance(out, dict) and 'text' in out
def batch_retrieve( self, query_docs, top_k_candidates=10, processe_query_docs=False, index=None, filters=None, ): """[summary] Args: query_docs ([type]): [description] top_k_candidates (int, optional): [description]. Defaults to 10. processe_query_docs (bool, optional): [description]. Defaults to False. index ([type], optional): [description]. Defaults to None. filters ([type], optional): [description]. Defaults to None. Returns: [type]: [description] """ if not self.document_store.is_synchronized(): raise ValueError( "Faiss_index and database haven't been synchronized yet." " Please, call update_embeddings methods first!") if processe_query_docs: processor = ViPreProcessor() query_docs = [ processor.clean({"text": query_doc})["text"] for query_doc in query_docs ] _, _, candidate_id_matrix = self.get_candidates(query_docs=query_docs, top_k=top_k_candidates, index=index, filters=filters) retrieve_results = [] for idx, query_doc in enumerate( tqdm(query_docs, desc="Retrieving..... ")): candidate_ids = [ candidate_id for candidate_id in candidate_id_matrix[idx] if candidate_id >= 0 ] retrieve_result, score = self._calc_scores_for_candidates( query_doc=query_doc, candidate_ids=candidate_ids) retrieve_results.append({ "query_doc": query_doc, "retrieve_result": retrieve_result, "similarity_score": round(score[0], 5), }) return retrieve_results
def test_basic_cleaning_1(): """Conduct basic cleaning and compare result """ processor = ViPreProcessor() text = 'Cho đến thời điểm này, có thể nói, ' + \ 'Klopp là một trong những đối thủ lớn nhất của Mourinho.' expected = 'cho đến thời_điểm này, có_thể nói, ' + \ 'klopp là một trong những đối_thủ lớn nhất của mourinho.' out = processor.clean({'text': text}) assert expected == out['text']
def test_basic_cleaning_1(): """Conduct basic cleaning and compare result""" processor = ViPreProcessor() text = ( "Cho đến thời điểm này, có thể nói, " + "Klopp là một trong những đối thủ lớn nhất của Mourinho." ) expected = ( "cho đến thời_điểm này, có_thể nói, " + "klopp là một trong những đối_thủ lớn nhất của mourinho." ) out = processor.clean({"text": text}) assert expected == out["text"]
def test_invalid_split_length(): """Test if code could raise exception for invalid args: 'split_length' """ processor = ViPreProcessor(split_length=None) assert_raises(lambda: processor.split({"text": "dummy"}), ValueError)
def batch_retrieve( self, query_docs: List[Document], top_k_results: int = 10, process_query_texts: bool = False, index: str = None, filters=None, ) -> List[Dict[str, Any]]: """Retrieves batch of most k similar docs of given batch of documents Args: query_docs ([type]): [description] top_k_results (int, optional): [description]. Defaults to 10. process_query_texts (bool, optional): [description]. Defaults to False. index ([type], optional): [description]. Defaults to None. filters ([type], optional): [description]. Defaults to None. Returns: List[Dict[str, Any]]: Retrieved results """ if not self.document_store.is_synchronized(): raise ValueError( "faiss_index and database haven't been synchronized yet." " Try to call update_embeddings methods first.") query_texts = [doc.text for doc in query_docs] if process_query_texts: processor = ViPreProcessor() query_texts = [ processor.clean({"text": query_text})["text"] for query_text in query_texts ] _, _, candidate_id_matrix = self.get_candidates( query_texts=query_texts, top_k=10 * top_k_results, index=index, filters=filters, ) # create large candidates search space 10*top_k_results retrieve_results = [] for idx, query_text in enumerate( tqdm(query_texts, desc="Retrieving..... ")): candidate_ids = [ candidate_id for candidate_id in candidate_id_matrix[idx] if candidate_id >= 0 ] reranked_candidates = self._calc_scores_for_candidates( query_text=query_text, candidate_ids=candidate_ids, top_k_results=top_k_results, ) for rank, reranked_candidate in enumerate(reranked_candidates): retrieve_results.append({ "document_id": query_docs[idx].id, f"sim_document_id_rank_{str(rank).zfill(2)}": reranked_candidate[0], f"sim_score_rank_{str(rank).zfill(2)}": round(reranked_candidate[1][0], 5), }) return retrieve_results
def sequential_retrieve( self, query_docs, meta_docs=None, top_k_candidates=10, processe_query_docs=True, index=None, filters=None, ): """[summary] Args: query_docs ([type]): [description] meta_docs ([type]): [description] top_k_candidates (int, optional): [description]. Defaults to 10. processe_query_docs (bool, optional): [description]. Defaults to True. index ([type], optional): [description]. Defaults to None. filters ([type], optional): [description]. Defaults to None. Returns: [type]: [description] """ if not self.document_store.is_synchronized(): raise ValueError( "Faiss_index and database haven't been synchronized yet." " Please, call update_embeddings methods first!") retrieve_results = [] for idx, query_doc in enumerate( tqdm(query_docs, desc="Sequential retrieving..... ")): if processe_query_docs: processor = ViPreProcessor() doc = processor.clean({"text": query_doc}) else: doc = {"text": query_doc} query_emb, _, candidate_id_matrix = self.get_candidates( query_docs=[doc["text"]], top_k=top_k_candidates, index=index, filters=filters, ) candidate_ids = [ candidate_id for candidate_id in candidate_id_matrix[0] if candidate_id >= 0 ] retrieve_result, score = self._calc_scores_for_candidates( query_doc=query_doc, candidate_ids=candidate_ids) retrieve_results.append({ "query_doc": query_doc, "retrieve_result": retrieve_result, "similarity_score": round(score[0], 5), }) doc["embedding"] = query_emb[0] if meta_docs and idx < len(meta_docs): meta = meta_docs[idx] else: meta = {} for m in meta.keys(): if isinstance(meta[m], list): meta[m] = "|".join(meta[m]) doc["meta"] = meta self.document_store.write_documents([doc]) return retrieve_results
FROM ( SELECT value AS url ,document_id FROM meta m WHERE lower(name) IN ( 'href' ,'url' ) ) AS url_table INNER JOIN "document" d ON url_table.document_id = d.id WHERE CAST(levenshtein('{0}', url) AS DECIMAL) / CAST(length(url) AS DECIMAL) < {1} ORDER BY levenshtein('{0}', url) LIMIT 1 """ # Default methods preprocessor = ViPreProcessor(split_by="sentence") document_store = FAISSDocumentStore(sql_url=POSTGRES_URI, vector_dim=CAND_DIM, index_buffer_size=5000) retriever = Retriever( document_store=document_store, candidate_vectorizer=TfidfDocVectorizer(CAND_DIM), retriever_vectorizer=TfidfDocVectorizer(RTRV_DIM), ) retriever.train_candidate_vectorizer(retrain=False, save_path=CAND_PATH) remote_doc_store = FAISSDocumentStore(sql_url=POSTGRES_URI, vector_dim=CAND_DIM)