コード例 #1
0
def main():

    cwd = Path(__file__).parent
    if not os.path.exists(os.path.join(cwd, "post_dataset.pkl")):
        print("Downloading sample data of TopDup")
        urllib.request.urlretrieve(
            "https://storage.googleapis.com/topdup/dataset/post_dataset.pkl",
            os.path.join(cwd, "post_dataset.pkl"),
        )

    print("Preprocessing documents")
    processor = ViPreProcessor()
    data = pickle.load(open(os.path.join(cwd, "post_dataset.pkl"), "rb"))
    docs = list()
    for d in tqdm(data[:1000]):
        content, meta = data_prep(jsonpickle.loads(d))
        doc = processor.clean({"text": content})
        for m in meta.keys():
            if isinstance(meta[m], list):  # serialize list
                meta[m] = "|".join(meta[m])
        doc["meta"] = meta
        docs.append(doc)

    print("Ingesting data to SQLite database")
    db_path = os.path.join(cwd, "topdup.db")
    if os.path.exists(db_path):
        os.remove(db_path)
    with sqlite3.connect(db_path):
        document_store = FAISSDocumentStore(sql_url=f"sqlite:///{db_path}")
        document_store.write_documents(docs)

    pass
コード例 #2
0
ファイル: test_vi_proc.py プロジェクト: henryho8012/topdup2
def test_not_implemented_splitby_in_split(srsb):
    """Test if code could raise exception for invalid args:
    'split_by' and 'split_respect_sentence_boundary'
    """
    processor = ViPreProcessor(
        split_by="dummy_string", split_respect_sentence_boundary=srsb
    )
    assert_raises(lambda: processor.split({"text": "dummy"}), NotImplementedError)
コード例 #3
0
ファイル: test_vi_proc.py プロジェクト: sitloboi2012/topdup
def test_basic_cleaning_2():
    """Test if output of clean function is dict and contains key 'text'
    """
    processor = ViPreProcessor()
    text = 'Cho đến thời điểm này, có thể nói, ' + \
           'Klopp là một trong những đối thủ lớn nhất của Mourinho.'

    out = processor.clean({'text': text})
    assert isinstance(out, dict) and 'text' in out
コード例 #4
0
    def batch_retrieve(
        self,
        query_docs,
        top_k_candidates=10,
        processe_query_docs=False,
        index=None,
        filters=None,
    ):
        """[summary]

        Args:
            query_docs ([type]): [description]
            top_k_candidates (int, optional): [description]. Defaults to 10.
            processe_query_docs (bool, optional): [description]. Defaults to False.
            index ([type], optional): [description]. Defaults to None.
            filters ([type], optional): [description]. Defaults to None.

        Returns:
            [type]: [description]
        """
        if not self.document_store.is_synchronized():
            raise ValueError(
                "Faiss_index and database haven't been synchronized yet."
                " Please, call update_embeddings methods first!")

        if processe_query_docs:
            processor = ViPreProcessor()
            query_docs = [
                processor.clean({"text": query_doc})["text"]
                for query_doc in query_docs
            ]

        _, _, candidate_id_matrix = self.get_candidates(query_docs=query_docs,
                                                        top_k=top_k_candidates,
                                                        index=index,
                                                        filters=filters)

        retrieve_results = []

        for idx, query_doc in enumerate(
                tqdm(query_docs, desc="Retrieving.....  ")):
            candidate_ids = [
                candidate_id for candidate_id in candidate_id_matrix[idx]
                if candidate_id >= 0
            ]

            retrieve_result, score = self._calc_scores_for_candidates(
                query_doc=query_doc, candidate_ids=candidate_ids)

            retrieve_results.append({
                "query_doc": query_doc,
                "retrieve_result": retrieve_result,
                "similarity_score": round(score[0], 5),
            })

        return retrieve_results
コード例 #5
0
ファイル: test_vi_proc.py プロジェクト: sitloboi2012/topdup
def test_basic_cleaning_1():
    """Conduct basic cleaning and compare result
    """
    processor = ViPreProcessor()
    text = 'Cho đến thời điểm này, có thể nói, ' + \
           'Klopp là một trong những đối thủ lớn nhất của Mourinho.'

    expected = 'cho đến thời_điểm này, có_thể nói, ' + \
               'klopp là một trong những đối_thủ lớn nhất của mourinho.'

    out = processor.clean({'text': text})
    assert expected == out['text']
コード例 #6
0
ファイル: test_vi_proc.py プロジェクト: henryho8012/topdup2
def test_basic_cleaning_1():
    """Conduct basic cleaning and compare result"""
    processor = ViPreProcessor()
    text = (
        "Cho đến thời điểm này, có thể nói, "
        + "Klopp là một trong những đối thủ lớn nhất của Mourinho."
    )

    expected = (
        "cho đến thời_điểm này, có_thể nói, "
        + "klopp là một trong những đối_thủ lớn nhất của mourinho."
    )

    out = processor.clean({"text": text})
    assert expected == out["text"]
コード例 #7
0
ファイル: test_vi_proc.py プロジェクト: henryho8012/topdup2
def test_invalid_split_length():
    """Test if code could raise exception for invalid args:
    'split_length'
    """
    processor = ViPreProcessor(split_length=None)
    assert_raises(lambda: processor.split({"text": "dummy"}), ValueError)
コード例 #8
0
    def batch_retrieve(
        self,
        query_docs: List[Document],
        top_k_results: int = 10,
        process_query_texts: bool = False,
        index: str = None,
        filters=None,
    ) -> List[Dict[str, Any]]:
        """Retrieves batch of most k similar docs of given batch of documents

        Args:
            query_docs ([type]): [description]
            top_k_results (int, optional): [description]. Defaults to 10.
            process_query_texts (bool, optional): [description]. Defaults to False.
            index ([type], optional): [description]. Defaults to None.
            filters ([type], optional): [description]. Defaults to None.

        Returns:
            List[Dict[str, Any]]: Retrieved results
        """
        if not self.document_store.is_synchronized():
            raise ValueError(
                "faiss_index and database haven't been synchronized yet."
                " Try to call update_embeddings methods first.")

        query_texts = [doc.text for doc in query_docs]

        if process_query_texts:
            processor = ViPreProcessor()
            query_texts = [
                processor.clean({"text": query_text})["text"]
                for query_text in query_texts
            ]

        _, _, candidate_id_matrix = self.get_candidates(
            query_texts=query_texts,
            top_k=10 * top_k_results,
            index=index,
            filters=filters,
        )  # create large candidates search space 10*top_k_results

        retrieve_results = []

        for idx, query_text in enumerate(
                tqdm(query_texts, desc="Retrieving.....  ")):
            candidate_ids = [
                candidate_id for candidate_id in candidate_id_matrix[idx]
                if candidate_id >= 0
            ]

            reranked_candidates = self._calc_scores_for_candidates(
                query_text=query_text,
                candidate_ids=candidate_ids,
                top_k_results=top_k_results,
            )

            for rank, reranked_candidate in enumerate(reranked_candidates):
                retrieve_results.append({
                    "document_id":
                    query_docs[idx].id,
                    f"sim_document_id_rank_{str(rank).zfill(2)}":
                    reranked_candidate[0],
                    f"sim_score_rank_{str(rank).zfill(2)}":
                    round(reranked_candidate[1][0], 5),
                })

        return retrieve_results
コード例 #9
0
    def sequential_retrieve(
        self,
        query_docs,
        meta_docs=None,
        top_k_candidates=10,
        processe_query_docs=True,
        index=None,
        filters=None,
    ):
        """[summary]

        Args:
            query_docs ([type]): [description]
            meta_docs ([type]): [description]
            top_k_candidates (int, optional): [description]. Defaults to 10.
            processe_query_docs (bool, optional): [description]. Defaults to True.
            index ([type], optional): [description]. Defaults to None.
            filters ([type], optional): [description]. Defaults to None.

        Returns:
            [type]: [description]
        """
        if not self.document_store.is_synchronized():
            raise ValueError(
                "Faiss_index and database haven't been synchronized yet."
                " Please, call update_embeddings methods first!")

        retrieve_results = []
        for idx, query_doc in enumerate(
                tqdm(query_docs, desc="Sequential retrieving.....  ")):

            if processe_query_docs:
                processor = ViPreProcessor()
                doc = processor.clean({"text": query_doc})
            else:
                doc = {"text": query_doc}

            query_emb, _, candidate_id_matrix = self.get_candidates(
                query_docs=[doc["text"]],
                top_k=top_k_candidates,
                index=index,
                filters=filters,
            )
            candidate_ids = [
                candidate_id for candidate_id in candidate_id_matrix[0]
                if candidate_id >= 0
            ]

            retrieve_result, score = self._calc_scores_for_candidates(
                query_doc=query_doc, candidate_ids=candidate_ids)
            retrieve_results.append({
                "query_doc": query_doc,
                "retrieve_result": retrieve_result,
                "similarity_score": round(score[0], 5),
            })

            doc["embedding"] = query_emb[0]

            if meta_docs and idx < len(meta_docs):
                meta = meta_docs[idx]
            else:
                meta = {}

            for m in meta.keys():
                if isinstance(meta[m], list):
                    meta[m] = "|".join(meta[m])

            doc["meta"] = meta

            self.document_store.write_documents([doc])

        return retrieve_results
コード例 #10
0
ファイル: main.py プロジェクト: linhdb-2149/topdup
    FROM (
        SELECT value AS url
            ,document_id
        FROM meta m
        WHERE lower(name) IN (
                'href'
                ,'url'
                )
        ) AS url_table
    INNER JOIN "document" d ON url_table.document_id = d.id
    WHERE CAST(levenshtein('{0}', url) AS DECIMAL) / CAST(length(url) AS DECIMAL) < {1}
    ORDER BY levenshtein('{0}', url) LIMIT 1
"""

# Default methods
preprocessor = ViPreProcessor(split_by="sentence")

document_store = FAISSDocumentStore(sql_url=POSTGRES_URI,
                                    vector_dim=CAND_DIM,
                                    index_buffer_size=5000)

retriever = Retriever(
    document_store=document_store,
    candidate_vectorizer=TfidfDocVectorizer(CAND_DIM),
    retriever_vectorizer=TfidfDocVectorizer(RTRV_DIM),
)
retriever.train_candidate_vectorizer(retrain=False, save_path=CAND_PATH)

remote_doc_store = FAISSDocumentStore(sql_url=POSTGRES_URI,
                                      vector_dim=CAND_DIM)