Python DensePassageRetriever.DensePassageRetriever Beispiele, haystack.retriever.dense.DensePassageRetriever.DensePassageRetriever Python Beispiele

Beispiel #1

0

Datei anzeigen

def test_faiss_update_docs(document_store, index_buffer_size):
    # adjust buffer size
    document_store.index_buffer_size = index_buffer_size

    # initial write
    document_store.write_documents(DOCUMENTS)

    # do the update
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=False,
        embed_title=True,
        remove_sep_tok_from_untitled_passages=True)

    document_store.update_embeddings(retriever=retriever)
    documents_indexed = document_store.get_all_documents()

    # test if correct vectors are associated with docs
    for i, doc in enumerate(documents_indexed):
        original_doc = [d for d in DOCUMENTS if d["text"] == doc.text][0]
        updated_embedding = retriever.embed_passages(
            [Document.from_dict(original_doc)])
        stored_emb = document_store.faiss_index.reconstruct(
            int(doc.meta["vector_id"]))
        # compare original input vec with stored one (ignore extra dim added by hnsw)
        assert np.allclose(updated_embedding, stored_emb, rtol=0.01)

    # test document correctness
    check_data_correctness(documents_indexed, DOCUMENTS)

Beispiel #2

0

Datei anzeigen

def initFinder():
    """
    Function to initiate retriever, reader and finder
    Parameters
    ----------
    Returns
    -------
        finder (object): Haystack finder
    """
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=False,
        embed_title=True,
        max_seq_len=256,
        batch_size=16,
        remove_sep_tok_from_untitled_passages=True)
    # Important:
    # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
    document_store.update_embeddings(retriever)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=False)
    return Finder(reader, retriever)

Beispiel #3

0

Datei anzeigen

Datei: conftest.py Projekt: venuraja79/haystack

def get_retriever(retriever_type, document_store):

    if retriever_type == "dpr":
        retriever = DensePassageRetriever(
            document_store=document_store,
            query_embedding_model=
            "facebook/dpr-question_encoder-single-nq-base",
            passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
            use_gpu=False,
            embed_title=True)
    elif retriever_type == "tfidf":
        retriever = TfidfRetriever(document_store=document_store)
        retriever.fit()
    elif retriever_type == "embedding":
        retriever = EmbeddingRetriever(document_store=document_store,
                                       embedding_model="deepset/sentence_bert",
                                       use_gpu=False)
    elif retriever_type == "elasticsearch":
        retriever = ElasticsearchRetriever(document_store=document_store)
    elif retriever_type == "es_filter_only":
        retriever = ElasticsearchFilterOnlyRetriever(
            document_store=document_store)
    else:
        raise Exception(f"No retriever fixture for '{retriever_type}'")

    return retriever

Beispiel #4

0

Datei anzeigen

Datei: conftest.py Projekt: pranav0904/haystack

def dpr_retriever(faiss_document_store):
    return DensePassageRetriever(
        document_store=faiss_document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=False,
        embed_title=True,
        use_fast_tokenizers=True)

Beispiel #5

0

Datei anzeigen

def get_dense_passage_retriever(document_store,
                                dpr_model_path,
                                use_gpu=True,
                                batch_size=16,
                                do_lower_case=True):
    return DensePassageRetriever(document_store=document_store,
                                 embedding_model=dpr_model_path,
                                 use_gpu=use_gpu,
                                 batch_size=batch_size,
                                 do_lower_case=do_lower_case)

Beispiel #6

0

Datei anzeigen

def test_dpr_passage_encoder():
    from haystack.retriever.dense import DensePassageRetriever

    passage = ["Let's encode this one"]
    retriever = DensePassageRetriever(document_store=None,
                                      embedding_model="dpr-bert-base-nq",
                                      gpu=False)
    emb = retriever.embed_passages(passage)[0]
    assert (emb.shape[0] == 768)
    assert (emb[0] - 0.52872 < 0.001)

Beispiel #7

0

Datei anzeigen

def get_retriever(retriever_name, doc_store):
    if retriever_name == "elastic":
        return ElasticsearchRetriever(doc_store)
    if retriever_name == "tfidf":
        return TfidfRetriever(doc_store)
    if retriever_name == "dpr":
        return DensePassageRetriever(document_store=doc_store,
                                      query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                      passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                      use_gpu=True)

Beispiel #8

0

Datei anzeigen

def test_join_document_pipeline(document_store_with_docs, reader):
    es = ElasticsearchRetriever(document_store=document_store_with_docs)
    dpr = DensePassageRetriever(
        document_store=document_store_with_docs,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=False,
    )
    document_store_with_docs.update_embeddings(dpr)

    query = "Where does Carla live?"

    # test merge without weights
    join_node = JoinDocuments(join_mode="merge")
    p = Pipeline()
    p.add_node(component=es, name="R1", inputs=["Query"])
    p.add_node(component=dpr, name="R2", inputs=["Query"])
    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
    results = p.run(query=query)
    assert len(results["documents"]) == 3

    # test merge with weights
    join_node = JoinDocuments(join_mode="merge",
                              weights=[1000, 1],
                              top_k_join=2)
    p = Pipeline()
    p.add_node(component=es, name="R1", inputs=["Query"])
    p.add_node(component=dpr, name="R2", inputs=["Query"])
    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
    results = p.run(query=query)
    assert math.isclose(results["documents"][0].score,
                        0.5350644373470798,
                        rel_tol=0.0001)
    assert len(results["documents"]) == 2

    # test concatenate
    join_node = JoinDocuments(join_mode="concatenate")
    p = Pipeline()
    p.add_node(component=es, name="R1", inputs=["Query"])
    p.add_node(component=dpr, name="R2", inputs=["Query"])
    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
    results = p.run(query=query)
    assert len(results["documents"]) == 3

    # test join_node with reader
    join_node = JoinDocuments()
    p = Pipeline()
    p.add_node(component=es, name="R1", inputs=["Query"])
    p.add_node(component=dpr, name="R2", inputs=["Query"])
    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
    p.add_node(component=reader, name="Reader", inputs=["Join"])
    results = p.run(query=query)
    #check whether correct answer is within top 2 predictions
    assert results["answers"][0]["answer"] == "Berlin" or results["answers"][
        1]["answer"] == "Berlin"

Beispiel #9

0

Datei anzeigen

def get_retriever(document_store,
                  query_model="facebook/dpr-question_encoder-single-nq-base",
                  passage_model="facebook/dpr-ctx_encoder-single-nq-base",
                  batch_size=128):
    return DensePassageRetriever(document_store=document_store,
                                 query_embedding_model=query_model,
                                 passage_embedding_model=passage_model,
                                 max_seq_len_query=64,
                                 max_seq_len_passage=256,
                                 use_gpu=torch.cuda.is_available(),
                                 batch_size=batch_size,
                                 embed_title=True)

Beispiel #10

0

Datei anzeigen

Datei: test_dpr_retriever.py Projekt: voxlogic/haystack

def test_dpr_inmemory_retrieval(document_store):

    documents = [
        Document(
            text="""Aaron Aaron ( or ; ""Ahärôn"") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman (""prophet"") to the Pharaoh. Part of the Law (Torah) that Moses received from""",
            meta={"name": "0"}
        ),
        Document(
            text="""Democratic Republic of the Congo to the south. Angola's capital, Luanda, lies on the Atlantic coast in the northwest of the country. Angola, although located in a tropical zone, has a climate that is not characterized for this region, due to the confluence of three factors: As a result, Angola's climate is characterized by two seasons: rainfall from October to April and drought, known as ""Cacimbo"", from May to August, drier, as the name implies, and with lower temperatures. On the other hand, while the coastline has high rainfall rates, decreasing from North to South and from to , with""",
        ),
        Document(
            text="""Schopenhauer, describing him as an ultimately shallow thinker: ""Schopenhauer has quite a crude mind ... where real depth starts, his comes to an end."" His friend Bertrand Russell had a low opinion on the philosopher, and attacked him in his famous ""History of Western Philosophy"" for hypocritically praising asceticism yet not acting upon it. On the opposite isle of Russell on the foundations of mathematics, the Dutch mathematician L. E. J. Brouwer incorporated the ideas of Kant and Schopenhauer in intuitionism, where mathematics is considered a purely mental activity, instead of an analytic activity wherein objective properties of reality are""",
            meta={"name": "1"}
        ),
        Document(
            text="""The Dothraki vocabulary was created by David J. Peterson well in advance of the adaptation. HBO hired the Language Creatio""",
            meta={"name": "2"}
        ),
        Document(
            text="""The title of the episode refers to the Great Sept of Baelor, the main religious building in King's Landing, where the episode's pivotal scene takes place. In the world created by George R. R. Martin""",
            meta={}
        )
    ]

    document_store.delete_all_documents(index="test_dpr")
    document_store.write_documents(documents, index="test_dpr")
    retriever = DensePassageRetriever(document_store=document_store,
                                      query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                      passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                      use_gpu=True, embed_title=True,
                                      remove_sep_tok_from_untitled_passages=True)
    document_store.update_embeddings(retriever=retriever, index="test_dpr")
    time.sleep(2)

    docs_with_emb = document_store.get_all_documents(index="test_dpr")

    # FAISSDocumentStore doesn't return embeddings, so these tests only work with ElasticsearchDocumentStore
    if isinstance(document_store, ElasticsearchDocumentStore):
        assert (len(docs_with_emb[0].embedding) == 768)
        assert (abs(docs_with_emb[0].embedding[0] - (-0.30634)) < 0.001)
        assert (abs(docs_with_emb[1].embedding[0] - (-0.37449)) < 0.001)
        assert (abs(docs_with_emb[2].embedding[0] - (-0.24695)) < 0.001)
        assert (abs(docs_with_emb[3].embedding[0] - (-0.08017)) < 0.001)
        assert (abs(docs_with_emb[4].embedding[0] - (-0.01534)) < 0.001)
    res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?", index="test_dpr")
    assert res[0].meta["name"] == "1"

    # clean up
    document_store.delete_all_documents(index="test_dpr")

Beispiel #11

0

Datei anzeigen

Datei: utils.py Projekt: stmnk/haystack

def get_retriever(retriever_name, doc_store):
    if retriever_name == "elastic":
        return ElasticsearchRetriever(doc_store)
    if retriever_name == "tfidf":
        return TfidfRetriever(doc_store)
    if retriever_name == "dpr":
        return DensePassageRetriever(
            document_store=doc_store,
            query_embedding_model=
            "facebook/dpr-question_encoder-single-nq-base",
            passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
            use_gpu=True,
            use_fast_tokenizers=False)
    if retriever_name == "sentence_transformers":
        return EmbeddingRetriever(document_store=doc_store,
                                  embedding_model="nq-distilbert-base-v1",
                                  use_gpu=True,
                                  model_format="sentence_transformers")

Beispiel #12

0

Datei anzeigen

def Find_answer(text_file_path, data_folder_path, symbol, question):
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

    with open(text_file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    for i, line in enumerate(data.split(symbol)):
        with open(f'{data_folder_path}/data{i+1}.txt', 'w') as f:
            print(f'writing file no.{i+1}')
            f.write(line)

    test_dicts = convert_files_to_dicts(dir_path=data_folder_path,
                                        clean_func=clean_wiki_text,
                                        split_paragraphs=True)
    document_store.write_documents(test_dicts)
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=16,
        use_gpu=True,
        embed_title=True,
        use_fast_tokenizers=True)

    document_store.update_embeddings(retriever)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True,
                        context_window_size=300)

    pipe = ExtractiveQAPipeline(reader, retriever)

    prediction = pipe.run(query=question, top_k_retriever=10, top_k_reader=3)

    doc_with_ans = []
    for i in range(len(prediction['answers'])):
        if prediction['answers'][i]['context'] not in doc_with_ans:
            doc_with_ans.append(prediction['answers'][i]['context'])

    answer = ' '.join(doc_with_ans)

    return answer

Beispiel #13

0

Datei anzeigen

Datei: test_dpr_retriever.py Projekt: tuner007/haystack

def test_dpr_inmemory_retrieval():
    document_store = InMemoryDocumentStore()

    documents = [
        {
            'name':
            '0',
            'text':
            """Aaron Aaron ( or ; ""Ahärôn"") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman (""prophet"") to the Pharaoh. Part of the Law (Torah) that Moses received from"""
        },
        {
            'name':
            '1',
            'text':
            """Schopenhauer, describing him as an ultimately shallow thinker: ""Schopenhauer has quite a crude mind ... where real depth starts, his comes to an end."" His friend Bertrand Russell had a low opinion on the philosopher, and attacked him in his famous ""History of Western Philosophy"" for hypocritically praising asceticism yet not acting upon it. On the opposite isle of Russell on the foundations of mathematics, the Dutch mathematician L. E. J. Brouwer incorporated the ideas of Kant and Schopenhauer in intuitionism, where mathematics is considered a purely mental activity, instead of an analytic activity wherein objective properties of reality are"""
        },
        {
            'name':
            '2',
            'text':
            """Democratic Republic of the Congo to the south. Angola's capital, Luanda, lies on the Atlantic coast in the northwest of the country. Angola, although located in a tropical zone, has a climate that is not characterized for this region, due to the confluence of three factors: As a result, Angola's climate is characterized by two seasons: rainfall from October to April and drought, known as ""Cacimbo"", from May to August, drier, as the name implies, and with lower temperatures. On the other hand, while the coastline has high rainfall rates, decreasing from North to South and from to , with"""
        },
    ]

    retriever = DensePassageRetriever(document_store=document_store,
                                      embedding_model="dpr-bert-base-nq",
                                      use_gpu=False)

    embedded = []
    for doc in documents:
        embedding = retriever.embed_passages([doc['text']])[0]
        doc['embedding'] = embedding
        embedded.append(doc)

        assert (embedding.shape[0] == 768)
        assert (embedding[0] - 0.52872 < 0.001)

    document_store.write_documents(embedded)

    res = retriever.retrieve(query="Which philosopher attacked Schopenhauer?")
    assert res[0].text == documents[1]["text"]

Beispiel #14

0

Datei anzeigen

Datei: process.py Projekt: npateel/timelywimely

def processDump(year):
    directory = year
    get_haystack_format(os.path.join(directory, 'dump'),
                        os.path.join(directory, 'dump.jsonl'))
    print(f"Wrote non empty docs to dump")
    url = "sqlite:///" + os.path.join(directory, 'faiss_document_store.db')
    document_store = FAISSDocumentStore(sql_url=url,
                                        progress_bar=True,
                                        faiss_index_factory_str="Flat")
    i = 0
    docs = []
    with open(os.path.join(directory, 'dump.jsonl')) as f:
        for line in tqdm(f):
            i += 1
            docs.append(json.loads(line))
            if i == 1000:
                document_store.write_documents(docs)
                docs = []
                i = 0
    print("Done creating documents")

    document_store.save(os.path.join(directory, "wiki_dump.faiss"))
    print("Saved document store")
    document_store = FAISSDocumentStore.load(os.path.join(
        directory, "wiki_dump.faiss"),
                                             sql_url=url,
                                             index='document')
    print("Starting DPR")
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=True,
        batch_size=64,
        embed_title=True)
    print("Updating embeddings...'")
    document_store.update_embeddings(retriever)
    document_store.save(os.path.join(directory, "wiki_dump_embeddings.faiss"))

Beispiel #15

0

Datei anzeigen

def set_embeded():
    """Return a friendly HTTP greeting."""
    index = request.form['index']
    document_store = ElasticsearchDocumentStore(
        host=app.config["host"],
        port=app.config["port"],
        username=app.config["username"],
        password=app.config["password"],
        index=index,
        embedding_field="embedding",
        embedding_dim=768)
    retriever = DensePassageRetriever(document_store=document_store,
                                      embedding_model="dpr-bert-base-nq",
                                      do_lower_case=True,
                                      use_gpu=False)
    #Now update the retriever embedded to the elasticsearch document
    document_store.update_embeddings(retriever)
    return json.dumps({
        'status': 'Susccess',
        'message':
        'Sucessfully embeded method updated in ElasticSearch Document',
        'result': []
    })

Beispiel #16

0

Datei anzeigen

Datei: TrainingManager.py Projekt: rafilevy/virtual_agronomist

    def get_current_retriever(cls, document_store):
        max_round = cls.get_current_round()
        if max_round != -1:
            old_modelDir = f"training/saved_models/dpr{max_round}"
            return DensePassageRetriever.load(document_store=document_store,
                                              load_dir=old_modelDir,
                                              max_seq_len_query=64,
                                              max_seq_len_passage=256,
                                              batch_size=16,
                                              use_gpu=False,
                                              embed_title=True,
                                              use_fast_tokenizers=True)

        return DensePassageRetriever(
            document_store=document_store,
            query_embedding_model=
            "facebook/dpr-question_encoder-single-nq-base",
            passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
            max_seq_len_query=64,
            max_seq_len_passage=256,
            batch_size=16,
            use_gpu=False,
            embed_title=True,
            use_fast_tokenizers=True)

Beispiel #17

0

Datei anzeigen

Datei: process.py Projekt: npateel/timelywimely

    #print("Saved document store")

    # create document store from the documents
    # I think this is more optimized for DPR?

    # faiss_index_factory_str is needed for saving and loading to work properly
    document_store = FAISSDocumentStore.load(
        "wiki_dump.faiss",
        sql_url='sqlite:///faiss_document_store.db',
        index='document')
    # unsure if we should tune these parameters based on Nikhils gpu
    print("Starting DPR")
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=True,
        batch_size=256,
        embed_title=True)
    # apparently this is time consuming
    print("Updating embeddings...'")
    document_store.update_embeddings(retriever)
    document_store.save("wiki_dump_embeddings.faiss")
    # example:
    # retrieved_doc = retriever.retrieve(query="Why did the revenue increase?")

    # tune these parameters too
    print("Running QA Reader")
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True,
                        no_ans_boost=-10,

Beispiel #18

0

Datei anzeigen

Datei: Tutorial7_RAG_Generator.py Projekt: vijayashok99/haystack

def tutorial7_rag_generator():
    # Add documents from which you want generate answers
    # Download a csv containing some sample documents data
    # Here some sample documents data
    temp = requests.get("https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv")
    open('small_generator_dataset.csv', 'wb').write(temp.content)

    # Get dataframe with columns "title", and "text"
    df = pd.read_csv("small_generator_dataset.csv", sep=',')
    # Minimal cleaning
    df.fillna(value="", inplace=True)

    print(df.head())

    titles = list(df["title"].values)
    texts = list(df["text"].values)

    # Create to haystack document format
    documents: List[Document] = []
    for title, text in zip(titles, texts):
        documents.append(
            Document(
                text=text,
                meta={
                    "name": title or ""
                }
            )
        )


    # Initialize FAISS document store to documents and corresponding index for embeddings
    # Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
    document_store = FAISSDocumentStore(
        faiss_index_factory_str="Flat",
        return_embedding=True
    )

    # Initialize DPR Retriever to encode documents, encode question and query documents
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=True,
        embed_title=True,
    )

    # Initialize RAG Generator
    generator = RAGenerator(
        model_name_or_path="facebook/rag-token-nq",
        use_gpu=True,
        top_k_answers=1,
        max_length=200,
        min_length=2,
        embed_title=True,
        num_beams=2,
    )

    # Delete existing documents in documents store
    document_store.delete_all_documents()
    # Write documents to document store
    document_store.write_documents(documents)
    # Add documents embeddings to index
    document_store.update_embeddings(
        retriever=retriever
    )

    # Now ask your questions
    # We have some sample questions
    QUESTIONS = [
        "who got the first nobel prize in physics",
        "when is the next deadpool movie being released",
        "which mode is used for short wave broadcast service",
        "who is the owner of reading football club",
        "when is the next scandal episode coming out",
        "when is the last time the philadelphia won the superbowl",
        "what is the most current adobe flash player version",
        "how many episodes are there in dragon ball z",
        "what is the first step in the evolution of the eye",
        "where is gall bladder situated in human body",
        "what is the main mineral in lithium batteries",
        "who is the president of usa right now",
        "where do the greasers live in the outsiders",
        "panda is a national animal of which country",
        "what is the name of manchester united stadium",
    ]

    # Now generate answer for question
    for question in QUESTIONS:
        # Retrieve related documents from retriever
        retriever_results = retriever.retrieve(
            query=question
        )

        # Now generate answer from question and retrieved documents
        predicted_result = generator.predict(
            query=question,
            documents=retriever_results,
            top_k=1
        )

        # Print you answer
        answers = predicted_result["answers"]
        print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')

Beispiel #19

0

Datei anzeigen

Datei: Tutorial6_Better_Retrieval_via_DPR.py Projekt: venuraja79/haystack

def tutorial6_better_retrieval_via_dpr():
    # OPTION 1: FAISS is a library for efficient similarity search on a cluster of dense vectors.
    # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood
    # to store the document text and other meta data. The vector embeddings of the text are
    # indexed on a FAISS Index that later is queried for searching answers.
    # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
    # faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
    # For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

    # OPTION2: Milvus is an open source database library that is also optimized for vector similarity searches like FAISS.
    # Like FAISS it has both a "Flat" and "HNSW" mode but it outperforms FAISS when it comes to dynamic data management.
    # It does require a little more setup, however, as it is run through Docker and requires the setup of some config files.
    # See https://milvus.io/docs/v1.0.0/milvus_docker-cpu.md
    # launch_milvus()
    # document_store = MilvusDocumentStore()

    # ## Preprocessing of documents
    # Let's first get some documents that we want to query
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the docs to our DB.
    document_store.write_documents(dicts)

    ### Retriever
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=2,
        use_gpu=True,
        embed_title=True,
        use_fast_tokenizers=True)

    # Important:
    # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
    document_store.update_embeddings(retriever)

    ### Reader
    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    ### Pipeline
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")

Beispiel #20

0

Datei anzeigen

Datei: Tutorial6_Better_Retrieval_via_DPR.py Projekt: x0rzkov/haystack

fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir,
                               clean_func=clean_wiki_text,
                               split_paragraphs=True)

# Now, let's write the docs to our DB.
document_store.write_documents(dicts)

### Retriever
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query=64,
    max_seq_len_passage=256,
    batch_size=2,
    use_gpu=True,
    embed_title=True,
    use_fast_tokenizers=True)

# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)

### Reader
# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)

Beispiel #21

0

Datei anzeigen

Datei: Tutorial9_DPR_training.py Projekt: voidful/haystack

def tutorial9_dpr_training():
    # Training Your Own "Dense Passage Retrieval" Model

    # Here are some imports that we'll need

    from haystack.retriever.dense import DensePassageRetriever
    from haystack.preprocessor.utils import fetch_archive_from_http
    from haystack.document_store.memory import InMemoryDocumentStore

    # Download original DPR data
    # WARNING: the train set is 7.4GB and the dev set is 800MB

    doc_dir = "data/dpr_training/"

    s3_url_train = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-train.json.gz"
    s3_url_dev = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz"

    fetch_archive_from_http(s3_url_train, output_dir=doc_dir + "train/")
    fetch_archive_from_http(s3_url_dev, output_dir=doc_dir + "dev/")

    ## Option 1: Training DPR from Scratch

    # Here are the variables to specify our training data, the models that we use to initialize DPR
    # and the directory where we'll be saving the model

    doc_dir = "data/dpr_training/"

    train_filename = "train/biencoder-nq-train.json"
    dev_filename = "dev/biencoder-nq-dev.json"

    query_model = "bert-base-uncased"
    passage_model = "bert-base-uncased"

    save_dir = "../saved_models/dpr"

    # ## Option 2: Finetuning DPR
    #
    # # Here are the variables you might want to use instead of the set above
    # # in order to perform pretraining
    #
    # doc_dir = "PATH_TO_YOUR_DATA_DIR"
    # train_filename = "TRAIN_FILENAME"
    # dev_filename = "DEV_FILENAME"
    #
    # query_model = "facebook/dpr-question_encoder-single-nq-base"
    # passage_model = "facebook/dpr-ctx_encoder-single-nq-base"
    #
    # save_dir = "..saved_models/dpr"

    ## Initialize DPR model

    retriever = DensePassageRetriever(
        document_store=InMemoryDocumentStore(),
        query_embedding_model=query_model,
        passage_embedding_model=passage_model,
        max_seq_len_query=64,
        max_seq_len_passage=256
    )

    # Start training our model and save it when it is finished

    retriever.train(
        data_dir=doc_dir,
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=dev_filename,
        n_epochs=1,
        batch_size=4,
        grad_acc_steps=4,
        save_dir=save_dir,
        evaluate_every=3000,
        embed_title=True,
        num_positives=1,
        num_hard_negatives=1
    )

    ## Loading

    reloaded_retriever = DensePassageRetriever.load(load_dir=save_dir, document_store=None)

Beispiel #22

0

Datei anzeigen

Datei: DPRTrainingGenerator.py Projekt: rafilevy/virtual_agronomist

class DPRTrainingTester:
    """
    To run with an in memory sqlite database
    """
    document_store = FAISSDocumentStore(
        faiss_index_factory_str="Flat"
    )

    retreiver = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=16,
        # use_gpu=True,
        use_gpu=False,
        embed_title=True,
        use_fast_tokenizers=True
    )

    # Loads the test document into the document store
    def loadDocumentsFromFile(self, knowledgeFilePath):
        converter = TextConverter(
            remove_numeric_tables=False,
            valid_languages=["en"])
        processor = PreProcessor(
            clean_empty_lines=True,
            clean_whitespace=False,
            clean_header_footer=True,
            split_by="passage",
            split_length=1,
            split_respect_sentence_boundary=False,
            split_overlap=0
        )
        self.trainingFile = knowledgeFilePath
        loadedFile = converter.convert(knowledgeFilePath)
        documents = processor.process(loadedFile)
        for i in range(0, len(documents)):
            docMetadata = documents[i]['meta']
            docMetadata['name'] = knowledgeFilePath
            docMetadata['doucmentID'] = knowledgeFilePath \
                + str(docMetadata['_split_id'])

        self.document_store.write_documents(documents)
        backagain = self.document_store.get_all_documents()

        print("Number of documents loaded", end=": ")
        print(self.document_store.get_document_count())

    def __init__(self, knowledgeFilePath):
        print("Started DPR Training tester!")

        # load test document into database
        print("Loading documents from " + knowledgeFilePath)
        self.document_store.delete_all_documents()
        self.loadDocumentsFromFile(knowledgeFilePath)

        # update dpr embeddings based on initial retreiver
        print("Performing initial embeddings update")
        self.document_store.update_embeddings(self.retreiver)

        # generate a new dprTrainingSet to populate
        self.trainingSet = DPRTrainingSet(self.document_store, 0)

    # return document store's id for the response marked correct
    def get_correct_id(self, responses, correctNum):
        # correctRespons = responses[correctNum].to_dict()
        return responses[correctNum].id

    # return list of document store ids for alternative responses
    def get_incorrect_ids(self, responses, correctNum):
        ids = []
        for i in range(0, len(responses)):
            if i == correctNum:
                continue
            ids.append(responses[i].id)
        return ids

    def askQuestion(self):
        print("------------------------------")
        question = input("Enter new question (DONE to finish): ")

        if question == "":
            return

        if question == 'DONE':
            self.generateTraining()
            return

        k = 10
        responses = self.retreiver.retrieve(question, top_k=k)

        print()
        for i in range(0, k):
            print(i, end=": ")
            print(responses[i].text)
            # print(responses[i])
            print()

        print()
        correctNum = input("Select correct response (X if none correct): ")

        if correctNum == "":
            return

        if correctNum == 'X':
            return

        print()
        print("------------------------------")

        self.trainingSet.addItem(
            question=question,
            posID=self.get_correct_id(responses, int(correctNum)),
            negIDs=self.get_incorrect_ids(responses, int(correctNum))
        )

    # file where all the training stuff is
    doc_dir = "data/"

    def generateTraining(self):
        self.trainingSet.addInBatchNegatives()

        self.trainingSet.generateJSON(self.trainingFile + "SET.json")
        print("New training set saved to: " + self.trainingFile + "SET.json")

        exit(0)

    def loop(self):
        self.askQuestion()

Beispiel #23

0

Datei anzeigen

class ManagerTester:
    """
    To run with a seperate postgres database
    """
    # docker run  -p 5432:5432 -e POSTGRES_PASSWORD=haystack -d postgres
    # document_store = FAISSDocumentStore(
    #     faiss_index_factory_str="Flat",
    #     sql_url="postgresql://*****:*****@localhost:5432"
    # )

    """
    To run with an in memory sqlite database
    """
    document_store = FAISSDocumentStore(
        faiss_index_factory_str="Flat"
    )

    retreiver = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=16,
        # use_gpu=True,
        use_gpu=False,
        embed_title=True,
        use_fast_tokenizers=True
    )

    # Loads the test document into the document store
    def loadDocumentsFromFile(self, knowledgeFilePath):
        converter = TextConverter(
            remove_numeric_tables=False,
            valid_languages=["en"])
        processor = PreProcessor(
            clean_empty_lines=True,
            clean_whitespace=True,
            clean_header_footer=True,
            split_by="passage",
            split_length=1,
            split_respect_sentence_boundary=False,
            split_overlap=0
        )
        loadedFile = converter.convert(knowledgeFilePath)
        documents = processor.process(loadedFile)
        for i in range(0, len(documents)):
            docMetadata = documents[i]['meta']
            docMetadata['name'] = knowledgeFilePath
            docMetadata['doucmentID'] = knowledgeFilePath \
                + str(docMetadata['_split_id'])

        self.document_store.write_documents(documents)
        backagain = self.document_store.get_all_documents()

        # for i in range(0,len(backagain)):
        #     print(i)
        #     print(":\n")
        #     print(backagain[i])
        #     print("---------------")

        print("Number of documents loaded", end=": ")
        print(self.document_store.get_document_count())

    def __init__(self, knowledgeFilePath):
        print("Started DPR Training tester!")

        # load test document into database
        print("Loading documents from " + knowledgeFilePath)
        self.document_store.delete_all_documents()
        self.loadDocumentsFromFile(knowledgeFilePath)

        # update dpr embeddings based on initial retreiver
        print("Performing initial embeddings update")
        self.document_store.update_embeddings(self.retreiver)

        self.trainingManager = DPRTrainingManager(self.document_store, 0)

    # return document store's id for the response marked correct
    def get_correct_id(self, responses, correctNum):
        # correctRespons = responses[correctNum].to_dict()
        return responses[correctNum].id

    # return list of document store ids for alternative responses
    def get_incorrect_ids(self, responses, correctNum):
        ids = []
        for i in range(0, len(responses)):
            if i == correctNum:
                continue
            ids.append(responses[i].id)
        return ids

    def askQuestion(self):
        print("------------------------------")
        question = input("Enter new question (T to run training): ")

        if question == 'T':
            self.train()
            return

        k = 5
        responses = self.retreiver.retrieve(question, top_k=k)

        print()
        for i in range(0, k):
            print(i, end=": ")
            print(responses[i].text)
            print()

        print()
        correctNum = input("Select correct response (X if none correct): ")

        if correctNum == 'X':
            return

        print()
        print("------------------------------")

        self.trainingManager.addItem(
            question=question,
            posID=self.get_correct_id(responses, int(correctNum)),
            negIDs=self.get_incorrect_ids(responses, int(correctNum))
        )

    def train(self):

        newModel = self.trainingManager.train()

        self.retreiver = DensePassageRetriever.load(
            document_store=self.document_store,
            load_dir=newModel,
            max_seq_len_query=64,
            max_seq_len_passage=256,
            batch_size=16,
            # use_gpu=True,
            use_gpu=False,
            embed_title=True,
            use_fast_tokenizers=True
        )

        self.document_store.update_embeddings(self.retreiver)

    def loop(self):
        self.askQuestion()

Beispiel #24

0

Datei anzeigen


document_store = ElasticsearchDocumentStore(
    host=DB_HOST,
    port=DB_PORT,
    username=DB_USER,
    password=DB_PW,
    index=DB_INDEX,
    scheme=ES_CONN_SCHEME,
    ca_certs=False,
    verify_certs=False,
    text_field=TEXT_FIELD_NAME,
    search_fields=SEARCH_FIELD_NAME,
    embedding_dim=EMBEDDING_DIM,
    embedding_field=EMBEDDING_FIELD_NAME,
    excluded_meta_data=EXCLUDE_META_DATA_FIELDS,  # type: ignore
    faq_question_field=FAQ_QUESTION_FIELD_NAME,
)

retriever = DensePassageRetriever(
    document_store=document_store,
    embedding_model=EMBEDDING_MODEL_PATH,
    do_lower_case=True,
    use_gpu=USE_GPU
)

@router.post("/update-embeddings")
def upload_file_to_document_store():
    document_store.update_embeddings(retriever)
    return "Successfully updated embeddings."

Beispiel #25

0

Datei anzeigen

# convert files to dicts containing documents that can be indexed to our datastore
got_dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

# Initialize DocumentStore and index documents
launch_es()
document_store = ElasticsearchDocumentStore()
document_store.delete_all_documents()
document_store.write_documents(got_dicts)

# Initialize Sparse retriever
es_retriever = ElasticsearchRetriever(document_store=document_store)

# Initialize dense retriever
dpr_retriever = DensePassageRetriever(document_store)
document_store.update_embeddings(dpr_retriever,
                                 update_existing_embeddings=False)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

######################
# Prebuilt Pipelines #
######################

# Extractive QA Pipeline
########################

p_extractive_premade = ExtractiveQAPipeline(reader=reader,
                                            retriever=es_retriever)
res = p_extractive_premade.run(query="Who is the father of Arya Stark?",

Beispiel #26

0

Datei anzeigen

Datei: Tutorial6_Better_Retrieval_via_DPR.py Projekt: gradjitta/haystack

# Let's first get some documents that we want to query
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir,
                               clean_func=clean_wiki_text,
                               split_paragraphs=True)

# Now, let's write the docs to our DB.
document_store.write_documents(dicts[:16])

### Retriever
retriever = DensePassageRetriever(document_store=document_store,
                                  embedding_model="dpr-bert-base-nq",
                                  do_lower_case=True,
                                  use_gpu=True)
# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)

### Reader
# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                    use_gpu=True)

### Finder

Beispiel #27

0

Datei anzeigen

s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir,
                               clean_func=clean_wiki_text,
                               split_paragraphs=True)

# Now, let's write the docs to our DB.
document_store.write_documents(dicts)

### Retriever
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
    remove_sep_tok_from_untitled_passages=True)

# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)

### Reader
# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",

Beispiel #28

0

Datei anzeigen

Datei: Tutorial11_Pipelines.py Projekt: stmnk/haystack

def tutorial11_pipelines():
    #Download and prepare data - 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    got_dicts = convert_files_to_dicts(dir_path=doc_dir,
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=True)

    # Initialize DocumentStore and index documents
    launch_es()
    document_store = ElasticsearchDocumentStore()
    document_store.delete_all_documents()
    document_store.write_documents(got_dicts)

    # Initialize Sparse retriever
    es_retriever = ElasticsearchRetriever(document_store=document_store)

    # Initialize dense retriever
    dpr_retriever = DensePassageRetriever(document_store)
    document_store.update_embeddings(dpr_retriever,
                                     update_existing_embeddings=False)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

    ######################
    # Prebuilt Pipelines #
    ######################

    # Extractive QA Pipeline
    ########################

    p_extractive_premade = ExtractiveQAPipeline(reader=reader,
                                                retriever=es_retriever)
    res = p_extractive_premade.run(query="Who is the father of Arya Stark?",
                                   top_k_retriever=10,
                                   top_k_reader=5)
    print_answers(res, details="minimal")

    # Document Search Pipeline
    ##########################

    p_retrieval = DocumentSearchPipeline(es_retriever)
    res = p_retrieval.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10)
    print_documents(res, max_text_len=200)

    # Generator Pipeline
    ##########################

    # We set this to True so that the document store returns document embeddings
    # with each document, this is needed by the Generator
    document_store.return_embedding = True

    # Initialize generator
    rag_generator = RAGenerator()

    # Generative QA
    p_generator = GenerativeQAPipeline(generator=rag_generator,
                                       retriever=dpr_retriever)
    res = p_generator.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10)
    print_answers(res, details="minimal")

    # We are setting this to False so that in later pipelines,
    # we get a cleaner printout
    document_store.return_embedding = False

    ##############################
    # Creating Pipeline Diagrams #
    ##############################

    p_extractive_premade.draw("pipeline_extractive_premade.png")
    p_retrieval.draw("pipeline_retrieval.png")
    p_generator.draw("pipeline_generator.png")

    ####################
    # Custom Pipelines #
    ####################

    # Extractive QA Pipeline
    ########################

    # Custom built extractive QA pipeline
    p_extractive = Pipeline()
    p_extractive.add_node(component=es_retriever,
                          name="Retriever",
                          inputs=["Query"])
    p_extractive.add_node(component=reader,
                          name="Reader",
                          inputs=["Retriever"])

    # Now we can run it
    res = p_extractive.run(query="Who is the father of Arya Stark?",
                           top_k_retriever=10,
                           top_k_reader=5)
    print_answers(res, details="minimal")
    p_extractive.draw("pipeline_extractive.png")

    # Ensembled Retriever Pipeline
    ##############################

    # Create ensembled pipeline
    p_ensemble = Pipeline()
    p_ensemble.add_node(component=es_retriever,
                        name="ESRetriever",
                        inputs=["Query"])
    p_ensemble.add_node(component=dpr_retriever,
                        name="DPRRetriever",
                        inputs=["Query"])
    p_ensemble.add_node(component=JoinDocuments(join_mode="concatenate"),
                        name="JoinResults",
                        inputs=["ESRetriever", "DPRRetriever"])
    p_ensemble.add_node(component=reader,
                        name="Reader",
                        inputs=["JoinResults"])
    p_ensemble.draw("pipeline_ensemble.png")

    # Run pipeline
    res = p_ensemble.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=5  #This is top_k per retriever
    )
    print_answers(res, details="minimal")

    # Query Classification Pipeline
    ###############################

    # Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.
    # Though this looks very similar to the ensembled pipeline shown above,
    # the key difference is that only one of the retrievers is run for each request.
    # By contrast both retrievers are always run in the ensembled approach.

    class QueryClassifier():
        outgoing_edges = 2

        def run(self, **kwargs):
            if "?" in kwargs["query"]:
                return (kwargs, "output_2")
            else:
                return (kwargs, "output_1")

    # Here we build the pipeline
    p_classifier = Pipeline()
    p_classifier.add_node(component=QueryClassifier(),
                          name="QueryClassifier",
                          inputs=["Query"])
    p_classifier.add_node(component=es_retriever,
                          name="ESRetriever",
                          inputs=["QueryClassifier.output_1"])
    p_classifier.add_node(component=dpr_retriever,
                          name="DPRRetriever",
                          inputs=["QueryClassifier.output_2"])
    p_classifier.add_node(component=reader,
                          name="QAReader",
                          inputs=["ESRetriever", "DPRRetriever"])
    p_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = p_classifier.run(query="Who is the father of Arya Stark?",
                             top_k_retriever=10)
    print("DPR Results" + "\n" + "=" * 15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = p_classifier.run(query="Arya Stark father", top_k_retriever=10)
    print("ES Results" + "\n" + "=" * 15)
    print_answers(res_2)

Beispiel #29

0

Datei anzeigen

Datei: Tutorial14_Query_Classifier.py Projekt: stmnk/haystack

def tutorial14_query_classifier():

    #Download and prepare data - 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    got_dicts = convert_files_to_dicts(
        dir_path=doc_dir,
        clean_func=clean_wiki_text,
        split_paragraphs=True
    )

    # Initialize DocumentStore and index documents
    launch_es()
    document_store = ElasticsearchDocumentStore()
    document_store.delete_all_documents()
    document_store.write_documents(got_dicts)

    # Initialize Sparse retriever
    es_retriever = ElasticsearchRetriever(document_store=document_store)

    # Initialize dense retriever
    dpr_retriever = DensePassageRetriever(document_store)
    document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")


    # Here we build the pipeline
    sklearn_keyword_classifier = Pipeline()
    sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
    sklearn_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
    sklearn_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"])
    sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
    sklearn_keyword_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = sklearn_keyword_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = sklearn_keyword_classifier.run(
        query="arya stark father",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_2)

    # Run only the dense retriever on the full sentence query
    res_3 = sklearn_keyword_classifier.run(
        query="which country was jon snow filmed ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_3)

    # Run only the sparse retriever on a keyword based query
    res_4 = sklearn_keyword_classifier.run(
        query="jon snow country",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_4)

    # Run only the dense retriever on the full sentence query
    res_5 = sklearn_keyword_classifier.run(
        query="who are the younger brothers of arya stark ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_5)

    # Run only the sparse retriever on a keyword based query
    res_6 = sklearn_keyword_classifier.run(
        query="arya stark younger brothers",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_6)

    # Here we build the pipeline
    transformer_keyword_classifier = Pipeline()
    transformer_keyword_classifier.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"])
    transformer_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
    transformer_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"])
    transformer_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
    transformer_keyword_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = transformer_keyword_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = transformer_keyword_classifier.run(
        query="arya stark father",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_2)

    # Run only the dense retriever on the full sentence query
    res_3 = transformer_keyword_classifier.run(
        query="which country was jon snow filmed ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_3)

    # Run only the sparse retriever on a keyword based query
    res_4 = transformer_keyword_classifier.run(
        query="jon snow country",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_4)

    # Run only the dense retriever on the full sentence query
    res_5 = transformer_keyword_classifier.run(
        query="who are the younger brothers of arya stark ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_5)

    # Run only the sparse retriever on a keyword based query
    res_6 = transformer_keyword_classifier.run(
        query="arya stark younger brothers",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_6)

    # Here we build the pipeline
    transformer_question_classifier = Pipeline()
    transformer_question_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
    transformer_question_classifier.add_node(component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"), name="QueryClassifier", inputs=["DPRRetriever"])
    transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"])
    transformer_question_classifier.draw("question_classifier.png")

    # Run only the QA reader on the question query
    res_1 = transformer_question_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Show only DPR results
    res_2 = transformer_question_classifier.run(
        query="Arya Stark was the daughter of a Lord.",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    res_2

    # Here we create the keyword vs question/statement query classifier

    queries = ["arya stark father","jon snow country",
               "who is the father of arya stark","which country was jon snow filmed?"]

    keyword_classifier = TransformersQueryClassifier()

    for query in queries:
        result = keyword_classifier.run(query=query)
        if result[1] == "output_1":
            category = "question/statement"
        else:
            category = "keyword"

        print(f"Query: {query}, raw_output: {result}, class: {category}")

    # Here we create the question vs statement query classifier

    queries = ["Lord Eddard was the father of Arya Stark.","Jon Snow was filmed in United Kingdom.",
               "who is the father of arya stark?","Which country was jon snow filmed in?"]

    question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier")

    for query in queries:
        result = question_classifier.run(query=query)
        if result[1] == "output_1":
            category = "question"
        else:
            category = "statement"

        print(f"Query: {query}, raw_output: {result}, class: {category}")