Beispiel #1
0
def test_generator_pipeline(document_store, retriever, rag_generator):
    document_store.write_documents(DOCS_WITH_EMBEDDINGS)
    query = "What is capital of the Germany?"
    pipeline = GenerativeQAPipeline(retriever=retriever, generator=rag_generator)
    output = pipeline.run(query=query, top_k_generator=2, top_k_retriever=1)
    answers = output["answers"]
    assert len(answers) == 2
    assert "berlin" in answers[0]["answer"]
Beispiel #2
0
def test_lfqa_pipeline(document_store, retriever, eli5_generator):
    # reuse existing DOCS but regenerate embeddings with retribert
    docs: List[Document] = []
    for idx, d in enumerate(DOCS_WITH_EMBEDDINGS):
        docs.append(Document(d.text, str(idx)))
    document_store.write_documents(docs)
    document_store.update_embeddings(retriever)
    query = "Tell me about Berlin?"
    pipeline = GenerativeQAPipeline(retriever=retriever,
                                    generator=eli5_generator)
    output = pipeline.run(query=query, top_k_generator=1, top_k_retriever=1)
    answers = output["answers"]
    assert len(answers) == 1
    assert "Germany" in answers[0]
Beispiel #3
0
def test_lfqa_pipeline_unknown_converter(document_store, retriever):
    # reuse existing DOCS but regenerate embeddings with retribert
    docs: List[Document] = []
    for idx, d in enumerate(DOCS_WITH_EMBEDDINGS):
        docs.append(Document(d.text, str(idx)))
    document_store.write_documents(docs)
    document_store.update_embeddings(retriever)
    seq2seq = Seq2SeqGenerator(
        model_name_or_path="patrickvonplaten/t5-tiny-random")
    query = "Tell me about Berlin?"
    pipeline = GenerativeQAPipeline(retriever=retriever, generator=seq2seq)

    # raises exception as we don't have converter for "patrickvonplaten/t5-tiny-random" in Seq2SeqGenerator
    with pytest.raises(Exception):
        output = pipeline.run(query=query,
                              top_k_generator=1,
                              top_k_retriever=1)
Beispiel #4
0
def test_lfqa_pipeline_invalid_converter(document_store, retriever):
    # reuse existing DOCS but regenerate embeddings with retribert
    docs: List[Document] = []
    for idx, d in enumerate(DOCS_WITH_EMBEDDINGS):
        docs.append(Document(d.text, str(idx)))
    document_store.write_documents(docs)
    document_store.update_embeddings(retriever)

    class _InvalidConverter:
        def __call__(self, some_invalid_para: str,
                     another_invalid_param: str) -> None:
            pass

    seq2seq = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5",
                               input_converter=_InvalidConverter())
    query = "This query will fail due to InvalidConverter used"
    pipeline = GenerativeQAPipeline(retriever=retriever, generator=seq2seq)

    # raises exception as we are using invalid method signature in _InvalidConverter
    with pytest.raises(Exception):
        output = pipeline.run(query=query,
                              top_k_generator=1,
                              top_k_retriever=1)
Beispiel #5
0
def test_generator_pipeline_with_translator(document_store, retriever,
                                            rag_generator, en_to_de_translator,
                                            de_to_en_translator):
    document_store.write_documents(DOCS_WITH_EMBEDDINGS)
    query = "Was ist die Hauptstadt der Bundesrepublik Deutschland?"
    base_pipeline = GenerativeQAPipeline(retriever=retriever,
                                         generator=rag_generator)
    pipeline = TranslationWrapperPipeline(
        input_translator=de_to_en_translator,
        output_translator=en_to_de_translator,
        pipeline=base_pipeline)
    output = pipeline.run(query=query, top_k_generator=2, top_k_retriever=1)
    answers = output["answers"]
    assert len(answers) == 2
    assert "berlin" in answers[0]["answer"]
Beispiel #6
0
res = p_retrieval.run(query="Who is the father of Arya Stark?",
                      top_k_retriever=10)
print_documents(res, max_text_len=200)

# Generator Pipeline
##########################

# We set this to True so that the document store returns document embeddings
# with each document, this is needed by the Generator
document_store.return_embedding = True

# Initialize generator
rag_generator = RAGenerator()

# Generative QA
p_generator = GenerativeQAPipeline(generator=rag_generator,
                                   retriever=dpr_retriever)
res = p_generator.run(query="Who is the father of Arya Stark?",
                      top_k_retriever=10)
print_answers(res, details="minimal")

# We are setting this to False so that in later pipelines,
# we get a cleaner printout
document_store.return_embedding = False

##############################
# Creating Pipeline Diagrams #
##############################

p_extractive_premade.draw("pipeline_extractive_premade.png")
p_retrieval.draw("pipeline_retrieval.png")
p_generator.draw("pipeline_generator.png")
def tutorial7_rag_generator():
    # Add documents from which you want generate answers
    # Download a csv containing some sample documents data
    # Here some sample documents data
    temp = requests.get("https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv")
    open('small_generator_dataset.csv', 'wb').write(temp.content)

    # Get dataframe with columns "title", and "text"
    df = pd.read_csv("small_generator_dataset.csv", sep=',')
    # Minimal cleaning
    df.fillna(value="", inplace=True)

    print(df.head())

    titles = list(df["title"].values)
    texts = list(df["text"].values)

    # Create to haystack document format
    documents: List[Document] = []
    for title, text in zip(titles, texts):
        documents.append(
            Document(
                text=text,
                meta={
                    "name": title or ""
                }
            )
        )


    # Initialize FAISS document store to documents and corresponding index for embeddings
    # Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
    document_store = FAISSDocumentStore(
        faiss_index_factory_str="Flat",
        return_embedding=True
    )

    # Initialize DPR Retriever to encode documents, encode question and query documents
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=True,
        embed_title=True,
    )

    # Initialize RAG Generator
    generator = RAGenerator(
        model_name_or_path="facebook/rag-token-nq",
        use_gpu=True,
        top_k=1,
        max_length=200,
        min_length=2,
        embed_title=True,
        num_beams=2,
    )

    # Delete existing documents in documents store
    document_store.delete_all_documents()
    # Write documents to document store
    document_store.write_documents(documents)
    # Add documents embeddings to index
    document_store.update_embeddings(
        retriever=retriever
    )

    # Now ask your questions
    # We have some sample questions
    QUESTIONS = [
        "who got the first nobel prize in physics",
        "when is the next deadpool movie being released",
        "which mode is used for short wave broadcast service",
        "who is the owner of reading football club",
        "when is the next scandal episode coming out",
        "when is the last time the philadelphia won the superbowl",
        "what is the most current adobe flash player version",
        "how many episodes are there in dragon ball z",
        "what is the first step in the evolution of the eye",
        "where is gall bladder situated in human body",
        "what is the main mineral in lithium batteries",
        "who is the president of usa right now",
        "where do the greasers live in the outsiders",
        "panda is a national animal of which country",
        "what is the name of manchester united stadium",
    ]

    # Now generate answer for question
    for question in QUESTIONS:
        # Retrieve related documents from retriever
        retriever_results = retriever.retrieve(
            query=question
        )

        # Now generate answer from question and retrieved documents
        predicted_result = generator.predict(
            query=question,
            documents=retriever_results,
            top_k=1
        )

        # Print you answer
        answers = predicted_result["answers"]
        print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')

    # Or alternatively use the Pipeline class
    from haystack.pipeline import GenerativeQAPipeline
    pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
    for question in QUESTIONS:
        res = pipe.run(query=question, top_k_generator=1, top_k_retriever=5)
        print(res)
Beispiel #8
0
def tutorial12_lfqa():
    """
    Document Store:
    FAISS is a library for efficient similarity search on a cluster of dense vectors.
    The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood
    to store the document text and other meta data. The vector embeddings of the text are
    indexed on a FAISS Index that later is queried for searching answers.
    The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
    faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
    For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
    """

    from haystack.document_store.faiss import FAISSDocumentStore

    document_store = FAISSDocumentStore(vector_dim=128,
                                        faiss_index_factory_str="Flat")
    """
    Cleaning & indexing documents:
    Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore
    """

    # Let's first get some files that we want to use
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # Convert files to dicts
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the dicts containing documents to our DB.
    document_store.write_documents(dicts)
    """
    Initalize Retriever and Reader/Generator:
    We use a `RetribertRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore`
    """

    from haystack.retriever.dense import EmbeddingRetriever

    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model="yjernite/retribert-base-uncased",
        model_format="retribert")

    document_store.update_embeddings(retriever)
    """Before we blindly use the `RetribertRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents."""

    from haystack.utils import print_answers, print_documents
    from haystack.pipeline import DocumentSearchPipeline

    p_retrieval = DocumentSearchPipeline(retriever)
    res = p_retrieval.run(query="Tell me something about Arya Stark?",
                          top_k_retriever=5)
    print_documents(res, max_text_len=512)
    """
    Similar to previous Tutorials we now initalize our reader/generator.
    Here we use a `Seq2SeqGenerator` with the *yjernite/bart_eli5* model (see: https://huggingface.co/yjernite/bart_eli5)
    """

    generator = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5")
    """
    Pipeline:
    With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions.
    You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    """

    from haystack.pipeline import GenerativeQAPipeline
    pipe = GenerativeQAPipeline(generator, retriever)
    """Voilà! Ask a question!"""

    query_1 = "Why did Arya Stark's character get portrayed in a television adaptation?"
    result_1 = pipe.run(query=query_1, top_k_retriever=1)
    print(f"Query: {query_1}")
    print(f"Answer: {result_1['answers'][0]}")
    print()

    query_2 = "What kind of character does Arya Stark play?"
    result_2 = pipe.run(query=query_2, top_k_retriever=1)
    print(f"Query: {query_2}")
    print(f"Answer: {result_2['answers'][0]}")
    print()
    pipe.run(query=query_2, top_k_retriever=1)
Beispiel #9
0
def tutorial11_pipelines():
    #Download and prepare data - 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    got_dicts = convert_files_to_dicts(dir_path=doc_dir,
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=True)

    # Initialize DocumentStore and index documents
    launch_es()
    document_store = ElasticsearchDocumentStore()
    document_store.delete_all_documents()
    document_store.write_documents(got_dicts)

    # Initialize Sparse retriever
    es_retriever = ElasticsearchRetriever(document_store=document_store)

    # Initialize dense retriever
    dpr_retriever = DensePassageRetriever(document_store)
    document_store.update_embeddings(dpr_retriever,
                                     update_existing_embeddings=False)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

    ######################
    # Prebuilt Pipelines #
    ######################

    # Extractive QA Pipeline
    ########################

    p_extractive_premade = ExtractiveQAPipeline(reader=reader,
                                                retriever=es_retriever)
    res = p_extractive_premade.run(query="Who is the father of Arya Stark?",
                                   top_k_retriever=10,
                                   top_k_reader=5)
    print_answers(res, details="minimal")

    # Document Search Pipeline
    ##########################

    p_retrieval = DocumentSearchPipeline(es_retriever)
    res = p_retrieval.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10)
    print_documents(res, max_text_len=200)

    # Generator Pipeline
    ##########################

    # We set this to True so that the document store returns document embeddings
    # with each document, this is needed by the Generator
    document_store.return_embedding = True

    # Initialize generator
    rag_generator = RAGenerator()

    # Generative QA
    p_generator = GenerativeQAPipeline(generator=rag_generator,
                                       retriever=dpr_retriever)
    res = p_generator.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10)
    print_answers(res, details="minimal")

    # We are setting this to False so that in later pipelines,
    # we get a cleaner printout
    document_store.return_embedding = False

    ##############################
    # Creating Pipeline Diagrams #
    ##############################

    p_extractive_premade.draw("pipeline_extractive_premade.png")
    p_retrieval.draw("pipeline_retrieval.png")
    p_generator.draw("pipeline_generator.png")

    ####################
    # Custom Pipelines #
    ####################

    # Extractive QA Pipeline
    ########################

    # Custom built extractive QA pipeline
    p_extractive = Pipeline()
    p_extractive.add_node(component=es_retriever,
                          name="Retriever",
                          inputs=["Query"])
    p_extractive.add_node(component=reader,
                          name="Reader",
                          inputs=["Retriever"])

    # Now we can run it
    res = p_extractive.run(query="Who is the father of Arya Stark?",
                           top_k_retriever=10,
                           top_k_reader=5)
    print_answers(res, details="minimal")
    p_extractive.draw("pipeline_extractive.png")

    # Ensembled Retriever Pipeline
    ##############################

    # Create ensembled pipeline
    p_ensemble = Pipeline()
    p_ensemble.add_node(component=es_retriever,
                        name="ESRetriever",
                        inputs=["Query"])
    p_ensemble.add_node(component=dpr_retriever,
                        name="DPRRetriever",
                        inputs=["Query"])
    p_ensemble.add_node(component=JoinDocuments(join_mode="concatenate"),
                        name="JoinResults",
                        inputs=["ESRetriever", "DPRRetriever"])
    p_ensemble.add_node(component=reader,
                        name="Reader",
                        inputs=["JoinResults"])
    p_ensemble.draw("pipeline_ensemble.png")

    # Run pipeline
    res = p_ensemble.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=5  #This is top_k per retriever
    )
    print_answers(res, details="minimal")

    # Query Classification Pipeline
    ###############################

    # Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.
    # Though this looks very similar to the ensembled pipeline shown above,
    # the key difference is that only one of the retrievers is run for each request.
    # By contrast both retrievers are always run in the ensembled approach.

    class QueryClassifier():
        outgoing_edges = 2

        def run(self, **kwargs):
            if "?" in kwargs["query"]:
                return (kwargs, "output_2")
            else:
                return (kwargs, "output_1")

    # Here we build the pipeline
    p_classifier = Pipeline()
    p_classifier.add_node(component=QueryClassifier(),
                          name="QueryClassifier",
                          inputs=["Query"])
    p_classifier.add_node(component=es_retriever,
                          name="ESRetriever",
                          inputs=["QueryClassifier.output_1"])
    p_classifier.add_node(component=dpr_retriever,
                          name="DPRRetriever",
                          inputs=["QueryClassifier.output_2"])
    p_classifier.add_node(component=reader,
                          name="QAReader",
                          inputs=["ESRetriever", "DPRRetriever"])
    p_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = p_classifier.run(query="Who is the father of Arya Stark?",
                             top_k_retriever=10)
    print("DPR Results" + "\n" + "=" * 15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = p_classifier.run(query="Arya Stark father", top_k_retriever=10)
    print("ES Results" + "\n" + "=" * 15)
    print_answers(res_2)