Ejemplo n.º 1
0
 def __init__(self):
     self.finder = Finder(reader=Reader(model_name_or_path=MODEL_PATH,
                                        tokenizer=MODEL_PATH,
                                        use_gpu=0),
                          retriever=ElasticsearchRetriever(
                              document_store=ElasticsearchDocumentStore(
                                  refresh_type='false')))
def set_finder(user_id_key):
    if user_settings[user_id_key]["model"] == "roberta":
        model_path = (
            "deepset/roberta-base-squad2"  # Path of the models hosted in Hugging Face
        )
    elif user_settings[user_id_key]["model"] == "bert":
        model_path = "deepset/bert-large-uncased-whole-word-masking-squad2"
    elif user_settings[user_id_key]["model"] == "distilbert":
        model_path = "distilbert-base-uncased-distilled-squad"
    else:
        model_path = "illuin/camembert-base-fquad"

    retriever = ElasticsearchRetriever(document_store=user_doc_store[user_id_key])

    if user_settings[user_id_key]["gpu"] == "on":
        try:
            reader = TransformersReader(
                model_name_or_path=model_path, tokenizer=model_path, use_gpu=0
            )
        except Exception as e:
            print(e)
            print("GPU not available. Inferencing on CPU")
            reader = TransformersReader(
                model_name_or_path=model_path, tokenizer=model_path, use_gpu=-1
            )

    else:
        reader = TransformersReader(
            model_name_or_path=model_path, tokenizer=model_path, use_gpu=-1
        )

    finder = Finder(reader, retriever)

    return finder
Ejemplo n.º 3
0
def test_eval_finder(document_store: BaseDocumentStore, reader):
    retriever = ElasticsearchRetriever(document_store=document_store)
    finder = Finder(reader=reader, retriever=retriever)

    # add eval data (SQUAD format)
    document_store.delete_all_documents(index="test_eval_document")
    document_store.delete_all_documents(index="test_feedback")
    document_store.add_eval_data(filename="samples/squad/tiny.json", doc_index="test_eval_document", label_index="test_feedback")
    assert document_store.get_document_count(index="test_eval_document") == 2

    # eval finder
    results = finder.eval(label_index="test_feedback", doc_index="test_eval_document", top_k_retriever=1, top_k_reader=5)
    assert results["retriever_recall"] == 1.0
    assert results["retriever_map"] == 1.0
    assert abs(results["reader_topk_f1"] - 0.66666) < 0.001
    assert abs(results["reader_topk_em"] - 0.5) < 0.001
    assert abs(results["reader_topk_accuracy"] - 1) < 0.001
    assert results["reader_top1_f1"] <= results["reader_topk_f1"]
    assert results["reader_top1_em"] <= results["reader_topk_em"]
    assert results["reader_top1_accuracy"] <= results["reader_topk_accuracy"]

    # batch eval finder
    results_batch = finder.eval_batch(label_index="test_feedback", doc_index="test_eval_document", top_k_retriever=1,
                          top_k_reader=5)
    assert results_batch["retriever_recall"] == 1.0
    assert results_batch["retriever_map"] == 1.0
    assert results_batch["reader_top1_f1"] == results["reader_top1_f1"]
    assert results_batch["reader_top1_em"] == results["reader_top1_em"]
    assert results_batch["reader_topk_accuracy"] == results["reader_topk_accuracy"]

    # clean up
    document_store.delete_all_documents(index="test_eval_document")
    document_store.delete_all_documents(index="test_feedback")
Ejemplo n.º 4
0
def get_retriever(retriever_type, document_store):

    if retriever_type == "dpr":
        retriever = DensePassageRetriever(
            document_store=document_store,
            query_embedding_model=
            "facebook/dpr-question_encoder-single-nq-base",
            passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
            use_gpu=False,
            embed_title=True)
    elif retriever_type == "tfidf":
        retriever = TfidfRetriever(document_store=document_store)
        retriever.fit()
    elif retriever_type == "embedding":
        retriever = EmbeddingRetriever(document_store=document_store,
                                       embedding_model="deepset/sentence_bert",
                                       use_gpu=False)
    elif retriever_type == "elasticsearch":
        retriever = ElasticsearchRetriever(document_store=document_store)
    elif retriever_type == "es_filter_only":
        retriever = ElasticsearchFilterOnlyRetriever(
            document_store=document_store)
    else:
        raise Exception(f"No retriever fixture for '{retriever_type}'")

    return retriever
Ejemplo n.º 5
0
def get_test_client_and_override_dependencies(reader, document_store_with_docs):
    from rest_api.application import app
    from rest_api.controller import search

    search.document_store = document_store_with_docs
    search.retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
    search.FINDERS = {1: Finder(reader=reader, retriever=search.retriever)}

    return TestClient(app)
Ejemplo n.º 6
0
    def setup(self):
        print("SETTING UP PIPELINE")
        self.document_store = ElasticsearchDocumentStore(
            similarity="dot_product", host="elasticsearch", username="", password="", index="document")
        self.document_store_faiss = FAISSDocumentStore(
            index="document",
            faiss_index_factory_str="Flat",
            return_embedding=True,
            sql_url=f"postgresql://{config('POSTGRES_USER')}:{config('POSTGRES_PASSWORD')}@{config('POSTGRES_HOST')}:{config('POSTGRES_PORT')}/faiss"
        )
        processor, converter = self.write_as4_docs()
        table_data = self.write_table_docs(converter, processor)

        es_retriever = ElasticsearchRetriever(
            document_store=self.document_store)
        print("SETTING UP DPR")
        dpr_retriever = DPRTrainingManager.get_current_retriever(
            self.document_store_faiss)
        print("SETTING UP EMBEDDINGS")
        embedding_retriever = EmbeddingRetriever(
            document_store=self.document_store_faiss,
            embedding_model="deepset/sentence_bert"
        )
        query_classifier = QueryClassifier()
        print("SETTING UP TABLE")
        table_retriever = TableRetriever(table_data)
        print("SETUP RETRIEVERS")
        self.question_generator = FurtherQuestionGenerator()
        print("UPDATING EMBEDDINGS")
        self.document_store_faiss.update_embeddings(dpr_retriever)
        print("UPDATED EMBEDDINGS")
        self.dpr_node = ContinualDPRNode(
            dpr_retriever, self.document_store_faiss)
        result = Result()
        self.trainer = DPRTrainingManager(
            self.document_store_faiss, self.dpr_node)
        print("SETUP COMPONENTS")
        pipeline = Pipeline()
        pipeline.add_node(component=es_retriever,
                          name="ESRetriever", inputs=["Query"])
        pipeline.add_node(component=self.dpr_node,
                          name="DPRRetriever", inputs=["Query"])
        pipeline.add_node(component=embedding_retriever,
                          name="EmbeddingRetriever", inputs=["Query"])
        pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[
                          "DPRRetriever", "EmbeddingRetriever", "ESRetriever"])
        pipeline.add_node(component=query_classifier,
                          name="QueryClassifier", inputs=["JoinResults"])
        pipeline.add_node(component=self.question_generator,
                          name="QnGenerator", inputs=["QueryClassifier.output_1"])
        pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[
                          "QueryClassifier.output_2"])
        pipeline.add_node(component=result, name="Result", inputs=[
                          "QnGenerator", "TableRetriever"])
        self.pipeline = pipeline
        print("SETUP PIPELINE")
def test_elasticsearch_custom_query(elasticsearch_fixture):
    client = Elasticsearch()
    client.indices.delete(index='haystack_test_custom', ignore=[404])
    document_store = ElasticsearchDocumentStore(index="haystack_test_custom", text_field="custom_text_field",
                                                embedding_field="custom_embedding_field")
    documents = [
        {"text": "test_1", "meta": {"year": "2019"}},
        {"text": "test_2", "meta": {"year": "2020"}},
        {"text": "test_3", "meta": {"year": "2021"}},
        {"text": "test_4", "meta": {"year": "2021"}},
        {"text": "test_5", "meta": {"year": "2021"}},
    ]
    document_store.write_documents(documents)

    # test custom "terms" query
    retriever = ElasticsearchRetriever(
        document_store=document_store,
        custom_query="""
            {
                "size": 10, 
                "query": {
                    "bool": {
                        "should": [{
                            "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
                            "filter": [{"terms": {"year": ${years}}}]}}}"""
    )
    results = retriever.run(query="test", filters={"years": ["2020", "2021"]})[0]["documents"]
    assert len(results) == 4

    # test custom "term" query
    retriever = ElasticsearchRetriever(
        document_store=document_store,
        custom_query="""
                {
                    "size": 10, 
                    "query": {
                        "bool": {
                            "should": [{
                                "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
                                "filter": [{"term": {"year": ${years}}}]}}}"""
    )
    results = retriever.run(query="test", filters={"years": "2021"})[0]["documents"]
    assert len(results) == 3
Ejemplo n.º 8
0
def get_retriever(retriever_name, doc_store):
    if retriever_name == "elastic":
        return ElasticsearchRetriever(doc_store)
    if retriever_name == "tfidf":
        return TfidfRetriever(doc_store)
    if retriever_name == "dpr":
        return DensePassageRetriever(document_store=doc_store,
                                      query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                      passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                      use_gpu=True)
Ejemplo n.º 9
0
def test_join_document_pipeline(document_store_with_docs, reader):
    es = ElasticsearchRetriever(document_store=document_store_with_docs)
    dpr = DensePassageRetriever(
        document_store=document_store_with_docs,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=False,
    )
    document_store_with_docs.update_embeddings(dpr)

    query = "Where does Carla live?"

    # test merge without weights
    join_node = JoinDocuments(join_mode="merge")
    p = Pipeline()
    p.add_node(component=es, name="R1", inputs=["Query"])
    p.add_node(component=dpr, name="R2", inputs=["Query"])
    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
    results = p.run(query=query)
    assert len(results["documents"]) == 3

    # test merge with weights
    join_node = JoinDocuments(join_mode="merge",
                              weights=[1000, 1],
                              top_k_join=2)
    p = Pipeline()
    p.add_node(component=es, name="R1", inputs=["Query"])
    p.add_node(component=dpr, name="R2", inputs=["Query"])
    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
    results = p.run(query=query)
    assert math.isclose(results["documents"][0].score,
                        0.5350644373470798,
                        rel_tol=0.0001)
    assert len(results["documents"]) == 2

    # test concatenate
    join_node = JoinDocuments(join_mode="concatenate")
    p = Pipeline()
    p.add_node(component=es, name="R1", inputs=["Query"])
    p.add_node(component=dpr, name="R2", inputs=["Query"])
    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
    results = p.run(query=query)
    assert len(results["documents"]) == 3

    # test join_node with reader
    join_node = JoinDocuments()
    p = Pipeline()
    p.add_node(component=es, name="R1", inputs=["Query"])
    p.add_node(component=dpr, name="R2", inputs=["Query"])
    p.add_node(component=join_node, name="Join", inputs=["R1", "R2"])
    p.add_node(component=reader, name="Reader", inputs=["Join"])
    results = p.run(query=query)
    #check whether correct answer is within top 2 predictions
    assert results["answers"][0]["answer"] == "Berlin" or results["answers"][
        1]["answer"] == "Berlin"
Ejemplo n.º 10
0
 def __init__(self):
     self.document_store = ElasticsearchDocumentStore(host="localhost",
                                                      username="",
                                                      password="",
                                                      index="document")
     self.retriever = ElasticsearchRetriever(
         document_store=self.document_store)
     self.reader = FARMReader(
         model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
     self.finder = Finder(self.reader, self.retriever)
     print('Ready')
Ejemplo n.º 11
0
def process(document_store):

    logger = logging.getLogger(__name__)
    # # Connect to Elasticsearch
    # document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
    #
    # # write the docs to the DB.
    # document_store.write_documents(file)
    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question
    # could be answered.
    #
    # They use some simple but fast algorithm.
    # Elasticsearch's default BM25 algorithm is used
    retriever = ElasticsearchRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).
    # **Here:** a medium sized RoBERTa QA model using a Reader based on
    #           FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                            "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=False,
                        context_window_size=500)

    # Some default pipes that can be chosen from
    # Extractive QA
    # qa_pipe = ExtractiveQAPipeline(reader=reader, retriever=retriever)

    # Document Search
    # doc_pipe = DocumentSearchPipeline(retriever=retriever)

    # Generative QA
    # doc_pipe = GenerativeQAPipeline(generator=rag_generator, retriever=retriever)

    # FAQ based QA
    # doc_pipe = FAQPipeline(retriever=retriever)

    # p = FAQPipeline(retriever=retriever)

    p = ExtractiveQAPipeline(reader, retriever)
    query_Handler.pipe = p
Ejemplo n.º 12
0
def test_elasticsearch_retrieval_filters(document_store_with_docs):
    retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
    res = retriever.retrieve(query="Who lives in Berlin?", filters={"name": ["filename1"]})
    assert res[0].text == "My name is Carla and I live in Berlin"
    assert len(res) == 1
    assert res[0].meta["name"] == "filename1"

    res = retriever.retrieve(query="Who lives in Berlin?", filters={"name":["filename1"], "meta_field": ["not_existing_value"]})
    assert len(res) == 0

    res = retriever.retrieve(query="Who lives in Berlin?", filters={"name":["filename1"], "not_existing_field": ["not_existing_value"]})
    assert len(res) == 0

    retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
    res = retriever.retrieve(query="Who lives in Berlin?", filters={"name":["filename1"], "meta_field": ["test1","test2"]})
    assert res[0].text == "My name is Carla and I live in Berlin"
    assert len(res) == 1
    assert res[0].meta["name"] == "filename1"

    retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
    res = retriever.retrieve(query="Who lives in Berlin?", filters={"name":["filename1"], "meta_field":["test2"]})
    assert len(res) == 0
Ejemplo n.º 13
0
def qna():
    """Return the n answers."""

    question = request.form['question']
    # index is the target document where queries need to sent.
    index = request.form['index']

    # to select train or untrained model
    mode = request.form['mode']

    #initialization of the Haystack Elasticsearch document storage
    document_store = ElasticsearchDocumentStore(
        host=app.config["host"],
        username=app.config["username"],
        password=app.config["password"],
        index=index)

    if mode == 'trained':
        # base on the search mode train_model
        reader = FARMReader(model_name_or_path=app.config["train_model"],
                            use_gpu=False)
    else:
        # base on the search mode pre_train
        reader = FARMReader(
            model_name_or_path="distilbert-base-uncased-distilled-squad",
            use_gpu=False)

    #initialization of ElasticRetriever
    retriever = ElasticsearchRetriever(document_store=document_store)
    # Finder sticks together reader and retriever
    # in a pipeline to answer our actual questions.
    finder = Finder(reader, retriever)

    # predict n answers
    n = int(request.form['n'])
    prediction = finder.get_answers(question=question,
                                    top_k_retriever=10,
                                    top_k_reader=n)
    answer = []
    for res in prediction['answers']:
        answer.append(res['answer'])

    return json.dumps({
        'status': 'success',
        'message': 'Process succesfully',
        'result': answer
    })
Ejemplo n.º 14
0
def test_eval_elastic_retriever(document_store: BaseDocumentStore, open_domain):
    retriever = ElasticsearchRetriever(document_store=document_store)

    # add eval data (SQUAD format)
    document_store.delete_all_documents(index="test_eval_document")
    document_store.delete_all_documents(index="test_feedback")
    document_store.add_eval_data(filename="samples/squad/tiny.json", doc_index="test_eval_document", label_index="test_feedback")
    assert document_store.get_document_count(index="test_eval_document") == 2

    # eval retriever
    results = retriever.eval(top_k=1, label_index="test_feedback", doc_index="test_eval_document", open_domain=open_domain)
    assert results["recall"] == 1.0
    assert results["map"] == 1.0

    # clean up
    document_store.delete_all_documents(index="test_eval_document")
    document_store.delete_all_documents(index="test_feedback")
Ejemplo n.º 15
0
def get_retriever(retriever_name, doc_store):
    if retriever_name == "elastic":
        return ElasticsearchRetriever(doc_store)
    if retriever_name == "tfidf":
        return TfidfRetriever(doc_store)
    if retriever_name == "dpr":
        return DensePassageRetriever(
            document_store=doc_store,
            query_embedding_model=
            "facebook/dpr-question_encoder-single-nq-base",
            passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
            use_gpu=True,
            use_fast_tokenizers=False)
    if retriever_name == "sentence_transformers":
        return EmbeddingRetriever(document_store=doc_store,
                                  embedding_model="nq-distilbert-base-v1",
                                  use_gpu=True,
                                  model_format="sentence_transformers")
Ejemplo n.º 16
0
    def __init__(self, id, add_sample_data=False):
        Model.__init__(self, id)

        doc_store = ElasticsearchDocumentStore(host=DB_HOST,
                                               port=DB_PORT,
                                               index=self.id)
        retriever = ElasticsearchRetriever(document_store=doc_store)

        reader = FARMReader(
            model_name_or_path=READER_MODEL_PATH,
            batch_size=BATCHSIZE,
            use_gpu=False,
            num_processes=MAX_PROCESSES,
        )
        self.finder = Finder(reader, retriever)

        if add_sample_data:
            add_sample_data_doc_qa(self)

        reader.save(directory=READER_MODEL_PATH)
        print("saved")
Ejemplo n.º 17
0
def launch_and_index_es(documents_dicts: List):
    es = Elasticsearch(['http://localhost:9200/'], verify_certs=True)
    if not es.ping():
        logging.info("Starting Elasticsearch ...")
        status = subprocess.run([
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'
        ],
                                shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        sleep(7)

    es.indices.delete(index='document', ignore=[400, 404])
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")
    document_store.write_documents(documents_dicts)
    retriever = ElasticsearchRetriever(document_store=document_store)
    return retriever
Ejemplo n.º 18
0
def create_app(config_name):
    app = Flask(__name__)
    app.config.from_object(config[config_name])
    host = config[config_name].ELASTIC_URL
    port = config[config_name].ELASTIC_PORT
    index = config[config_name].ELASTIC_INDEX

    doc_store = ElasticsearchDocumentStore(host=host,
                                           username='',
                                           password='',
                                           index=index)

    retriever = ElasticsearchRetriever(document_store=doc_store)
    model_name = "deepset/roberta-base-squad2"
    reader = FARMReader(model_name_or_path=model_name,
                        num_processes=0,
                        use_gpu=False)
    app.finder = Finder(reader, retriever)

    from app.main import main as main_blueprint
    app.register_blueprint(main_blueprint)

    return app
Ejemplo n.º 19
0
def tutorial11_pipelines():
    #Download and prepare data - 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    got_dicts = convert_files_to_dicts(dir_path=doc_dir,
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=True)

    # Initialize DocumentStore and index documents
    launch_es()
    document_store = ElasticsearchDocumentStore()
    document_store.delete_all_documents()
    document_store.write_documents(got_dicts)

    # Initialize Sparse retriever
    es_retriever = ElasticsearchRetriever(document_store=document_store)

    # Initialize dense retriever
    dpr_retriever = DensePassageRetriever(document_store)
    document_store.update_embeddings(dpr_retriever,
                                     update_existing_embeddings=False)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

    ######################
    # Prebuilt Pipelines #
    ######################

    # Extractive QA Pipeline
    ########################

    p_extractive_premade = ExtractiveQAPipeline(reader=reader,
                                                retriever=es_retriever)
    res = p_extractive_premade.run(query="Who is the father of Arya Stark?",
                                   top_k_retriever=10,
                                   top_k_reader=5)
    print_answers(res, details="minimal")

    # Document Search Pipeline
    ##########################

    p_retrieval = DocumentSearchPipeline(es_retriever)
    res = p_retrieval.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10)
    print_documents(res, max_text_len=200)

    # Generator Pipeline
    ##########################

    # We set this to True so that the document store returns document embeddings
    # with each document, this is needed by the Generator
    document_store.return_embedding = True

    # Initialize generator
    rag_generator = RAGenerator()

    # Generative QA
    p_generator = GenerativeQAPipeline(generator=rag_generator,
                                       retriever=dpr_retriever)
    res = p_generator.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10)
    print_answers(res, details="minimal")

    # We are setting this to False so that in later pipelines,
    # we get a cleaner printout
    document_store.return_embedding = False

    ##############################
    # Creating Pipeline Diagrams #
    ##############################

    p_extractive_premade.draw("pipeline_extractive_premade.png")
    p_retrieval.draw("pipeline_retrieval.png")
    p_generator.draw("pipeline_generator.png")

    ####################
    # Custom Pipelines #
    ####################

    # Extractive QA Pipeline
    ########################

    # Custom built extractive QA pipeline
    p_extractive = Pipeline()
    p_extractive.add_node(component=es_retriever,
                          name="Retriever",
                          inputs=["Query"])
    p_extractive.add_node(component=reader,
                          name="Reader",
                          inputs=["Retriever"])

    # Now we can run it
    res = p_extractive.run(query="Who is the father of Arya Stark?",
                           top_k_retriever=10,
                           top_k_reader=5)
    print_answers(res, details="minimal")
    p_extractive.draw("pipeline_extractive.png")

    # Ensembled Retriever Pipeline
    ##############################

    # Create ensembled pipeline
    p_ensemble = Pipeline()
    p_ensemble.add_node(component=es_retriever,
                        name="ESRetriever",
                        inputs=["Query"])
    p_ensemble.add_node(component=dpr_retriever,
                        name="DPRRetriever",
                        inputs=["Query"])
    p_ensemble.add_node(component=JoinDocuments(join_mode="concatenate"),
                        name="JoinResults",
                        inputs=["ESRetriever", "DPRRetriever"])
    p_ensemble.add_node(component=reader,
                        name="Reader",
                        inputs=["JoinResults"])
    p_ensemble.draw("pipeline_ensemble.png")

    # Run pipeline
    res = p_ensemble.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=5  #This is top_k per retriever
    )
    print_answers(res, details="minimal")

    # Query Classification Pipeline
    ###############################

    # Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.
    # Though this looks very similar to the ensembled pipeline shown above,
    # the key difference is that only one of the retrievers is run for each request.
    # By contrast both retrievers are always run in the ensembled approach.

    class QueryClassifier():
        outgoing_edges = 2

        def run(self, **kwargs):
            if "?" in kwargs["query"]:
                return (kwargs, "output_2")
            else:
                return (kwargs, "output_1")

    # Here we build the pipeline
    p_classifier = Pipeline()
    p_classifier.add_node(component=QueryClassifier(),
                          name="QueryClassifier",
                          inputs=["Query"])
    p_classifier.add_node(component=es_retriever,
                          name="ESRetriever",
                          inputs=["QueryClassifier.output_1"])
    p_classifier.add_node(component=dpr_retriever,
                          name="DPRRetriever",
                          inputs=["QueryClassifier.output_2"])
    p_classifier.add_node(component=reader,
                          name="QAReader",
                          inputs=["ESRetriever", "DPRRetriever"])
    p_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = p_classifier.run(query="Who is the father of Arya Stark?",
                             top_k_retriever=10)
    print("DPR Results" + "\n" + "=" * 15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = p_classifier.run(query="Arya Stark father", top_k_retriever=10)
    print("ES Results" + "\n" + "=" * 15)
    print_answers(res_2)
Ejemplo n.º 20
0
# dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
# # # print(dicts[:1])
# # # print(len(dicts))

from haystack.database.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="document")
# document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document",
#                                             embedding_field="embedding", embedding_dim=768)

# document_store.write_documents(dicts)

from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

# from haystack.retriever.dense import DensePassageRetriever
# retriever = DensePassageRetriever(document_store=document_store,
#                                   embedding_model="dpr-bert-base-nq",
#                                   do_lower_case=True, use_gpu=True)
# # document_store.update_embeddings(retriever)

# from haystack.retriever.dense import EmbeddingRetriever
# retriever = EmbeddingRetriever(document_store=document_store,
#                                embedding_model="deepset/sentence_bert",
#                                model_format="farm")

# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
# reader = TransformersReader(model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
Ejemplo n.º 21
0
from fastapi import FastAPI
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever
from haystack.reader.farm import FARMReader
from haystack.pipeline import ExtractiveQAPipeline

# initialize doc store, retriever and reader components
DOC_STORE = ElasticsearchDocumentStore(host='localhost',
                                       username='',
                                       password='',
                                       index='aurelius')
RETRIEVER = ElasticsearchRetriever(DOC_STORE)
READER = FARMReader(model_name_or_path='deepset/bert-base-cased-squad2',
                    context_window_size=1500,
                    use_gpu=True)
# initialize pipeline
PIPELINE = ExtractiveQAPipeline(reader=READER, retriever=RETRIEVER)
# initialize API
APP = FastAPI()


@APP.get('/query')
async def get_query(q: str, retriever_limit: int = 10, reader_limit: int = 3):
    """Makes query to doc store via Haystack pipeline.

    :param q: Query string representing the question being asked.
    :type q: str
    """
    # get answers
    return PIPELINE.run(query=q,
                        top_k_retriever=retriever_limit,
Ejemplo n.º 22
0
def get_elastic_search_retriever(document_store):
    return ElasticsearchRetriever(document_store=document_store)
Ejemplo n.º 23
0
def tutorial1_basic_qa_pipeline():
    logger = logging.getLogger(__name__)

    LAUNCH_ELASTICSEARCH = True

    # ## Document Store
    #
    # Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of
    # `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`.
    #
    # **Here:** We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval,
    # and vector storage for text embeddings.
    # **Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3
    # for using SQL/InMemory document stores.
    # **Hint**:
    # This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can
    # configure Haystack to work with your existing document stores.
    #
    # Start an Elasticsearch server
    # You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in
    # your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.

    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.run([
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'
        ],
                                shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # Connect to Elasticsearch
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")

    # ## Preprocessing of documents
    #
    # Haystack provides a customizable pipeline for:
    # - converting files into texts
    # - cleaning texts
    # - splitting texts
    # - writing them to a Document Store

    # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add
    # them in Elasticsearch.

    # Let's first fetch some documents that we want to query
    # Here: 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    if LAUNCH_ELASTICSEARCH:
        document_store.write_documents(dicts)
    else:
        logger.warning(
            "Since we already have a running ES instance we should not index the same documents again. \n"
            "If you still want to do this call: document_store.write_documents(dicts) manually "
        )

    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question
    # could be answered.
    #
    # They use some simple but fast algorithm.
    # **Here:** We use Elasticsearch's default BM25 algorithm
    # **Alternatives:**
    # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters
    # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of
    #   embeddings (e.g. created via Sentence-BERT)
    # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging

    retriever = ElasticsearchRetriever(document_store=document_store)

    # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes
    # with SQLite document store.
    #
    # from haystack.retriever.tfidf import TfidfRetriever
    # retriever = TfidfRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).
    # **Here:** a medium sized RoBERTa QA model using a Reader based on
    #           FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                            "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
    # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean
    #           the model prefers "no answer possible"
    #
    # #### FARMReader

    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    # #### TransformersReader

    # Alternative:
    # reader = TransformersReader(
    #    model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

    # ### Pipeline
    #
    # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
    # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
Ejemplo n.º 24
0
def tutorial14_query_classifier():

    #Download and prepare data - 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    got_dicts = convert_files_to_dicts(
        dir_path=doc_dir,
        clean_func=clean_wiki_text,
        split_paragraphs=True
    )

    # Initialize DocumentStore and index documents
    launch_es()
    document_store = ElasticsearchDocumentStore()
    document_store.delete_all_documents()
    document_store.write_documents(got_dicts)

    # Initialize Sparse retriever
    es_retriever = ElasticsearchRetriever(document_store=document_store)

    # Initialize dense retriever
    dpr_retriever = DensePassageRetriever(document_store)
    document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")


    # Here we build the pipeline
    sklearn_keyword_classifier = Pipeline()
    sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
    sklearn_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
    sklearn_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"])
    sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
    sklearn_keyword_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = sklearn_keyword_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = sklearn_keyword_classifier.run(
        query="arya stark father",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_2)

    # Run only the dense retriever on the full sentence query
    res_3 = sklearn_keyword_classifier.run(
        query="which country was jon snow filmed ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_3)

    # Run only the sparse retriever on a keyword based query
    res_4 = sklearn_keyword_classifier.run(
        query="jon snow country",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_4)

    # Run only the dense retriever on the full sentence query
    res_5 = sklearn_keyword_classifier.run(
        query="who are the younger brothers of arya stark ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_5)

    # Run only the sparse retriever on a keyword based query
    res_6 = sklearn_keyword_classifier.run(
        query="arya stark younger brothers",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_6)

    # Here we build the pipeline
    transformer_keyword_classifier = Pipeline()
    transformer_keyword_classifier.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"])
    transformer_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
    transformer_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"])
    transformer_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
    transformer_keyword_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = transformer_keyword_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = transformer_keyword_classifier.run(
        query="arya stark father",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_2)

    # Run only the dense retriever on the full sentence query
    res_3 = transformer_keyword_classifier.run(
        query="which country was jon snow filmed ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_3)

    # Run only the sparse retriever on a keyword based query
    res_4 = transformer_keyword_classifier.run(
        query="jon snow country",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_4)

    # Run only the dense retriever on the full sentence query
    res_5 = transformer_keyword_classifier.run(
        query="who are the younger brothers of arya stark ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_5)

    # Run only the sparse retriever on a keyword based query
    res_6 = transformer_keyword_classifier.run(
        query="arya stark younger brothers",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_6)

    # Here we build the pipeline
    transformer_question_classifier = Pipeline()
    transformer_question_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
    transformer_question_classifier.add_node(component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"), name="QueryClassifier", inputs=["DPRRetriever"])
    transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"])
    transformer_question_classifier.draw("question_classifier.png")

    # Run only the QA reader on the question query
    res_1 = transformer_question_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Show only DPR results
    res_2 = transformer_question_classifier.run(
        query="Arya Stark was the daughter of a Lord.",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    res_2

    # Here we create the keyword vs question/statement query classifier

    queries = ["arya stark father","jon snow country",
               "who is the father of arya stark","which country was jon snow filmed?"]

    keyword_classifier = TransformersQueryClassifier()

    for query in queries:
        result = keyword_classifier.run(query=query)
        if result[1] == "output_1":
            category = "question/statement"
        else:
            category = "keyword"

        print(f"Query: {query}, raw_output: {result}, class: {category}")

    # Here we create the question vs statement query classifier

    queries = ["Lord Eddard was the father of Arya Stark.","Jon Snow was filmed in United Kingdom.",
               "who is the father of arya stark?","Which country was jon snow filmed in?"]

    question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier")

    for query in queries:
        result = question_classifier.run(query=query)
        if result[1] == "output_1":
            category = "question"
        else:
            category = "statement"

        print(f"Query: {query}, raw_output: {result}, class: {category}")
Ejemplo n.º 25
0
def tutorial5_evaluation():

    ##############################################
    # Settings
    ##############################################
    # Choose from Evaluation style from ['retriever_closed', 'reader_closed', 'retriever_reader_open']
    # 'retriever_closed' - evaluates only the retriever, based on whether the gold_label document is retrieved.
    # 'reader_closed' - evaluates only the reader in a closed domain fashion i.e. the reader is given one query
    #     and one document and metrics are calculated on whether the right position in this text is selected by
    #     the model as the answer span (i.e. SQuAD style)
    # 'retriever_reader_open' - evaluates retriever and reader in open domain fashion i.e. a document is considered
    #     correctly retrieved if it contains the answer string within it. The reader is evaluated based purely on the
    #     predicted string, regardless of which document this came from and the position of the extracted span.
    style = "retriever_reader_open"

    # make sure these indices do not collide with existing ones, the indices will be wiped clean before data is inserted
    doc_index = "tutorial5_docs"
    label_index = "tutorial5_labels"

    ##############################################
    # Code
    ##############################################
    launch_es()
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # Download evaluation data, which is a subset of Natural Questions development set containing 50 documents
    doc_dir = "../data/nq"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # Connect to Elasticsearch
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document",
                                                create_index=False,
                                                embedding_field="emb",
                                                embedding_dim=768,
                                                excluded_meta_data=["emb"])

    # Add evaluation data to Elasticsearch document store
    # We first delete the custom tutorial indices to not have duplicate elements
    # and also split our documents into shorter passages using the PreProcessor
    preprocessor = PreProcessor(split_by="word",
                                split_length=500,
                                split_overlap=0,
                                split_respect_sentence_boundary=False,
                                clean_empty_lines=False,
                                clean_whitespace=False)
    document_store.delete_all_documents(index=doc_index)
    document_store.delete_all_documents(index=label_index)
    document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json",
                                 doc_index=doc_index,
                                 label_index=label_index,
                                 preprocessor=preprocessor)

    # Let's prepare the labels that we need for the retriever and the reader
    labels = document_store.get_all_labels_aggregated(index=label_index)

    # Initialize Retriever
    retriever = ElasticsearchRetriever(document_store=document_store)

    # Alternative: Evaluate DensePassageRetriever
    # Note, that DPR works best when you index short passages < 512 tokens as only those tokens will be used for the embedding.
    # Here, for nq_dev_subset_v2.json we have avg. num of tokens = 5220(!).
    # DPR still outperforms Elastic's BM25 by a small margin here.
    # retriever = DensePassageRetriever(document_store=document_store,
    #                                   query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    #                                   passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    #                                   use_gpu=True,
    #                                   embed_title=True,
    #                                   remove_sep_tok_from_untitled_passages=True)
    # document_store.update_embeddings(retriever, index=doc_index)

    # Initialize Reader
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        top_k=4,
                        return_no_answer=True)

    # Here we initialize the nodes that perform evaluation
    eval_retriever = EvalDocuments()
    eval_reader = EvalAnswers(
        sas_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    )

    ## Evaluate Retriever on its own in closed domain fashion
    if style == "retriever_closed":
        retriever_eval_results = retriever.eval(top_k=10,
                                                label_index=label_index,
                                                doc_index=doc_index)
        ## Retriever Recall is the proportion of questions for which the correct document containing the answer is
        ## among the correct documents
        print("Retriever Recall:", retriever_eval_results["recall"])
        ## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
        print("Retriever Mean Avg Precision:", retriever_eval_results["map"])

    # Evaluate Reader on its own in closed domain fashion (i.e. SQuAD style)
    elif style == "reader_closed":
        reader_eval_results = reader.eval(document_store=document_store,
                                          device=device,
                                          label_index=label_index,
                                          doc_index=doc_index)
        # Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
        #reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device)

        ## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer
        print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])
        ## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
        print("Reader Exact Match:", reader_eval_results["EM"])
        ## Reader F1-Score is the average overlap between the predicted answers and the correct answers
        print("Reader F1-Score:", reader_eval_results["f1"])

    # Evaluate combination of Reader and Retriever in open domain fashion
    elif style == "retriever_reader_open":

        # Here is the pipeline definition
        p = Pipeline()
        p.add_node(component=retriever, name="ESRetriever", inputs=["Query"])
        p.add_node(component=eval_retriever,
                   name="EvalDocuments",
                   inputs=["ESRetriever"])
        p.add_node(component=reader, name="QAReader", inputs=["EvalDocuments"])
        p.add_node(component=eval_reader,
                   name="EvalAnswers",
                   inputs=["QAReader"])
        results = []

        for l in labels:
            res = p.run(
                query=l.question,
                top_k_retriever=10,
                labels=l,
                top_k_reader=10,
                index=doc_index,
            )
            results.append(res)

        eval_retriever.print()
        print()
        retriever.print_time()
        print()
        eval_reader.print(mode="reader")
        print()
        reader.print_time()
        print()
        eval_reader.print(mode="pipeline")
    else:
        raise ValueError(
            f'style={style} is not a valid option. Choose from retriever_closed, reader_closed, retriever_reader_open'
        )
def main():

    launch_es()

    document_store = ElasticsearchDocumentStore()
    es_retriever = ElasticsearchRetriever(document_store=document_store)
    eval_retriever = EvalRetriever(open_domain=open_domain)
    reader = FARMReader("deepset/roberta-base-squad2",
                        top_k_per_candidate=4,
                        num_processes=1,
                        return_no_answer=True)
    eval_reader = EvalReader(debug=True, open_domain=open_domain)

    # Download evaluation data, which is a subset of Natural Questions development set containing 50 documents
    doc_dir = "../data/nq"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # Add evaluation data to Elasticsearch document store
    # We first delete the custom tutorial indices to not have duplicate elements
    preprocessor = PreProcessor(split_length=500,
                                split_overlap=0,
                                split_respect_sentence_boundary=False,
                                clean_empty_lines=False,
                                clean_whitespace=False)
    document_store.delete_all_documents(index=doc_index)
    document_store.delete_all_documents(index=label_index)
    document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json",
                                 doc_index=doc_index,
                                 label_index=label_index,
                                 preprocessor=preprocessor)
    labels = document_store.get_all_labels_aggregated(index=label_index)
    q_to_l_dict = {l.question: {"retriever": l, "reader": l} for l in labels}

    # Here is the pipeline definition
    p = Pipeline()
    p.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"])
    p.add_node(component=eval_retriever,
               name="EvalRetriever",
               inputs=["ESRetriever"])
    p.add_node(component=reader, name="QAReader", inputs=["EvalRetriever"])
    p.add_node(component=eval_reader, name="EvalReader", inputs=["QAReader"])

    results = []
    for i, (q, l) in enumerate(q_to_l_dict.items()):
        res = p.run(
            query=q,
            top_k_retriever=top_k_retriever,
            labels=l,
            top_k_reader=10,
            index=doc_index,
            # skip_incorrect_retrieval=True
        )
        results.append(res)

    eval_retriever.print()
    print()
    es_retriever.print_time()
    print()
    eval_reader.print(mode="reader")
    print()
    reader.print_time()
    print()
    eval_reader.print(mode="pipeline")
Ejemplo n.º 27
0
def test_elasticsearch_retrieval(document_store_with_docs):
    retriever = ElasticsearchRetriever(document_store=document_store_with_docs)
    res = retriever.retrieve(query="Who lives in Berlin?")
    assert res[0].text == "My name is Carla and I live in Berlin"
    assert len(res) == 3
    assert res[0].meta["name"] == "filename1"
 def setRetriever(self, documentStore):
     return ElasticsearchRetriever(document_store=documentStore)