def test_get_document_count_only_documents_without_embedding_arg():
    documents = [
        {
            "text": "text1",
            "id": "1",
            "embedding": np.random.rand(768).astype(np.float32),
            "meta_field_for_count": "a"
        },
        {
            "text": "text2",
            "id": "2",
            "embedding": np.random.rand(768).astype(np.float64),
            "meta_field_for_count": "b"
        },
        {
            "text": "text3",
            "id": "3",
            "embedding": np.random.rand(768).astype(np.float32).tolist()
        },
        {
            "text": "text4",
            "id": "4",
            "meta_field_for_count": "b"
        },
        {
            "text": "text5",
            "id": "5",
            "meta_field_for_count": "b"
        },
        {
            "text": "text6",
            "id": "6",
            "meta_field_for_count": "c"
        },
        {
            "text": "text7",
            "id": "7",
            "embedding": np.random.rand(768).astype(np.float64),
            "meta_field_for_count": "c"
        },
    ]

    _index: str = "haystack_test_count"
    document_store = ElasticsearchDocumentStore(index=_index)
    document_store.delete_documents(index=_index)

    document_store.write_documents(documents)

    assert document_store.get_document_count() == 7
    assert document_store.get_document_count(
        only_documents_without_embedding=True) == 3
    assert document_store.get_document_count(
        only_documents_without_embedding=True,
        filters={"meta_field_for_count": ["c"]}) == 1
    assert document_store.get_document_count(
        only_documents_without_embedding=True,
        filters={"meta_field_for_count": ["b"]}) == 2
Exemple #2
0
def test_elasticsearch_custom_fields(elasticsearch_fixture):
    client = Elasticsearch()
    client.indices.delete(index='haystack_test_custom', ignore=[404])
    document_store = ElasticsearchDocumentStore(index="haystack_test_custom", text_field="custom_text_field",
                                                embedding_field="custom_embedding_field")

    doc_to_write = {"custom_text_field": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)}
    document_store.write_documents([doc_to_write])
    documents = document_store.get_all_documents(return_embedding=True)
    assert len(documents) == 1
    assert documents[0].text == "test"
    np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)
def update_elastic_embeddings(document_store: ElasticsearchDocumentStore,
                              retriever: BaseRetriever,
                              update_existing=False):
    index = document_store.index

    result = document_store.get_all_documents_generator(index)
    for document_batch in get_batches_from_generator(result, 10_000):
        if len(document_batch) == 0:
            break
        if not update_existing:
            # take only documents with no embeddings
            document_batch = [d for d in document_batch if d.embedding is None]
        if len(document_batch) == 0:
            continue
        embeddings = retriever.embed_passages(document_batch)  # type: ignore
        assert len(document_batch) == len(embeddings)
        print('updating ', len(document_batch), ' embeddings')

        doc_updates = []
        for doc, emb in zip(document_batch, embeddings):
            update = {
                "_op_type": "update",
                "_index": index,
                "_id": doc.id,
                "doc": {
                    document_store.embedding_field: emb.tolist()
                },
            }
            doc_updates.append(update)

        bulk(document_store.client,
             doc_updates,
             request_timeout=300,
             refresh=document_store.refresh_type)
Exemple #4
0
 def __init__(self):
     self.finder = Finder(reader=Reader(model_name_or_path=MODEL_PATH,
                                        tokenizer=MODEL_PATH,
                                        use_gpu=0),
                          retriever=ElasticsearchRetriever(
                              document_store=ElasticsearchDocumentStore(
                                  refresh_type='false')))
def get_elastic_document_store():
    def is_first_run():
        existing_images = os.popen('docker images').read()
        return 'elasticsearch' in existing_images and '7.9.2' in existing_images

    def create_image_and_volume():
        os.popen('mkdir -m777 -p elasticsearch/data')
        os.popen(
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" -v $PWD/elasticsearch/data:/usr/share/elasticsearch/data --name elasticsearch elasticsearch:7.9.2'
        )

    if is_first_run():
        create_image_and_volume()

    print('starting elastic docker')
    if 'elasticsearch' not in os.popen('docker ps').read():
        os.popen("""docker start %s""" % elastic_docker_id)
        time.sleep(25)
        print(
            os.popen(
                """curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_cluster/settings -d '{ "transient": { "cluster.routing.allocation.disk.threshold_enabled": false } }'"""
            ))
        print(
            os.popen(
                """curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'"""
            ))
        time.sleep(5)
    elastic_ds = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="document",
                                            return_embedding=True)
    return elastic_ds
Exemple #6
0
def get_document_store(document_store_type, embedding_field="embedding"):
    if document_store_type == "sql":
        document_store = SQLDocumentStore(url="sqlite://", index="haystack_test")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore(
            return_embedding=True, embedding_field=embedding_field, index="haystack_test"
        )
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(
            index="haystack_test", return_embedding=True, embedding_field=embedding_field
        )
    elif document_store_type == "faiss":
        document_store = FAISSDocumentStore(
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        return document_store
    elif document_store_type == "milvus":
        document_store = MilvusDocumentStore(
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        return document_store
    else:
        raise Exception(f"No document store fixture for '{document_store_type}'")

    return document_store
Exemple #7
0
def get_document_store(document_store_type, similarity='dot_product'):
    """ TODO This method is taken from test/conftest.py but maybe should be within Haystack.
    Perhaps a class method of DocStore that just takes string for type of DocStore"""
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
        assert document_store.get_document_count() == 0
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore()
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="eval_document",
                                                    similarity=similarity,
                                                    timeout=3000)
    elif document_store_type in ("milvus_flat", "milvus_hnsw"):
        if document_store_type == "milvus_flat":
            index_type = IndexType.FLAT
            index_param = None
            search_param = None
        elif document_store_type == "milvus_hnsw":
            index_type = IndexType.HNSW
            index_param = {"M": 64, "efConstruction": 80}
            search_param = {"ef": 20}
        document_store = MilvusDocumentStore(similarity=similarity,
                                             index_type=index_type,
                                             index_param=index_param,
                                             search_param=search_param)
        assert document_store.get_document_count(index="eval_document") == 0
    elif document_store_type in ("faiss_flat", "faiss_hnsw"):
        if document_store_type == "faiss_flat":
            index_type = "Flat"
        elif document_store_type == "faiss_hnsw":
            index_type = "HNSW"
        status = subprocess.run(['docker rm -f haystack-postgres'], shell=True)
        time.sleep(1)
        status = subprocess.run([
            'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres'
        ],
                                shell=True)
        time.sleep(6)
        status = subprocess.run([
            'docker exec haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'
        ],
                                shell=True)
        time.sleep(1)
        document_store = FAISSDocumentStore(
            sql_url="postgresql://*****:*****@localhost:5432/haystack",
            faiss_index_factory_str=index_type,
            similarity=similarity)
        assert document_store.get_document_count() == 0

    else:
        raise Exception(
            f"No document store fixture for '{document_store_type}'")
    return document_store
Exemple #8
0
    def setup(self):
        print("SETTING UP PIPELINE")
        self.document_store = ElasticsearchDocumentStore(
            similarity="dot_product", host="elasticsearch", username="", password="", index="document")
        self.document_store_faiss = FAISSDocumentStore(
            index="document",
            faiss_index_factory_str="Flat",
            return_embedding=True,
            sql_url=f"postgresql://{config('POSTGRES_USER')}:{config('POSTGRES_PASSWORD')}@{config('POSTGRES_HOST')}:{config('POSTGRES_PORT')}/faiss"
        )
        processor, converter = self.write_as4_docs()
        table_data = self.write_table_docs(converter, processor)

        es_retriever = ElasticsearchRetriever(
            document_store=self.document_store)
        print("SETTING UP DPR")
        dpr_retriever = DPRTrainingManager.get_current_retriever(
            self.document_store_faiss)
        print("SETTING UP EMBEDDINGS")
        embedding_retriever = EmbeddingRetriever(
            document_store=self.document_store_faiss,
            embedding_model="deepset/sentence_bert"
        )
        query_classifier = QueryClassifier()
        print("SETTING UP TABLE")
        table_retriever = TableRetriever(table_data)
        print("SETUP RETRIEVERS")
        self.question_generator = FurtherQuestionGenerator()
        print("UPDATING EMBEDDINGS")
        self.document_store_faiss.update_embeddings(dpr_retriever)
        print("UPDATED EMBEDDINGS")
        self.dpr_node = ContinualDPRNode(
            dpr_retriever, self.document_store_faiss)
        result = Result()
        self.trainer = DPRTrainingManager(
            self.document_store_faiss, self.dpr_node)
        print("SETUP COMPONENTS")
        pipeline = Pipeline()
        pipeline.add_node(component=es_retriever,
                          name="ESRetriever", inputs=["Query"])
        pipeline.add_node(component=self.dpr_node,
                          name="DPRRetriever", inputs=["Query"])
        pipeline.add_node(component=embedding_retriever,
                          name="EmbeddingRetriever", inputs=["Query"])
        pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[
                          "DPRRetriever", "EmbeddingRetriever", "ESRetriever"])
        pipeline.add_node(component=query_classifier,
                          name="QueryClassifier", inputs=["JoinResults"])
        pipeline.add_node(component=self.question_generator,
                          name="QnGenerator", inputs=["QueryClassifier.output_1"])
        pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[
                          "QueryClassifier.output_2"])
        pipeline.add_node(component=result, name="Result", inputs=[
                          "QnGenerator", "TableRetriever"])
        self.pipeline = pipeline
        print("SETUP PIPELINE")
def test_elasticsearch_custom_query(elasticsearch_fixture):
    client = Elasticsearch()
    client.indices.delete(index='haystack_test_custom', ignore=[404])
    document_store = ElasticsearchDocumentStore(index="haystack_test_custom", text_field="custom_text_field",
                                                embedding_field="custom_embedding_field")
    documents = [
        {"text": "test_1", "meta": {"year": "2019"}},
        {"text": "test_2", "meta": {"year": "2020"}},
        {"text": "test_3", "meta": {"year": "2021"}},
        {"text": "test_4", "meta": {"year": "2021"}},
        {"text": "test_5", "meta": {"year": "2021"}},
    ]
    document_store.write_documents(documents)

    # test custom "terms" query
    retriever = ElasticsearchRetriever(
        document_store=document_store,
        custom_query="""
            {
                "size": 10, 
                "query": {
                    "bool": {
                        "should": [{
                            "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
                            "filter": [{"terms": {"year": ${years}}}]}}}"""
    )
    results = retriever.run(query="test", filters={"years": ["2020", "2021"]})[0]["documents"]
    assert len(results) == 4

    # test custom "term" query
    retriever = ElasticsearchRetriever(
        document_store=document_store,
        custom_query="""
                {
                    "size": 10, 
                    "query": {
                        "bool": {
                            "should": [{
                                "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
                                "filter": [{"term": {"year": ${years}}}]}}}"""
    )
    results = retriever.run(query="test", filters={"years": "2021"})[0]["documents"]
    assert len(results) == 3
Exemple #10
0
def get_document_store(document_store_type, es_similarity='cosine'):
    """ TODO This method is taken from test/conftest.py but maybe should be within Haystack.
    Perhaps a class method of DocStore that just takes string for type of DocStore"""
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore()
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="eval_document",
                                                    similarity=es_similarity)
    elif document_store_type in ("faiss_flat", "faiss_hnsw"):
        if document_store_type == "faiss_flat":
            index_type = "Flat"
        elif document_store_type == "faiss_hnsw":
            index_type = "HNSW"

        #TEMP FIX for issue with deleting docs
        # status = subprocess.run(
        #     ['docker rm -f haystack-postgres'],
        #     shell=True)
        # time.sleep(3)
        # try:
        #     document_store = FAISSDocumentStore(sql_url="postgresql://*****:*****@localhost:5432/haystack",
        #                                         faiss_index_factory_str=index_type)
        # except:
        # Launch a postgres instance & create empty DB
        # logger.info("Didn't find Postgres. Start a new instance...")
        status = subprocess.run(['docker rm -f haystack-postgres'], shell=True)
        time.sleep(1)
        status = subprocess.run([
            'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres'
        ],
                                shell=True)
        time.sleep(3)
        status = subprocess.run([
            'docker exec -it haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'
        ],
                                shell=True)
        time.sleep(1)
        document_store = FAISSDocumentStore(
            sql_url="postgresql://*****:*****@localhost:5432/haystack",
            faiss_index_factory_str=index_type)

    else:
        raise Exception(
            f"No document store fixture for '{document_store_type}'")
    assert document_store.get_document_count() == 0
    return document_store
Exemple #11
0
def get_document_store(document_store_type,
                       embedding_dim=768,
                       embedding_field="embedding"):
    if document_store_type == "sql":
        document_store = SQLDocumentStore(url="sqlite://",
                                          index="haystack_test")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore(return_embedding=True,
                                               embedding_dim=embedding_dim,
                                               embedding_field=embedding_field,
                                               index="haystack_test")
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(
            index="haystack_test",
            return_embedding=True,
            embedding_dim=embedding_dim,
            embedding_field=embedding_field)
    elif document_store_type == "faiss":
        document_store = FAISSDocumentStore(
            vector_dim=embedding_dim,
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        return document_store
    elif document_store_type == "milvus":
        document_store = MilvusDocumentStore(
            vector_dim=embedding_dim,
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        _, collections = document_store.milvus_server.list_collections()
        for collection in collections:
            if collection.startswith("haystack_test"):
                document_store.milvus_server.drop_collection(collection)
        return document_store
    elif document_store_type == "weaviate":
        document_store = WeaviateDocumentStore(
            weaviate_url="http://localhost:8080", index="Haystacktest")
        document_store.weaviate_client.schema.delete_all()
        document_store._create_schema_and_index_if_not_exist()
        return document_store
    else:
        raise Exception(
            f"No document store fixture for '{document_store_type}'")

    return document_store
Exemple #12
0
def test_init_elastic_client():
    # defaults
    _ = ElasticsearchDocumentStore()

    # list of hosts + single port
    _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=9200)

    # list of hosts + list of ports (wrong)
    with pytest.raises(Exception):
        _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"],
                                       port=[9200])

    # list of hosts + list
    _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"],
                                   port=[9200, 9200])

    # only api_key
    with pytest.raises(Exception):
        _ = ElasticsearchDocumentStore(host=["localhost"],
                                       port=[9200],
                                       api_key="test")

    # api_key +  id
    _ = ElasticsearchDocumentStore(host=["localhost"],
                                   port=[9200],
                                   api_key="test",
                                   api_key_id="test")
Exemple #13
0
    def launch_elasticsearch(self, launch: bool = False, name: str = "hera"):
        if launch:
            logging.info("Starting Elasticsearch ...")
            status = subprocess.run([
                f'docker run -d -p 9200:9200 --name "hera" -e "discovery.type=single-node" elasticsearch:7.6.2'
            ],
                                    shell=True)
            time.sleep(30)
        else:
            logging.info("Starting Elasticsearch ...")
            try:
                status = subprocess.run(['docker stop hera'], shell=True)
            except:
                raise ("No running containers")

            finally:
                status = subprocess.run(['docker start hera'], shell=True)
                time.sleep(30)

        index = "document"
        document_store = ElasticsearchDocumentStore(host="localhost",
                                                    username="",
                                                    password="",
                                                    index=index)

        dicts = convert_files_to_dicts(dir_path=self.data_path,
                                       clean_func=self.clean_website_text,
                                       split_paragraphs=True)
        try:
            document_store.delete_all_documents(index=index)
        except:
            pass
        finally:
            document_store.write_documents(dicts)
        return status
Exemple #14
0
def update_document():
    """Return a the url of the index document."""
    if request.files:
        # index is the target document where queries need to sent.
        index = request.form['index']
        # uploaded document for target source
        doc = request.files["doc"]

        file_path = os.path.join(app.config["input"], doc.filename)

        # saving the file to the input directory
        doc.save(file_path)
        #initialization of the Haystack Elasticsearch document storage
        document_store = ElasticsearchDocumentStore(
            host=app.config["host"],
            port=app.config["port"],
            username=app.config["username"],
            password=app.config["password"],
            index=index)
        # convert the pdf files into dictionary and update to ElasticSearch Document
        dicts = convert_files_to_dicts(app.config["input"],
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=False)
        document_store.write_documents(dicts)
        os.remove(file_path)
        return json.dumps({
            'status':
            'Susccess',
            'message':
            'document available at http://' + app.config["host"] + ':' +
            app.config["port"] + '/' + index + '/_search',
            'result': []
        })
    else:
        return json.dumps({
            'status': 'Failed',
            'message': 'No file uploaded',
            'result': []
        })
Exemple #15
0
def launch_and_index_es(documents_dicts: List):
    es = Elasticsearch(['http://localhost:9200/'], verify_certs=True)
    if not es.ping():
        logging.info("Starting Elasticsearch ...")
        status = subprocess.run([
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'
        ],
                                shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        sleep(7)

    es.indices.delete(index='document', ignore=[400, 404])
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")
    document_store.write_documents(documents_dicts)
    retriever = ElasticsearchRetriever(document_store=document_store)
    return retriever
Exemple #16
0
def set_embeded():
    """Return a friendly HTTP greeting."""
    index = request.form['index']
    document_store = ElasticsearchDocumentStore(
        host=app.config["host"],
        port=app.config["port"],
        username=app.config["username"],
        password=app.config["password"],
        index=index,
        embedding_field="embedding",
        embedding_dim=768)
    retriever = DensePassageRetriever(document_store=document_store,
                                      embedding_model="dpr-bert-base-nq",
                                      do_lower_case=True,
                                      use_gpu=False)
    #Now update the retriever embedded to the elasticsearch document
    document_store.update_embeddings(retriever)
    return json.dumps({
        'status': 'Susccess',
        'message':
        'Sucessfully embeded method updated in ElasticSearch Document',
        'result': []
    })
Exemple #17
0
def qna():
    """Return the n answers."""

    question = request.form['question']
    # index is the target document where queries need to sent.
    index = request.form['index']

    # to select train or untrained model
    mode = request.form['mode']

    #initialization of the Haystack Elasticsearch document storage
    document_store = ElasticsearchDocumentStore(
        host=app.config["host"],
        username=app.config["username"],
        password=app.config["password"],
        index=index)

    if mode == 'trained':
        # base on the search mode train_model
        reader = FARMReader(model_name_or_path=app.config["train_model"],
                            use_gpu=False)
    else:
        # base on the search mode pre_train
        reader = FARMReader(
            model_name_or_path="distilbert-base-uncased-distilled-squad",
            use_gpu=False)

    #initialization of ElasticRetriever
    retriever = ElasticsearchRetriever(document_store=document_store)
    # Finder sticks together reader and retriever
    # in a pipeline to answer our actual questions.
    finder = Finder(reader, retriever)

    # predict n answers
    n = int(request.form['n'])
    prediction = finder.get_answers(question=question,
                                    top_k_retriever=10,
                                    top_k_reader=n)
    answer = []
    for res in prediction['answers']:
        answer.append(res['answer'])

    return json.dumps({
        'status': 'success',
        'message': 'Process succesfully',
        'result': answer
    })
Exemple #18
0
    def __init__(self, id, add_sample_data=False):
        Model.__init__(self, id)

        doc_store = ElasticsearchDocumentStore(
            host=DB_HOST,
            port=DB_PORT,
            index=self.id,
            embedding_field="question_emb",
            embedding_dim=768,
            excluded_meta_data=["question_emb"])
        retriever = EmbeddingRetriever(document_store=doc_store,
                                       embedding_model="deepset/sentence_bert",
                                       use_gpu=False)

        self.finder = Finder(reader=None, retriever=retriever)

        if add_sample_data:
            add_sample_data_faq_qa(self)
Exemple #19
0
def get_document_store(document_store_type, faiss_document_store, inmemory_document_store):
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
    elif document_store_type == "memory":
        document_store = inmemory_document_store
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="haystack_test", return_embedding=False)
    elif document_store_type == "faiss":
        document_store = faiss_document_store
    else:
        raise Exception(f"No document store fixture for '{document_store_type}'")

    return document_store
Exemple #20
0
    def __init__(self, id, add_sample_data=False):
        Model.__init__(self, id)

        doc_store = ElasticsearchDocumentStore(host=DB_HOST,
                                               port=DB_PORT,
                                               index=self.id)
        retriever = ElasticsearchRetriever(document_store=doc_store)

        reader = FARMReader(
            model_name_or_path=READER_MODEL_PATH,
            batch_size=BATCHSIZE,
            use_gpu=False,
            num_processes=MAX_PROCESSES,
        )
        self.finder = Finder(reader, retriever)

        if add_sample_data:
            add_sample_data_doc_qa(self)

        reader.save(directory=READER_MODEL_PATH)
        print("saved")
Exemple #21
0
def create_app(config_name):
    app = Flask(__name__)
    app.config.from_object(config[config_name])
    host = config[config_name].ELASTIC_URL
    port = config[config_name].ELASTIC_PORT
    index = config[config_name].ELASTIC_INDEX

    doc_store = ElasticsearchDocumentStore(host=host,
                                           username='',
                                           password='',
                                           index=index)

    retriever = ElasticsearchRetriever(document_store=doc_store)
    model_name = "deepset/roberta-base-squad2"
    reader = FARMReader(model_name_or_path=model_name,
                        num_processes=0,
                        use_gpu=False)
    app.finder = Finder(reader, retriever)

    from app.main import main as main_blueprint
    app.register_blueprint(main_blueprint)

    return app
Exemple #22
0
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from tqdm import tqdm

document_store = ElasticsearchDocumentStore(refresh_type='false')


def concat_sents(sent):
    i = 0
    ss = ''
    for s in sent:
        i += len(str(s).split())
        ss += s
        if i > 200:
            i = 0
            yield ss
            ss = ''


with open('../data/arxiv-processed-pickle', 'rb') as f:
    import pickle

    dic = pickle.load(f)
for (sents, name) in tqdm(dic):
    to_insert = []
    for s in concat_sents(sents):
        to_insert.append({'text': s, 'meta': {'name': name}})
    document_store.write_documents(to_insert)
Exemple #23
0
from transformers import AutoTokenizer, AutoModel

from definitions import from_root_dir
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from pulp.cy.dense.recv import TransformersEmbeddingRetriever


def to_meta_dict(meta: dict) -> dict:
    abs = meta['abstract'] if 'abstract' in meta.keys() else None
    if 'abstract' in meta:
        del meta['abstract']
    return {'text': abs, 'meta': meta}


store = ElasticsearchDocumentStore(refresh_type='false', index='meta')
retriever = TransformersEmbeddingRetriever(
    document_store=store,
    embedding_model=AutoModel.from_pretrained(
        from_root_dir('models/scibert_scivocab_uncased')),
    tokenizer=AutoTokenizer.from_pretrained(
        from_root_dir('models/scibert_scivocab_uncased')))
with open(from_root_dir('data/arxiv-metadata_pickle'), 'rb') as f:
    import pickle

    l = pickle.load(f)

store.write_documents((to_meta_dict(m) for m in l))
store.update_embeddings(retriever, index='meta')
# retriever.embed(['It is shown that, within a Ginzburg-Landau (GL) formalism, the\nsuperconducting fluctuation is insulating at zero temperature even if the\nfluctuation dynamics is metallic (dissipative). Based on this fact, the low\ntemperature behavior of the $H_{c2}$-line and the resistivity curves near a\nzero temperature transition are discussed. In particular, it is pointed out\nthat the neglect of quantum fluctuations in data analysis of the dc resistivity\nmay lead to an under-estimation of the $H_{c2}$ values near zero temperature.\n'])
if LAUNCH_ELASTICSEARCH:
    logging.info("Starting Elasticsearch ...")
    status = subprocess.run([
        'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'
    ],
                            shell=True)
    if status.returncode:
        raise Exception(
            "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
            "then set LAUNCH_ELASTICSEARCH in the script to False.")
    time.sleep(15)

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="document")

# ## Preprocessing of documents
#
# Haystack provides a customizable pipeline for:
# - converting files into texts
# - cleaning texts
# - splitting texts
# - writing them to a Document Store

# In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add
# them in Elasticsearch.

# Let's first fetch some documents that we want to query
# Here: 517 Wikipedia articles for Game of Thrones
Exemple #25
0
import json
import glob
import os
import pprint
import re
from bs4 import BeautifulSoup
import logging

from haystack.preprocessor.utils import convert_files_to_dicts
from haystack.utils import print_answers
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever

DB_HOST = os.getenv("DB_HOST", "localhost")
document_store = ElasticsearchDocumentStore(host=DB_HOST,
                                            username="",
                                            password="",
                                            index="document")
document_store.delete_all_documents(index='document')


# Elesier dataset with full text
def nf2020toDict():
    paths = "./data/HACKXML0000000004/**/*.xml"
    target_tags = ['simple-para', 'para']

    docs = []
    for path in glob.glob(paths)[1:]:
        with open(path, 'r') as f:
            data = BeautifulSoup(f.read(), "xml")
        temp = {}
        temp["meta"] = {}
    reader_name = "deepset/roberta-base-squad2"
    top_k_retriever = 7
    top_k_reader = 1
    conversational = 'True'

    # Use transfromer reader
    reader = FARMReader(model_name_or_path=reader_name, use_gpu=True)

    print('Fetching documents for book ' + book_title)
    document_fetcher_func = top_50_wiki_results_2
    num_docs = document_fetcher_func(book_title)

    print('Fetched ' + str(num_docs) + ' documents for book ' + book_title)

    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="default")
    document_store.delete_all_documents(index="default")
    #document_store = InMemoryDocumentStore()

    doc_dir = root + "/documents"
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Add documents to the document store
    document_store.write_documents(dicts)

    # Use ElasticsearchRetriever
    retriever = ElasticsearchRetriever(document_store=document_store)
    #retriever = TfidfRetriever(document_store=document_store)
        prepared_segment["text"] = segment
        prepared_segments_part.append(prepared_segment)
        if i + 1 % SEGMENT_BATCH_SIZE == 0:
            prepared_segments.append(prepared_segments_part)
            prepared_segments_part = []
    if prepared_segments_part:
        prepared_segments.append(prepared_segments_part)
    return prepared_segments


from haystack.document_store.elasticsearch import ElasticsearchDocumentStore

# init haystack ES client with custom mappings
document_store = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="document",
                                            custom_mapping=mapping)


# ingest full document (no splitting to segments)
def prepare_ingest_full_document(sector, date, title, link, page_content):
    document = {}
    document["title"] = title
    document["sector"] = sector.strip()
    document["given_date"] = date
    try:
        document["date"] = str(datetime.strptime(date, "%d %b %Y").date())
    except:
        pass
    document["link"] = link
Exemple #28
0
from haystack.retriever.sparse import ElasticsearchRetriever, ElasticsearchFilterOnlyRetriever
from haystack.retriever.dense import EmbeddingRetriever

logger = logging.getLogger(__name__)
router = APIRouter()

# Init global components: DocumentStore, Retriever, Reader, Finder
document_store = ElasticsearchDocumentStore(
    host=DB_HOST,
    port=DB_PORT,
    username=DB_USER,
    password=DB_PW,
    index=DB_INDEX,
    scheme=ES_CONN_SCHEME,
    ca_certs=False,
    verify_certs=False,
    text_field=TEXT_FIELD_NAME,
    name_field=NAME_FIELD_NAME,
    search_fields=SEARCH_FIELD_NAME,
    embedding_dim=EMBEDDING_DIM,
    embedding_field=EMBEDDING_FIELD_NAME,
    excluded_meta_data=EXCLUDE_META_DATA_FIELDS,  # type: ignore
    faq_question_field=FAQ_QUESTION_FIELD_NAME,
)


if RETRIEVER_TYPE == "EmbeddingRetriever":
    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model=EMBEDDING_MODEL_PATH,
        model_format=EMBEDDING_MODEL_FORMAT,
Exemple #29
0
def tutorial1_basic_qa_pipeline():
    logger = logging.getLogger(__name__)

    LAUNCH_ELASTICSEARCH = True

    # ## Document Store
    #
    # Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of
    # `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`.
    #
    # **Here:** We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval,
    # and vector storage for text embeddings.
    # **Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3
    # for using SQL/InMemory document stores.
    # **Hint**:
    # This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can
    # configure Haystack to work with your existing document stores.
    #
    # Start an Elasticsearch server
    # You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in
    # your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.

    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.run([
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'
        ],
                                shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # Connect to Elasticsearch
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")

    # ## Preprocessing of documents
    #
    # Haystack provides a customizable pipeline for:
    # - converting files into texts
    # - cleaning texts
    # - splitting texts
    # - writing them to a Document Store

    # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add
    # them in Elasticsearch.

    # Let's first fetch some documents that we want to query
    # Here: 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    if LAUNCH_ELASTICSEARCH:
        document_store.write_documents(dicts)
    else:
        logger.warning(
            "Since we already have a running ES instance we should not index the same documents again. \n"
            "If you still want to do this call: document_store.write_documents(dicts) manually "
        )

    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question
    # could be answered.
    #
    # They use some simple but fast algorithm.
    # **Here:** We use Elasticsearch's default BM25 algorithm
    # **Alternatives:**
    # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters
    # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of
    #   embeddings (e.g. created via Sentence-BERT)
    # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging

    retriever = ElasticsearchRetriever(document_store=document_store)

    # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes
    # with SQLite document store.
    #
    # from haystack.retriever.tfidf import TfidfRetriever
    # retriever = TfidfRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).
    # **Here:** a medium sized RoBERTa QA model using a Reader based on
    #           FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                            "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
    # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean
    #           the model prefers "no answer possible"
    #
    # #### FARMReader

    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    # #### TransformersReader

    # Alternative:
    # reader = TransformersReader(
    #    model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

    # ### Pipeline
    #
    # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
    # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
Exemple #30
0
    ES_CONN_SCHEME, TEXT_FIELD_NAME, SEARCH_FIELD_NAME, EMBEDDING_DIM,
    EMBEDDING_FIELD_NAME, EXCLUDE_META_DATA_FIELDS, FAQ_QUESTION_FIELD_NAME,
    CREATE_INDEX, VECTOR_SIMILARITY_METRIC, UPDATE_EXISTING_DOCUMENTS)

router = APIRouter()

document_store = ElasticsearchDocumentStore(
    host=DB_HOST,
    port=DB_PORT,
    username=DB_USER,
    password=DB_PW,
    index=DB_INDEX,
    label_index=DB_INDEX_FEEDBACK,
    scheme=ES_CONN_SCHEME,
    ca_certs=False,
    verify_certs=False,
    text_field=TEXT_FIELD_NAME,
    search_fields=SEARCH_FIELD_NAME,
    faq_question_field=FAQ_QUESTION_FIELD_NAME,
    embedding_dim=EMBEDDING_DIM,
    embedding_field=EMBEDDING_FIELD_NAME,
    excluded_meta_data=EXCLUDE_META_DATA_FIELDS,  # type: ignore
    create_index=CREATE_INDEX,
    update_existing_documents=UPDATE_EXISTING_DOCUMENTS,
    similarity=VECTOR_SIMILARITY_METRIC)


class FAQQAFeedback(BaseModel):
    question: str = Field(
        ..., description="The question input by the user, i.e., the query.")
    is_correct_answer: bool = Field(