Python ElasticsearchDocumentStoreの例、haystack.database.elasticsearch.ElasticsearchDocumentStore Pythonの例

コード例 #1

0

ファイルを表示

class QAPipeline:
    def __init__(self):
        self.document_store = ElasticsearchDocumentStore(host="localhost",
                                                         username="",
                                                         password="",
                                                         index="document")
        self.retriever = ElasticsearchRetriever(
            document_store=self.document_store)
        self.reader = FARMReader(
            model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
        self.finder = Finder(self.reader, self.retriever)
        print('Ready')

    def add_to_datastore_from_remote(self, data_url):
        return {'status': 'Not Implemented'}

    def add_to_datastore_local(self, data_path):
        json_data = read_json_data(data_path)
        es_data = create_data_dicts(json_data)
        self.document_store.write_documents(es_data)
        return {'status': 'Added To Datastore'}

    def answer(self, question, top_k_options=10, top_k_answers=3):
        prediction = self.finder.get_answers(question=question,
                                             top_k_retriever=top_k_options,
                                             top_k_reader=top_k_answers)
        results = extract_info_from_predictions(prediction)
        return results

コード例 #2

0

ファイルを表示

def main():
    POPULATE_DOCUMENT_STORE = True

    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="",
                                                index="document",
                                                text_field="text",
                                                embedding_field="question_emb",
                                                embedding_dim="768",
                                                excluded_meta_data=["question_emb"])

    retriever = EmbeddingRetriever(
        document_store=document_store, embedding_model=os.getcwd() +
        "\\kbQA\\bert-german-model",
        gpu=True, model_format="transformers")

    if POPULATE_DOCUMENT_STORE:
        doc_dir = os.getcwd() + "\\kbQA\\data\\Skripte\\Securplus\\txt"
        dicts = convert_files_to_dicts(
            dir_path=doc_dir, clean_func=clean_text, split_paragraphs=True)

        with open("Output.txt", "w") as text_file:
            text = ""
            for doc in dicts:
                text = text + "\n" + doc["text"]
            text_file.write(text)
        df = pd.DataFrame.from_dict(dicts)

        # Hier muss man aufpassen! Wir erzeugen an dieser Stelle keine embeddings für die questions, sondern für
        # für die Texte, d.h. die Antworten. Daher sind die Namen der Variablen etwas verwirrend gewählt.
        # dummy_questions ist einfach nur eine steigende Zahl beginnend bei eins. Wird benötigt, da sonst Exceptions
        # bei der Suche geschmissen werden.
        # Im Tutorial scheint von einem FAQ ausgegangen zu sein, bei dem Frage und Antwort
        # definiert sind und somit embeddings für die vordefinierte Frage erzeugt werden können und eigentlich nur
        # auf diese basierend, die k-besten Kandidaten zurückgegeben werden. Wir dagegen erzeugen embeddings für
        # jeden einzelnen Text.
        # todo: Da wir für jeden Text embeddings erzeugen müssen wir eventuell eine Sentence Segmentation durchführen,
        #       denn je länger die Texte werden, desto ungenauer werden auch die embeddings. Pro Satz embedding sind
        #       deutlich exakter.
        questions = list(df["text"].values)
        df["question_emb"] = retriever.create_embedding(texts=questions)
        dummy_questions = [f"{no}" for no, x in enumerate(questions, start=1)]
        df["question"] = dummy_questions
        print(df.head())

        docs_to_index = df.to_dict(orient="records")
        document_store.write_documents(docs_to_index)

    # question = "Wie viele haben Angst um ihren Job?"
    question = "welche leistungen sind ausgeschlossen?"
    # auch hier wieder: Kleinschreibung zwingend notwendig!
    question = question.lower()

    # Wir können aktuell keinen Reader verwenden, da diese scheinbar QA fine tuning voraussetzen
    # Der Retriever holt anhand der embeddings die besten Treffer ran.
    # get_answers() ohne reader nicht verwendbar
    finder = Finder(reader=None, retriever=retriever)
    prediction = finder.get_answers_via_similar_questions(
        question, top_k_retriever=5)
    print_answers(prediction, details="all")

コード例 #3

0

ファイルを表示

def test_elasticsearch_write_read(elasticsearch_fixture):
    document_store = ElasticsearchDocumentStore()
    documents = convert_files_to_dicts(dir_path="samples/docs")
    document_store.write_documents(documents)
    sleep(2)  # wait for documents to be available for query
    documents = document_store.get_all_documents()
    assert len(documents) == 2
    assert documents[0].id
    assert documents[0].text

コード例 #4

0

ファイルを表示

ファイル: test_db.py プロジェクト: zlapp/haystack

def test_elasticsearch_write_read(elasticsearch_fixture):
    document_store = ElasticsearchDocumentStore()
    write_documents_to_db(document_store=document_store,
                          document_dir="samples/docs")
    sleep(2)  # wait for documents to be available for query
    documents = document_store.get_all_documents()
    print(documents)
    assert len(documents) == 2
    assert documents[0].id
    assert documents[0].text

コード例 #5

0

ファイルを表示

 def __init__(self):
     self.document_store = ElasticsearchDocumentStore(host="localhost",
                                                      username="",
                                                      password="",
                                                      index="document")
     self.retriever = ElasticsearchRetriever(
         document_store=self.document_store)
     self.reader = FARMReader(
         model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
     self.finder = Finder(self.reader, self.retriever)
     print('Ready')

コード例 #6

0

ファイルを表示

ファイル: run.py プロジェクト: ZapAutomation/DocumentAnalysis

def get_results(txt_files_location, use_gpu, questions_list, results_location):

    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")
    for dirpath, dirnames, files in os.walk(txt_files_location):
        for dirname in dirnames:
            for dirpath, dirname, files in os.walk(
                    os.path.join(txt_files_location, dirname)):
                for file_name in files:
                    document_store.client.indices.delete(index='document',
                                                         ignore=[400, 404])

                    doc_dir = dirpath

                    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                                   clean_func=clean_wiki_text,
                                                   split_paragraphs=True)

                    document_store.write_documents(dicts)

                    retriever = ElasticsearchRetriever(
                        document_store=document_store)

                    reader = FARMReader(
                        model_name_or_path=
                        "elgeish/cs224n-squad2.0-albert-xxlarge-v1",
                        use_gpu=use_gpu)

                    finder = Finder(reader, retriever)

                    sys.stdout = open(
                        os.path.join(results_location,
                                     file_name[:-4] + "_results.txt"), "a+")

                    for i, question in enumerate(questions_list):

                        prediction = finder.get_answers(question=question,
                                                        top_k_retriever=10,
                                                        top_k_reader=1)

                        print("\n\n\nQuestion " + str(i + 1) + ":\n")
                        print(question + "\n")
                        print_answers(prediction, details="minimal")

                    sys.stdout.close()

    document_store.client.transport.close()

コード例 #7

0

ファイルを表示

def init():
    ### Model values for Reader and Document Store
    global document_store, retriever, reader, finder
    document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
    retriever = ElasticsearchRetriever(document_store=document_store)
    reader = FARMReader(model_name_or_path='deepset/roberta-base-squad2-covid', use_gpu=False)
    finder = Finder(reader, retriever)

コード例 #8

0

ファイルを表示

def test_elasticsearch_custom_fields(elasticsearch_fixture):
    client = Elasticsearch()
    client.indices.delete(index='haystack_test_custom', ignore=[404])
    document_store = ElasticsearchDocumentStore(
        index="haystack_test_custom",
        text_field="custom_text_field",
        embedding_field="custom_embedding_field")

    doc_to_write = {
        "custom_text_field": "test",
        "custom_embedding_field": np.random.rand(768).astype(np.float32)
    }
    document_store.write_documents([doc_to_write])
    documents = document_store.get_all_documents()
    assert len(documents) == 1
    assert documents[0].text == "test"
    np.testing.assert_array_equal(doc_to_write["custom_embedding_field"],
                                  documents[0].embedding)

コード例 #9

0

ファイルを表示

def get_elastic_search_document_store(es_host='localhost',
                                      es_port=9200,
                                      es_index_name='wikipedia',
                                      search_fields=['text']):
    return ElasticsearchDocumentStore(host=es_host,
                                      port=es_port,
                                      username="",
                                      password="",
                                      index=es_index_name,
                                      search_fields=search_fields)

コード例 #10

0

ファイルを表示

def document_store(request, test_docs_xs, elasticsearch_fixture):
    if request.param == "sql":
        if os.path.exists("qa_test.db"):
            os.remove("qa_test.db")
        document_store = SQLDocumentStore(url="sqlite:///qa_test.db")

    if request.param == "memory":
        document_store = InMemoryDocumentStore()

    if request.param == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="haystack_test")

    return document_store

コード例 #11

0

ファイルを表示

    def qa(self, question, text_field):
        document_store = ElasticsearchDocumentStore(host=ES_HOST,
                                                    username=ES_USERNAME,
                                                    password=ES_PASSWORD,
                                                    index=self.ELASTIC_INDEX,
                                                    text_field=text_field)
        retriever = TfidfRetriever(document_store=document_store)

        reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                            use_gpu=False)

        finder = Finder(reader, retriever)
        prediction = finder.get_answers(question=question,
                                        top_k_retriever=1,
                                        top_k_reader=5)

        return prediction

コード例 #12

0

ファイルを表示

def get_document_store(document_store_type):
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore()
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="haystack_test")
    elif document_store_type == "faiss":
        if os.path.exists("haystack_test_faiss.db"):
            os.remove("haystack_test_faiss.db")
        document_store = FAISSDocumentStore(sql_url="sqlite:///haystack_test_faiss.db")
    else:
        raise Exception(f"No document store fixture for '{document_store_type}'")

    return document_store

コード例 #13

0

ファイルを表示

ファイル: farm.py プロジェクト: vchulski/haystack

    def eval(self,
             document_store: ElasticsearchDocumentStore,
             device: str,
             label_index: str = "feedback",
             doc_index: str = "eval_document",
             label_origin: str = "gold_label"):
        """
        Performs evaluation on evaluation documents in Elasticsearch DocumentStore.

        Returns a dict containing the following metrics:
            - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers
            - "f1": Average overlap between predicted answers and their corresponding correct answers
            - "top_n_recall": Proportion of predicted answers that overlap with correct answer

        :param document_store: The ElasticsearchDocumentStore containing the evaluation documents
        :type document_store: ElasticsearchDocumentStore
        :param device: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
        :type device: str
        :param label_index: Elasticsearch index where labeled questions are stored
        :type label_index: str
        :param doc_index: Elasticsearch index where documents that are used for evaluation are stored
        :type doc_index: str
        """

        # extract all questions for evaluation
        filter = {"origin": label_origin}
        questions = document_store.get_all_documents_in_index(
            index=label_index, filters=filter)

        # mapping from doc_id to questions
        doc_questions_dict = {}
        id = 0
        for question in questions:
            doc_id = question["_source"]["doc_id"]
            if doc_id not in doc_questions_dict:
                doc_questions_dict[doc_id] = [{
                    "id":
                    id,
                    "question":
                    question["_source"]["question"],
                    "answers":
                    question["_source"]["answers"],
                    "is_impossible":
                    False if question["_source"]["answers"] else True
                }]
            else:
                doc_questions_dict[doc_id].append({
                    "id":
                    id,
                    "question":
                    question["_source"]["question"],
                    "answers":
                    question["_source"]["answers"],
                    "is_impossible":
                    False if question["_source"]["answers"] else True
                })
            id += 1

        # extract eval documents and convert data back to SQuAD-like format
        documents = document_store.get_all_documents_in_index(index=doc_index)
        dicts = []
        for document in documents:
            doc_id = document["_source"]["doc_id"]
            text = document["_source"]["text"]
            questions = doc_questions_dict[doc_id]
            dicts.append({"qas": questions, "context": text})

        # Create DataLoader that can be passed to the Evaluator
        indices = range(len(dicts))
        dataset, tensor_names = self.inferencer.processor.dataset_from_dicts(
            dicts, indices=indices)
        data_loader = NamedDataLoader(dataset=dataset,
                                      batch_size=self.inferencer.batch_size,
                                      tensor_names=tensor_names)

        evaluator = Evaluator(data_loader=data_loader,
                              tasks=self.inferencer.processor.tasks,
                              device=device)

        eval_results = evaluator.eval(self.inferencer.model)
        results = {
            "EM": eval_results[0]["EM"],
            "f1": eval_results[0]["f1"],
            "top_n_recall": eval_results[0]["top_n_recall"]
        }
        return results

コード例 #14

0

ファイルを表示

ファイル: index_wiki.py プロジェクト: zhilangtaosha/question-answering

def main():
    HOST = 'localhost'
    PORT = 9200
    INDEX_NAME = 'wikipedia_en'

    from haystack import Finder
    from haystack.indexing.cleaning import clean_wiki_text
    from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
    from haystack.reader.farm import FARMReader
    from haystack.reader.transformers import TransformersReader
    from haystack.utils import print_answers
    from haystack.database.elasticsearch import ElasticsearchDocumentStore
    document_store = ElasticsearchDocumentStore(host=HOST,
                                                port=PORT,
                                                username="",
                                                password="",
                                                index=INDEX_NAME)

    # clear existing index (optional)
    # if document_store.client.indices.exists(index=document_store.index):
    #     print('clear existing inddex')
    #     document_store.client.indices.delete(index=document_store.index)

    # Get all dirs in wikipedia folder
    from os import listdir
    from os.path import isfile, join
    import json
    from tqdm import tqdm

    wikidata_path = "wikipedia"
    onlydirs = [
        f for f in listdir(wikidata_path) if not isfile(join(wikidata_path, f))
    ]

    dicts = []
    bulk_size = 5000

    pbar = tqdm(onlydirs)
    for directory in pbar:
        subdirs = [
            f for f in listdir(join(wikidata_path, directory))
            if not isfile(join(wikidata_path, directory))
        ]
        pbar.set_description(f"Processing wikipedia folder {directory}")

        for file in subdirs:
            f = open(join(wikidata_path, directory, file), "r")

            # Each text file contains json structures separated by EOL
            articles = f.read().split("\n")

            for article in articles:
                if len(article) == 0: continue

                # Article in json format
                json_formatted_article = json.loads(article)

                # Rename keys
                document = {
                    "id": json_formatted_article["id"],
                    "name": json_formatted_article["title"],
                    "url": json_formatted_article["url"],
                    "text": json_formatted_article["text"]
                }

                # Add document to bulk
                dicts.append(document)

                if len(dicts) >= bulk_size:
                    # Index bulk
                    try:
                        document_store.write_documents(dicts)
                        dicts.clear()
                    except:
                        print("Bulk not indexed")

    if len(dicts) > 0:
        print('final round')
        document_store.write_documents(dicts)

    print('finished')

コード例 #15

0

ファイルを表示

ファイル: index.py プロジェクト: zhilangtaosha/question-answering

def main(data_dir, bulk_size, paragraph=False):
    document_store = ElasticsearchDocumentStore(
        host="localhost",
        username="",
        password="",
        index="wikipedia" + ("_paragraph" if paragraph else ""))
    if document_store.client.indices.exists(index=document_store.index):
        logger.info(f'{"wikipedia" + ("_paragraph" if paragraph else "")}')
        logger.warning(
            f"Index {document_store.index} already exists, deleting the index."
        )
        document_store.client.indices.delete(index=document_store.index)

    # Get all dirs in wikipedia folder
    only_dirs = [f for f in listdir(data_dir) if not isfile(join(data_dir, f))]

    dicts = []
    counts = dict(documents=0, paragraphs=0)
    progress_bar = tqdm(only_dirs)
    for directory in progress_bar:
        sub_dirs = [
            f for f in listdir(join(data_dir, directory))
            if not isfile(join(data_dir, directory))
        ]
        progress_bar.set_description(
            f"Processing wikipedia folder {directory}")

        for file in sub_dirs:
            f = open(join(data_dir, directory, file), "r")

            # Each text file contains json structures separated by EOL
            articles = f.read().split("\n")

            for article in articles:
                if len(article) == 0:
                    continue

                # Article in json format
                json_formatted_article = json.loads(article)

                base_document = {
                    "id": json_formatted_article["id"],
                    "name": json_formatted_article["title"],
                    "url": json_formatted_article["url"],
                }
                counts["documents"] += 1
                if paragraph:
                    """
                    - Paragraphs are separated by two new-line characters.
                    - The first paragraph is always the title --> remove!
                    - Some paragraphs only contain whitespace --> ignore
                    """
                    paragraphs = [
                        p.strip() for pid, p in enumerate(
                            json_formatted_article["text"].split("\n\n"))
                        if pid > 0 and p.strip()
                    ]
                    counts["paragraphs"] += len(paragraphs)
                    for pid, p in enumerate(paragraphs):
                        document = {
                            **base_document, "paragraph_id": pid,
                            "text": p
                        }

                        # Add document to bulk
                        dicts.append(document)

                else:
                    # Rename keys
                    document = {
                        **base_document, "text": json_formatted_article["text"]
                    }

                    # Add document to bulk
                    dicts.append(document)

                if len(dicts) >= bulk_size:
                    # Index bulk
                    try:
                        document_store.write_documents(dicts)
                    except:
                        logger.warning("Bulk not indexed")

                    # Empty bulk
                    dicts = []

    # index the last partial batch
    if dicts:
        try:
            document_store.write_documents(dicts)
        except:
            logger.warning("Bulk not indexed")

    logger.info("==" * 100)
    logger.info("Indexing done.")
    logger.info(f"# documents: {counts['documents']}")
    if paragraph and counts['documents']:
        logger.info(
            f"# paragraphs: {counts['paragraphs']}, "
            f"{counts['paragraphs'] / counts['documents']:.2f} per document")

コード例 #16

0

ファイルを表示

ファイル: GotQA.py プロジェクト: cppxaxa/Haystack_Optimal_QA_Demo

from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

# doc_dir = "data/article_txt_got"
# # # s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
# # # fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
# dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
# # # print(dicts[:1])
# # # print(len(dicts))

from haystack.database.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="document")
# document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document",
#                                             embedding_field="embedding", embedding_dim=768)

# document_store.write_documents(dicts)

from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

# from haystack.retriever.dense import DensePassageRetriever
# retriever = DensePassageRetriever(document_store=document_store,
#                                   embedding_model="dpr-bert-base-nq",
#                                   do_lower_case=True, use_gpu=True)
# # document_store.update_embeddings(retriever)

コード例 #17

0

ファイルを表示

ファイル: kbQA_embeddings.py プロジェクト: F4r1n/haystack

            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # Init the DocumentStore
    #
    # * specify the name of our `text_field` in Elasticsearch that we want to return as an answer
    # * specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question
    # * set `excluded_meta_data=["question_emb"]` so that we don't return the huge embedding vectors in our search results

    document_store = ElasticsearchDocumentStore(
        host="localhost",
        username="",
        password="",
        index="document",
        text_field="text",
        embedding_field="question_emb",
        embedding_dim=768,
        excluded_meta_data=["question_emb"])

    # Create a Retriever using embeddings
    # Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones).
    # We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings.
    #
    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model="deepset/sentence_bert",
                                   gpu=True)

    if POPULATE_DOCUMENT_STORE:
        # set path to directory conating the text files

コード例 #18

0

ファイルを表示

from config import DB_HOST, DB_USER, DB_PW, DB_INDEX
from config import READER_MODEL_PATH
from fastapi import FastAPI
from pydantic import BaseModel

import logging

logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s %(levelname)s %(name)s %(threadName)s : %(message)s')

app = FastAPI()
### Model values for Reader and Document Store
global document_store, retriever, reader, finder
document_store = ElasticsearchDocumentStore(host=DB_HOST,
                                            username=DB_USER,
                                            password=DB_PW,
                                            index=DB_INDEX)
retriever = ElasticsearchRetriever(document_store=document_store)
reader = FARMReader(model_name_or_path=READER_MODEL_PATH, use_gpu=False)
finder = Finder(reader, retriever)

## API


class Item(BaseModel):
    query: str


@app.get("/greet")
async def greet():
    return {"message": "Hi there!!! I am working"}

コード例 #19

0

ファイルを表示

def main():
    # fetch model files if not present. not hosted in git repo
    # model_exists = os.path.isfile(
    #     './kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin')
    # if not model_exists:
    #     logging.info("Starting model download (about 700MB) ...")
    #     urllib.request.urlretrieve(
    #         "https://cdn.huggingface.co/mrm8488/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin",
    #         "./kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin")
    #     logging.info("model successfully downloaded")
    # start Elasticsearch
    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.call(
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" --name "MLQA2" elasticsearch:7.6.2',
            shell=True
        )
        if status.returncode:
            raise Exception("Failed to launch Elasticsearch. If you want to "
                            "connect to an existing Elasticsearch instance"
                            "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # 512 dimensions because that is what the sentnce transformer returns
    document_store = ElasticsearchDocumentStore(host="localhost", username="",
                                                password="", index="document",
                                                embedding_dim=512,
                                                embedding_field="embedding")

    # load docs in database
    if LAUNCH_ELASTICSEARCH or POPULATE_DOCUMENT_STORE:
        dicts = convert_files_to_dicts(
            dir_path=data_path, clean_func=clean_text, split_paragraphs=True)

        logging.info("files to dicts done.")
        # write dicts containing the texts to the database
        document_store.write_documents(dicts)
        logging.info("documents to store written.")

        retriever = EmbeddingRetriever(document_store=document_store,
                                       embedding_model=retriever_model_name_full,
                                       model_format=retriever_model_type,
                                       gpu=False)
        # generate embeddings for each text and add it to the databse entry
        document_store.update_embeddings(retriever)
        logging.info("embeddings to documents in store written.")

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model=retriever_model_name_full,
                                   model_format=retriever_model_type,
                                   gpu=False)

    # reader wont be used in the retrieval because results take longer and the quality is worse
    # still has to be initialized
    # reader = TransformersReader(model="./kbQA/" + reader_model_name,
    #                             tokenizer="./kbQA/" + reader_model_name,
    #                             use_gpu=-1)
    finder = Finder(retriever=retriever, reader=None)

    if TEST:
        try:
            with open("./kbQA/Test.json", encoding="utf-8") as file:
                times = []
                results = []
                failed = []
                # each line has multiple paragraphs and embeddings, read file line
                # by line
                for line in enumerate(file):
                    # load the json string of the current line as a a python object
                    data = json.loads(line[1])
                    q = data["question"]
                    # fetch results from db
                    start_time = time.process_time()
                    candidate_docs = finder.retriever.retrieve(
                        query=q, filters=None, top_k=5)
                    end_time = time.process_time()
                    times.append(end_time-start_time)
                    answered = False
                    for doc in candidate_docs:
                        if data["answer"] in doc.text:
                            answered = True
                            results.append(True)
                            break
                    if not answered:
                        answers = []
                        for doc in candidate_docs:
                            answers.append(doc.text)
                        failed.append(
                            {"q": q, "correct": data["answer"], "a": answers})
                total = 0
                for zeit in times:
                    total = total + zeit
                logging.info("Average time per request: %f",
                             total / len(times))
                logging.info("Questions answered correctly: %d/%d (%f)",
                             len(results), len(times), len(results)/len(times))
                logging.info("Failed questions:")
                for fail in failed:
                    logging.info("Question: %s", fail["q"])
                    logging.info("Correct Answer: %s", fail["correct"])
                    for answer in fail["a"]:
                        logging.info(answer)

        except Exception as e:
            traceback.print_exc()
            logging.error(f"exception: {e}")
    else:
        # loop until Keyboard-Interrupt event ctrl+c or "!q" input
        while True:
            try:
                # Eread input from console input
                q = input("Enter:").strip()
                # input "!q" to stop execution
                if q == "!q":
                    exit(0)
                # fetch results from db
                candidate_docs = finder.retriever.retrieve(
                    query=q, filters=None, top_k=5)
                for doc in candidate_docs:
                    logging.info("doc id: %s", doc.id)
                    logging.info("doc meta name: %s", doc.meta["name"])
                    logging.info("doc text: %s", doc.text)
                    logging.info("doc query score: %s", doc.query_score)
                    logging.info("")
                # not used
                # prediction = finder.get_answers(
                #     question=q, top_k_retriever=10, top_k_reader=5)
                # print_answers(prediction, details="medium")
            except Exception as e:
                traceback.print_exc()
                logging.error(f"exception: {e}")

コード例 #20

0

ファイルを表示

    status = subprocess.run(
        ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
    )
    if status.returncode:
        raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                        "then set LAUNCH_ELASTICSEARCH in the script to False.")
    time.sleep(30)

# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents
doc_dir = "../data/nq"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document",
                                            create_index=False, embedding_field="emb",
                                            embedding_dim=768, excluded_meta_data=["emb"])


# Add evaluation data to Elasticsearch database
# We first delete the custom tutorial indices to not have duplicate elements
document_store.delete_all_documents(index=doc_index)
document_store.delete_all_documents(index=label_index)
document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json", doc_index=doc_index, label_index=label_index)


# Initialize Retriever
retriever = ElasticsearchRetriever(document_store=document_store)

# Alternative: Evaluate DensePassageRetriever
# Note, that DPR works best when you index short passages < 512 tokens as only those tokens will be used for the embedding.

コード例 #21

0

ファイルを表示

from pprint import pprint
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever

if __name__ == '__main__':

    document_store = ElasticsearchDocumentStore(
        host="192.168.8.106",
        username="",
        password="",
        index="drqa_wiki",
    )

    retriever = ElasticsearchRetriever(document_store=document_store)
    while True:
        q = input("utter question: ")
        documents = retriever.retrieve(q, top_k=3)
        pprint([d.text for d in documents])

コード例 #22

0

ファイルを表示

ファイル: Tutorial6_Better_Retrieval_via_DPR.py プロジェクト: gradjitta/haystack

if LAUNCH_ELASTICSEARCH:
    logging.info("Starting Elasticsearch ...")
    status = subprocess.run([
        'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'
    ],
                            shell=True)
    if status.returncode:
        raise Exception(
            "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
            "then set LAUNCH_ELASTICSEARCH in the script to False.")
    time.sleep(15)

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="document",
                                            embedding_dim=768,
                                            embedding_field="embedding")

# ## Cleaning & indexing documents
# Let's first get some documents that we want to query
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir,
                               clean_func=clean_wiki_text,
                               split_paragraphs=True)

# Now, let's write the docs to our DB.

コード例 #23

0

ファイルを表示

from haystack import Finder
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.reader.farm import FARMReader
from haystack.retriever.sparse import ElasticsearchRetriever
from haystack.utils import print_answers

document_store = ElasticsearchDocumentStore(
    host="192.168.8.106",
    username="",
    password="",
    index="drqa_wiki",
    # embedding_dim=768,
    # embedding_field="embedding",
)
retriever = ElasticsearchRetriever(document_store=document_store)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                    use_gpu=False)
finder = Finder(reader, retriever)

prediction = finder.get_answers(question="What is the capital of Germany?",
                                top_k_retriever=10,
                                top_k_reader=5)

# prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5)
# prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5)

print_answers(prediction, details="minimal")

コード例 #24

0

ファイルを表示

    logging.info("Starting Elasticsearch ...")
    status = subprocess.run(
        ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
    )
    if status.returncode:
        raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                        "then set LAUNCH_ELASTICSEARCH in the script to False.")
    time.sleep(30)

# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents
doc_dir = "../data/nq"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset.json.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", create_index=False)
# Add evaluation data to Elasticsearch database
if LAUNCH_ELASTICSEARCH:
    document_store.add_eval_data("../data/nq/nq_dev_subset.json")
else:
    logger.warning("Since we already have a running ES instance we should not index the same documents again."
                   "If you still want to do this call: 'document_store.add_eval_data('../data/nq/nq_dev_subset.json')' manually ")

# Initialize Retriever
retriever = ElasticsearchRetriever(document_store=document_store)

# Initialize Reader
reader = FARMReader("deepset/roberta-base-squad2")

# Initialize Finder which sticks together Reader and Retriever
finder = Finder(reader, retriever)

コード例 #25

0

ファイルを表示

from setup import enable_elastic_search, base_corpus
from haystack.reader.farm import FARMReader
from haystack.retriever.dense import DensePassageRetriever
from haystack.database.elasticsearch import ElasticsearchDocumentStore

from haystack import Finder
from typing import Dict, List

import re

enable_elastic_search()

document_store_dense = ElasticsearchDocumentStore(host="localhost",
                                                  username="",
                                                  password="",
                                                  index="document",
                                                  embedding_field="embedding",
                                                  embedding_dim=768)
documet_store_sparse = ElasticsearchDocumentStore(host="localhost",
                                                  username="",
                                                  password="",
                                                  index="document")

document_store_dense.write_documents(base_corpus())
dense_retriever = DensePassageRetriever(document_store=document_store_dense,
                                        embedding_model="dpr-bert-base-nq",
                                        do_lower_case=True,
                                        use_gpu=True)
document_store_dense.update_embeddings(dense_retriever)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",

コード例 #26

0

ファイルを表示

ファイル: model.py プロジェクト: VictorMadu/COVID-QA

    DEFAULT_TOP_K_READER,
    DEFAULT_TOP_K_RETRIEVER,
)
from backend.controller.autocomplete import addQuestionToAutocomplete

logger = logging.getLogger(__name__)

router = APIRouter()

document_store = ElasticsearchDocumentStore(
    host=DB_HOST,
    username=DB_USER,
    password=DB_PW,
    index=DB_INDEX,
    scheme=ES_CONN_SCHEME,
    ca_certs=False,
    verify_certs=False,
    text_field=TEXT_FIELD_NAME,
    search_fields=SEARCH_FIELD_NAME,
    embedding_dim=EMBEDDING_DIM,
    embedding_field=EMBEDDING_FIELD_NAME,
    excluded_meta_data=EXCLUDE_META_DATA_FIELDS,
)

# multilingual baseline retriever (=BM25)
retriever = ElasticsearchRetriever(document_store=document_store,
                                   embedding_model=None,
                                   gpu=USE_GPU)

# english_retriever
english_retriever = ElasticsearchRetriever(
    document_store=document_store,

コード例 #27

0

ファイルを表示

ファイル: MLQA_api.py プロジェクト: F4r1n/haystack

        # of the sentence and bridge anaphora resolution
        sent_segs = [f"{topic}: {sent}" for sent in sent_segs]
        out.extend(sent_segs)

    text = "\n".join(out)
    return text


if __name__ == '__main__':
    while True:
        try:
            # 512 dimensions because that is what the sentnce transformer returns
            document_store = ElasticsearchDocumentStore(
                host="elasticsearch",
                username="",
                password="",
                index="document",
                embedding_dim=512,
                embedding_field="embedding")
            break
        except:
            time.sleep(15)

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model=retriever_model_name_full,
                                   model_format=retriever_model_type,
                                   gpu=False)
    if document_store.get_document_count() < 1:
        dicts = convert_files_to_dicts(dir_path=data_path,
                                       clean_func=clean_text,
                                       split_paragraphs=True)

コード例 #28

0

ファイルを表示

ファイル: main.py プロジェクト: paper2code/telegram-qa-with-arxiv-papers

import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

from flask import Flask, jsonify, request

from haystack import Finder
from haystack.indexing.cleaning import clean_wiki_text
from haystack.indexing.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever

document_store = ElasticsearchDocumentStore(host="elasticsearch",
                                            username="",
                                            password="",
                                            index="arxiv-qa")


def filter_answers(results: dict, details: str = "all"):
    answers = results["answers"]
    if details != "all":
        if details == "minimal":
            keys_to_keep = set(["answer", "context"])
        elif details == "medium":
            keys_to_keep = set(["answer", "context", "score"])
        else:
            keys_to_keep = answers.keys()

        # filter the results
        filtered_answers = []

コード例 #29

0

ファイルを表示

ファイル: data_ingestion.py プロジェクト: zzzbit/COVID-QA

        question_embedding = retriever.create_embedding(
            r["_source"]["question"])

        body = {"doc": {"question_emb": question_embedding}}
        document_store.client.update(index=document_store.index,
                                     id=r["_id"],
                                     body=body)


if __name__ == "__main__":

    document_store = ElasticsearchDocumentStore(
        host="localhost",
        username="",
        password="",
        index="document",
        text_field="answer",
        embedding_field="question_emb",
        embedding_dim=768,
        excluded_meta_data=["question_emb"],
    )

    MODEL = "deepset/sentence_bert"
    GPU = False

    retriever = ElasticsearchRetriever(document_store=document_store,
                                       embedding_model=MODEL,
                                       gpu=GPU,
                                       emb_extraction_layer=-2,
                                       pooling_strategy="reduce_mean")

    # index new docs