Esempio n. 1
0
    def setup(self):
        print("SETTING UP PIPELINE")
        self.document_store = ElasticsearchDocumentStore(
            similarity="dot_product", host="elasticsearch", username="", password="", index="document")
        self.document_store_faiss = FAISSDocumentStore(
            index="document",
            faiss_index_factory_str="Flat",
            return_embedding=True,
            sql_url=f"postgresql://{config('POSTGRES_USER')}:{config('POSTGRES_PASSWORD')}@{config('POSTGRES_HOST')}:{config('POSTGRES_PORT')}/faiss"
        )
        processor, converter = self.write_as4_docs()
        table_data = self.write_table_docs(converter, processor)

        es_retriever = ElasticsearchRetriever(
            document_store=self.document_store)
        print("SETTING UP DPR")
        dpr_retriever = DPRTrainingManager.get_current_retriever(
            self.document_store_faiss)
        print("SETTING UP EMBEDDINGS")
        embedding_retriever = EmbeddingRetriever(
            document_store=self.document_store_faiss,
            embedding_model="deepset/sentence_bert"
        )
        query_classifier = QueryClassifier()
        print("SETTING UP TABLE")
        table_retriever = TableRetriever(table_data)
        print("SETUP RETRIEVERS")
        self.question_generator = FurtherQuestionGenerator()
        print("UPDATING EMBEDDINGS")
        self.document_store_faiss.update_embeddings(dpr_retriever)
        print("UPDATED EMBEDDINGS")
        self.dpr_node = ContinualDPRNode(
            dpr_retriever, self.document_store_faiss)
        result = Result()
        self.trainer = DPRTrainingManager(
            self.document_store_faiss, self.dpr_node)
        print("SETUP COMPONENTS")
        pipeline = Pipeline()
        pipeline.add_node(component=es_retriever,
                          name="ESRetriever", inputs=["Query"])
        pipeline.add_node(component=self.dpr_node,
                          name="DPRRetriever", inputs=["Query"])
        pipeline.add_node(component=embedding_retriever,
                          name="EmbeddingRetriever", inputs=["Query"])
        pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[
                          "DPRRetriever", "EmbeddingRetriever", "ESRetriever"])
        pipeline.add_node(component=query_classifier,
                          name="QueryClassifier", inputs=["JoinResults"])
        pipeline.add_node(component=self.question_generator,
                          name="QnGenerator", inputs=["QueryClassifier.output_1"])
        pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[
                          "QueryClassifier.output_2"])
        pipeline.add_node(component=result, name="Result", inputs=[
                          "QnGenerator", "TableRetriever"])
        self.pipeline = pipeline
        print("SETUP PIPELINE")
Esempio n. 2
0
def faiss_document_store():
    if os.path.exists("haystack_test_faiss.db"):
        os.remove("haystack_test_faiss.db")
    document_store = FAISSDocumentStore(
        sql_url="sqlite:///haystack_test_faiss.db", return_embedding=True)
    yield document_store
    document_store.faiss_index.reset()
Esempio n. 3
0
    def load(self):
        if(self.finder and self.finder2):
            return
        if(not self.document_store2):
            self.document_store2 = FAISSDocumentStore.load(
                sql_url=sqlUrlFAQ, faiss_file_path='faiss2')  # save before load in preprocess
            self.initSql(url=sqlUrlFAQ, document_store=self.document_store2)
        # else:  # reset session
        #     # self.document_store2.session.close()
        #     super(
        #         FAISSDocumentStore, self.document_store2).__init__(url=sqlUrlFAQ)
        if(not self.retriever2):
            self.retriever2 = EmbeddingRetriever(document_store=self.document_store2,
                                                 embedding_model="sentence_bert-saved", use_gpu=False)
        if(not self.finder2):
            self.finder2 = Finder(reader=None, retriever=self.retriever2)

        if(not self.document_store):
            self.document_store = SQLDocumentStore(url=sqlUrl)  
            #FAISSDocumentStore.load(faiss_file_path='faiss1', sql_url=sqlUrl)
                                                          
            self.initSql(url=sqlUrl, document_store=self.document_store)
        # else:  # reset session
        #     # self.document_store.session.close()
        #     super(
        #         FAISSDocumentStore, self.document_store).__init__(url=sqlUrl)
        # self.retriever = EmbeddingRetriever( #redice load by sharing the same retriever and set store on fly??
        #     document_store=self.document_store, embedding_model="sentence_bert-saved", use_gpu=False) if not self.retriever else self.retriever
        if(not self.retriever):
            self.retriever = TfidfRetriever(document_store=self.document_store)
        self.reader = FARMReader(model_name_or_path=modelDir,
                                 use_gpu=False, no_ans_boost=0) if not self.reader else self.reader
        # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
        self.finder = Finder(
            self.reader, self.retriever) if not self.finder else self.finder
Esempio n. 4
0
def get_document_store(document_store_type, embedding_field="embedding"):
    if document_store_type == "sql":
        document_store = SQLDocumentStore(url="sqlite://", index="haystack_test")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore(
            return_embedding=True, embedding_field=embedding_field, index="haystack_test"
        )
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(
            index="haystack_test", return_embedding=True, embedding_field=embedding_field
        )
    elif document_store_type == "faiss":
        document_store = FAISSDocumentStore(
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        return document_store
    elif document_store_type == "milvus":
        document_store = MilvusDocumentStore(
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        return document_store
    else:
        raise Exception(f"No document store fixture for '{document_store_type}'")

    return document_store
Esempio n. 5
0
def get_document_store(document_store_type, similarity='dot_product'):
    """ TODO This method is taken from test/conftest.py but maybe should be within Haystack.
    Perhaps a class method of DocStore that just takes string for type of DocStore"""
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
        assert document_store.get_document_count() == 0
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore()
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="eval_document",
                                                    similarity=similarity,
                                                    timeout=3000)
    elif document_store_type in ("milvus_flat", "milvus_hnsw"):
        if document_store_type == "milvus_flat":
            index_type = IndexType.FLAT
            index_param = None
            search_param = None
        elif document_store_type == "milvus_hnsw":
            index_type = IndexType.HNSW
            index_param = {"M": 64, "efConstruction": 80}
            search_param = {"ef": 20}
        document_store = MilvusDocumentStore(similarity=similarity,
                                             index_type=index_type,
                                             index_param=index_param,
                                             search_param=search_param)
        assert document_store.get_document_count(index="eval_document") == 0
    elif document_store_type in ("faiss_flat", "faiss_hnsw"):
        if document_store_type == "faiss_flat":
            index_type = "Flat"
        elif document_store_type == "faiss_hnsw":
            index_type = "HNSW"
        status = subprocess.run(['docker rm -f haystack-postgres'], shell=True)
        time.sleep(1)
        status = subprocess.run([
            'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres'
        ],
                                shell=True)
        time.sleep(6)
        status = subprocess.run([
            'docker exec haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'
        ],
                                shell=True)
        time.sleep(1)
        document_store = FAISSDocumentStore(
            sql_url="postgresql://*****:*****@localhost:5432/haystack",
            faiss_index_factory_str=index_type,
            similarity=similarity)
        assert document_store.get_document_count() == 0

    else:
        raise Exception(
            f"No document store fixture for '{document_store_type}'")
    return document_store
def test_faiss_index_save_and_load(tmp_path):
    document_store = FAISSDocumentStore(
        sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}",
        index="haystack_test",
    )
    document_store.write_documents(DOCUMENTS)

    # test saving the index
    document_store.save(tmp_path / "haystack_test_faiss")

    # clear existing faiss_index
    document_store.faiss_indexes[document_store.index].reset()

    # test faiss index is cleared
    assert document_store.faiss_indexes[document_store.index].ntotal == 0

    # test loading the index
    new_document_store = FAISSDocumentStore.load(
        sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}",
        faiss_file_path=tmp_path / "haystack_test_faiss",
        index=document_store.index)

    # check faiss index is restored
    assert new_document_store.faiss_indexes[
        document_store.index].ntotal == len(DOCUMENTS)
    # check if documents are restored
    assert len(new_document_store.get_all_documents()) == len(DOCUMENTS)
Esempio n. 7
0
def get_document_store(document_store_type,
                       embedding_dim=768,
                       embedding_field="embedding"):
    if document_store_type == "sql":
        document_store = SQLDocumentStore(url="sqlite://",
                                          index="haystack_test")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore(return_embedding=True,
                                               embedding_dim=embedding_dim,
                                               embedding_field=embedding_field,
                                               index="haystack_test")
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(
            index="haystack_test",
            return_embedding=True,
            embedding_dim=embedding_dim,
            embedding_field=embedding_field)
    elif document_store_type == "faiss":
        document_store = FAISSDocumentStore(
            vector_dim=embedding_dim,
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        return document_store
    elif document_store_type == "milvus":
        document_store = MilvusDocumentStore(
            vector_dim=embedding_dim,
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        _, collections = document_store.milvus_server.list_collections()
        for collection in collections:
            if collection.startswith("haystack_test"):
                document_store.milvus_server.drop_collection(collection)
        return document_store
    elif document_store_type == "weaviate":
        document_store = WeaviateDocumentStore(
            weaviate_url="http://localhost:8080", index="Haystacktest")
        document_store.weaviate_client.schema.delete_all()
        document_store._create_schema_and_index_if_not_exist()
        return document_store
    else:
        raise Exception(
            f"No document store fixture for '{document_store_type}'")

    return document_store
Esempio n. 8
0
def get_document_store(document_store_type, es_similarity='cosine'):
    """ TODO This method is taken from test/conftest.py but maybe should be within Haystack.
    Perhaps a class method of DocStore that just takes string for type of DocStore"""
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore()
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="eval_document",
                                                    similarity=es_similarity)
    elif document_store_type in ("faiss_flat", "faiss_hnsw"):
        if document_store_type == "faiss_flat":
            index_type = "Flat"
        elif document_store_type == "faiss_hnsw":
            index_type = "HNSW"

        #TEMP FIX for issue with deleting docs
        # status = subprocess.run(
        #     ['docker rm -f haystack-postgres'],
        #     shell=True)
        # time.sleep(3)
        # try:
        #     document_store = FAISSDocumentStore(sql_url="postgresql://*****:*****@localhost:5432/haystack",
        #                                         faiss_index_factory_str=index_type)
        # except:
        # Launch a postgres instance & create empty DB
        # logger.info("Didn't find Postgres. Start a new instance...")
        status = subprocess.run(['docker rm -f haystack-postgres'], shell=True)
        time.sleep(1)
        status = subprocess.run([
            'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres'
        ],
                                shell=True)
        time.sleep(3)
        status = subprocess.run([
            'docker exec -it haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'
        ],
                                shell=True)
        time.sleep(1)
        document_store = FAISSDocumentStore(
            sql_url="postgresql://*****:*****@localhost:5432/haystack",
            faiss_index_factory_str=index_type)

    else:
        raise Exception(
            f"No document store fixture for '{document_store_type}'")
    assert document_store.get_document_count() == 0
    return document_store
Esempio n. 9
0
def test_faiss_retrieving(index_factory):
    document_store = FAISSDocumentStore(
        sql_url="sqlite:///haystack_test_faiss.db",
        faiss_index_factory_str=index_factory)
    document_store.delete_all_documents(index="document")
    if "ivf" in index_factory.lower():
        document_store.train_index(DOCUMENTS)
    document_store.write_documents(DOCUMENTS)
    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model="deepset/sentence_bert",
                                   use_gpu=False)
    result = retriever.retrieve(query="How to test this?")
    assert len(result) == len(DOCUMENTS)
    assert type(result[0]) == Document
Esempio n. 10
0
def test_faiss_passing_index_from_outside():
    d = 768
    nlist = 2
    quantizer = faiss.IndexFlatIP(d)
    faiss_index = faiss.IndexIVFFlat(quantizer, d, nlist,
                                     faiss.METRIC_INNER_PRODUCT)
    faiss_index.nprobe = 2
    document_store = FAISSDocumentStore(
        sql_url="sqlite:///haystack_test_faiss.db", faiss_index=faiss_index)

    document_store.delete_all_documents(index="document")
    # as it is a IVF index we need to train it before adding docs
    document_store.train_index(DOCUMENTS)

    document_store.write_documents(documents=DOCUMENTS, index="document")
    documents_indexed = document_store.get_all_documents(index="document")

    # test document correctness
    check_data_correctness(documents_indexed, DOCUMENTS)
Esempio n. 11
0
def test_faiss_passing_index_from_outside():
    d = 768
    nlist = 2
    quantizer = faiss.IndexFlatIP(d)
    faiss_index = faiss.IndexIVFFlat(quantizer, d, nlist,
                                     faiss.METRIC_INNER_PRODUCT)
    faiss_index.set_direct_map_type(faiss.DirectMap.Hashtable)
    faiss_index.nprobe = 2
    document_store = FAISSDocumentStore(
        sql_url="sqlite:///haystack_test_faiss.db", faiss_index=faiss_index)

    document_store.delete_all_documents(index="document")
    # as it is a IVF index we need to train it before adding docs
    document_store.train_index(DOCUMENTS)

    document_store.write_documents(documents=DOCUMENTS, index="document")
    documents_indexed = document_store.get_all_documents(index="document")

    # test if vectors ids are associated with docs
    for doc in documents_indexed:
        assert 0 <= int(doc.meta["vector_id"]) <= 7
Esempio n. 12
0
def test_faiss_index_save_and_load(document_store):
    document_store.write_documents(DOCUMENTS)

    # test saving the index
    document_store.save("haystack_test_faiss")

    # clear existing faiss_index
    document_store.faiss_index.reset()

    # test faiss index is cleared
    assert document_store.faiss_index.ntotal == 0

    # test loading the index
    new_document_store = FAISSDocumentStore.load(
        sql_url="sqlite://", faiss_file_path="haystack_test_faiss")

    # check faiss index is restored
    assert new_document_store.faiss_index.ntotal == len(DOCUMENTS)
Esempio n. 13
0
def get_document_store(document_store_type):
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore()
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="haystack_test")
    elif document_store_type == "faiss":
        if os.path.exists("haystack_test_faiss.db"):
            os.remove("haystack_test_faiss.db")
        document_store = FAISSDocumentStore(
            sql_url="sqlite:///haystack_test_faiss.db")
    else:
        raise Exception(
            f"No document store fixture for '{document_store_type}'")

    return document_store
Esempio n. 14
0
def Find_answer(text_file_path, data_folder_path, symbol, question):
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

    with open(text_file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    for i, line in enumerate(data.split(symbol)):
        with open(f'{data_folder_path}/data{i+1}.txt', 'w') as f:
            print(f'writing file no.{i+1}')
            f.write(line)

    test_dicts = convert_files_to_dicts(dir_path=data_folder_path,
                                        clean_func=clean_wiki_text,
                                        split_paragraphs=True)
    document_store.write_documents(test_dicts)
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=16,
        use_gpu=True,
        embed_title=True,
        use_fast_tokenizers=True)

    document_store.update_embeddings(retriever)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True,
                        context_window_size=300)

    pipe = ExtractiveQAPipeline(reader, retriever)

    prediction = pipe.run(query=question, top_k_retriever=10, top_k_reader=3)

    doc_with_ans = []
    for i in range(len(prediction['answers'])):
        if prediction['answers'][i]['context'] not in doc_with_ans:
            doc_with_ans.append(prediction['answers'][i]['context'])

    answer = ' '.join(doc_with_ans)

    return answer
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)
                  )
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store_ES = ElasticsearchDocumentStore(
    similarity="dot_product", host="localhost", username="", password="")

from typing import List
import requests
import pandas as pd
from haystack import Document
from haystack.document_store.faiss import FAISSDocumentStore

# FAISS DocumentStore used for DPR and embedding retriever
document_store_FAISS = FAISSDocumentStore(
    faiss_index_factory_str="Flat",
    return_embedding=True
)

import pprint
pp = pprint.PrettyPrinter(indent=2)

"""## Document Preprocessing"""

from haystack.reader.farm import FARMReader
import haystack

converter = haystack.file_converter.txt.TextConverter(
    remove_numeric_tables=False,
    valid_languages=["en"])

as4 = converter.convert(file_path="/content/as4-winterBarley.txt")
Esempio n. 16
0
import time
from elasticsearch import Elasticsearch

es_server = Popen(['/home/dr_lunars/elasticsearch-7.0.0/bin/elasticsearch'],stdout=PIPE, stderr=STDOUT)

time.sleep(30)

es = Elasticsearch("http://localhost:9200", timeout=300, max_retries=10, retry_on_timeout=True)

daily_score = 0

# %%
# DPR

from haystack.document_store.faiss import FAISSDocumentStore
document_store = FAISSDocumentStore.load(faiss_file_path="my_faiss", sql_url="sqlite:///my_doc_store.db", index="document")

from dpr_inference import DPR

model_path = '/home/dr_lunars/models/question_encoder-optimized-quantized.onnx'
tokenizer_path = "kykim/bert-kor-base"

dpr = DPR(
    model_path=model_path,
    tokenizer_path=tokenizer_path,
    document_store=document_store
)

# %%
# Reader
Esempio n. 17
0
# Minimal cleaning
df.fillna(value="", inplace=True)

print(df.head())

titles = list(df["title"].values)
texts = list(df["text"].values)

# Create to haystack document format
documents: List[Document] = []
for title, text in zip(titles, texts):
    documents.append(Document(text=text, meta={"name": title or ""}))

# Initialize FAISS document store to documents and corresponding index for embeddings
# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat",
                                    return_embedding=True)

# Initialize DPR Retriever to encode documents, encode question and query documents
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=False,
    embed_title=True,
)

# Initialize RAG Generator
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=False,
    top_k_answers=1,
Esempio n. 18
0
def get_faiss_document_store():
    return FAISSDocumentStore(
        faiss_index_factory_str=hyperparams.faiss_index_factory_str,
        sql_url=sql_url)
def tutorial7_rag_generator():
    # Add documents from which you want generate answers
    # Download a csv containing some sample documents data
    # Here some sample documents data
    temp = requests.get("https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv")
    open('small_generator_dataset.csv', 'wb').write(temp.content)

    # Get dataframe with columns "title", and "text"
    df = pd.read_csv("small_generator_dataset.csv", sep=',')
    # Minimal cleaning
    df.fillna(value="", inplace=True)

    print(df.head())

    titles = list(df["title"].values)
    texts = list(df["text"].values)

    # Create to haystack document format
    documents: List[Document] = []
    for title, text in zip(titles, texts):
        documents.append(
            Document(
                text=text,
                meta={
                    "name": title or ""
                }
            )
        )


    # Initialize FAISS document store to documents and corresponding index for embeddings
    # Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
    document_store = FAISSDocumentStore(
        faiss_index_factory_str="Flat",
        return_embedding=True
    )

    # Initialize DPR Retriever to encode documents, encode question and query documents
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=True,
        embed_title=True,
    )

    # Initialize RAG Generator
    generator = RAGenerator(
        model_name_or_path="facebook/rag-token-nq",
        use_gpu=True,
        top_k_answers=1,
        max_length=200,
        min_length=2,
        embed_title=True,
        num_beams=2,
    )

    # Delete existing documents in documents store
    document_store.delete_all_documents()
    # Write documents to document store
    document_store.write_documents(documents)
    # Add documents embeddings to index
    document_store.update_embeddings(
        retriever=retriever
    )

    # Now ask your questions
    # We have some sample questions
    QUESTIONS = [
        "who got the first nobel prize in physics",
        "when is the next deadpool movie being released",
        "which mode is used for short wave broadcast service",
        "who is the owner of reading football club",
        "when is the next scandal episode coming out",
        "when is the last time the philadelphia won the superbowl",
        "what is the most current adobe flash player version",
        "how many episodes are there in dragon ball z",
        "what is the first step in the evolution of the eye",
        "where is gall bladder situated in human body",
        "what is the main mineral in lithium batteries",
        "who is the president of usa right now",
        "where do the greasers live in the outsiders",
        "panda is a national animal of which country",
        "what is the name of manchester united stadium",
    ]

    # Now generate answer for question
    for question in QUESTIONS:
        # Retrieve related documents from retriever
        retriever_results = retriever.retrieve(
            query=question
        )

        # Now generate answer from question and retrieved documents
        predicted_result = generator.predict(
            query=question,
            documents=retriever_results,
            top_k=1
        )

        # Print you answer
        answers = predicted_result["answers"]
        print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')
Esempio n. 20
0
}).to_dict(orient='records')
# %%
# clean data
# preprocessing from haystack
preprocessor = PreProcessor(clean_empty_lines=True,
                            clean_whitespace=True,
                            clean_header_footer=False,
                            split_by="word",
                            split_length=100,
                            split_respect_sentence_boundary=True,
                            split_overlap=10)
nested_docs = [preprocessor.process(d) for d in all_dicts]
docs = [d for x in nested_docs for d in x]
# %%
# start FAISS document store and store docs
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

document_store.write_documents(docs)

# %%
# initialise storage
from haystack.retriever.dense import DensePassageRetriever
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query=64,
    max_seq_len_passage=256,
    batch_size=16,
    use_gpu=False,
    embed_title=True,
Esempio n. 21
0
def tutorial12_lfqa():
    """
    Document Store:
    FAISS is a library for efficient similarity search on a cluster of dense vectors.
    The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood
    to store the document text and other meta data. The vector embeddings of the text are
    indexed on a FAISS Index that later is queried for searching answers.
    The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
    faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
    For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
    """

    from haystack.document_store.faiss import FAISSDocumentStore

    document_store = FAISSDocumentStore(vector_dim=128,
                                        faiss_index_factory_str="Flat")
    """
    Cleaning & indexing documents:
    Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore
    """

    # Let's first get some files that we want to use
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # Convert files to dicts
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the dicts containing documents to our DB.
    document_store.write_documents(dicts)
    """
    Initalize Retriever and Reader/Generator:
    We use a `RetribertRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore`
    """

    from haystack.retriever.dense import EmbeddingRetriever

    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model="yjernite/retribert-base-uncased",
        model_format="retribert")

    document_store.update_embeddings(retriever)
    """Before we blindly use the `RetribertRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents."""

    from haystack.utils import print_answers, print_documents
    from haystack.pipeline import DocumentSearchPipeline

    p_retrieval = DocumentSearchPipeline(retriever)
    res = p_retrieval.run(query="Tell me something about Arya Stark?",
                          top_k_retriever=5)
    print_documents(res, max_text_len=512)
    """
    Similar to previous Tutorials we now initalize our reader/generator.
    Here we use a `Seq2SeqGenerator` with the *yjernite/bart_eli5* model (see: https://huggingface.co/yjernite/bart_eli5)
    """

    generator = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5")
    """
    Pipeline:
    With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions.
    You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    """

    from haystack.pipeline import GenerativeQAPipeline
    pipe = GenerativeQAPipeline(generator, retriever)
    """Voilà! Ask a question!"""

    query_1 = "Why did Arya Stark's character get portrayed in a television adaptation?"
    result_1 = pipe.run(query=query_1, top_k_retriever=1)
    print(f"Query: {query_1}")
    print(f"Answer: {result_1['answers'][0]}")
    print()

    query_2 = "What kind of character does Arya Stark play?"
    result_2 = pipe.run(query=query_2, top_k_retriever=1)
    print(f"Query: {query_2}")
    print(f"Answer: {result_2['answers'][0]}")
    print()
    pipe.run(query=query_2, top_k_retriever=1)
class DPRTrainingTester:
    """
    To run with an in memory sqlite database
    """
    document_store = FAISSDocumentStore(
        faiss_index_factory_str="Flat"
    )

    retreiver = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=16,
        # use_gpu=True,
        use_gpu=False,
        embed_title=True,
        use_fast_tokenizers=True
    )

    # Loads the test document into the document store
    def loadDocumentsFromFile(self, knowledgeFilePath):
        converter = TextConverter(
            remove_numeric_tables=False,
            valid_languages=["en"])
        processor = PreProcessor(
            clean_empty_lines=True,
            clean_whitespace=False,
            clean_header_footer=True,
            split_by="passage",
            split_length=1,
            split_respect_sentence_boundary=False,
            split_overlap=0
        )
        self.trainingFile = knowledgeFilePath
        loadedFile = converter.convert(knowledgeFilePath)
        documents = processor.process(loadedFile)
        for i in range(0, len(documents)):
            docMetadata = documents[i]['meta']
            docMetadata['name'] = knowledgeFilePath
            docMetadata['doucmentID'] = knowledgeFilePath \
                + str(docMetadata['_split_id'])

        self.document_store.write_documents(documents)
        backagain = self.document_store.get_all_documents()

        print("Number of documents loaded", end=": ")
        print(self.document_store.get_document_count())

    def __init__(self, knowledgeFilePath):
        print("Started DPR Training tester!")

        # load test document into database
        print("Loading documents from " + knowledgeFilePath)
        self.document_store.delete_all_documents()
        self.loadDocumentsFromFile(knowledgeFilePath)

        # update dpr embeddings based on initial retreiver
        print("Performing initial embeddings update")
        self.document_store.update_embeddings(self.retreiver)

        # generate a new dprTrainingSet to populate
        self.trainingSet = DPRTrainingSet(self.document_store, 0)

    # return document store's id for the response marked correct
    def get_correct_id(self, responses, correctNum):
        # correctRespons = responses[correctNum].to_dict()
        return responses[correctNum].id

    # return list of document store ids for alternative responses
    def get_incorrect_ids(self, responses, correctNum):
        ids = []
        for i in range(0, len(responses)):
            if i == correctNum:
                continue
            ids.append(responses[i].id)
        return ids

    def askQuestion(self):
        print("------------------------------")
        question = input("Enter new question (DONE to finish): ")

        if question == "":
            return

        if question == 'DONE':
            self.generateTraining()
            return

        k = 10
        responses = self.retreiver.retrieve(question, top_k=k)

        print()
        for i in range(0, k):
            print(i, end=": ")
            print(responses[i].text)
            # print(responses[i])
            print()

        print()
        correctNum = input("Select correct response (X if none correct): ")

        if correctNum == "":
            return

        if correctNum == 'X':
            return

        print()
        print("------------------------------")

        self.trainingSet.addItem(
            question=question,
            posID=self.get_correct_id(responses, int(correctNum)),
            negIDs=self.get_incorrect_ids(responses, int(correctNum))
        )

    # file where all the training stuff is
    doc_dir = "data/"

    def generateTraining(self):
        self.trainingSet.addInBatchNegatives()

        self.trainingSet.generateJSON(self.trainingFile + "SET.json")
        print("New training set saved to: " + self.trainingFile + "SET.json")

        exit(0)

    def loop(self):
        self.askQuestion()
Esempio n. 23
0
class MLPipeline:
    def __init__(self):
        self.pipeline = None
        self.document_store = None
        self.document_store_faiss = None
        self.question_generator = None
        self.doc_lock = RLock()

    def write_as4_docs(self):
        converter = file_converter.txt.TextConverter(
            remove_numeric_tables=False,
            valid_languages=["en"])

        processor = preprocessor.preprocessor.PreProcessor(
            clean_empty_lines=True,
            clean_whitespace=False,
            clean_header_footer=True,
            split_by="passage",
            split_length=1,
            split_respect_sentence_boundary=False,
            split_overlap=0
        )

        self.document_store.delete_all_documents(index="document")
        self.document_store_faiss.delete_all_documents(index="document")

        for file in [file for file in os.listdir("knowledgeBase/text") if ".txt" in file]:
            doc = converter.convert(file_path="knowledgeBase/text/" + file)
            doc_processed = processor.process(doc)

            for i in range(len(doc_processed)):
                doc_processed[i]["meta"]["index"] = -1
                doc_processed[i]["meta"]["table"] = False
                doc_processed[i]["meta"]["name"] = file[:-4]

            self.document_store.write_documents(
                doc_processed, index="document")
            self.document_store_faiss.write_documents(
                doc_processed, index="document")

        backagain = self.document_store_faiss.get_all_documents()
        for i in range(0, len(backagain)):
            print(i)
            print(":\n")
            print(backagain[i])
            print("---------------")

        return (processor, converter)

    def write_table_docs(self, converter, processor):
        data = []
        docs = []

        for file in [file for file in os.listdir("knowledgeBase/tables") if ".csv" in file]:
            with open("knowledgeBase/tables/" + file, mode='r') as infile:
                reader = csv.reader(infile)
                new_dict = {row[0]: row[1:] for row in reader}
                data.append(new_dict)
            infile.close()
            with open("knowledgeBase/tables/" + file[:-4] + ".txt", mode='r') as infile:
                docs.append(infile.read())
            infile.close()

        with open('knowledgeBase/table_text.txt', 'w') as outfile:
            for item in docs:
                outfile.write("%s\n\n" % item)
        outfile.close()

        # Construct FAISS DocumentStore for table content

        tables = converter.convert(file_path="knowledgeBase/table_text.txt")

        tableDocs = processor.process(tables)
        for i in range(len(tableDocs)):
            tableDocs[i]["meta"]["index"] = i
            tableDocs[i]["meta"]["table"] = True
            tableDocs[i]["meta"]["name"] = "  "

        self.document_store.write_documents(tableDocs, index="document")
        self.document_store_faiss.write_documents(tableDocs, index="document")
        return data

    def re_process_documents(self):
        with self.doc_lock:
            self.setup()

    def setup(self):
        print("SETTING UP PIPELINE")
        self.document_store = ElasticsearchDocumentStore(
            similarity="dot_product", host="elasticsearch", username="", password="", index="document")
        self.document_store_faiss = FAISSDocumentStore(
            index="document",
            faiss_index_factory_str="Flat",
            return_embedding=True,
            sql_url=f"postgresql://{config('POSTGRES_USER')}:{config('POSTGRES_PASSWORD')}@{config('POSTGRES_HOST')}:{config('POSTGRES_PORT')}/faiss"
        )
        processor, converter = self.write_as4_docs()
        table_data = self.write_table_docs(converter, processor)

        es_retriever = ElasticsearchRetriever(
            document_store=self.document_store)
        print("SETTING UP DPR")
        dpr_retriever = DPRTrainingManager.get_current_retriever(
            self.document_store_faiss)
        print("SETTING UP EMBEDDINGS")
        embedding_retriever = EmbeddingRetriever(
            document_store=self.document_store_faiss,
            embedding_model="deepset/sentence_bert"
        )
        query_classifier = QueryClassifier()
        print("SETTING UP TABLE")
        table_retriever = TableRetriever(table_data)
        print("SETUP RETRIEVERS")
        self.question_generator = FurtherQuestionGenerator()
        print("UPDATING EMBEDDINGS")
        self.document_store_faiss.update_embeddings(dpr_retriever)
        print("UPDATED EMBEDDINGS")
        self.dpr_node = ContinualDPRNode(
            dpr_retriever, self.document_store_faiss)
        result = Result()
        self.trainer = DPRTrainingManager(
            self.document_store_faiss, self.dpr_node)
        print("SETUP COMPONENTS")
        pipeline = Pipeline()
        pipeline.add_node(component=es_retriever,
                          name="ESRetriever", inputs=["Query"])
        pipeline.add_node(component=self.dpr_node,
                          name="DPRRetriever", inputs=["Query"])
        pipeline.add_node(component=embedding_retriever,
                          name="EmbeddingRetriever", inputs=["Query"])
        pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[
                          "DPRRetriever", "EmbeddingRetriever", "ESRetriever"])
        pipeline.add_node(component=query_classifier,
                          name="QueryClassifier", inputs=["JoinResults"])
        pipeline.add_node(component=self.question_generator,
                          name="QnGenerator", inputs=["QueryClassifier.output_1"])
        pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[
                          "QueryClassifier.output_2"])
        pipeline.add_node(component=result, name="Result", inputs=[
                          "QnGenerator", "TableRetriever"])
        self.pipeline = pipeline
        print("SETUP PIPELINE")

    def answer(self, question, history={}):
        with self.doc_lock:
            if self.pipeline is None:
                return ""

            print(f"USING HISTORY: {history}")
            self.question_generator.history = history
            responses = self.pipeline.run(
                query=self.question_generator.question_parsing(question), top_k_retriever=20)
            if type(responses) is list:
                return responses[0]
            else:
                return responses

    def report(self, question):
        print("Question reported:")
        if self.trainer is None:
            print("Trainer is missing!")
            return []
        return self.trainer.processQuestion(question)

    def processTrainingAction(self, question, choices, correct_num):
        if self.trainer is None:
            print("Trainer is missing!")
            return 0
        return self.trainer.processTrainingAction(question, choices, correct_num)
Esempio n. 24
0
def create_store():
    # FAISS is a library for efficient similarity search on a cluster of dense vectors.
    # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood
    # to store the document text and other meta data. The vector embeddings of the text are
    # indexed on a FAISS Index that later is queried for searching answers.
    document_store = FAISSDocumentStore()
def tutorial6_better_retrieval_via_dpr():
    # FAISS is a library for efficient similarity search on a cluster of dense vectors.
    # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood
    # to store the document text and other meta data. The vector embeddings of the text are
    # indexed on a FAISS Index that later is queried for searching answers.
    # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
    # faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
    # For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

    # ## Preprocessing of documents
    # Let's first get some documents that we want to query
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the docs to our DB.
    document_store.write_documents(dicts)

    ### Retriever
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=2,
        use_gpu=True,
        embed_title=True,
        use_fast_tokenizers=True)

    # Important:
    # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
    document_store.update_embeddings(retriever)

    ### Reader
    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    ### Pipeline
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
Esempio n. 26
0
class ManagerTester:
    """
    To run with a seperate postgres database
    """
    # docker run  -p 5432:5432 -e POSTGRES_PASSWORD=haystack -d postgres
    # document_store = FAISSDocumentStore(
    #     faiss_index_factory_str="Flat",
    #     sql_url="postgresql://*****:*****@localhost:5432"
    # )

    """
    To run with an in memory sqlite database
    """
    document_store = FAISSDocumentStore(
        faiss_index_factory_str="Flat"
    )

    retreiver = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=16,
        # use_gpu=True,
        use_gpu=False,
        embed_title=True,
        use_fast_tokenizers=True
    )

    # Loads the test document into the document store
    def loadDocumentsFromFile(self, knowledgeFilePath):
        converter = TextConverter(
            remove_numeric_tables=False,
            valid_languages=["en"])
        processor = PreProcessor(
            clean_empty_lines=True,
            clean_whitespace=True,
            clean_header_footer=True,
            split_by="passage",
            split_length=1,
            split_respect_sentence_boundary=False,
            split_overlap=0
        )
        loadedFile = converter.convert(knowledgeFilePath)
        documents = processor.process(loadedFile)
        for i in range(0, len(documents)):
            docMetadata = documents[i]['meta']
            docMetadata['name'] = knowledgeFilePath
            docMetadata['doucmentID'] = knowledgeFilePath \
                + str(docMetadata['_split_id'])

        self.document_store.write_documents(documents)
        backagain = self.document_store.get_all_documents()

        # for i in range(0,len(backagain)):
        #     print(i)
        #     print(":\n")
        #     print(backagain[i])
        #     print("---------------")

        print("Number of documents loaded", end=": ")
        print(self.document_store.get_document_count())

    def __init__(self, knowledgeFilePath):
        print("Started DPR Training tester!")

        # load test document into database
        print("Loading documents from " + knowledgeFilePath)
        self.document_store.delete_all_documents()
        self.loadDocumentsFromFile(knowledgeFilePath)

        # update dpr embeddings based on initial retreiver
        print("Performing initial embeddings update")
        self.document_store.update_embeddings(self.retreiver)

        self.trainingManager = DPRTrainingManager(self.document_store, 0)

    # return document store's id for the response marked correct
    def get_correct_id(self, responses, correctNum):
        # correctRespons = responses[correctNum].to_dict()
        return responses[correctNum].id

    # return list of document store ids for alternative responses
    def get_incorrect_ids(self, responses, correctNum):
        ids = []
        for i in range(0, len(responses)):
            if i == correctNum:
                continue
            ids.append(responses[i].id)
        return ids

    def askQuestion(self):
        print("------------------------------")
        question = input("Enter new question (T to run training): ")

        if question == 'T':
            self.train()
            return

        k = 5
        responses = self.retreiver.retrieve(question, top_k=k)

        print()
        for i in range(0, k):
            print(i, end=": ")
            print(responses[i].text)
            print()

        print()
        correctNum = input("Select correct response (X if none correct): ")

        if correctNum == 'X':
            return

        print()
        print("------------------------------")

        self.trainingManager.addItem(
            question=question,
            posID=self.get_correct_id(responses, int(correctNum)),
            negIDs=self.get_incorrect_ids(responses, int(correctNum))
        )

    def train(self):

        newModel = self.trainingManager.train()

        self.retreiver = DensePassageRetriever.load(
            document_store=self.document_store,
            load_dir=newModel,
            max_seq_len_query=64,
            max_seq_len_passage=256,
            batch_size=16,
            # use_gpu=True,
            use_gpu=False,
            embed_title=True,
            use_fast_tokenizers=True
        )

        self.document_store.update_embeddings(self.retreiver)

    def loop(self):
        self.askQuestion()
Esempio n. 27
0
config = None
with open(configFile) as file:
  config = yaml.safe_load(file)

sqlUrlFAQ = config["sqlUrlFAQ"]


model = AdaptiveModel.convert_from_transformers(
    "deepset/sentence_bert", device="cpu", task_type="embeddings")
processor = Processor.convert_from_transformers(
    "deepset/sentence_bert", task_type="embeddings", max_seq_len=384, doc_stride=128)
model.save("sentence_bert-saved")
processor.save("sentence_bert-saved")


document_store = FAISSDocumentStore(sql_url=sqlUrlFAQ)


# from haystack.retriever.dense import DensePassageRetriever
# retriever = DensePassageRetriever(document_store=document_store,
#                                   query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
#                                   passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
#                                   max_seq_len_query=64,
#                                   max_seq_len_passage=256,
#                                   batch_size=16,
#                                   use_gpu=True,
#                                   embed_title=True,
#                                   use_fast_tokenizers=True)


# Get dataframe with columns "question", "answer" and some custom metadata
Esempio n. 28
0
from haystack.document_store.faiss import FAISSDocumentStore
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.utils import print_answers
from haystack.retriever.dense import DensePassageRetriever


# FAISS is a library for efficient similarity search on a cluster of dense vectors.
# The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood
# to store the document text and other meta data. The vector embeddings of the text are
# indexed on a FAISS Index that later is queried for searching answers.
# The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
# faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
# For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

# ## Preprocessing of documents
# Let's first get some documents that we want to query
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

# Now, let's write the docs to our DB.
document_store.write_documents(dicts)

### Retriever
retriever = DensePassageRetriever(document_store=document_store,
from haystack import Finder
from haystack.document_store.faiss import FAISSDocumentStore
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.utils import print_answers
from haystack.retriever.dense import DensePassageRetriever

# FAISS is a library for efficient similarity search on a cluster of dense vectors.
# The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood
# to store the document text and other meta data. The vector embeddings of the text are
# indexed on a FAISS Index that later is queried for searching answers.
document_store = FAISSDocumentStore()

# ## Preprocessing of documents
# Let's first get some documents that we want to query
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir,
                               clean_func=clean_wiki_text,
                               split_paragraphs=True)

# Now, let's write the docs to our DB.
document_store.write_documents(dicts)

### Retriever
retriever = DensePassageRetriever(
    document_store=document_store,