Esempio n. 1
0
def test_tfidf_retriever():
    from haystack.retriever.sparse import TfidfRetriever

    test_docs = [{
        "name": "testing the finder 1",
        "text": "godzilla says hello"
    }, {
        "name": "testing the finder 2",
        "text": "optimus prime says bye"
    }, {
        "name": "testing the finder 3",
        "text": "alien says arghh"
    }]

    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)

    retriever = TfidfRetriever(document_store)
    retriever.fit()
    assert retriever.retrieve("godzilla", top_k=1) == [
        Document(id='0',
                 text='godzilla says hello',
                 external_source_id=None,
                 question=None,
                 query_score=None,
                 meta={})
    ]
Esempio n. 2
0
def test_tfidf_retriever():
    from haystack.retriever.sparse import TfidfRetriever

    test_docs = [{
        "id": "26f84672c6d7aaeb8e2cd53e9c62d62d",
        "name": "testing the finder 1",
        "text": "godzilla says hello"
    }, {
        "name": "testing the finder 2",
        "text": "optimus prime says bye"
    }, {
        "name": "testing the finder 3",
        "text": "alien says arghh"
    }]

    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)

    retriever = TfidfRetriever(document_store)
    retriever.fit()
    doc = retriever.retrieve("godzilla", top_k=1)[0]
    assert doc.id == "26f84672c6d7aaeb8e2cd53e9c62d62d"
    assert doc.text == 'godzilla says hello'
    assert doc.meta == {"name": "testing the finder 1"}
Esempio n. 3
0
def feed_documents_to_model(model_name="deepset/roberta-base-squad2-covid"):
    """Feeds documents to model and returns a model ready to make predictions

    Parameters
    ----------
    model_name : str
        The path of the model to be selected from HuggingFace
        By default uses the pretrained version of roBERTa in squad2
        and covid articles

    Returns
    -------
    finder
        the model to use for predictions
    """

    # Initialize in memory Document Store
    document_store = InMemoryDocumentStore()
    # Load articles and format it as dictionary
    articles = ret.get_data(MANIFEST, ARTICLES_FOLDER, [])
    dicts_textContent = process_documents(articles)
    # Store the dictionary with articles content in the Document Store
    document_store.write_documents(dicts_textContent)
    # Retriever chooses what is the subset of documents that are relevant
    # many techniques are possible: for dev purposes TfidfRetriever is faster
    retriever = TfidfRetriever(document_store=document_store)
    # Reader provides interface to use the pre trained transformers
    # by default we're using the roberta
    reader = FARMReader(model_name_or_path=model_name, use_gpu=False)
    # The finder retrieves predictions
    finder = Finder(reader, retriever)

    return finder
Esempio n. 4
0
def get_retriever(retriever_type, document_store):

    if retriever_type == "dpr":
        retriever = DensePassageRetriever(
            document_store=document_store,
            query_embedding_model=
            "facebook/dpr-question_encoder-single-nq-base",
            passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
            use_gpu=False,
            embed_title=True)
    elif retriever_type == "tfidf":
        retriever = TfidfRetriever(document_store=document_store)
        retriever.fit()
    elif retriever_type == "embedding":
        retriever = EmbeddingRetriever(document_store=document_store,
                                       embedding_model="deepset/sentence_bert",
                                       use_gpu=False)
    elif retriever_type == "elasticsearch":
        retriever = ElasticsearchRetriever(document_store=document_store)
    elif retriever_type == "es_filter_only":
        retriever = ElasticsearchFilterOnlyRetriever(
            document_store=document_store)
    else:
        raise Exception(f"No retriever fixture for '{retriever_type}'")

    return retriever
Esempio n. 5
0
def test_finder_get_answers():
    test_docs = [{
        "name": "testing the finder 1",
        "text": "testing the finder with pyhton unit test 1",
        "meta": {
            "test": "test"
        }
    }, {
        "name": "testing the finder 2",
        "text": "testing the finder with pyhton unit test 2",
        "meta": {
            "test": "test"
        }
    }, {
        "name": "testing the finder 3",
        "text": "testing the finder with pyhton unit test 3",
        "meta": {
            "test": "test"
        }
    }]

    document_store = SQLDocumentStore(url="sqlite:///qa_test.db")
    document_store.write_documents(test_docs)
    retriever = TfidfRetriever(document_store=document_store)
    reader = TransformersReader(
        model="distilbert-base-uncased-distilled-squad",
        tokenizer="distilbert-base-uncased",
        use_gpu=-1)
    finder = Finder(reader, retriever)
    prediction = finder.get_answers(question="testing finder",
                                    top_k_retriever=10,
                                    top_k_reader=5)
    assert prediction is not None
Esempio n. 6
0
    def load(self):
        if(self.finder and self.finder2):
            return
        if(not self.document_store2):
            self.document_store2 = FAISSDocumentStore.load(
                sql_url=sqlUrlFAQ, faiss_file_path='faiss2')  # save before load in preprocess
            self.initSql(url=sqlUrlFAQ, document_store=self.document_store2)
        # else:  # reset session
        #     # self.document_store2.session.close()
        #     super(
        #         FAISSDocumentStore, self.document_store2).__init__(url=sqlUrlFAQ)
        if(not self.retriever2):
            self.retriever2 = EmbeddingRetriever(document_store=self.document_store2,
                                                 embedding_model="sentence_bert-saved", use_gpu=False)
        if(not self.finder2):
            self.finder2 = Finder(reader=None, retriever=self.retriever2)

        if(not self.document_store):
            self.document_store = SQLDocumentStore(url=sqlUrl)  
            #FAISSDocumentStore.load(faiss_file_path='faiss1', sql_url=sqlUrl)
                                                          
            self.initSql(url=sqlUrl, document_store=self.document_store)
        # else:  # reset session
        #     # self.document_store.session.close()
        #     super(
        #         FAISSDocumentStore, self.document_store).__init__(url=sqlUrl)
        # self.retriever = EmbeddingRetriever( #redice load by sharing the same retriever and set store on fly??
        #     document_store=self.document_store, embedding_model="sentence_bert-saved", use_gpu=False) if not self.retriever else self.retriever
        if(not self.retriever):
            self.retriever = TfidfRetriever(document_store=self.document_store)
        self.reader = FARMReader(model_name_or_path=modelDir,
                                 use_gpu=False, no_ans_boost=0) if not self.reader else self.reader
        # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
        self.finder = Finder(
            self.reader, self.retriever) if not self.finder else self.finder
Esempio n. 7
0
def test_finder_get_answers_with_in_memory_store():
    test_docs = [{
        "text": "testing the finder with pyhton unit test 1",
        'meta': {
            "name": "testing the finder 1",
            'url': 'url'
        }
    }, {
        "text": "testing the finder with pyhton unit test 2",
        'meta': {
            "name": "testing the finder 2",
            'url': 'url'
        }
    }, {
        "text": "testing the finder with pyhton unit test 3",
        'meta': {
            "name": "testing the finder 3",
            'url': 'url'
        }
    }]

    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)

    retriever = TfidfRetriever(document_store=document_store)
    reader = TransformersReader(
        model="distilbert-base-uncased-distilled-squad",
        tokenizer="distilbert-base-uncased",
        use_gpu=-1)
    finder = Finder(reader, retriever)
    prediction = finder.get_answers(question="testing finder",
                                    top_k_retriever=10,
                                    top_k_reader=5)
    assert prediction is not None
Esempio n. 8
0
def test_finder_get_answers_single_result(reader, document_store_with_docs):
    retriever = TfidfRetriever(document_store=document_store_with_docs)
    finder = Finder(reader, retriever)
    query = "testing finder"
    prediction = finder.get_answers(question=query, top_k_retriever=1,
                                    top_k_reader=1)
    assert prediction is not None
    assert len(prediction["answers"]) == 1
Esempio n. 9
0
def get_retriever(retriever_name, doc_store):
    if retriever_name == "elastic":
        return ElasticsearchRetriever(doc_store)
    if retriever_name == "tfidf":
        return TfidfRetriever(doc_store)
    if retriever_name == "dpr":
        return DensePassageRetriever(document_store=doc_store,
                                      query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                      passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                      use_gpu=True)
Esempio n. 10
0
def test_finder_offsets(reader, document_store_with_docs):
    retriever = TfidfRetriever(document_store=document_store_with_docs)
    finder = Finder(reader, retriever)
    prediction = finder.get_answers(question="Who lives in Berlin?", top_k_retriever=10,
                                    top_k_reader=5)

    assert prediction["answers"][0]["offset_start"] == 11
    assert prediction["answers"][0]["offset_end"] == 16
    start = prediction["answers"][0]["offset_start"]
    end = prediction["answers"][0]["offset_end"]
    assert prediction["answers"][0]["context"][start:end] == prediction["answers"][0]["answer"]
Esempio n. 11
0
def test_finder_offsets(reader, document_store_with_docs):
    retriever = TfidfRetriever(document_store=document_store_with_docs)
    finder = Finder(reader, retriever)
    prediction = finder.get_answers(question="Who lives in Berlin?",
                                    top_k_retriever=10,
                                    top_k_reader=5)

    assert prediction["answers"][0]["offset_start"] == 11
    #TODO enable again when FARM is upgraded incl. the new offset calc
    #    assert prediction["answers"][0]["offset_end"] == 16
    start = prediction["answers"][0]["offset_start"]
    end = prediction["answers"][0]["offset_end"]
Esempio n. 12
0
def test_finder_get_answers(reader, document_store_with_docs):
    retriever = TfidfRetriever(document_store=document_store_with_docs)
    finder = Finder(reader, retriever)
    prediction = finder.get_answers(question="Who lives in Berlin?", top_k_retriever=10,
                                    top_k_reader=3)
    assert prediction is not None
    assert prediction["question"] == "Who lives in Berlin?"
    assert prediction["answers"][0]["answer"] == "Carla"
    assert prediction["answers"][0]["probability"] <= 1
    assert prediction["answers"][0]["probability"] >= 0
    assert prediction["answers"][0]["meta"]["meta_field"] == "test1"
    assert prediction["answers"][0]["context"] == "My name is Carla and I live in Berlin"

    assert len(prediction["answers"]) == 3
Esempio n. 13
0
def get_retriever(retriever_name, doc_store):
    if retriever_name == "elastic":
        return ElasticsearchRetriever(doc_store)
    if retriever_name == "tfidf":
        return TfidfRetriever(doc_store)
    if retriever_name == "dpr":
        return DensePassageRetriever(
            document_store=doc_store,
            query_embedding_model=
            "facebook/dpr-question_encoder-single-nq-base",
            passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
            use_gpu=True,
            use_fast_tokenizers=False)
    if retriever_name == "sentence_transformers":
        return EmbeddingRetriever(document_store=doc_store,
                                  embedding_model="nq-distilbert-base-v1",
                                  use_gpu=True,
                                  model_format="sentence_transformers")
Esempio n. 14
0
def retriever():
    document_store = read_corpus()
    retriever = TfidfRetriever(document_store=document_store)
    return retriever
Esempio n. 15
0
def tutorial3_basic_qa_pipeline_without_elasticsearch():
    # In-Memory Document Store
    document_store = InMemoryDocumentStore()

    # or, alternatively, SQLite Document Store
    # document_store = SQLDocumentStore(url="sqlite:///qa.db")

    # ## Preprocessing of documents
    #
    # Haystack provides a customizable pipeline for:
    # - converting files into texts
    # - cleaning texts
    # - splitting texts
    # - writing them to a Document Store

    # In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index
    # them in Elasticsearch.
    # Let's first get some documents that we want to query
    # Here: 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    document_store.write_documents(dicts)

    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where
    # a given question could be answered.
    #
    # With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more
    # retrievers, please refer to the tutorial-1.

    # An in-memory TfidfRetriever based on Pandas dataframes
    retriever = TfidfRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).

    # **Here:**                   a medium sized RoBERTa QA model using a Reader based on
    #                             FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):**  TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):**  e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                             "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
    # **Hint:**                   You can adjust the model to return "no answer possible" with the no_ans_boost.
    #                             Higher values mean the model prefers "no answer possible".

    # #### FARMReader
    #
    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    # #### TransformersReader
    # Alternative:
    # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

    # ### Pipeline
    #
    # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
    # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
Esempio n. 16
0
    with open('data.json') as f:
        data = json.load(f)
    for article in data:
        filename = article['url'].split('/')[-1]
        with open('articles/%s.txt' % filename, 'w') as f:
            print(article['text'], file=f)
    

# Convert files to dicts
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.

doc_dir = 'articles'

def write_storage():
    dicts = convert_files_to_dicts(dir_path=doc_dir, split_paragraphs=True)
    document_store.write_documents(dicts)

write_storage()

from haystack.retriever.sparse import TfidfRetriever# ElasticsearchRetriever
# retriever = ElasticsearchRetriever(document_store=document_store)
retriever = TfidfRetriever(document_store=document_store)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)

finder = Finder(reader, retriever)

def get_prediction(question, num_retreive, num_read):
    return finder.get_answers(question=question, top_k_retriever=num_retreive, top_k_reader=num_read)
Esempio n. 17
0
def tfidf_retriever(inmemory_document_store):
    return TfidfRetriever(document_store=inmemory_document_store)
Esempio n. 18
0
from haystack.tokenizer import tokenizer
from haystack.utils import print_answers
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.retriever.sparse import TfidfRetriever

print("===============DocumentStore=================")
document_store_tfidf = InMemoryDocumentStore()
doc_dir_ja = "data/article_txt_got_ja_0"
dicts_ja = convert_files_to_dicts(dir_path=doc_dir_ja,
                                  clean_func=clean_wiki_text,
                                  split_paragraphs=True)
print(dicts_ja[0:3])
document_store_tfidf.write_documents(dicts_ja)

print("===============Retriever&Reader================")
retriever_tfidf = TfidfRetriever(document_store=document_store_tfidf)
reader_farm = FARMReader(model_name_or_path="cl-tohoku/bert-base-japanese",
                         use_gpu=True)
finder_tfidf_farm = Finder(reader_farm, retriever_tfidf)

print("===================question========================")
question = "脚本家は誰?"
tokenization = tokenizer.FullTokenizer(
    "./model_sentence_piece/vocab.txt",
    model_file="./model_sentence_piece/wiki-ja.model",
    do_lower_case=True)
question_tokenize_bySM = tokenization.space_separation(question)
prediction = finder_tfidf_farm.get_answers(question=question_tokenize_bySM,
                                           top_k_retriever=10,
                                           top_k_reader=2)