コード例 #1
0
ファイル: haystack_api.py プロジェクト: Accuro-Lab/Data-ML
def feed_documents_to_model(model_name="deepset/roberta-base-squad2-covid"):
    """Feeds documents to model and returns a model ready to make predictions

    Parameters
    ----------
    model_name : str
        The path of the model to be selected from HuggingFace
        By default uses the pretrained version of roBERTa in squad2
        and covid articles

    Returns
    -------
    finder
        the model to use for predictions
    """

    # Initialize in memory Document Store
    document_store = InMemoryDocumentStore()
    # Load articles and format it as dictionary
    articles = ret.get_data(MANIFEST, ARTICLES_FOLDER, [])
    dicts_textContent = process_documents(articles)
    # Store the dictionary with articles content in the Document Store
    document_store.write_documents(dicts_textContent)
    # Retriever chooses what is the subset of documents that are relevant
    # many techniques are possible: for dev purposes TfidfRetriever is faster
    retriever = TfidfRetriever(document_store=document_store)
    # Reader provides interface to use the pre trained transformers
    # by default we're using the roberta
    reader = FARMReader(model_name_or_path=model_name, use_gpu=False)
    # The finder retrieves predictions
    finder = Finder(reader, retriever)

    return finder
コード例 #2
0
ファイル: conftest.py プロジェクト: vivek22122014/haystack
def get_document_store(document_store_type, embedding_field="embedding"):
    if document_store_type == "sql":
        document_store = SQLDocumentStore(url="sqlite://", index="haystack_test")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore(
            return_embedding=True, embedding_field=embedding_field, index="haystack_test"
        )
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(
            index="haystack_test", return_embedding=True, embedding_field=embedding_field
        )
    elif document_store_type == "faiss":
        document_store = FAISSDocumentStore(
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        return document_store
    elif document_store_type == "milvus":
        document_store = MilvusDocumentStore(
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        return document_store
    else:
        raise Exception(f"No document store fixture for '{document_store_type}'")

    return document_store
コード例 #3
0
def get_document_store(document_store_type, similarity='dot_product'):
    """ TODO This method is taken from test/conftest.py but maybe should be within Haystack.
    Perhaps a class method of DocStore that just takes string for type of DocStore"""
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
        assert document_store.get_document_count() == 0
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore()
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="eval_document",
                                                    similarity=similarity,
                                                    timeout=3000)
    elif document_store_type in ("milvus_flat", "milvus_hnsw"):
        if document_store_type == "milvus_flat":
            index_type = IndexType.FLAT
            index_param = None
            search_param = None
        elif document_store_type == "milvus_hnsw":
            index_type = IndexType.HNSW
            index_param = {"M": 64, "efConstruction": 80}
            search_param = {"ef": 20}
        document_store = MilvusDocumentStore(similarity=similarity,
                                             index_type=index_type,
                                             index_param=index_param,
                                             search_param=search_param)
        assert document_store.get_document_count(index="eval_document") == 0
    elif document_store_type in ("faiss_flat", "faiss_hnsw"):
        if document_store_type == "faiss_flat":
            index_type = "Flat"
        elif document_store_type == "faiss_hnsw":
            index_type = "HNSW"
        status = subprocess.run(['docker rm -f haystack-postgres'], shell=True)
        time.sleep(1)
        status = subprocess.run([
            'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres'
        ],
                                shell=True)
        time.sleep(6)
        status = subprocess.run([
            'docker exec haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'
        ],
                                shell=True)
        time.sleep(1)
        document_store = FAISSDocumentStore(
            sql_url="postgresql://*****:*****@localhost:5432/haystack",
            faiss_index_factory_str=index_type,
            similarity=similarity)
        assert document_store.get_document_count() == 0

    else:
        raise Exception(
            f"No document store fixture for '{document_store_type}'")
    return document_store
コード例 #4
0
def test_tfidf_retriever():
    from haystack.retriever.sparse import TfidfRetriever

    test_docs = [
        {"id": "26f84672c6d7aaeb8e2cd53e9c62d62d", "name": "testing the finder 1", "text": "godzilla says hello"},
        {"name": "testing the finder 2", "text": "optimus prime says bye"},
        {"name": "testing the finder 3", "text": "alien says arghh"}
    ]

    from haystack.document_store.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)

    retriever = TfidfRetriever(document_store)
    retriever.fit()
    doc = retriever.retrieve("godzilla", top_k=1)[0]
    assert doc.id == "26f84672c6d7aaeb8e2cd53e9c62d62d"
    assert doc.text == 'godzilla says hello'
    assert doc.meta == {"name": "testing the finder 1"}
コード例 #5
0
def get_document_store(document_store_type,
                       embedding_dim=768,
                       embedding_field="embedding"):
    if document_store_type == "sql":
        document_store = SQLDocumentStore(url="sqlite://",
                                          index="haystack_test")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore(return_embedding=True,
                                               embedding_dim=embedding_dim,
                                               embedding_field=embedding_field,
                                               index="haystack_test")
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(
            index="haystack_test",
            return_embedding=True,
            embedding_dim=embedding_dim,
            embedding_field=embedding_field)
    elif document_store_type == "faiss":
        document_store = FAISSDocumentStore(
            vector_dim=embedding_dim,
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        return document_store
    elif document_store_type == "milvus":
        document_store = MilvusDocumentStore(
            vector_dim=embedding_dim,
            sql_url="sqlite://",
            return_embedding=True,
            embedding_field=embedding_field,
            index="haystack_test",
        )
        _, collections = document_store.milvus_server.list_collections()
        for collection in collections:
            if collection.startswith("haystack_test"):
                document_store.milvus_server.drop_collection(collection)
        return document_store
    elif document_store_type == "weaviate":
        document_store = WeaviateDocumentStore(
            weaviate_url="http://localhost:8080", index="Haystacktest")
        document_store.weaviate_client.schema.delete_all()
        document_store._create_schema_and_index_if_not_exist()
        return document_store
    else:
        raise Exception(
            f"No document store fixture for '{document_store_type}'")

    return document_store
コード例 #6
0
def get_document_store(document_store_type, es_similarity='cosine'):
    """ TODO This method is taken from test/conftest.py but maybe should be within Haystack.
    Perhaps a class method of DocStore that just takes string for type of DocStore"""
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore()
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="eval_document",
                                                    similarity=es_similarity)
    elif document_store_type in ("faiss_flat", "faiss_hnsw"):
        if document_store_type == "faiss_flat":
            index_type = "Flat"
        elif document_store_type == "faiss_hnsw":
            index_type = "HNSW"

        #TEMP FIX for issue with deleting docs
        # status = subprocess.run(
        #     ['docker rm -f haystack-postgres'],
        #     shell=True)
        # time.sleep(3)
        # try:
        #     document_store = FAISSDocumentStore(sql_url="postgresql://*****:*****@localhost:5432/haystack",
        #                                         faiss_index_factory_str=index_type)
        # except:
        # Launch a postgres instance & create empty DB
        # logger.info("Didn't find Postgres. Start a new instance...")
        status = subprocess.run(['docker rm -f haystack-postgres'], shell=True)
        time.sleep(1)
        status = subprocess.run([
            'docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres'
        ],
                                shell=True)
        time.sleep(3)
        status = subprocess.run([
            'docker exec -it haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'
        ],
                                shell=True)
        time.sleep(1)
        document_store = FAISSDocumentStore(
            sql_url="postgresql://*****:*****@localhost:5432/haystack",
            faiss_index_factory_str=index_type)

    else:
        raise Exception(
            f"No document store fixture for '{document_store_type}'")
    assert document_store.get_document_count() == 0
    return document_store
コード例 #7
0
ファイル: conftest.py プロジェクト: tcapilla/haystack
def get_document_store(document_store_type):
    if document_store_type == "sql":
        if os.path.exists("haystack_test.db"):
            os.remove("haystack_test.db")
        document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
    elif document_store_type == "memory":
        document_store = InMemoryDocumentStore()
    elif document_store_type == "elasticsearch":
        # make sure we start from a fresh index
        client = Elasticsearch()
        client.indices.delete(index='haystack_test*', ignore=[404])
        document_store = ElasticsearchDocumentStore(index="haystack_test")
    elif document_store_type == "faiss":
        if os.path.exists("haystack_test_faiss.db"):
            os.remove("haystack_test_faiss.db")
        document_store = FAISSDocumentStore(
            sql_url="sqlite:///haystack_test_faiss.db")
    else:
        raise Exception(
            f"No document store fixture for '{document_store_type}'")

    return document_store
コード例 #8
0
def read_corpus():
    document_store = InMemoryDocumentStore()
    doc_dir = "Quran"
    dicts = convert_files_to_dicts(dir_path=doc_dir, split_paragraphs=True)
    document_store.write_documents(dicts)
    return document_store
コード例 #9
0
def tutorial9_dpr_training():
    # Training Your Own "Dense Passage Retrieval" Model

    # Here are some imports that we'll need

    from haystack.retriever.dense import DensePassageRetriever
    from haystack.preprocessor.utils import fetch_archive_from_http
    from haystack.document_store.memory import InMemoryDocumentStore

    # Download original DPR data
    # WARNING: the train set is 7.4GB and the dev set is 800MB

    doc_dir = "data/dpr_training/"

    s3_url_train = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-train.json.gz"
    s3_url_dev = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz"

    fetch_archive_from_http(s3_url_train, output_dir=doc_dir + "train/")
    fetch_archive_from_http(s3_url_dev, output_dir=doc_dir + "dev/")

    ## Option 1: Training DPR from Scratch

    # Here are the variables to specify our training data, the models that we use to initialize DPR
    # and the directory where we'll be saving the model

    doc_dir = "data/dpr_training/"

    train_filename = "train/biencoder-nq-train.json"
    dev_filename = "dev/biencoder-nq-dev.json"

    query_model = "bert-base-uncased"
    passage_model = "bert-base-uncased"

    save_dir = "../saved_models/dpr"

    # ## Option 2: Finetuning DPR
    #
    # # Here are the variables you might want to use instead of the set above
    # # in order to perform pretraining
    #
    # doc_dir = "PATH_TO_YOUR_DATA_DIR"
    # train_filename = "TRAIN_FILENAME"
    # dev_filename = "DEV_FILENAME"
    #
    # query_model = "facebook/dpr-question_encoder-single-nq-base"
    # passage_model = "facebook/dpr-ctx_encoder-single-nq-base"
    #
    # save_dir = "..saved_models/dpr"

    ## Initialize DPR model

    retriever = DensePassageRetriever(
        document_store=InMemoryDocumentStore(),
        query_embedding_model=query_model,
        passage_embedding_model=passage_model,
        max_seq_len_query=64,
        max_seq_len_passage=256
    )

    # Start training our model and save it when it is finished

    retriever.train(
        data_dir=doc_dir,
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=dev_filename,
        n_epochs=1,
        batch_size=4,
        grad_acc_steps=4,
        save_dir=save_dir,
        evaluate_every=3000,
        embed_title=True,
        num_positives=1,
        num_hard_negatives=1
    )

    ## Loading

    reloaded_retriever = DensePassageRetriever.load(load_dir=save_dir, document_store=None)
コード例 #10
0
def tutorial3_basic_qa_pipeline_without_elasticsearch():
    # In-Memory Document Store
    document_store = InMemoryDocumentStore()

    # or, alternatively, SQLite Document Store
    # document_store = SQLDocumentStore(url="sqlite:///qa.db")

    # ## Preprocessing of documents
    #
    # Haystack provides a customizable pipeline for:
    # - converting files into texts
    # - cleaning texts
    # - splitting texts
    # - writing them to a Document Store

    # In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index
    # them in Elasticsearch.
    # Let's first get some documents that we want to query
    # Here: 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    document_store.write_documents(dicts)

    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where
    # a given question could be answered.
    #
    # With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more
    # retrievers, please refer to the tutorial-1.

    # An in-memory TfidfRetriever based on Pandas dataframes
    retriever = TfidfRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).

    # **Here:**                   a medium sized RoBERTa QA model using a Reader based on
    #                             FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):**  TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):**  e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                             "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
    # **Hint:**                   You can adjust the model to return "no answer possible" with the no_ans_boost.
    #                             Higher values mean the model prefers "no answer possible".

    # #### FARMReader
    #
    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    # #### TransformersReader
    # Alternative:
    # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

    # ### Pipeline
    #
    # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
    # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
コード例 #11
0
def get_retriever_for_training(
        query_model="facebook/dpr-question_encoder-single-nq-base",
        passage_model="facebook/dpr-ctx_encoder-single-nq-base"):
    return get_retriever(InMemoryDocumentStore(),
                         query_model=query_model,
                         passage_model=passage_model)
コード例 #12
0
ファイル: conftest.py プロジェクト: pranav0904/haystack
def inmemory_document_store():
    return InMemoryDocumentStore(return_embedding=True)
コード例 #13
0
# You can use an `InMemoryDocumentStore` or a `SQLDocumentStore`(with SQLite) as the document store.
#
# If you are interested in more feature-rich Elasticsearch, then please refer to the Tutorial 1.

from haystack import Finder
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.document_store.sql import SQLDocumentStore
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.retriever.sparse import TfidfRetriever
from haystack.utils import print_answers

# In-Memory Document Store
document_store = InMemoryDocumentStore()

# or, alternatively, SQLite Document Store
# document_store = SQLDocumentStore(url="sqlite:///qa.db")

# ## Preprocessing of documents
#
# Haystack provides a customizable pipeline for:
# - converting files into texts
# - cleaning texts
# - splitting texts
# - writing them to a Document Store

# In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index
# them in Elasticsearch.
# Let's first get some documents that we want to query
コード例 #14
0
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.retriever.sparse import TfidfRetriever
from haystack.reader.farm import FARMReader
from haystack.pipeline import ExtractiveQAPipeline
from collections import Counter
from wordcloud import WordCloud

# load and transform data and put it in a document store
data = pd.read_csv('QA-WordClouds/Womens Clothing E-Commerce Reviews.csv')

# convert dataframe to docs
docs = [{"text": str(text)} for text in data['Review Text']]

print('done')

doc_store = InMemoryDocumentStore()
doc_store.write_documents(docs)
# get haystack pipe with reader and retriever
# get retriever
retriever = TfidfRetriever(document_store=doc_store)
# model for question answering:
model_name = 'distilbert-base-cased-distilled-squad'
reader = FARMReader(model_name_or_path=model_name,
                    progress_bar=False,
                    return_no_answer=False)

# finally the pipe
pipe = ExtractiveQAPipeline(reader, retriever)

# ask questions and get results from the pipe
question = 'How are the colors?'
コード例 #15
0
ファイル: all_section.py プロジェクト: shira07tech01/haystack
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.tokenizer import tokenizer
from haystack.utils import print_answers
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.retriever.sparse import TfidfRetriever

print("===============DocumentStore=================")
document_store_tfidf = InMemoryDocumentStore()
doc_dir_ja = "data/article_txt_got_ja_0"
dicts_ja = convert_files_to_dicts(dir_path=doc_dir_ja,
                                  clean_func=clean_wiki_text,
                                  split_paragraphs=True)
print(dicts_ja[0:3])
document_store_tfidf.write_documents(dicts_ja)

print("===============Retriever&Reader================")
retriever_tfidf = TfidfRetriever(document_store=document_store_tfidf)
reader_farm = FARMReader(model_name_or_path="cl-tohoku/bert-base-japanese",
                         use_gpu=True)
finder_tfidf_farm = Finder(reader_farm, retriever_tfidf)

print("===================question========================")
question = "脚本家は誰?"
tokenization = tokenizer.FullTokenizer(
    "./model_sentence_piece/vocab.txt",
    model_file="./model_sentence_piece/wiki-ja.model",
    do_lower_case=True)
コード例 #16
0
retriever = ElasticsearchRetriever(document_store=document_store)



reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)

finder = Finder(reader, retriever)

prediction = finder.get_answers(question="Who is the education minister", top_k_retriever=10, top_k_reader=5)

print_answers(prediction, details="minimal")

"""Question answering without Elastic search"""

from haystack.document_store.memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()

doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)


print(dicts[:3])
document_store.write_documents(dicts)

from haystack.retriever.sparse import TfidfRetriever
retriever = TfidfRetriever(document_store=document_store)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)