Beispiel #1
0
    def launch_elasticsearch(self, launch: bool = False, name: str = "hera"):
        if launch:
            logging.info("Starting Elasticsearch ...")
            status = subprocess.run([
                f'docker run -d -p 9200:9200 --name "hera" -e "discovery.type=single-node" elasticsearch:7.6.2'
            ],
                                    shell=True)
            time.sleep(30)
        else:
            logging.info("Starting Elasticsearch ...")
            try:
                status = subprocess.run(['docker stop hera'], shell=True)
            except:
                raise ("No running containers")

            finally:
                status = subprocess.run(['docker start hera'], shell=True)
                time.sleep(30)

        index = "document"
        document_store = ElasticsearchDocumentStore(host="localhost",
                                                    username="",
                                                    password="",
                                                    index=index)

        dicts = convert_files_to_dicts(dir_path=self.data_path,
                                       clean_func=self.clean_website_text,
                                       split_paragraphs=True)
        try:
            document_store.delete_all_documents(index=index)
        except:
            pass
        finally:
            document_store.write_documents(dicts)
        return status
def pre_process(user_id_key):
    uploads_dir = "users/" + str(user_id_key) + "/uploads/"
    try:
        es_result = es.search(
            index="user_" + str(user_id_key), body={"query": {"match_all": {}}}
        )
        no_docs = len(es_result["hits"]["hits"])

    except Exception as e:
        print(e)
        print("\n no documents in es")

    processed = convert_files_to_dicts(
        dir_path=uploads_dir, clean_func=clean_wiki_text, split_paragraphs=True
    )

    for doc in range(len(processed)):
        try:
            # print("\n Checking for duplicate docs ..")
            add_doc = True
            for each_doc in range(no_docs):
                doc_text = es_result["hits"]["hits"][each_doc]["_source"]["text"]
                doc_name = es_result["hits"]["hits"][each_doc]["_source"]["name"]
                doc_id = es_result["hits"]["hits"][each_doc]["_id"]
                if (
                    processed[doc]["meta"]["name"] == "context_file.txt"
                    and doc_name == "context_file.txt"
                ):
                    # print("Deleting context file to update with new changes ..")
                    es.delete(
                        index="user_" + str(user_id_key), doc_type="_doc", id=doc_id
                    )

                if processed[doc]["text"] == doc_text:
                    # print("\n There is a duplicate, So this document is not added ..")
                    add_doc = False
                    os.remove(uploads_dir + str(processed[doc]["meta"]["name"]))
                    break
            if add_doc:
                # print("\n No duplicates found, so adding this to es..")
                processed_lst = [processed[doc]]
                user_doc_store[user_id_key].write_documents(processed_lst)
                os.remove(uploads_dir + str(processed[doc]["meta"]["name"]))

        except Exception as e:
            print(e)
            # print("\n no documents in es")
            processed_lst = [processed[doc]]
            user_doc_store[user_id_key].write_documents(processed_lst)
            os.remove(uploads_dir + str(processed[doc]["meta"]["name"]))
Beispiel #3
0
def Find_answer(text_file_path, data_folder_path, symbol, question):
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

    with open(text_file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    for i, line in enumerate(data.split(symbol)):
        with open(f'{data_folder_path}/data{i+1}.txt', 'w') as f:
            print(f'writing file no.{i+1}')
            f.write(line)

    test_dicts = convert_files_to_dicts(dir_path=data_folder_path,
                                        clean_func=clean_wiki_text,
                                        split_paragraphs=True)
    document_store.write_documents(test_dicts)
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=16,
        use_gpu=True,
        embed_title=True,
        use_fast_tokenizers=True)

    document_store.update_embeddings(retriever)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True,
                        context_window_size=300)

    pipe = ExtractiveQAPipeline(reader, retriever)

    prediction = pipe.run(query=question, top_k_retriever=10, top_k_reader=3)

    doc_with_ans = []
    for i in range(len(prediction['answers'])):
        if prediction['answers'][i]['context'] not in doc_with_ans:
            doc_with_ans.append(prediction['answers'][i]['context'])

    answer = ' '.join(doc_with_ans)

    return answer
Beispiel #4
0
def update_document():
    """Return a the url of the index document."""
    if request.files:
        # index is the target document where queries need to sent.
        index = request.form['index']
        # uploaded document for target source
        doc = request.files["doc"]

        file_path = os.path.join(app.config["input"], doc.filename)

        # saving the file to the input directory
        doc.save(file_path)
        #initialization of the Haystack Elasticsearch document storage
        document_store = ElasticsearchDocumentStore(
            host=app.config["host"],
            port=app.config["port"],
            username=app.config["username"],
            password=app.config["password"],
            index=index)
        # convert the pdf files into dictionary and update to ElasticSearch Document
        dicts = convert_files_to_dicts(app.config["input"],
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=False)
        document_store.write_documents(dicts)
        os.remove(file_path)
        return json.dumps({
            'status':
            'Susccess',
            'message':
            'document available at http://' + app.config["host"] + ':' +
            app.config["port"] + '/' + index + '/_search',
            'result': []
        })
    else:
        return json.dumps({
            'status': 'Failed',
            'message': 'No file uploaded',
            'result': []
        })
Beispiel #5
0
def tutorial3_basic_qa_pipeline_without_elasticsearch():
    # In-Memory Document Store
    document_store = InMemoryDocumentStore()

    # or, alternatively, SQLite Document Store
    # document_store = SQLDocumentStore(url="sqlite:///qa.db")

    # ## Preprocessing of documents
    #
    # Haystack provides a customizable pipeline for:
    # - converting files into texts
    # - cleaning texts
    # - splitting texts
    # - writing them to a Document Store

    # In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index
    # them in Elasticsearch.
    # Let's first get some documents that we want to query
    # Here: 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    document_store.write_documents(dicts)

    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where
    # a given question could be answered.
    #
    # With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more
    # retrievers, please refer to the tutorial-1.

    # An in-memory TfidfRetriever based on Pandas dataframes
    retriever = TfidfRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).

    # **Here:**                   a medium sized RoBERTa QA model using a Reader based on
    #                             FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):**  TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):**  e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                             "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
    # **Hint:**                   You can adjust the model to return "no answer possible" with the no_ans_boost.
    #                             Higher values mean the model prefers "no answer possible".

    # #### FARMReader
    #
    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    # #### TransformersReader
    # Alternative:
    # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

    # ### Pipeline
    #
    # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
    # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
Beispiel #6
0
def test_convert_files_to_dicts(xpdf_fixture):
    documents = utils.convert_files_to_dicts(dir_path="samples",
                                             clean_func=clean_wiki_text,
                                             split_paragraphs=True)
    assert documents and len(documents) > 0
Beispiel #7
0
def tutorial1_basic_qa_pipeline():
    logger = logging.getLogger(__name__)

    LAUNCH_ELASTICSEARCH = True

    # ## Document Store
    #
    # Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of
    # `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`.
    #
    # **Here:** We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval,
    # and vector storage for text embeddings.
    # **Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3
    # for using SQL/InMemory document stores.
    # **Hint**:
    # This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can
    # configure Haystack to work with your existing document stores.
    #
    # Start an Elasticsearch server
    # You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in
    # your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.

    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.run([
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'
        ],
                                shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # Connect to Elasticsearch
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document")

    # ## Preprocessing of documents
    #
    # Haystack provides a customizable pipeline for:
    # - converting files into texts
    # - cleaning texts
    # - splitting texts
    # - writing them to a Document Store

    # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add
    # them in Elasticsearch.

    # Let's first fetch some documents that we want to query
    # Here: 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
    # It must take a str as input, and return a str.

    # Now, let's write the docs to our DB.
    if LAUNCH_ELASTICSEARCH:
        document_store.write_documents(dicts)
    else:
        logger.warning(
            "Since we already have a running ES instance we should not index the same documents again. \n"
            "If you still want to do this call: document_store.write_documents(dicts) manually "
        )

    # ## Initalize Retriever, Reader,  & Finder
    #
    # ### Retriever
    #
    # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question
    # could be answered.
    #
    # They use some simple but fast algorithm.
    # **Here:** We use Elasticsearch's default BM25 algorithm
    # **Alternatives:**
    # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters
    # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of
    #   embeddings (e.g. created via Sentence-BERT)
    # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging

    retriever = ElasticsearchRetriever(document_store=document_store)

    # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes
    # with SQLite document store.
    #
    # from haystack.retriever.tfidf import TfidfRetriever
    # retriever = TfidfRetriever(document_store=document_store)

    # ### Reader
    #
    # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
    # on powerful, but slower deep learning models.
    #
    # Haystack currently supports Readers based on the frameworks FARM and Transformers.
    # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).
    # **Here:** a medium sized RoBERTa QA model using a Reader based on
    #           FARM (https://huggingface.co/deepset/roberta-base-squad2)
    # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)
    # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or
    #                            "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)
    # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean
    #           the model prefers "no answer possible"
    #
    # #### FARMReader

    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    # #### TransformersReader

    # Alternative:
    # reader = TransformersReader(
    #    model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

    # ### Pipeline
    #
    # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
    # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
Beispiel #8
0
    temp = {}
    if isinstance(row['abstract'], str):
        temp['text'] = row['abstract']
        temp['meta'] = {}
        temp['meta']['title'] = row['title']
        temp['meta']['doi'] = row['doi'] if isinstance(row['doi'], str) else ''
        #temp['meta']['References'] = ', '.join(row['References']) if isinstance(row['References'], list) else ''
        temp['meta']['keywords'] = ' '.join(row['keywords']) if isinstance(
            row['keywords'], list) and len(row['keywords']) > 0 else ''
        other_nfpaper_docs_dicts.append(temp)

logging.info("Indexing Elesier articals only with abstracts")
document_store.write_documents(other_nfpaper_docs_dicts)

# Compiled informatics PDF docs from the NF Registry website
pdf_dicts = convert_files_to_dicts(dir_path='./data/nf_info_pdfs/',
                                   split_paragraphs=True)
for p in pdf_dicts:
    p['text'] = re.sub(r'For more information on.*', '',
                       ' '.join(p['text'].split('\n')))
pdf_docs = [
    p for p in pdf_dicts if not p['text'].startswith(
        'Help end NF by joining the confidential NF Registry')
    and len(p['text']) > 50
]

logging.info("Indexing NF Registry PDF files")
document_store.write_documents(pdf_docs)

# PMC Articles
pmc_papers = pd.read_csv("./data/pmc_papers.csv")
pmc_papers['authors'] = pmc_papers['authors'].map(
def tutorial6_better_retrieval_via_dpr():
    # OPTION 1: FAISS is a library for efficient similarity search on a cluster of dense vectors.
    # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood
    # to store the document text and other meta data. The vector embeddings of the text are
    # indexed on a FAISS Index that later is queried for searching answers.
    # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
    # faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
    # For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

    # OPTION2: Milvus is an open source database library that is also optimized for vector similarity searches like FAISS.
    # Like FAISS it has both a "Flat" and "HNSW" mode but it outperforms FAISS when it comes to dynamic data management.
    # It does require a little more setup, however, as it is run through Docker and requires the setup of some config files.
    # See https://milvus.io/docs/v1.0.0/milvus_docker-cpu.md
    # launch_milvus()
    # document_store = MilvusDocumentStore()

    # ## Preprocessing of documents
    # Let's first get some documents that we want to query
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the docs to our DB.
    document_store.write_documents(dicts)

    ### Retriever
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=64,
        max_seq_len_passage=256,
        batch_size=2,
        use_gpu=True,
        embed_title=True,
        use_fast_tokenizers=True)

    # Important:
    # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
    document_store.update_embeddings(retriever)

    ### Reader
    # Load a  local model or any of the QA models on
    # Hugging Face's model hub (https://huggingface.co/models)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                        use_gpu=True)

    ### Pipeline
    from haystack.pipeline import ExtractiveQAPipeline
    pipe = ExtractiveQAPipeline(reader, retriever)

    ## Voilà! Ask a question!
    prediction = pipe.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10,
                          top_k_reader=5)

    # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
    # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)

    print_answers(prediction, details="minimal")
Beispiel #10
0
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.tokenizer import tokenizer
from haystack.utils import print_answers
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.retriever.sparse import TfidfRetriever

print("===============DocumentStore=================")
document_store_tfidf = InMemoryDocumentStore()
doc_dir_ja = "data/article_txt_got_ja_0"
dicts_ja = convert_files_to_dicts(dir_path=doc_dir_ja,
                                  clean_func=clean_wiki_text,
                                  split_paragraphs=True)
print(dicts_ja[0:3])
document_store_tfidf.write_documents(dicts_ja)

print("===============Retriever&Reader================")
retriever_tfidf = TfidfRetriever(document_store=document_store_tfidf)
reader_farm = FARMReader(model_name_or_path="cl-tohoku/bert-base-japanese",
                         use_gpu=True)
finder_tfidf_farm = Finder(reader_farm, retriever_tfidf)

print("===================question========================")
question = "脚本家は誰?"
tokenization = tokenizer.FullTokenizer(
    "./model_sentence_piece/vocab.txt",
    model_file="./model_sentence_piece/wiki-ja.model",
    do_lower_case=True)
Beispiel #11
0
doc_txt = converter.convert(
    file_path="data/preprocessing_tutorial/classics.txt", meta=None)

converter = PDFToTextConverter(remove_numeric_tables=True,
                               valid_languages=["en"])
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf",
                            meta=None)

converter = DocxToTextConverter(remove_numeric_tables=True,
                                valid_languages=["en"])
doc_docx = converter.convert(
    file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)

# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.

all_docs = convert_files_to_dicts(dir_path="data/preprocessing_tutorial")
"""

## PreProcessor

The PreProcessor class is designed to help you clean text and split text into sensible units.
File splitting can have a very significant impact on the system's performance.
Have a look at the [Preprocessing](https://haystack.deepset.ai/docs/latest/preprocessingmd)
and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages on our website for more details.
"""

# This is a default usage of the PreProcessor.
# Here, it performs cleaning of consecutive whitespaces
# and splits a single large document into smaller documents.
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
# Note how the single document passed into the document gets split into 5 smaller documents
def tutorial14_query_classifier():

    #Download and prepare data - 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    got_dicts = convert_files_to_dicts(
        dir_path=doc_dir,
        clean_func=clean_wiki_text,
        split_paragraphs=True
    )

    # Initialize DocumentStore and index documents
    launch_es()
    document_store = ElasticsearchDocumentStore()
    document_store.delete_all_documents()
    document_store.write_documents(got_dicts)

    # Initialize Sparse retriever
    es_retriever = ElasticsearchRetriever(document_store=document_store)

    # Initialize dense retriever
    dpr_retriever = DensePassageRetriever(document_store)
    document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")


    # Here we build the pipeline
    sklearn_keyword_classifier = Pipeline()
    sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
    sklearn_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
    sklearn_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"])
    sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
    sklearn_keyword_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = sklearn_keyword_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = sklearn_keyword_classifier.run(
        query="arya stark father",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_2)

    # Run only the dense retriever on the full sentence query
    res_3 = sklearn_keyword_classifier.run(
        query="which country was jon snow filmed ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_3)

    # Run only the sparse retriever on a keyword based query
    res_4 = sklearn_keyword_classifier.run(
        query="jon snow country",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_4)

    # Run only the dense retriever on the full sentence query
    res_5 = sklearn_keyword_classifier.run(
        query="who are the younger brothers of arya stark ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_5)

    # Run only the sparse retriever on a keyword based query
    res_6 = sklearn_keyword_classifier.run(
        query="arya stark younger brothers",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_6)

    # Here we build the pipeline
    transformer_keyword_classifier = Pipeline()
    transformer_keyword_classifier.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"])
    transformer_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
    transformer_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"])
    transformer_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
    transformer_keyword_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = transformer_keyword_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = transformer_keyword_classifier.run(
        query="arya stark father",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_2)

    # Run only the dense retriever on the full sentence query
    res_3 = transformer_keyword_classifier.run(
        query="which country was jon snow filmed ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_3)

    # Run only the sparse retriever on a keyword based query
    res_4 = transformer_keyword_classifier.run(
        query="jon snow country",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_4)

    # Run only the dense retriever on the full sentence query
    res_5 = transformer_keyword_classifier.run(
        query="who are the younger brothers of arya stark ?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_5)

    # Run only the sparse retriever on a keyword based query
    res_6 = transformer_keyword_classifier.run(
        query="arya stark younger brothers",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    print_answers(res_6)

    # Here we build the pipeline
    transformer_question_classifier = Pipeline()
    transformer_question_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
    transformer_question_classifier.add_node(component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"), name="QueryClassifier", inputs=["DPRRetriever"])
    transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"])
    transformer_question_classifier.draw("question_classifier.png")

    # Run only the QA reader on the question query
    res_1 = transformer_question_classifier.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=10
    )
    print("DPR Results" + "\n" + "="*15)
    print_answers(res_1)

    # Show only DPR results
    res_2 = transformer_question_classifier.run(
        query="Arya Stark was the daughter of a Lord.",
        top_k_retriever=10
    )
    print("ES Results" + "\n" + "="*15)
    res_2

    # Here we create the keyword vs question/statement query classifier

    queries = ["arya stark father","jon snow country",
               "who is the father of arya stark","which country was jon snow filmed?"]

    keyword_classifier = TransformersQueryClassifier()

    for query in queries:
        result = keyword_classifier.run(query=query)
        if result[1] == "output_1":
            category = "question/statement"
        else:
            category = "keyword"

        print(f"Query: {query}, raw_output: {result}, class: {category}")

    # Here we create the question vs statement query classifier

    queries = ["Lord Eddard was the father of Arya Stark.","Jon Snow was filmed in United Kingdom.",
               "who is the father of arya stark?","Which country was jon snow filmed in?"]

    question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier")

    for query in queries:
        result = question_classifier.run(query=query)
        if result[1] == "output_1":
            category = "question"
        else:
            category = "statement"

        print(f"Query: {query}, raw_output: {result}, class: {category}")
Beispiel #13
0
def tutorial12_lfqa():
    """
    Document Store:
    FAISS is a library for efficient similarity search on a cluster of dense vectors.
    The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood
    to store the document text and other meta data. The vector embeddings of the text are
    indexed on a FAISS Index that later is queried for searching answers.
    The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
    faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
    For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
    """

    from haystack.document_store.faiss import FAISSDocumentStore

    document_store = FAISSDocumentStore(vector_dim=128,
                                        faiss_index_factory_str="Flat")
    """
    Cleaning & indexing documents:
    Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore
    """

    # Let's first get some files that we want to use
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # Convert files to dicts
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the dicts containing documents to our DB.
    document_store.write_documents(dicts)
    """
    Initalize Retriever and Reader/Generator:
    We use a `RetribertRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore`
    """

    from haystack.retriever.dense import EmbeddingRetriever

    retriever = EmbeddingRetriever(
        document_store=document_store,
        embedding_model="yjernite/retribert-base-uncased",
        model_format="retribert")

    document_store.update_embeddings(retriever)
    """Before we blindly use the `RetribertRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents."""

    from haystack.utils import print_answers, print_documents
    from haystack.pipeline import DocumentSearchPipeline

    p_retrieval = DocumentSearchPipeline(retriever)
    res = p_retrieval.run(query="Tell me something about Arya Stark?",
                          top_k_retriever=5)
    print_documents(res, max_text_len=512)
    """
    Similar to previous Tutorials we now initalize our reader/generator.
    Here we use a `Seq2SeqGenerator` with the *yjernite/bart_eli5* model (see: https://huggingface.co/yjernite/bart_eli5)
    """

    generator = Seq2SeqGenerator(model_name_or_path="yjernite/bart_eli5")
    """
    Pipeline:
    With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
    Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
    To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions.
    You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).
    """

    from haystack.pipeline import GenerativeQAPipeline
    pipe = GenerativeQAPipeline(generator, retriever)
    """Voilà! Ask a question!"""

    query_1 = "Why did Arya Stark's character get portrayed in a television adaptation?"
    result_1 = pipe.run(query=query_1, top_k_retriever=1)
    print(f"Query: {query_1}")
    print(f"Answer: {result_1['answers'][0]}")
    print()

    query_2 = "What kind of character does Arya Stark play?"
    result_2 = pipe.run(query=query_2, top_k_retriever=1)
    print(f"Query: {query_2}")
    print(f"Answer: {result_2['answers'][0]}")
    print()
    pipe.run(query=query_2, top_k_retriever=1)
Beispiel #14
0
def tutorial11_pipelines():
    #Download and prepare data - 517 Wikipedia articles for Game of Thrones
    doc_dir = "data/article_txt_got"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # convert files to dicts containing documents that can be indexed to our datastore
    got_dicts = convert_files_to_dicts(dir_path=doc_dir,
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=True)

    # Initialize DocumentStore and index documents
    launch_es()
    document_store = ElasticsearchDocumentStore()
    document_store.delete_all_documents()
    document_store.write_documents(got_dicts)

    # Initialize Sparse retriever
    es_retriever = ElasticsearchRetriever(document_store=document_store)

    # Initialize dense retriever
    dpr_retriever = DensePassageRetriever(document_store)
    document_store.update_embeddings(dpr_retriever,
                                     update_existing_embeddings=False)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

    ######################
    # Prebuilt Pipelines #
    ######################

    # Extractive QA Pipeline
    ########################

    p_extractive_premade = ExtractiveQAPipeline(reader=reader,
                                                retriever=es_retriever)
    res = p_extractive_premade.run(query="Who is the father of Arya Stark?",
                                   top_k_retriever=10,
                                   top_k_reader=5)
    print_answers(res, details="minimal")

    # Document Search Pipeline
    ##########################

    p_retrieval = DocumentSearchPipeline(es_retriever)
    res = p_retrieval.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10)
    print_documents(res, max_text_len=200)

    # Generator Pipeline
    ##########################

    # We set this to True so that the document store returns document embeddings
    # with each document, this is needed by the Generator
    document_store.return_embedding = True

    # Initialize generator
    rag_generator = RAGenerator()

    # Generative QA
    p_generator = GenerativeQAPipeline(generator=rag_generator,
                                       retriever=dpr_retriever)
    res = p_generator.run(query="Who is the father of Arya Stark?",
                          top_k_retriever=10)
    print_answers(res, details="minimal")

    # We are setting this to False so that in later pipelines,
    # we get a cleaner printout
    document_store.return_embedding = False

    ##############################
    # Creating Pipeline Diagrams #
    ##############################

    p_extractive_premade.draw("pipeline_extractive_premade.png")
    p_retrieval.draw("pipeline_retrieval.png")
    p_generator.draw("pipeline_generator.png")

    ####################
    # Custom Pipelines #
    ####################

    # Extractive QA Pipeline
    ########################

    # Custom built extractive QA pipeline
    p_extractive = Pipeline()
    p_extractive.add_node(component=es_retriever,
                          name="Retriever",
                          inputs=["Query"])
    p_extractive.add_node(component=reader,
                          name="Reader",
                          inputs=["Retriever"])

    # Now we can run it
    res = p_extractive.run(query="Who is the father of Arya Stark?",
                           top_k_retriever=10,
                           top_k_reader=5)
    print_answers(res, details="minimal")
    p_extractive.draw("pipeline_extractive.png")

    # Ensembled Retriever Pipeline
    ##############################

    # Create ensembled pipeline
    p_ensemble = Pipeline()
    p_ensemble.add_node(component=es_retriever,
                        name="ESRetriever",
                        inputs=["Query"])
    p_ensemble.add_node(component=dpr_retriever,
                        name="DPRRetriever",
                        inputs=["Query"])
    p_ensemble.add_node(component=JoinDocuments(join_mode="concatenate"),
                        name="JoinResults",
                        inputs=["ESRetriever", "DPRRetriever"])
    p_ensemble.add_node(component=reader,
                        name="Reader",
                        inputs=["JoinResults"])
    p_ensemble.draw("pipeline_ensemble.png")

    # Run pipeline
    res = p_ensemble.run(
        query="Who is the father of Arya Stark?",
        top_k_retriever=5  #This is top_k per retriever
    )
    print_answers(res, details="minimal")

    # Query Classification Pipeline
    ###############################

    # Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run.
    # Though this looks very similar to the ensembled pipeline shown above,
    # the key difference is that only one of the retrievers is run for each request.
    # By contrast both retrievers are always run in the ensembled approach.

    class QueryClassifier():
        outgoing_edges = 2

        def run(self, **kwargs):
            if "?" in kwargs["query"]:
                return (kwargs, "output_2")
            else:
                return (kwargs, "output_1")

    # Here we build the pipeline
    p_classifier = Pipeline()
    p_classifier.add_node(component=QueryClassifier(),
                          name="QueryClassifier",
                          inputs=["Query"])
    p_classifier.add_node(component=es_retriever,
                          name="ESRetriever",
                          inputs=["QueryClassifier.output_1"])
    p_classifier.add_node(component=dpr_retriever,
                          name="DPRRetriever",
                          inputs=["QueryClassifier.output_2"])
    p_classifier.add_node(component=reader,
                          name="QAReader",
                          inputs=["ESRetriever", "DPRRetriever"])
    p_classifier.draw("pipeline_classifier.png")

    # Run only the dense retriever on the full sentence query
    res_1 = p_classifier.run(query="Who is the father of Arya Stark?",
                             top_k_retriever=10)
    print("DPR Results" + "\n" + "=" * 15)
    print_answers(res_1)

    # Run only the sparse retriever on a keyword based query
    res_2 = p_classifier.run(query="Arya Stark father", top_k_retriever=10)
    print("ES Results" + "\n" + "=" * 15)
    print_answers(res_2)
Beispiel #15
0
# try:
# //or we should config the cache_dir on from_pretrain # dont rely on cache , must save it with save()
# model = TransformersReader(model_name_or_path=modelName, use_gpu=-1)
# model = FARMReader(model_name_or_path=modelName,
#                                use_gpu=False, no_ans_boost=0)
# except:
#   pass;

document_store = SQLDocumentStore(url=sqlUrl)
# document_store = FAISSDocumentStore(sql_url=sqlUrl)

# convert files to dicts containing documents that can be indexed to our datastore
# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
# It must take a str as input, and return a str.
dicts = convert_files_to_dicts(dir_path=doc_dir,
                               clean_func=clean_wiki_text,
                               split_paragraphs=False)
# dicts = tika_convert_files_to_dicts(dir_path=doc_dir,clean_func=clean_wiki_text, split_paragraphs=False)

# We now have a list of dictionaries that we can write to our document store.
# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself.
# The default format here is: {"name": "<some-document-name>, "text": "<the-actual-text>"}

# Let's have a look at the first 3 entries:
# print(dicts[:3])
# Now, let's write the docs to our DB.
document_store.delete_all_documents()
document_store.write_documents(dicts)

# retriever = EmbeddingRetriever(
#     document_store=document_store, embedding_model="sentence_bert-saved", use_gpu=False)
def tutorial8_preprocessing():
    # This fetches some sample files to work with

    doc_dir = "data/preprocessing_tutorial"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
    """
    ## Converters
    
    Haystack's converter classes are designed to help you turn files on your computer into the documents
    that can be processed by the Haystack pipeline.
    There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
    """

    # Here are some examples of how you would use file converters

    converter = TextConverter(remove_numeric_tables=True,
                              valid_languages=["en"])
    doc_txt = converter.convert(
        file_path="data/preprocessing_tutorial/classics.txt", meta=None)

    converter = PDFToTextConverter(remove_numeric_tables=True,
                                   valid_languages=["en"])
    doc_pdf = converter.convert(
        file_path="data/preprocessing_tutorial/bert.pdf", meta=None)

    converter = DocxToTextConverter(remove_numeric_tables=True,
                                    valid_languages=["en"])
    doc_docx = converter.convert(
        file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)

    # Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.

    all_docs = convert_files_to_dicts(dir_path="data/preprocessing_tutorial")
    """
    
    ## PreProcessor
    
    The PreProcessor class is designed to help you clean text and split text into sensible units.
    File splitting can have a very significant impact on the system's performance.
    Have a look at the [Preprocessing](https://haystack.deepset.ai/docs/latest/preprocessingmd)
    and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages on our website for more details.
    """

    # This is a default usage of the PreProcessor.
    # Here, it performs cleaning of consecutive whitespaces
    # and splits a single large document into smaller documents.
    # Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
    # Note how the single document passed into the document gets split into 5 smaller documents

    preprocessor = PreProcessor(clean_empty_lines=True,
                                clean_whitespace=True,
                                clean_header_footer=False,
                                split_by="word",
                                split_length=1000,
                                split_respect_sentence_boundary=True)
    docs_default = preprocessor.process(doc_txt)
    print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")
    """
    ## Cleaning
    
    - `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines
    - `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text
    - `clean_header_footer` will remove any long header or footer texts that are repeated on each page
    
    ## Splitting
    By default, the PreProcessor will respect sentence boundaries, meaning that documents will not start or end
    midway through a sentence.
    This will help reduce the possibility of answer phrases being split between two documents.
    This feature can be turned off by setting `split_respect_sentence_boundary=False`.
    """

    # Not respecting sentence boundary vs respecting sentence boundary

    preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False)
    docs_nrsb = preprocessor_nrsb.process(doc_txt)

    print("RESPECTING SENTENCE BOUNDARY")
    end_text = docs_default[0]["text"][-50:]
    print("End of document: \"..." + end_text + "\"")
    print()
    print("NOT RESPECTING SENTENCE BOUNDARY")
    end_text_nrsb = docs_nrsb[0]["text"][-50:]
    print("End of document: \"..." + end_text_nrsb + "\"")
    """
    A commonly used strategy to split long documents, especially in the field of Question Answering,
    is the sliding window approach. If `split_length=10` and `split_overlap=3`, your documents will look like this:
    
    - doc1 = words[0:10]
    - doc2 = words[7:17]
    - doc3 = words[14:24]
    - ...
    
    You can use this strategy by following the code below.
    """

    # Sliding window approach

    preprocessor_sliding_window = PreProcessor(
        split_overlap=3,
        split_length=10,
        split_respect_sentence_boundary=False)
    docs_sliding_window = preprocessor_sliding_window.process(doc_txt)

    doc1 = docs_sliding_window[0]["text"][:200]
    doc2 = docs_sliding_window[1]["text"][:100]
    doc3 = docs_sliding_window[2]["text"][:100]

    print("Document 1: \"" + doc1 + "...\"")
    print("Document 2: \"" + doc2 + "...\"")
    print("Document 3: \"" + doc3 + "...\"")
Beispiel #17
0
def read_corpus():
    document_store = InMemoryDocumentStore()
    doc_dir = "Quran"
    dicts = convert_files_to_dicts(dir_path=doc_dir, split_paragraphs=True)
    document_store.write_documents(dicts)
    return document_store
Beispiel #18
0
def add_sample_data_doc_qa(model: DocQAModel):
    dicts = convert_files_to_dicts(dir_path="data/rc",
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)
    model.finder.retriever.document_store.write_documents(dicts)