def test_extractive_qa_answers_single_result(reader, retriever_with_docs): pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) query = "testing finder" prediction = pipeline.run(query=query, top_k_retriever=1, top_k_reader=1) assert prediction is not None assert len(prediction["answers"]) == 1
def test_extractive_qa_offsets(reader, retriever_with_docs): pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=5) assert prediction["answers"][0]["offset_start"] == 11 assert prediction["answers"][0]["offset_end"] == 16 start = prediction["answers"][0]["offset_start"] end = prediction["answers"][0]["offset_end"] assert prediction["answers"][0]["context"][start:end] == prediction["answers"][0]["answer"]
def test_extractive_qa_answers(reader, retriever_with_docs): pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=3) assert prediction is not None assert prediction["query"] == "Who lives in Berlin?" assert prediction["answers"][0]["answer"] == "Carla" assert prediction["answers"][0]["probability"] <= 1 assert prediction["answers"][0]["probability"] >= 0 assert prediction["answers"][0]["meta"]["meta_field"] == "test1" assert prediction["answers"][0]["context"] == "My name is Carla and I live in Berlin" assert len(prediction["answers"]) == 3
def Find_answer(text_file_path, data_folder_path, symbol, question): document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") with open(text_file_path, 'r', encoding='utf-8') as f: data = f.read() for i, line in enumerate(data.split(symbol)): with open(f'{data_folder_path}/data{i+1}.txt', 'w') as f: print(f'writing file no.{i+1}') f.write(line) test_dicts = convert_files_to_dicts(dir_path=data_folder_path, clean_func=clean_wiki_text, split_paragraphs=True) document_store.write_documents(test_dicts) retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query=64, max_seq_len_passage=256, batch_size=16, use_gpu=True, embed_title=True, use_fast_tokenizers=True) document_store.update_embeddings(retriever) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, context_window_size=300) pipe = ExtractiveQAPipeline(reader, retriever) prediction = pipe.run(query=question, top_k_retriever=10, top_k_reader=3) doc_with_ans = [] for i in range(len(prediction['answers'])): if prediction['answers'][i]['context'] not in doc_with_ans: doc_with_ans.append(prediction['answers'][i]['context']) answer = ' '.join(doc_with_ans) return answer
def tutorial3_basic_qa_pipeline_without_elasticsearch(): # In-Memory Document Store document_store = InMemoryDocumentStore() # or, alternatively, SQLite Document Store # document_store = SQLDocumentStore(url="sqlite:///qa.db") # ## Preprocessing of documents # # Haystack provides a customizable pipeline for: # - converting files into texts # - cleaning texts # - splitting texts # - writing them to a Document Store # In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index # them in Elasticsearch. # Let's first get some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. # Now, let's write the docs to our DB. document_store.write_documents(dicts) # ## Initalize Retriever, Reader, & Finder # # ### Retriever # # Retrievers help narrowing down the scope for the Reader to smaller units of text where # a given question could be answered. # # With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more # retrievers, please refer to the tutorial-1. # An in-memory TfidfRetriever based on Pandas dataframes retriever = TfidfRetriever(document_store=document_store) # ### Reader # # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based # on powerful, but slower deep learning models. # # Haystack currently supports Readers based on the frameworks FARM and Transformers. # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). # **Here:** a medium sized RoBERTa QA model using a Reader based on # FARM (https://huggingface.co/deepset/roberta-base-squad2) # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or # "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. # Higher values mean the model prefers "no answer possible". # #### FARMReader # # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) # #### TransformersReader # Alternative: # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # ### Pipeline # # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). from haystack.pipeline import ExtractiveQAPipeline pipe = ExtractiveQAPipeline(reader, retriever) ## Voilà! Ask a question! prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal")
def tutorial1_basic_qa_pipeline(): logger = logging.getLogger(__name__) LAUNCH_ELASTICSEARCH = True # ## Document Store # # Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of # `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`. # # **Here:** We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval, # and vector storage for text embeddings. # **Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3 # for using SQL/InMemory document stores. # **Hint**: # This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can # configure Haystack to work with your existing document stores. # # Start an Elasticsearch server # You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in # your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source. if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.run([ 'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2' ], shell=True) if status.returncode: raise Exception( "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # Connect to Elasticsearch document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") # ## Preprocessing of documents # # Haystack provides a customizable pipeline for: # - converting files into texts # - cleaning texts # - splitting texts # - writing them to a Document Store # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add # them in Elasticsearch. # Let's first fetch some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. # Now, let's write the docs to our DB. if LAUNCH_ELASTICSEARCH: document_store.write_documents(dicts) else: logger.warning( "Since we already have a running ES instance we should not index the same documents again. \n" "If you still want to do this call: document_store.write_documents(dicts) manually " ) # ## Initalize Retriever, Reader, & Finder # # ### Retriever # # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question # could be answered. # # They use some simple but fast algorithm. # **Here:** We use Elasticsearch's default BM25 algorithm # **Alternatives:** # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of # embeddings (e.g. created via Sentence-BERT) # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging retriever = ElasticsearchRetriever(document_store=document_store) # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes # with SQLite document store. # # from haystack.retriever.tfidf import TfidfRetriever # retriever = TfidfRetriever(document_store=document_store) # ### Reader # # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based # on powerful, but slower deep learning models. # # Haystack currently supports Readers based on the frameworks FARM and Transformers. # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). # **Here:** a medium sized RoBERTa QA model using a Reader based on # FARM (https://huggingface.co/deepset/roberta-base-squad2) # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or # "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean # the model prefers "no answer possible" # # #### FARMReader # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) # #### TransformersReader # Alternative: # reader = TransformersReader( # model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # ### Pipeline # # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). from haystack.pipeline import ExtractiveQAPipeline pipe = ExtractiveQAPipeline(reader, retriever) ## Voilà! Ask a question! prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal")
def tutorial6_better_retrieval_via_dpr(): # OPTION 1: FAISS is a library for efficient similarity search on a cluster of dense vectors. # The FAISSDocumentStore uses a SQL(SQLite in-memory be default) document store under-the-hood # to store the document text and other meta data. The vector embeddings of the text are # indexed on a FAISS Index that later is queried for searching answers. # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for # faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. # For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") # OPTION2: Milvus is an open source database library that is also optimized for vector similarity searches like FAISS. # Like FAISS it has both a "Flat" and "HNSW" mode but it outperforms FAISS when it comes to dynamic data management. # It does require a little more setup, however, as it is run through Docker and requires the setup of some config files. # See https://milvus.io/docs/v1.0.0/milvus_docker-cpu.md # launch_milvus() # document_store = MilvusDocumentStore() # ## Preprocessing of documents # Let's first get some documents that we want to query doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Now, let's write the docs to our DB. document_store.write_documents(dicts) ### Retriever retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query=64, max_seq_len_passage=256, batch_size=2, use_gpu=True, embed_title=True, use_fast_tokenizers=True) # Important: # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all # previously indexed documents and update their embedding representation. # While this can be a time consuming operation (depending on corpus size), it only needs to be done once. # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast. document_store.update_embeddings(retriever) ### Reader # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) ### Pipeline from haystack.pipeline import ExtractiveQAPipeline pipe = ExtractiveQAPipeline(reader, retriever) ## Voilà! Ask a question! prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal")
document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") ###################### # Prebuilt Pipelines # ###################### # Extractive QA Pipeline ######################## p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever) res = p_extractive_premade.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) print_answers(res, details="minimal") # Document Search Pipeline ########################## p_retrieval = DocumentSearchPipeline(es_retriever) res = p_retrieval.run(query="Who is the father of Arya Stark?", top_k_retriever=10) print_documents(res, max_text_len=200) # Generator Pipeline ########################## # We set this to True so that the document store returns document embeddings
# Testing with tutorial data doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Connecting point to preprocessing of as4 document_store.write_documents(dicts) # Fast filter to narrow down text - Default BM25, can be cunstomised from haystack.retriever.sparse import ElasticsearchRetriever retriever = ElasticsearchRetriever(document_store=document_store) # Reader to further scan with Hugging Face models # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) reader = FARMReader(model_name_or_path="deepset/bert-large-uncased-whole-word-masking-squad2", use_gpu=True) from haystack.pipeline import ExtractiveQAPipeline # Original Finder deprecated, pipeline allows more flexibility # prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # top_k_retriever -> the more retriever the more document scanned in Reader, slower but higher hit rate extractive_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever) # Other options: Document Search, Generative, FAQ question = input("What would you like to ask? ") prediction = extractive_pipeline.run(query=question, top_k_retriever=10, top_k_reader=5) # print_answers(prediction, details="all") #details: all, medium, minimal # data format: {query,'answers':[{'answer','score','probability','context','document_id','offset','meta'}]} print(prediction['answers'][0]['answer'])
model_name = 'distilbert-base-cased-distilled-squad' reader = FARMReader(model_name_or_path=model_name, progress_bar=False, return_no_answer=False) # finally the pipe pipe = ExtractiveQAPipeline(reader, retriever) # ask questions and get results from the pipe question = 'How are the colors?' answers = pipe.run(query=question, params={ 'Retriever': { 'top_k': 100 }, 'Reader': { 'top_k': 25 } }) print('Got the answers!') # visualize them with word clouds results = [] for answer in answers['answers']: results.append(answer['answer']) counter = Counter(results) # wordcloud cloud = WordCloud()
index="taschenhirn") # Now, let's write the dicts containing documents to our DB. document_store.write_documents(dicts) # initialize sparse retriever: retriever = ElasticsearchRetriever(document_store=document_store) # Alternative: reader = TransformersReader(model_name_or_path="Sahajtomar/GELECTRAQA", tokenizer="Sahajtomar/GELECTRAQA") # initialize pipe pipe = ExtractiveQAPipeline(reader, retriever) # You can configure how many candidates the reader and retriever shall return # The higher top_k_retriever, the better (but also the slower) your answers. #prediction = pipe.run(query="Welche Staaten grenzen an den Bodensee?", top_k_retriever=10, top_k_reader=5) pipe.run(query="Welches ist der größte See Bayerns?", top_k_retriever=5, top_k_reader=2) pipe.run(query="Wie weit erstreckt sich die Arktis?", top_k_retriever=5, top_k_reader=2) pipe.run(query="Wie viele Planeten kreisen um die Sonne?", top_k_retriever=5, top_k_reader=2) # EOF
def tutorial11_pipelines(): #Download and prepare data - 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore got_dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Initialize DocumentStore and index documents launch_es() document_store = ElasticsearchDocumentStore() document_store.delete_all_documents() document_store.write_documents(got_dicts) # Initialize Sparse retriever es_retriever = ElasticsearchRetriever(document_store=document_store) # Initialize dense retriever dpr_retriever = DensePassageRetriever(document_store) document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") ###################### # Prebuilt Pipelines # ###################### # Extractive QA Pipeline ######################## p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever) res = p_extractive_premade.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) print_answers(res, details="minimal") # Document Search Pipeline ########################## p_retrieval = DocumentSearchPipeline(es_retriever) res = p_retrieval.run(query="Who is the father of Arya Stark?", top_k_retriever=10) print_documents(res, max_text_len=200) # Generator Pipeline ########################## # We set this to True so that the document store returns document embeddings # with each document, this is needed by the Generator document_store.return_embedding = True # Initialize generator rag_generator = RAGenerator() # Generative QA p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever) res = p_generator.run(query="Who is the father of Arya Stark?", top_k_retriever=10) print_answers(res, details="minimal") # We are setting this to False so that in later pipelines, # we get a cleaner printout document_store.return_embedding = False ############################## # Creating Pipeline Diagrams # ############################## p_extractive_premade.draw("pipeline_extractive_premade.png") p_retrieval.draw("pipeline_retrieval.png") p_generator.draw("pipeline_generator.png") #################### # Custom Pipelines # #################### # Extractive QA Pipeline ######################## # Custom built extractive QA pipeline p_extractive = Pipeline() p_extractive.add_node(component=es_retriever, name="Retriever", inputs=["Query"]) p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"]) # Now we can run it res = p_extractive.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) print_answers(res, details="minimal") p_extractive.draw("pipeline_extractive.png") # Ensembled Retriever Pipeline ############################## # Create ensembled pipeline p_ensemble = Pipeline() p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) p_ensemble.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) p_ensemble.add_node(component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"]) p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) p_ensemble.draw("pipeline_ensemble.png") # Run pipeline res = p_ensemble.run( query="Who is the father of Arya Stark?", top_k_retriever=5 #This is top_k per retriever ) print_answers(res, details="minimal") # Query Classification Pipeline ############################### # Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run. # Though this looks very similar to the ensembled pipeline shown above, # the key difference is that only one of the retrievers is run for each request. # By contrast both retrievers are always run in the ensembled approach. class QueryClassifier(): outgoing_edges = 2 def run(self, **kwargs): if "?" in kwargs["query"]: return (kwargs, "output_2") else: return (kwargs, "output_1") # Here we build the pipeline p_classifier = Pipeline() p_classifier.add_node(component=QueryClassifier(), name="QueryClassifier", inputs=["Query"]) p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) p_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query res_1 = p_classifier.run(query="Who is the father of Arya Stark?", top_k_retriever=10) print("DPR Results" + "\n" + "=" * 15) print_answers(res_1) # Run only the sparse retriever on a keyword based query res_2 = p_classifier.run(query="Arya Stark father", top_k_retriever=10) print("ES Results" + "\n" + "=" * 15) print_answers(res_2)