document_store.update_embeddings(retriever) document_store.save("wiki_dump_embeddings.faiss") # example: # retrieved_doc = retriever.retrieve(query="Why did the revenue increase?") # tune these parameters too print("Running QA Reader") reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, no_ans_boost=-10, context_window_size=500, top_k_per_candidate=3, top_k_per_sample=1, num_processes=1, max_seq_len=256, doc_stride=128) # example: # reader.predict(question="Who is the father of Arya Starck?", documents=retrieved_docs, top_k=3) print("Started pipeline") p = Pipeline() p.add_node(component=retriever, name="ESRetriever1", inputs=["Query"]) p.add_node(component=reader, name="QAReader", inputs=["ESRetriever1"]) res = p.run(query="What did Einstein work on?", params={"retriever": { "top_k": 1 }}) json.dump(res, open('answer.json', 'w'), default=dumper) exit()
# Creating Pipeline Diagrams # ############################## p_extractive_premade.draw("pipeline_extractive_premade.png") p_retrieval.draw("pipeline_retrieval.png") p_generator.draw("pipeline_generator.png") #################### # Custom Pipelines # #################### # Extractive QA Pipeline ######################## # Custom built extractive QA pipeline p_extractive = Pipeline() p_extractive.add_node(component=es_retriever, name="Retriever", inputs=["Query"]) p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"]) # Now we can run it res = p_extractive.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) print_answers(res, details="minimal") p_extractive.draw("pipeline_extractive.png") # Ensembled Retriever Pipeline ##############################
def test_eval_pipeline(document_store: BaseDocumentStore, reader, retriever): # add eval data (SQUAD format) document_store.add_eval_data( filename="samples/squad/tiny.json", doc_index="haystack_test_eval_document", label_index="haystack_test_feedback", ) labels = document_store.get_all_labels_aggregated(index="haystack_test_feedback") eval_retriever = EvalDocuments() eval_reader = EvalAnswers(sas_model="sentence-transformers/paraphrase-MiniLM-L3-v2",debug=True) eval_reader_cross = EvalAnswers(sas_model="cross-encoder/stsb-TinyBERT-L-4",debug=True) eval_reader_vanila = EvalAnswers() assert document_store.get_document_count(index="haystack_test_eval_document") == 2 p = Pipeline() p.add_node(component=retriever, name="ESRetriever", inputs=["Query"]) p.add_node(component=eval_retriever, name="EvalDocuments", inputs=["ESRetriever"]) p.add_node(component=reader, name="QAReader", inputs=["EvalDocuments"]) p.add_node(component=eval_reader, name="EvalAnswers", inputs=["QAReader"]) p.add_node(component=eval_reader_cross, name="EvalAnswers_cross", inputs=["QAReader"]) p.add_node(component=eval_reader_vanila, name="EvalAnswers_vanilla", inputs=["QAReader"]) for l in labels: res = p.run( query=l.question, top_k_retriever=10, labels=l, top_k_reader=10, index="haystack_test_eval_document", ) assert eval_retriever.recall == 1.0 assert round(eval_reader.top_k_f1, 4) == 0.8333 assert eval_reader.top_k_em == 0.5 assert round(eval_reader.top_k_sas, 3) == 0.800 assert round(eval_reader_cross.top_k_sas, 3) == 0.671 assert eval_reader.top_k_em == eval_reader_vanila.top_k_em
probability: Optional[float] = None context: Optional[str] offset_start: int offset_end: int offset_start_in_doc: Optional[int] offset_end_in_doc: Optional[int] document_id: Optional[str] = None meta: Optional[Dict[str, Any]] class Response(BaseModel): query: str answers: List[Answer] PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=QUERY_PIPELINE_NAME) logger.info(f"Loaded pipeline nodes: {PIPELINE.graph.nodes.keys()}") concurrency_limiter = RequestLimiter(CONCURRENT_REQUEST_PER_WORKER) @router.post("/query", response_model=Response) def query(request: Request): with concurrency_limiter.run(): result = _process_request(PIPELINE, request) return result def _process_request(pipeline, request) -> Response: start_time = time.time() filters = {}
def tutorial5_evaluation(): ############################################## # Settings ############################################## # Choose from Evaluation style from ['retriever_closed', 'reader_closed', 'retriever_reader_open'] # 'retriever_closed' - evaluates only the retriever, based on whether the gold_label document is retrieved. # 'reader_closed' - evaluates only the reader in a closed domain fashion i.e. the reader is given one query # and one document and metrics are calculated on whether the right position in this text is selected by # the model as the answer span (i.e. SQuAD style) # 'retriever_reader_open' - evaluates retriever and reader in open domain fashion i.e. a document is considered # correctly retrieved if it contains the answer string within it. The reader is evaluated based purely on the # predicted string, regardless of which document this came from and the position of the extracted span. style = "retriever_reader_open" # make sure these indices do not collide with existing ones, the indices will be wiped clean before data is inserted doc_index = "tutorial5_docs" label_index = "tutorial5_labels" ############################################## # Code ############################################## launch_es() device, n_gpu = initialize_device_settings(use_cuda=True) # Download evaluation data, which is a subset of Natural Questions development set containing 50 documents doc_dir = "../data/nq" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Connect to Elasticsearch document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", create_index=False, embedding_field="emb", embedding_dim=768, excluded_meta_data=["emb"]) # Add evaluation data to Elasticsearch document store # We first delete the custom tutorial indices to not have duplicate elements # and also split our documents into shorter passages using the PreProcessor preprocessor = PreProcessor(split_by="word", split_length=500, split_overlap=0, split_respect_sentence_boundary=False, clean_empty_lines=False, clean_whitespace=False) document_store.delete_all_documents(index=doc_index) document_store.delete_all_documents(index=label_index) document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json", doc_index=doc_index, label_index=label_index, preprocessor=preprocessor) # Let's prepare the labels that we need for the retriever and the reader labels = document_store.get_all_labels_aggregated(index=label_index) # Initialize Retriever retriever = ElasticsearchRetriever(document_store=document_store) # Alternative: Evaluate DensePassageRetriever # Note, that DPR works best when you index short passages < 512 tokens as only those tokens will be used for the embedding. # Here, for nq_dev_subset_v2.json we have avg. num of tokens = 5220(!). # DPR still outperforms Elastic's BM25 by a small margin here. # retriever = DensePassageRetriever(document_store=document_store, # query_embedding_model="facebook/dpr-question_encoder-single-nq-base", # passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", # use_gpu=True, # embed_title=True, # remove_sep_tok_from_untitled_passages=True) # document_store.update_embeddings(retriever, index=doc_index) # Initialize Reader reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", top_k=4, return_no_answer=True) # Here we initialize the nodes that perform evaluation eval_retriever = EvalDocuments() eval_reader = EvalAnswers( sas_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2" ) ## Evaluate Retriever on its own in closed domain fashion if style == "retriever_closed": retriever_eval_results = retriever.eval(top_k=10, label_index=label_index, doc_index=doc_index) ## Retriever Recall is the proportion of questions for which the correct document containing the answer is ## among the correct documents print("Retriever Recall:", retriever_eval_results["recall"]) ## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank print("Retriever Mean Avg Precision:", retriever_eval_results["map"]) # Evaluate Reader on its own in closed domain fashion (i.e. SQuAD style) elif style == "reader_closed": reader_eval_results = reader.eval(document_store=document_store, device=device, label_index=label_index, doc_index=doc_index) # Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch #reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device) ## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"]) ## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer print("Reader Exact Match:", reader_eval_results["EM"]) ## Reader F1-Score is the average overlap between the predicted answers and the correct answers print("Reader F1-Score:", reader_eval_results["f1"]) # Evaluate combination of Reader and Retriever in open domain fashion elif style == "retriever_reader_open": # Here is the pipeline definition p = Pipeline() p.add_node(component=retriever, name="ESRetriever", inputs=["Query"]) p.add_node(component=eval_retriever, name="EvalDocuments", inputs=["ESRetriever"]) p.add_node(component=reader, name="QAReader", inputs=["EvalDocuments"]) p.add_node(component=eval_reader, name="EvalAnswers", inputs=["QAReader"]) results = [] for l in labels: res = p.run( query=l.question, top_k_retriever=10, labels=l, top_k_reader=10, index=doc_index, ) results.append(res) eval_retriever.print() print() retriever.print_time() print() eval_reader.print(mode="reader") print() reader.print_time() print() eval_reader.print(mode="pipeline") else: raise ValueError( f'style={style} is not a valid option. Choose from retriever_closed, reader_closed, retriever_reader_open' )
def test_eval_pipeline(document_store: BaseDocumentStore, reader, retriever): # add eval data (SQUAD format) document_store.add_eval_data( filename="samples/squad/tiny.json", doc_index="haystack_test_eval_document", label_index="haystack_test_feedback", ) labels = document_store.get_all_labels_aggregated( index="haystack_test_feedback") q_to_l_dict = {l.question: {"retriever": l, "reader": l} for l in labels} eval_retriever = EvalRetriever() eval_reader = EvalReader() assert document_store.get_document_count( index="haystack_test_eval_document") == 2 p = Pipeline() p.add_node(component=retriever, name="ESRetriever", inputs=["Query"]) p.add_node(component=eval_retriever, name="EvalRetriever", inputs=["ESRetriever"]) p.add_node(component=reader, name="QAReader", inputs=["EvalRetriever"]) p.add_node(component=eval_reader, name="EvalReader", inputs=["QAReader"]) for q, l in q_to_l_dict.items(): res = p.run( query=q, top_k_retriever=10, labels=l, top_k_reader=10, index="haystack_test_eval_document", ) assert eval_retriever.recall == 1.0 assert round(eval_reader.top_k_f1, 4) == 0.8333 assert eval_reader.top_k_em == 0.5
def main(): launch_es() document_store = ElasticsearchDocumentStore() es_retriever = ElasticsearchRetriever(document_store=document_store) eval_retriever = EvalRetriever(open_domain=open_domain) reader = FARMReader("deepset/roberta-base-squad2", top_k_per_candidate=4, num_processes=1, return_no_answer=True) eval_reader = EvalReader(debug=True, open_domain=open_domain) # Download evaluation data, which is a subset of Natural Questions development set containing 50 documents doc_dir = "../data/nq" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Add evaluation data to Elasticsearch document store # We first delete the custom tutorial indices to not have duplicate elements preprocessor = PreProcessor(split_length=500, split_overlap=0, split_respect_sentence_boundary=False, clean_empty_lines=False, clean_whitespace=False) document_store.delete_all_documents(index=doc_index) document_store.delete_all_documents(index=label_index) document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json", doc_index=doc_index, label_index=label_index, preprocessor=preprocessor) labels = document_store.get_all_labels_aggregated(index=label_index) q_to_l_dict = {l.question: {"retriever": l, "reader": l} for l in labels} # Here is the pipeline definition p = Pipeline() p.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) p.add_node(component=eval_retriever, name="EvalRetriever", inputs=["ESRetriever"]) p.add_node(component=reader, name="QAReader", inputs=["EvalRetriever"]) p.add_node(component=eval_reader, name="EvalReader", inputs=["QAReader"]) results = [] for i, (q, l) in enumerate(q_to_l_dict.items()): res = p.run( query=q, top_k_retriever=top_k_retriever, labels=l, top_k_reader=10, index=doc_index, # skip_incorrect_retrieval=True ) results.append(res) eval_retriever.print() print() es_retriever.print_time() print() eval_reader.print(mode="reader") print() reader.print_time() print() eval_reader.print(mode="pipeline")
def setup(self): print("SETTING UP PIPELINE") self.document_store = ElasticsearchDocumentStore( similarity="dot_product", host="elasticsearch", username="", password="", index="document") self.document_store_faiss = FAISSDocumentStore( index="document", faiss_index_factory_str="Flat", return_embedding=True, sql_url=f"postgresql://{config('POSTGRES_USER')}:{config('POSTGRES_PASSWORD')}@{config('POSTGRES_HOST')}:{config('POSTGRES_PORT')}/faiss" ) processor, converter = self.write_as4_docs() table_data = self.write_table_docs(converter, processor) es_retriever = ElasticsearchRetriever( document_store=self.document_store) print("SETTING UP DPR") dpr_retriever = DPRTrainingManager.get_current_retriever( self.document_store_faiss) print("SETTING UP EMBEDDINGS") embedding_retriever = EmbeddingRetriever( document_store=self.document_store_faiss, embedding_model="deepset/sentence_bert" ) query_classifier = QueryClassifier() print("SETTING UP TABLE") table_retriever = TableRetriever(table_data) print("SETUP RETRIEVERS") self.question_generator = FurtherQuestionGenerator() print("UPDATING EMBEDDINGS") self.document_store_faiss.update_embeddings(dpr_retriever) print("UPDATED EMBEDDINGS") self.dpr_node = ContinualDPRNode( dpr_retriever, self.document_store_faiss) result = Result() self.trainer = DPRTrainingManager( self.document_store_faiss, self.dpr_node) print("SETUP COMPONENTS") pipeline = Pipeline() pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) pipeline.add_node(component=self.dpr_node, name="DPRRetriever", inputs=["Query"]) pipeline.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[ "DPRRetriever", "EmbeddingRetriever", "ESRetriever"]) pipeline.add_node(component=query_classifier, name="QueryClassifier", inputs=["JoinResults"]) pipeline.add_node(component=self.question_generator, name="QnGenerator", inputs=["QueryClassifier.output_1"]) pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[ "QueryClassifier.output_2"]) pipeline.add_node(component=result, name="Result", inputs=[ "QnGenerator", "TableRetriever"]) self.pipeline = pipeline print("SETUP PIPELINE")
def tutorial14_query_classifier(): #Download and prepare data - 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore got_dicts = convert_files_to_dicts( dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True ) # Initialize DocumentStore and index documents launch_es() document_store = ElasticsearchDocumentStore() document_store.delete_all_documents() document_store.write_documents(got_dicts) # Initialize Sparse retriever es_retriever = ElasticsearchRetriever(document_store=document_store) # Initialize dense retriever dpr_retriever = DensePassageRetriever(document_store) document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") # Here we build the pipeline sklearn_keyword_classifier = Pipeline() sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"]) sklearn_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) sklearn_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) sklearn_keyword_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query res_1 = sklearn_keyword_classifier.run( query="Who is the father of Arya Stark?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_1) # Run only the sparse retriever on a keyword based query res_2 = sklearn_keyword_classifier.run( query="arya stark father", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_2) # Run only the dense retriever on the full sentence query res_3 = sklearn_keyword_classifier.run( query="which country was jon snow filmed ?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_3) # Run only the sparse retriever on a keyword based query res_4 = sklearn_keyword_classifier.run( query="jon snow country", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_4) # Run only the dense retriever on the full sentence query res_5 = sklearn_keyword_classifier.run( query="who are the younger brothers of arya stark ?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_5) # Run only the sparse retriever on a keyword based query res_6 = sklearn_keyword_classifier.run( query="arya stark younger brothers", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_6) # Here we build the pipeline transformer_keyword_classifier = Pipeline() transformer_keyword_classifier.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]) transformer_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) transformer_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) transformer_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) transformer_keyword_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query res_1 = transformer_keyword_classifier.run( query="Who is the father of Arya Stark?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_1) # Run only the sparse retriever on a keyword based query res_2 = transformer_keyword_classifier.run( query="arya stark father", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_2) # Run only the dense retriever on the full sentence query res_3 = transformer_keyword_classifier.run( query="which country was jon snow filmed ?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_3) # Run only the sparse retriever on a keyword based query res_4 = transformer_keyword_classifier.run( query="jon snow country", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_4) # Run only the dense retriever on the full sentence query res_5 = transformer_keyword_classifier.run( query="who are the younger brothers of arya stark ?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_5) # Run only the sparse retriever on a keyword based query res_6 = transformer_keyword_classifier.run( query="arya stark younger brothers", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) print_answers(res_6) # Here we build the pipeline transformer_question_classifier = Pipeline() transformer_question_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) transformer_question_classifier.add_node(component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"), name="QueryClassifier", inputs=["DPRRetriever"]) transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"]) transformer_question_classifier.draw("question_classifier.png") # Run only the QA reader on the question query res_1 = transformer_question_classifier.run( query="Who is the father of Arya Stark?", top_k_retriever=10 ) print("DPR Results" + "\n" + "="*15) print_answers(res_1) # Show only DPR results res_2 = transformer_question_classifier.run( query="Arya Stark was the daughter of a Lord.", top_k_retriever=10 ) print("ES Results" + "\n" + "="*15) res_2 # Here we create the keyword vs question/statement query classifier queries = ["arya stark father","jon snow country", "who is the father of arya stark","which country was jon snow filmed?"] keyword_classifier = TransformersQueryClassifier() for query in queries: result = keyword_classifier.run(query=query) if result[1] == "output_1": category = "question/statement" else: category = "keyword" print(f"Query: {query}, raw_output: {result}, class: {category}") # Here we create the question vs statement query classifier queries = ["Lord Eddard was the father of Arya Stark.","Jon Snow was filmed in United Kingdom.", "who is the father of arya stark?","Which country was jon snow filmed in?"] question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier") for query in queries: result = question_classifier.run(query=query) if result[1] == "output_1": category = "question" else: category = "statement" print(f"Query: {query}, raw_output: {result}, class: {category}")
document_store = FAISSDocumentStore.load(faiss, sql_url=sql_url, index='document') print("Loaded document store") retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", use_gpu=True, batch_size=64, embed_title=True) print("Loaded retriever") reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, no_ans_boost=-10, context_window_size=500, top_k_per_candidate=3, top_k_per_sample=1, num_processes=1, max_seq_len=256, doc_stride=128, progress_bar=False) print("Loaded reader") p = Pipeline() p.add_node(component=retriever, name="ESRetriever1", inputs=["Query"]) p.add_node(component=reader, name="QAReader", inputs=["ESRetriever1"]) testing.test(p, dataset, jsonify=True, output_file=output_path)
def tutorial11_pipelines(): #Download and prepare data - 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore got_dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Initialize DocumentStore and index documents launch_es() document_store = ElasticsearchDocumentStore() document_store.delete_all_documents() document_store.write_documents(got_dicts) # Initialize Sparse retriever es_retriever = ElasticsearchRetriever(document_store=document_store) # Initialize dense retriever dpr_retriever = DensePassageRetriever(document_store) document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") ###################### # Prebuilt Pipelines # ###################### # Extractive QA Pipeline ######################## p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever) res = p_extractive_premade.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) print_answers(res, details="minimal") # Document Search Pipeline ########################## p_retrieval = DocumentSearchPipeline(es_retriever) res = p_retrieval.run(query="Who is the father of Arya Stark?", top_k_retriever=10) print_documents(res, max_text_len=200) # Generator Pipeline ########################## # We set this to True so that the document store returns document embeddings # with each document, this is needed by the Generator document_store.return_embedding = True # Initialize generator rag_generator = RAGenerator() # Generative QA p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever) res = p_generator.run(query="Who is the father of Arya Stark?", top_k_retriever=10) print_answers(res, details="minimal") # We are setting this to False so that in later pipelines, # we get a cleaner printout document_store.return_embedding = False ############################## # Creating Pipeline Diagrams # ############################## p_extractive_premade.draw("pipeline_extractive_premade.png") p_retrieval.draw("pipeline_retrieval.png") p_generator.draw("pipeline_generator.png") #################### # Custom Pipelines # #################### # Extractive QA Pipeline ######################## # Custom built extractive QA pipeline p_extractive = Pipeline() p_extractive.add_node(component=es_retriever, name="Retriever", inputs=["Query"]) p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"]) # Now we can run it res = p_extractive.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) print_answers(res, details="minimal") p_extractive.draw("pipeline_extractive.png") # Ensembled Retriever Pipeline ############################## # Create ensembled pipeline p_ensemble = Pipeline() p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) p_ensemble.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) p_ensemble.add_node(component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"]) p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) p_ensemble.draw("pipeline_ensemble.png") # Run pipeline res = p_ensemble.run( query="Who is the father of Arya Stark?", top_k_retriever=5 #This is top_k per retriever ) print_answers(res, details="minimal") # Query Classification Pipeline ############################### # Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run. # Though this looks very similar to the ensembled pipeline shown above, # the key difference is that only one of the retrievers is run for each request. # By contrast both retrievers are always run in the ensembled approach. class QueryClassifier(): outgoing_edges = 2 def run(self, **kwargs): if "?" in kwargs["query"]: return (kwargs, "output_2") else: return (kwargs, "output_1") # Here we build the pipeline p_classifier = Pipeline() p_classifier.add_node(component=QueryClassifier(), name="QueryClassifier", inputs=["Query"]) p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) p_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) p_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query res_1 = p_classifier.run(query="Who is the father of Arya Stark?", top_k_retriever=10) print("DPR Results" + "\n" + "=" * 15) print_answers(res_1) # Run only the sparse retriever on a keyword based query res_2 = p_classifier.run(query="Arya Stark father", top_k_retriever=10) print("ES Results" + "\n" + "=" * 15) print_answers(res_2)
docs = [doc[1] for doc in kwargs["result"]] return (docs, "output_1") else: return (kwargs["result"]["answer"], "output_1") result = Result() """## Assembling into Pipeline""" # Current approach from haystack import Pipeline from haystack.pipeline import JoinDocuments # Building new pipeline with multiple retrievers pipeline = Pipeline() pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) pipeline.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) pipeline.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) pipeline.add_node(component=JoinDocuments(join_mode="merge"), name="JoinResults", inputs=[ "DPRRetriever", "EmbeddingRetriever", "ESRetriever"]) pipeline.add_node(component=query_classifier, name="QueryClassifier", inputs=["JoinResults"]) pipeline.add_node(component=question_generator, name="QnGenerator", inputs=["QueryClassifier.output_1"]) pipeline.add_node(component=table_retriever, name="TableRetriever", inputs=[ "QueryClassifier.output_2"]) pipeline.add_node(component=result, name="Result", inputs=[ "QnGenerator", "TableRetriever"])