def test_context_window_size(test_docs_xs): # TODO parametrize window_size and farm/transformers reader using pytest docs = [ Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs ] for window_size in [10, 15, 20]: farm_reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0, use_gpu=False, top_k_per_sample=5, no_ans_boost=None, context_window_size=window_size) prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5) for answer in prediction["answers"]: # If the extracted answer is larger than the context window, the context window is expanded. # If the extracted answer is odd in length, the resulting context window is one less than context_window_size # due to rounding (See FARM's QACandidate) # TODO Currently the behaviour of context_window_size in FARMReader and TransformerReader is different if len(answer["answer"]) <= window_size: assert len(answer["context"]) in [window_size, window_size - 1] else: assert len(answer["answer"]) == len(answer["context"])
def main(): args = docopt(__doc__) data_dir = args["--data_dir"] if args["train"] : reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) reader.train(data_dir=data_dir, train_filename=args["--train_file_name"],dev_filename=args["--dev_file_name"],use_gpu=False, n_epochs=1, save_dir=args["--save_dir"],dev_split=0.05) if args["test"] : reader = FARMReader(model_name_or_path=args["--save_dir"], use_gpu=False) print(reader.eval_on_file(data_dir,args["--eval_file_name"],'cpu')) if args["cli"] : reader = FARMReader(model_name_or_path=args["--save_dir"], use_gpu=False) query_doc_list=[] for text_file in list(glob.glob(data_dir+'/*.txt')): with open(text_file,"r") as f: context=f.read() #context=context.split(".") context=[context] for i,para in enumerate(context): query_doc_list.append(Document(id=str(i),text=para)) while 1: question=input("CTRL C to exit >") prediction=reader.predict(question,query_doc_list) print("answer:>> ",prediction['answers'][0]['answer']) print("-----") print("context:>> ",prediction['answers'][0]['context']) print("-------------")
def test_top_k(test_docs_xs): # TODO parametrize top_k and farm/transformers reader using pytest # TODO transformers reader was crashing when tested on this docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs] farm_reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0, use_gpu=False, top_k_per_sample=4, no_ans_boost=None, top_k_per_candidate=4) for top_k in [2, 5, 10]: prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k) assert len(prediction["answers"]) == top_k
def farm_distilbert(): return FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False, top_k_per_sample=5, num_processes=0 )
def feed_documents_to_model(model_name="deepset/roberta-base-squad2-covid"): """Feeds documents to model and returns a model ready to make predictions Parameters ---------- model_name : str The path of the model to be selected from HuggingFace By default uses the pretrained version of roBERTa in squad2 and covid articles Returns ------- finder the model to use for predictions """ # Initialize in memory Document Store document_store = InMemoryDocumentStore() # Load articles and format it as dictionary articles = ret.get_data(MANIFEST, ARTICLES_FOLDER, []) dicts_textContent = process_documents(articles) # Store the dictionary with articles content in the Document Store document_store.write_documents(dicts_textContent) # Retriever chooses what is the subset of documents that are relevant # many techniques are possible: for dev purposes TfidfRetriever is faster retriever = TfidfRetriever(document_store=document_store) # Reader provides interface to use the pre trained transformers # by default we're using the roberta reader = FARMReader(model_name_or_path=model_name, use_gpu=False) # The finder retrieves predictions finder = Finder(reader, retriever) return finder
def reader_without_normalized_scores(): return FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False, top_k_per_sample=5, num_processes=0, use_confidence_scores=False)
def initFinder(): """ Function to initiate retriever, reader and finder Parameters ---------- Returns ------- finder (object): Haystack finder """ retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", use_gpu=False, embed_title=True, max_seq_len=256, batch_size=16, remove_sep_tok_from_untitled_passages=True) # Important: # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all # previously indexed documents and update their embedding representation. # While this can be a time consuming operation (depending on corpus size), it only needs to be done once. # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast. document_store.update_embeddings(retriever) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) return Finder(reader, retriever)
def init(): ### Model values for Reader and Document Store global document_store, retriever, reader, finder document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") retriever = ElasticsearchRetriever(document_store=document_store) reader = FARMReader(model_name_or_path='deepset/roberta-base-squad2-covid', use_gpu=False) finder = Finder(reader, retriever)
def load(self): if(self.finder and self.finder2): return if(not self.document_store2): self.document_store2 = FAISSDocumentStore.load( sql_url=sqlUrlFAQ, faiss_file_path='faiss2') # save before load in preprocess self.initSql(url=sqlUrlFAQ, document_store=self.document_store2) # else: # reset session # # self.document_store2.session.close() # super( # FAISSDocumentStore, self.document_store2).__init__(url=sqlUrlFAQ) if(not self.retriever2): self.retriever2 = EmbeddingRetriever(document_store=self.document_store2, embedding_model="sentence_bert-saved", use_gpu=False) if(not self.finder2): self.finder2 = Finder(reader=None, retriever=self.retriever2) if(not self.document_store): self.document_store = SQLDocumentStore(url=sqlUrl) #FAISSDocumentStore.load(faiss_file_path='faiss1', sql_url=sqlUrl) self.initSql(url=sqlUrl, document_store=self.document_store) # else: # reset session # # self.document_store.session.close() # super( # FAISSDocumentStore, self.document_store).__init__(url=sqlUrl) # self.retriever = EmbeddingRetriever( #redice load by sharing the same retriever and set store on fly?? # document_store=self.document_store, embedding_model="sentence_bert-saved", use_gpu=False) if not self.retriever else self.retriever if(not self.retriever): self.retriever = TfidfRetriever(document_store=self.document_store) self.reader = FARMReader(model_name_or_path=modelDir, use_gpu=False, no_ans_boost=0) if not self.reader else self.reader # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) self.finder = Finder( self.reader, self.retriever) if not self.finder else self.finder
def qna(): """Return the n answers.""" question = request.form['question'] # index is the target document where queries need to sent. index = request.form['index'] # to select train or untrained model mode = request.form['mode'] #initialization of the Haystack Elasticsearch document storage document_store = ElasticsearchDocumentStore( host=app.config["host"], username=app.config["username"], password=app.config["password"], index=index) if mode == 'trained': # base on the search mode train_model reader = FARMReader(model_name_or_path=app.config["train_model"], use_gpu=False) else: # base on the search mode pre_train reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False) #initialization of ElasticRetriever retriever = ElasticsearchRetriever(document_store=document_store) # Finder sticks together reader and retriever # in a pipeline to answer our actual questions. finder = Finder(reader, retriever) # predict n answers n = int(request.form['n']) prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=n) answer = [] for res in prediction['answers']: answer.append(res['answer']) return json.dumps({ 'status': 'success', 'message': 'Process succesfully', 'result': answer })
def no_answer_reader(request): if request.param == "farm": return FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False, top_k_per_sample=5, no_ans_boost=0, num_processes=0) if request.param == "transformers": return TransformersReader(model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2", use_gpu=-1, n_best_per_passage=5)
def reader(request): if request.param == "farm": return FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False, top_k_per_sample=5, num_processes=0) if request.param == "transformers": return TransformersReader(model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
def farm_roberta(): return FARMReader( model_name_or_path="deepset/roberta-base-squad2", use_gpu=False, top_k_per_sample=5, no_ans_boost=0, num_processes=0 )
def test_top_k(test_docs_xs): # TODO parametrize top_k and farm/transformers reader using pytest # TODO transformers reader was crashing when tested on this docs = [] for d in test_docs_xs: doc = Document(id=d["meta"]["name"], text=d["text"], meta=d["meta"]) docs.append(doc) farm_reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", num_processes=0, use_gpu=False, top_k_per_sample=4, no_ans_boost=None, top_k_per_candidate=4) for top_k in [2, 5, 10]: prediction = farm_reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k) assert len(prediction["answers"]) == top_k
def __init__(self): self.document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") self.retriever = ElasticsearchRetriever( document_store=self.document_store) self.reader = FARMReader( model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) self.finder = Finder(self.reader, self.retriever) print('Ready')
def process(document_store): logger = logging.getLogger(__name__) # # Connect to Elasticsearch # document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") # # # write the docs to the DB. # document_store.write_documents(file) # ## Initalize Retriever, Reader, & Finder # # ### Retriever # # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question # could be answered. # # They use some simple but fast algorithm. # Elasticsearch's default BM25 algorithm is used retriever = ElasticsearchRetriever(document_store=document_store) # ### Reader # # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based # on powerful, but slower deep learning models. # # Haystack currently supports Readers based on the frameworks FARM and Transformers. # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). # **Here:** a medium sized RoBERTa QA model using a Reader based on # FARM (https://huggingface.co/deepset/roberta-base-squad2) # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or # "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False, context_window_size=500) # Some default pipes that can be chosen from # Extractive QA # qa_pipe = ExtractiveQAPipeline(reader=reader, retriever=retriever) # Document Search # doc_pipe = DocumentSearchPipeline(retriever=retriever) # Generative QA # doc_pipe = GenerativeQAPipeline(generator=rag_generator, retriever=retriever) # FAQ based QA # doc_pipe = FAQPipeline(retriever=retriever) # p = FAQPipeline(retriever=retriever) p = ExtractiveQAPipeline(reader, retriever) query_Handler.pipe = p
def __init__(self, id, add_sample_data=False): Model.__init__(self, id) doc_store = ElasticsearchDocumentStore(host=DB_HOST, port=DB_PORT, index=self.id) retriever = ElasticsearchRetriever(document_store=doc_store) reader = FARMReader( model_name_or_path=READER_MODEL_PATH, batch_size=BATCHSIZE, use_gpu=False, num_processes=MAX_PROCESSES, ) self.finder = Finder(reader, retriever) if add_sample_data: add_sample_data_doc_qa(self) reader.save(directory=READER_MODEL_PATH) print("saved")
def get_results(txt_files_location, use_gpu, questions_list, results_location): document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") for dirpath, dirnames, files in os.walk(txt_files_location): for dirname in dirnames: for dirpath, dirname, files in os.walk( os.path.join(txt_files_location, dirname)): for file_name in files: document_store.client.indices.delete(index='document', ignore=[400, 404]) doc_dir = dirpath dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) document_store.write_documents(dicts) retriever = ElasticsearchRetriever( document_store=document_store) reader = FARMReader( model_name_or_path= "elgeish/cs224n-squad2.0-albert-xxlarge-v1", use_gpu=use_gpu) finder = Finder(reader, retriever) sys.stdout = open( os.path.join(results_location, file_name[:-4] + "_results.txt"), "a+") for i, question in enumerate(questions_list): prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=1) print("\n\n\nQuestion " + str(i + 1) + ":\n") print(question + "\n") print_answers(prediction, details="minimal") sys.stdout.close() document_store.client.transport.close()
def qa(self, question, text_field): document_store = ElasticsearchDocumentStore(host=ES_HOST, username=ES_USERNAME, password=ES_PASSWORD, index=self.ELASTIC_INDEX, text_field=text_field) retriever = TfidfRetriever(document_store=document_store) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) finder = Finder(reader, retriever) prediction = finder.get_answers(question=question, top_k_retriever=1, top_k_reader=5) return prediction
def Find_answer(text_file_path, data_folder_path, symbol, question): document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") with open(text_file_path, 'r', encoding='utf-8') as f: data = f.read() for i, line in enumerate(data.split(symbol)): with open(f'{data_folder_path}/data{i+1}.txt', 'w') as f: print(f'writing file no.{i+1}') f.write(line) test_dicts = convert_files_to_dicts(dir_path=data_folder_path, clean_func=clean_wiki_text, split_paragraphs=True) document_store.write_documents(test_dicts) retriever = DensePassageRetriever( document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query=64, max_seq_len_passage=256, batch_size=16, use_gpu=True, embed_title=True, use_fast_tokenizers=True) document_store.update_embeddings(retriever) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, context_window_size=300) pipe = ExtractiveQAPipeline(reader, retriever) prediction = pipe.run(query=question, top_k_retriever=10, top_k_reader=3) doc_with_ans = [] for i in range(len(prediction['answers'])): if prediction['answers'][i]['context'] not in doc_with_ans: doc_with_ans.append(prediction['answers'][i]['context']) answer = ' '.join(doc_with_ans) return answer
def tutorial2_finetune_a_model_on_your_data(): # ## Create Training Data # # There are two ways to generate training data # # 1. **Annotation**: You can use the annotation tool(https://github.com/deepset-ai/haystack#labeling-tool) to label # your data, i.e. highlighting answers to your questions in a document. The tool supports structuring # your workflow with organizations, projects, and users. The labels can be exported in SQuAD format # that is compatible for training with Haystack. # # 2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's # REST API interface. This includes a customizable user feedback API for providing feedback on the # answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data # for fine-tuning your model further. # # # ## Fine-tune your model # # Once you have collected training data, you can fine-tune your base models. # We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format). # We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer # Learning effects. #**Recommendation: Run training on a GPU. To do so change the `use_gpu` arguments below to `True` reader = FARMReader( model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True) train_data = "data/squad20" # train_data = "PATH/TO_YOUR/TRAIN_DATA" reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model") # Saving the model happens automatically at the end of training into the `save_dir` you specified # However, you could also save a reader manually again via: reader.save(directory="my_model") # If you want to load it at a later point, just do: new_reader = FARMReader(model_name_or_path="my_model")
def create_app(config_name): app = Flask(__name__) app.config.from_object(config[config_name]) host = config[config_name].ELASTIC_URL port = config[config_name].ELASTIC_PORT index = config[config_name].ELASTIC_INDEX doc_store = ElasticsearchDocumentStore(host=host, username='', password='', index=index) retriever = ElasticsearchRetriever(document_store=doc_store) model_name = "deepset/roberta-base-squad2" reader = FARMReader(model_name_or_path=model_name, num_processes=0, use_gpu=False) app.finder = Finder(reader, retriever) from app.main import main as main_blueprint app.register_blueprint(main_blueprint) return app
# english_retriever english_retriever = ElasticsearchRetriever( document_store=document_store, embedding_model=EMBEDDING_MODEL_PATH, gpu=USE_GPU, pooling_strategy=EMBEDDING_POOLING_STRATEGY, emb_extraction_layer=EMBEDDING_EXTRACTION_LAYER) if READER_MODEL_PATH: # needed for extractive QA reader = FARMReader( model_name_or_path=str(READER_MODEL_PATH), batch_size=BATCHSIZE, use_gpu=USE_GPU, context_window_size=CONTEXT_WINDOW_SIZE, top_k_per_candidate=TOP_K_PER_CANDIDATE, no_ans_boost=NO_ANS_BOOST, max_processes=MAX_PROCESSES, max_seq_len=MAX_SEQ_LEN, doc_stride=DOC_STRIDE, ) else: # don't need one for pure FAQ matching reader = None FINDERS = { 1: Finder(reader=reader, retriever=retriever), 2: Finder(reader=reader, retriever=english_retriever) }
return document_store @st.cache(allow_output_mutation=True) def retriever(): document_store = read_corpus() retriever = TfidfRetriever(document_store=document_store) return retriever question = st.text_input('Input your question here:') if st.button('Ask'): with st.spinner('Reading all the translations from all over Quran'): retriever = retriever() if not(path.exists('data/mlm-temp')): reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2", use_gpu=False) reader.save(directory='data/mlm-temp') st.info('Downloaded Fresh Model') else: reader = FARMReader(model_name_or_path="data/mlm-temp", use_gpu=False) st.info('Re-Used Model') finder = Finder(reader, retriever) prediction = finder.get_answers(question=question, top_k_retriever=10, top_k_reader=5) keys=['answer','context','meta','probability','score'] print(list( map(prediction.get, ['query']))) print("\n") answer_frame=pd.DataFrame.from_records([list( map(i.get, keys)) for i in prediction['answers']]) answer_frame.columns=['answer','reference','Surah','confidence','score']
# from haystack.retriever.dense import EmbeddingRetriever # retriever = EmbeddingRetriever(document_store=document_store, # embedding_model="deepset/sentence_bert", # model_format="farm") # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) # reader = TransformersReader(model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) # reader = FARMReader(model_name_or_path="twmkn9/albert-base-v2-squad2", use_gpu=True) # reader = FARMReader(model_name_or_path="roberta-large", use_gpu=True) # reader = FARMReader(model_name_or_path="csarron/mobilebert-uncased-squad-v2", use_gpu=True) # reader = FARMReader(model_name_or_path="deepset/xlm-roberta-large-squad2", use_gpu=True) # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) # reader = FARMReader(model_name_or_path="deepset/xlm-roberta-large-squad2", use_gpu=True) reader = FARMReader(model_name_or_path="twmkn9/albert-base-v2-squad2", use_gpu=True) # reader = FARMReader(model_name_or_path="ktrapeznikov/albert-xlarge-v2-squad-v2", use_gpu=True) finder = Finder(reader, retriever) prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=40, top_k_reader=5) print_answers(prediction, details="minimal") # print("\n\n") # print(prediction)
fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # Connect to Elasticsearch document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", create_index=False) # Add evaluation data to Elasticsearch database if LAUNCH_ELASTICSEARCH: document_store.add_eval_data("../data/nq/nq_dev_subset.json") else: logger.warning("Since we already have a running ES instance we should not index the same documents again." "If you still want to do this call: 'document_store.add_eval_data('../data/nq/nq_dev_subset.json')' manually ") # Initialize Retriever retriever = ElasticsearchRetriever(document_store=document_store) # Initialize Reader reader = FARMReader("deepset/roberta-base-squad2") # Initialize Finder which sticks together Reader and Retriever finder = Finder(reader, retriever) ## Evaluate Retriever on its own if eval_retriever_only: retriever_eval_results = retriever.eval() ## Retriever Recall is the proportion of questions for which the correct document containing the answer is ## among the correct documents print("Retriever Recall:", retriever_eval_results["recall"]) ## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank print("Retriever Mean Avg Precision:", retriever_eval_results["map"]) # Evaluate Reader on its own
### Retriever retriever = DensePassageRetriever(document_store=document_store, embedding_model="dpr-bert-base-nq", do_lower_case=True, use_gpu=True) # Important: # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all # previously indexed documents and update their embedding representation. # While this can be a time consuming operation (depending on corpus size), it only needs to be done once. # At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast. document_store.update_embeddings(retriever) ### Reader # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) ### Finder # The Finder sticks together reader and retriever in a pipeline to answer our actual questions. finder = Finder(reader, retriever) ### Voilà! Ask a question! # You can configure how many candidates the reader and retriever shall return # The higher top_k_retriever, the better (but also the slower) your answers. prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5)
write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True) ## Initalize Reader, Retriever & Finder # A retriever identifies the k most promising chunks of text that might contain the answer for our question # Retrievers use some simple but fast algorithm, here: TF-IDF retriever = TfidfRetriever(document_store=document_store) # A reader scans the text chunks in detail and extracts the k best answers # Reader use more powerful but slower deep learning models # You can select a local model or any of the QA models published on huggingface's model hub (https://huggingface.co/models) # here: a medium sized BERT QA model trained via FARM on Squad 2.0 reader = FARMReader(model_name_or_path="deepset/bert-base-cased-squad2", use_gpu=False) # OR: use alternatively a reader from huggingface's transformers package (https://github.com/huggingface/transformers) # reader = TransformersReader(model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # The Finder sticks together retriever and retriever in a pipeline to answer our actual questions finder = Finder(reader, retriever) ## Voilá! Ask a question! # You can configure how many candidates the reader and retriever shall return # The higher top_k_retriever, the better (but also the slower) your answers. prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) #prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5)
def tutorial3_basic_qa_pipeline_without_elasticsearch(): # In-Memory Document Store document_store = InMemoryDocumentStore() # or, alternatively, SQLite Document Store # document_store = SQLDocumentStore(url="sqlite:///qa.db") # ## Preprocessing of documents # # Haystack provides a customizable pipeline for: # - converting files into texts # - cleaning texts # - splitting texts # - writing them to a Document Store # In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index # them in Elasticsearch. # Let's first get some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. # Now, let's write the docs to our DB. document_store.write_documents(dicts) # ## Initalize Retriever, Reader, & Finder # # ### Retriever # # Retrievers help narrowing down the scope for the Reader to smaller units of text where # a given question could be answered. # # With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more # retrievers, please refer to the tutorial-1. # An in-memory TfidfRetriever based on Pandas dataframes retriever = TfidfRetriever(document_store=document_store) # ### Reader # # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based # on powerful, but slower deep learning models. # # Haystack currently supports Readers based on the frameworks FARM and Transformers. # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). # **Here:** a medium sized RoBERTa QA model using a Reader based on # FARM (https://huggingface.co/deepset/roberta-base-squad2) # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or # "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. # Higher values mean the model prefers "no answer possible". # #### FARMReader # # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) # #### TransformersReader # Alternative: # reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # ### Pipeline # # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). from haystack.pipeline import ExtractiveQAPipeline pipe = ExtractiveQAPipeline(reader, retriever) ## Voilà! Ask a question! prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal")
def tutorial1_basic_qa_pipeline(): logger = logging.getLogger(__name__) LAUNCH_ELASTICSEARCH = True # ## Document Store # # Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of # `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`. # # **Here:** We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval, # and vector storage for text embeddings. # **Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3 # for using SQL/InMemory document stores. # **Hint**: # This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can # configure Haystack to work with your existing document stores. # # Start an Elasticsearch server # You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in # your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source. if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.run([ 'docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2' ], shell=True) if status.returncode: raise Exception( "Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # Connect to Elasticsearch document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") # ## Preprocessing of documents # # Haystack provides a customizable pipeline for: # - converting files into texts # - cleaning texts # - splitting texts # - writing them to a Document Store # In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and add # them in Elasticsearch. # Let's first fetch some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # convert files to dicts containing documents that can be indexed to our datastore dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. # Now, let's write the docs to our DB. if LAUNCH_ELASTICSEARCH: document_store.write_documents(dicts) else: logger.warning( "Since we already have a running ES instance we should not index the same documents again. \n" "If you still want to do this call: document_store.write_documents(dicts) manually " ) # ## Initalize Retriever, Reader, & Finder # # ### Retriever # # Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question # could be answered. # # They use some simple but fast algorithm. # **Here:** We use Elasticsearch's default BM25 algorithm # **Alternatives:** # - Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters # - Use `EmbeddingRetriever` to find candidate documents based on the similarity of # embeddings (e.g. created via Sentence-BERT) # - Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging retriever = ElasticsearchRetriever(document_store=document_store) # Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes # with SQLite document store. # # from haystack.retriever.tfidf import TfidfRetriever # retriever = TfidfRetriever(document_store=document_store) # ### Reader # # A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based # on powerful, but slower deep learning models. # # Haystack currently supports Readers based on the frameworks FARM and Transformers. # With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). # **Here:** a medium sized RoBERTa QA model using a Reader based on # FARM (https://huggingface.co/deepset/roberta-base-squad2) # **Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) # **Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or # "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) # **Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean # the model prefers "no answer possible" # # #### FARMReader # Load a local model or any of the QA models on # Hugging Face's model hub (https://huggingface.co/models) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) # #### TransformersReader # Alternative: # reader = TransformersReader( # model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # ### Pipeline # # With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. # Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. # To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. # You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). from haystack.pipeline import ExtractiveQAPipeline pipe = ExtractiveQAPipeline(reader, retriever) ## Voilà! Ask a question! prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) # prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) # prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) print_answers(prediction, details="minimal")