def set_finder(user_id_key): if user_settings[user_id_key]["model"] == "roberta": model_path = ( "deepset/roberta-base-squad2" # Path of the models hosted in Hugging Face ) elif user_settings[user_id_key]["model"] == "bert": model_path = "deepset/bert-large-uncased-whole-word-masking-squad2" elif user_settings[user_id_key]["model"] == "distilbert": model_path = "distilbert-base-uncased-distilled-squad" else: model_path = "illuin/camembert-base-fquad" retriever = ElasticsearchRetriever(document_store=user_doc_store[user_id_key]) if user_settings[user_id_key]["gpu"] == "on": try: reader = TransformersReader( model_name_or_path=model_path, tokenizer=model_path, use_gpu=0 ) except Exception as e: print(e) print("GPU not available. Inferencing on CPU") reader = TransformersReader( model_name_or_path=model_path, tokenizer=model_path, use_gpu=-1 ) else: reader = TransformersReader( model_name_or_path=model_path, tokenizer=model_path, use_gpu=-1 ) finder = Finder(reader, retriever) return finder
def __init__(self, hugging_face_model_name: str = "distilbert-base-uncased-distilled-squad", tokenizer_name: str = "distilbert-base-uncased", cuda_is_available: bool = True): cuda_is_available = 0 if cuda_is_available else -1 self.__reader = TransformersReader(model=hugging_face_model_name, tokenizer=tokenizer_name, context_window_size=512, use_gpu=cuda_is_available) self.__rouge = RougeCalculator(stopwords=False)
def test_finder_get_answers(): test_docs = [{ "name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", "meta": { "test": "test" } }, { "name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", "meta": { "test": "test" } }, { "name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", "meta": { "test": "test" } }] document_store = SQLDocumentStore(url="sqlite:///qa_test.db") document_store.write_documents(test_docs) retriever = TfidfRetriever(document_store=document_store) reader = TransformersReader( model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) finder = Finder(reader, retriever) prediction = finder.get_answers(question="testing finder", top_k_retriever=10, top_k_reader=5) assert prediction is not None
def transformers_roberta(): return TransformersReader( model_name_or_path="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2", use_gpu=-1, top_k_per_candidate=5 )
def test_finder_get_answers_with_in_memory_store(): test_docs = [{ "name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': { 'url': 'url' } }, { "name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': { 'url': 'url' } }, { "name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': { 'url': 'url' } }] from haystack.database.memory import InMemoryDocumentStore document_store = InMemoryDocumentStore() document_store.write_documents(test_docs) retriever = TfidfRetriever(document_store=document_store) reader = TransformersReader( model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) finder = Finder(reader, retriever) prediction = finder.get_answers(question="testing finder", top_k_retriever=10, top_k_reader=5) assert prediction is not None
def no_answer_reader(request): if request.param == "farm": return FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False, top_k_per_sample=5, no_ans_boost=0, num_processes=0) if request.param == "transformers": return TransformersReader(model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2", use_gpu=-1, n_best_per_passage=5)
def reader(request): if request.param == "farm": return FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=False, top_k_per_sample=5, num_processes=0) if request.param == "transformers": return TransformersReader(model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
# write the docs to the elasticsearch database document_store.write_documents(dicts) # ## Initalize Retriever, Reader, & Finder # ### Retriever # Retrievers help narrowing down the scope for the Reader to smaller units # of text where a given question # could be answered. # We use Elasticsearch's default BM25 algorithm retriever = ElasticsearchRetriever(document_store=document_store) # ### Reader # A Reader scans the texts returned by retrievers in detail and extracts # the k best answers. It is based on a powerful, but slower deep learning model. reader = TransformersReader(model="dbmdz/bert-base-german-uncased", tokenizer="dbmdz/bert-base-german-uncased", use_gpu=-1) # ### Finder # The Finder sticks together reader and retriever in a pipeline to answer # our actual questions. finder = Finder(reader, retriever) initend = time.time() questions = [ "worauf sollte man auf Fähren achten?", "wird die verkehrschilderkennung für alle kommen?", "was beinhaltet der Autopilot?", "wie viel verbaucht das Model 3?", "fährt das auto wenn der stecker steckt?", "Welche dimension haben die kleinen Sommerreifen?", "wie viel zoll haben die Sommerreifen?", "Werden UV-Strahlen beim Tesla geblockt?", "Ich habe bei Tesla 500€ pro Rad bezahlt.",
retriever = ElasticsearchRetriever(document_store=document_store) elif RETRIEVER_TYPE is None or RETRIEVER_TYPE == "ElasticsearchFilterOnlyRetriever": retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store) else: raise ValueError( f"Could not load Retriever of type '{RETRIEVER_TYPE}'. " f"Please adjust RETRIEVER_TYPE to one of: " f"'EmbeddingRetriever', 'ElasticsearchRetriever', 'ElasticsearchFilterOnlyRetriever', None" f"OR modify rest_api/search.py to support your retriever") if READER_MODEL_PATH: # for extractive doc-qa if READER_TYPE == "TransformersReader": use_gpu = -1 if not USE_GPU else GPU_NUMBER reader = TransformersReader( model_name_or_path=READER_MODEL_PATH, use_gpu=use_gpu, context_window_size=CONTEXT_WINDOW_SIZE, tokenizer=READER_TOKENIZER) # type: Optional[BaseReader] elif READER_TYPE == "FARMReader": reader = FARMReader( model_name_or_path=READER_MODEL_PATH, batch_size=BATCHSIZE, use_gpu=USE_GPU, context_window_size=CONTEXT_WINDOW_SIZE, top_k_per_candidate=TOP_K_PER_CANDIDATE, no_ans_boost=NO_ANS_BOOST, num_processes=MAX_PROCESSES, max_seq_len=MAX_SEQ_LEN, doc_stride=DOC_STRIDE, ) # type: Optional[BaseReader] else:
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) df = pd.DataFrame.from_dict(dicts) # Get embeddings for our questions from the FAQs questions = list(df["text"].values) df["question_emb"] = retriever.create_embedding(texts=questions) # Convert Dataframe to list of dicts and index them in our DocumentStore docs_to_index = df.to_dict(orient="records") # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. # Now, let's write the docs to our DB. document_store.write_documents(docs_to_index) reader = TransformersReader( model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) # Init reader & and use Finder to get answer (same as in Tutorial 1) finder = Finder(reader=reader, retriever=retriever) prediction = finder.get_answers(question="Who is the father of Arya?", top_k_reader=3, top_k_retriever=5) print_answers(prediction, details="all")
class Evaluator: def __init__(self, hugging_face_model_name: str = "distilbert-base-uncased-distilled-squad", tokenizer_name: str = "distilbert-base-uncased", cuda_is_available: bool = True): cuda_is_available = 0 if cuda_is_available else -1 self.__reader = TransformersReader(model=hugging_face_model_name, tokenizer=tokenizer_name, context_window_size=512, use_gpu=cuda_is_available) self.__rouge = RougeCalculator(stopwords=False) def __evaluate_question_answer_pair(self, question: str, answer: str, context: str, identifier: int, verbose: bool = False) -> float: start = None if verbose: start = time() document = Document(identifier, context) predictions = self.__reader.predict(question=question, documents=[document], top_k=1) predicted_answer = predictions["answers"][0]["answer"] score = self.__compute_f1_measure(answer, predicted_answer) if verbose: end = time() print("Question: {}\nPredicted: {}\nGenerated: {}\nScore: {}\nTook {} seconds.\n_____________\n".format( question, predicted_answer, answer, score, end - start)) return score def __compute_f1_measure(self, generated_answer: str, predicted_answer: str) -> float: if predicted_answer is None: predicted_answer = "" rouge_score = self.__rouge.rouge_n(summary=generated_answer, references=predicted_answer, n=1) return rouge_score def evaluate_question_answer_pairs(self, questions: List[str], answers: List[str], contexts: List[str], verbose: bool = False) -> float: """ :param questions: A list of N questions :param answers: A list of N answers. Answer at index 1 is the answer to the question at index 1 in questions :param contexts: A list o N passages used to generate the question. Context at index i belongs to question at i :param verbose: Print intermediate results. :return: The evaluation metric between 0 and 1. """ if len(questions) != len(answers) != len(contexts): raise Exception("Questions, Answers and Context lists must be of equal lengths.") question_answer_context_triplet = list(zip(questions, answers, contexts)) score = 0 counter = 1 length = len(questions) for question, answer, context in question_answer_context_triplet: score += self.__evaluate_question_answer_pair(question, answer, context, counter, verbose) if verbose and counter % 1 == 0: print("\n> {} % done\n".format((counter / length) * 100)) counter += 1 score = score / counter print("\n\n[FINAL SCORE] ========> {}\n\n".format(score)) return score
print(dicts[:3]) # Connect to Elasticsearch document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="taschenhirn") # Now, let's write the dicts containing documents to our DB. document_store.write_documents(dicts) # initialize sparse retriever: retriever = ElasticsearchRetriever(document_store=document_store) # Alternative: reader = TransformersReader(model_name_or_path="Sahajtomar/GELECTRAQA", tokenizer="Sahajtomar/GELECTRAQA") # initialize pipe pipe = ExtractiveQAPipeline(reader, retriever) # You can configure how many candidates the reader and retriever shall return # The higher top_k_retriever, the better (but also the slower) your answers. #prediction = pipe.run(query="Welche Staaten grenzen an den Bodensee?", top_k_retriever=10, top_k_reader=5) pipe.run(query="Welches ist der größte See Bayerns?", top_k_retriever=5, top_k_reader=2) pipe.run(query="Wie weit erstreckt sich die Arktis?", top_k_retriever=5, top_k_reader=2)
def transformers_distilbert(): return TransformersReader( model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)
def get_neural_reader(reader_path, use_gpu=True): use_gpu = 0 if use_gpu else -1 return TransformersReader(model=reader_path, tokenizer=reader_path, use_gpu=use_gpu)
retriever = ElasticsearchRetriever(document_store=document_store) elif RETRIEVER_TYPE is None or RETRIEVER_TYPE == "ElasticsearchFilterOnlyRetriever": retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store) else: raise ValueError( f"Could not load Retriever of type '{RETRIEVER_TYPE}'. " f"Please adjust RETRIEVER_TYPE to one of: " f"'EmbeddingRetriever', 'ElasticsearchRetriever', 'ElasticsearchFilterOnlyRetriever', None" f"OR modify rest_api/search.py to support your retriever") if READER_MODEL_PATH: # for extractive doc-qa if READER_TYPE == "TransformersReader": use_gpu = -1 if not USE_GPU else GPU_NUMBER reader = TransformersReader( model=str(READER_MODEL_PATH), use_gpu=use_gpu, context_window_size=CONTEXT_WINDOW_SIZE, tokenizer=str(READER_TOKENIZER)) # type: Optional[FARMReader] elif READER_TYPE == "FARMReader": reader = FARMReader( model_name_or_path=str(READER_MODEL_PATH), batch_size=BATCHSIZE, use_gpu=USE_GPU, context_window_size=CONTEXT_WINDOW_SIZE, top_k_per_candidate=TOP_K_PER_CANDIDATE, no_ans_boost=NO_ANS_BOOST, num_processes=MAX_PROCESSES, max_seq_len=MAX_SEQ_LEN, doc_stride=DOC_STRIDE, ) # type: Optional[FARMReader] else:
retriever = TfidfRetriever(document_store=documentstore) if READER_MODEL_PATH: # for extractive doc-qa '''reader = FARMReader( model_name_or_path=str(READER_MODEL_PATH), batch_size=BATCHSIZE, use_gpu=USE_GPU, context_window_size=CONTEXT_WINDOW_SIZE, top_k_per_candidate=TOP_K_PER_CANDIDATE, no_ans_boost=NO_ANS_BOOST, num_processes=MAX_PROCESSES, max_seq_len=MAX_SEQ_LEN, doc_stride=DOC_STRIDE, ) # type: Optional[FARMReader]''' reader = TransformersReader(use_gpu=-1) else: reader = None # don't need one for pure FAQ matching FINDERS = {1: Finder(reader=reader, retriever=retriever)} ############################################# # Data schema for request & response ############################################# class Question(BaseModel): questions: List[str] filters: Optional[Dict[str, Optional[str]]] = None top_k_reader: int = DEFAULT_TOP_K_READER top_k_retriever: int = DEFAULT_TOP_K_RETRIEVER
def main(): # fetch model files if not present. not hosted in git repo model_exists = os.path.isfile( './kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin') if not model_exists: logging.info("Starting model download (700MB) ...") urllib.request.urlretrieve( "https://cdn.huggingface.co/mrm8488/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin", "./kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin") logging.info("model successfully downloaded") # start Elasticsearch if LAUNCH_ELASTICSEARCH: logging.info("Starting Elasticsearch ...") status = subprocess.call( 'docker run -d -p 9200:9200 -e "discovery.type=single-node" --name "TeslaNew" elasticsearch:7.6.2', shell=True) if status.returncode: raise Exception( "Failed to launch Elasticsearch. If you want to " "connect to an existing Elasticsearch instance" "then set LAUNCH_ELASTICSEARCH in the script to False.") time.sleep(15) # 512 dimensions because that is what the sentnce transformer returns document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", embedding_dim=512, embedding_field="embedding") # load docs in database if LAUNCH_ELASTICSEARCH or POPULATE_DOCUMENT_STORE: dicts = convert_files_to_dicts(dir_path=data_path, clean_func=clean_text, split_paragraphs=True) logging.info("files to dicts done.") # logging.info("first 10 dicts:", dicts[0:10]) # write dicts containing the texts to the database document_store.write_documents(dicts) logging.info("documents to store written.") retriever = EmbeddingRetriever( document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) # generate embeddings for each text and add it to the databse entry document_store.update_embeddings(retriever) logging.info("embeddings to documents in store written.") retriever = EmbeddingRetriever(document_store=document_store, embedding_model=retriever_model_name_full, model_format=retriever_model_type, gpu=False) # reader wont be used in the retrieval because results take longer and the quality is worse # still has to be initialized reader = TransformersReader(model="./kbQA/" + reader_model_name, tokenizer="./kbQA/" + reader_model_name, use_gpu=-1) finder = Finder(retriever=retriever, reader=reader) if TEST: try: with open("./kbQA/Test.json", encoding="utf-8") as file: times = [] results = [] failed = [] # each line has multiple paragraphs and embeddings, read file line # by line for line in enumerate(file): # load the json string of the current line as a a python object data = json.loads(line[1]) q = data["question"] # fetch results from db start_time = time.process_time() candidate_docs = finder.retriever.retrieve(query=q, filters=None, top_k=5) end_time = time.process_time() times.append(end_time - start_time) answered = False for doc in candidate_docs: if data["answer"] in doc.text: answered = True results.append(True) break if not answered: answers = [] for doc in candidate_docs: answers.append(doc.text) failed.append({ "q": q, "correct": data["answer"], "a": answers }) total = 0 for zeit in times: total = total + zeit logging.info("Average time per request: %f", total / len(times)) logging.info("Questions answered correctly: %d/%d (%f)", len(results), len(times), len(results) / len(times)) logging.debug("Failed questions:") for fail in failed: logging.debug("Question: %s", fail["q"]) logging.debug("Correct Answer: %s", fail["correct"]) for answer in fail["a"]: logging.debug(answer) except Exception as e: traceback.print_exc() logging.error(f"exception: {e}") else: # loop until Keyboard-Interrupt event ctrl+c or "!q" input while True: try: # Eread input from console input q = input("Enter:").strip() # input "!q" to stop execution if q == "!q": exit(0) # fetch results from db candidate_docs = finder.retriever.retrieve(query=q, filters=None, top_k=5) for doc in candidate_docs: logging.info("doc id: %s", doc.id) logging.info("doc meta name: %s", doc.meta["name"]) logging.info("doc text: %s", doc.text) logging.info("doc query score: %s", doc.query_score) logging.info("") # not used # prediction = finder.get_answers( # question=q, top_k_retriever=10, top_k_reader=5) # print_answers(prediction, details="medium") except Exception as e: traceback.print_exc() logging.error(f"exception: {e}")
retriever = ElasticsearchRetriever(document_store=document_store) elif RETRIEVER_TYPE is None or RETRIEVER_TYPE == "ElasticsearchFilterOnlyRetriever": retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store) else: raise ValueError( f"Could not load Retriever of type '{RETRIEVER_TYPE}'. " f"Please adjust RETRIEVER_TYPE to one of: " f"'EmbeddingRetriever', 'ElasticsearchRetriever', 'ElasticsearchFilterOnlyRetriever', None" f"OR modify rest_api/search.py to support your retriever") if READER_MODEL_PATH: # for extractive doc-qa if READER_TYPE == "TransformersReader": use_gpu = -1 if not USE_GPU else GPU_NUMBER reader = TransformersReader( model_name_or_path=READER_MODEL_PATH, use_gpu=use_gpu, context_window_size=CONTEXT_WINDOW_SIZE, return_no_answers=READER_CAN_HAVE_NO_ANSWER, tokenizer=READER_TOKENIZER) # type: Optional[BaseReader] elif READER_TYPE == "FARMReader": reader = FARMReader( model_name_or_path=READER_MODEL_PATH, batch_size=BATCHSIZE, use_gpu=USE_GPU, context_window_size=CONTEXT_WINDOW_SIZE, top_k_per_candidate=TOP_K_PER_CANDIDATE, no_ans_boost=NO_ANS_BOOST, num_processes=MAX_PROCESSES, max_seq_len=MAX_SEQ_LEN, doc_stride=DOC_STRIDE, ) # type: Optional[BaseReader] else:
from haystack.reader.transformers import TransformersReader from haystack.utils import print_answers from haystack.document_store.elasticsearch import ElasticsearchDocumentStore from haystack.retriever.sparse import ElasticsearchRetriever import os # data = pd.read_csv('test.txt', sep='\t') document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") retriever = ElasticsearchRetriever(document_store=document_store) reader = TransformersReader(model_name_or_path='deepset/roberta-base-squad2', tokenizer='deepset/roberta-base-squad2', context_window_size=500, use_gpu=-1) # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, context_window_size=500) finder = Finder(reader, retriever) if __name__ == '__main__': # questions = ["What do we know about Bourin and Uchiyama?"] ''' prediction = finder.get_answers(question="What do we know about symbiotic stars?", top_k_retriever=10, top_k_reader=3) print_answers(prediction, details='minimal') ''' while True: qes = input('Question: ') # print(qes) prediction = finder.get_answers(question=qes,