def set_finder(user_id_key):
    if user_settings[user_id_key]["model"] == "roberta":
        model_path = (
            "deepset/roberta-base-squad2"  # Path of the models hosted in Hugging Face
        )
    elif user_settings[user_id_key]["model"] == "bert":
        model_path = "deepset/bert-large-uncased-whole-word-masking-squad2"
    elif user_settings[user_id_key]["model"] == "distilbert":
        model_path = "distilbert-base-uncased-distilled-squad"
    else:
        model_path = "illuin/camembert-base-fquad"

    retriever = ElasticsearchRetriever(document_store=user_doc_store[user_id_key])

    if user_settings[user_id_key]["gpu"] == "on":
        try:
            reader = TransformersReader(
                model_name_or_path=model_path, tokenizer=model_path, use_gpu=0
            )
        except Exception as e:
            print(e)
            print("GPU not available. Inferencing on CPU")
            reader = TransformersReader(
                model_name_or_path=model_path, tokenizer=model_path, use_gpu=-1
            )

    else:
        reader = TransformersReader(
            model_name_or_path=model_path, tokenizer=model_path, use_gpu=-1
        )

    finder = Finder(reader, retriever)

    return finder
Exemple #2
0
    def __init__(self,
                 hugging_face_model_name: str = "distilbert-base-uncased-distilled-squad",
                 tokenizer_name: str = "distilbert-base-uncased",
                 cuda_is_available: bool = True):
        cuda_is_available = 0 if cuda_is_available else -1

        self.__reader = TransformersReader(model=hugging_face_model_name,
                                           tokenizer=tokenizer_name,
                                           context_window_size=512,
                                           use_gpu=cuda_is_available)

        self.__rouge = RougeCalculator(stopwords=False)
Exemple #3
0
def test_finder_get_answers():
    test_docs = [{
        "name": "testing the finder 1",
        "text": "testing the finder with pyhton unit test 1",
        "meta": {
            "test": "test"
        }
    }, {
        "name": "testing the finder 2",
        "text": "testing the finder with pyhton unit test 2",
        "meta": {
            "test": "test"
        }
    }, {
        "name": "testing the finder 3",
        "text": "testing the finder with pyhton unit test 3",
        "meta": {
            "test": "test"
        }
    }]

    document_store = SQLDocumentStore(url="sqlite:///qa_test.db")
    document_store.write_documents(test_docs)
    retriever = TfidfRetriever(document_store=document_store)
    reader = TransformersReader(
        model="distilbert-base-uncased-distilled-squad",
        tokenizer="distilbert-base-uncased",
        use_gpu=-1)
    finder = Finder(reader, retriever)
    prediction = finder.get_answers(question="testing finder",
                                    top_k_retriever=10,
                                    top_k_reader=5)
    assert prediction is not None
Exemple #4
0
def transformers_roberta():
    return TransformersReader(
        model_name_or_path="deepset/roberta-base-squad2",
        tokenizer="deepset/roberta-base-squad2",
        use_gpu=-1,
        top_k_per_candidate=5
    )
def test_finder_get_answers_with_in_memory_store():
    test_docs = [{
        "name": "testing the finder 1",
        "text": "testing the finder with pyhton unit test 1",
        'meta': {
            'url': 'url'
        }
    }, {
        "name": "testing the finder 2",
        "text": "testing the finder with pyhton unit test 2",
        'meta': {
            'url': 'url'
        }
    }, {
        "name": "testing the finder 3",
        "text": "testing the finder with pyhton unit test 3",
        'meta': {
            'url': 'url'
        }
    }]

    from haystack.database.memory import InMemoryDocumentStore
    document_store = InMemoryDocumentStore()
    document_store.write_documents(test_docs)

    retriever = TfidfRetriever(document_store=document_store)
    reader = TransformersReader(
        model="distilbert-base-uncased-distilled-squad",
        tokenizer="distilbert-base-uncased",
        use_gpu=-1)
    finder = Finder(reader, retriever)
    prediction = finder.get_answers(question="testing finder",
                                    top_k_retriever=10,
                                    top_k_reader=5)
    assert prediction is not None
Exemple #6
0
def no_answer_reader(request):
    if request.param == "farm":
        return FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                          use_gpu=False, top_k_per_sample=5, no_ans_boost=0, num_processes=0)
    if request.param == "transformers":
        return TransformersReader(model="deepset/roberta-base-squad2",
                                  tokenizer="deepset/roberta-base-squad2",
                                  use_gpu=-1, n_best_per_passage=5)
Exemple #7
0
def reader(request):
    if request.param == "farm":
        return FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad",
                          use_gpu=False, top_k_per_sample=5, num_processes=0)
    if request.param == "transformers":
        return TransformersReader(model="distilbert-base-uncased-distilled-squad",
                                  tokenizer="distilbert-base-uncased",
                                  use_gpu=-1)
Exemple #8
0
        # write the docs to the elasticsearch database
        document_store.write_documents(dicts)

    # ## Initalize Retriever, Reader,  & Finder
    # ### Retriever
    # Retrievers help narrowing down the scope for the Reader to smaller units
    # of text where a given question
    # could be answered.
    # We use Elasticsearch's default BM25 algorithm
    retriever = ElasticsearchRetriever(document_store=document_store)
    # ### Reader
    # A Reader scans the texts returned by retrievers in detail and extracts
    # the k best answers. It is based on a powerful, but slower deep learning model.
    reader = TransformersReader(model="dbmdz/bert-base-german-uncased",
                                tokenizer="dbmdz/bert-base-german-uncased",
                                use_gpu=-1)
    # ### Finder
    # The Finder sticks together reader and retriever in a pipeline to answer
    # our actual questions.
    finder = Finder(reader, retriever)
    initend = time.time()
    questions = [
        "worauf sollte man auf Fähren achten?",
        "wird die verkehrschilderkennung für alle kommen?",
        "was beinhaltet der Autopilot?", "wie viel verbaucht das Model 3?",
        "fährt das auto wenn der stecker steckt?",
        "Welche dimension haben die kleinen Sommerreifen?",
        "wie viel zoll haben die Sommerreifen?",
        "Werden UV-Strahlen beim Tesla geblockt?",
        "Ich habe bei Tesla 500€ pro Rad bezahlt.",
Exemple #9
0
    retriever = ElasticsearchRetriever(document_store=document_store)
elif RETRIEVER_TYPE is None or RETRIEVER_TYPE == "ElasticsearchFilterOnlyRetriever":
    retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store)
else:
    raise ValueError(
        f"Could not load Retriever of type '{RETRIEVER_TYPE}'. "
        f"Please adjust RETRIEVER_TYPE to one of: "
        f"'EmbeddingRetriever', 'ElasticsearchRetriever', 'ElasticsearchFilterOnlyRetriever', None"
        f"OR modify rest_api/search.py to support your retriever")

if READER_MODEL_PATH:  # for extractive doc-qa
    if READER_TYPE == "TransformersReader":
        use_gpu = -1 if not USE_GPU else GPU_NUMBER
        reader = TransformersReader(
            model_name_or_path=READER_MODEL_PATH,
            use_gpu=use_gpu,
            context_window_size=CONTEXT_WINDOW_SIZE,
            tokenizer=READER_TOKENIZER)  # type: Optional[BaseReader]
    elif READER_TYPE == "FARMReader":
        reader = FARMReader(
            model_name_or_path=READER_MODEL_PATH,
            batch_size=BATCHSIZE,
            use_gpu=USE_GPU,
            context_window_size=CONTEXT_WINDOW_SIZE,
            top_k_per_candidate=TOP_K_PER_CANDIDATE,
            no_ans_boost=NO_ANS_BOOST,
            num_processes=MAX_PROCESSES,
            max_seq_len=MAX_SEQ_LEN,
            doc_stride=DOC_STRIDE,
        )  # type: Optional[BaseReader]
    else:
Exemple #10
0
        dicts = convert_files_to_dicts(dir_path=doc_dir,
                                       clean_func=clean_wiki_text,
                                       split_paragraphs=True)

        df = pd.DataFrame.from_dict(dicts)
        # Get embeddings for our questions from the FAQs
        questions = list(df["text"].values)
        df["question_emb"] = retriever.create_embedding(texts=questions)

        # Convert Dataframe to list of dicts and index them in our DocumentStore
        docs_to_index = df.to_dict(orient="records")
        # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers)
        # It must take a str as input, and return a str.

        # Now, let's write the docs to our DB.
        document_store.write_documents(docs_to_index)

    reader = TransformersReader(
        model="distilbert-base-uncased-distilled-squad",
        tokenizer="distilbert-base-uncased",
        use_gpu=-1)

    # Init reader & and use Finder to get answer (same as in Tutorial 1)
    finder = Finder(reader=reader, retriever=retriever)

    prediction = finder.get_answers(question="Who is the father of Arya?",
                                    top_k_reader=3,
                                    top_k_retriever=5)

    print_answers(prediction, details="all")
Exemple #11
0
class Evaluator:

    def __init__(self,
                 hugging_face_model_name: str = "distilbert-base-uncased-distilled-squad",
                 tokenizer_name: str = "distilbert-base-uncased",
                 cuda_is_available: bool = True):
        cuda_is_available = 0 if cuda_is_available else -1

        self.__reader = TransformersReader(model=hugging_face_model_name,
                                           tokenizer=tokenizer_name,
                                           context_window_size=512,
                                           use_gpu=cuda_is_available)

        self.__rouge = RougeCalculator(stopwords=False)

    def __evaluate_question_answer_pair(self, question: str, answer: str, context: str, identifier: int,
                                        verbose: bool = False) -> float:

        start = None

        if verbose:
            start = time()

        document = Document(identifier, context)

        predictions = self.__reader.predict(question=question, documents=[document], top_k=1)

        predicted_answer = predictions["answers"][0]["answer"]

        score = self.__compute_f1_measure(answer, predicted_answer)

        if verbose:
            end = time()

            print("Question: {}\nPredicted: {}\nGenerated: {}\nScore: {}\nTook {} seconds.\n_____________\n".format(
                question, predicted_answer,
                answer, score,
                end - start))
        return score

    def __compute_f1_measure(self, generated_answer: str, predicted_answer: str) -> float:

        if predicted_answer is None:
            predicted_answer = ""

        rouge_score = self.__rouge.rouge_n(summary=generated_answer, references=predicted_answer, n=1)

        return rouge_score

    def evaluate_question_answer_pairs(self, questions: List[str], answers: List[str], contexts: List[str],
                                       verbose: bool = False) -> float:
        """
        :param questions: A list of N questions
        :param answers: A list of N answers. Answer at index 1 is the answer to the question at index 1 in questions
        :param contexts: A list o N passages used to generate the question. Context at index i belongs to question at i
        :param verbose: Print intermediate results.
        :return: The evaluation metric between 0 and 1.
        """

        if len(questions) != len(answers) != len(contexts):
            raise Exception("Questions, Answers and Context lists must be of equal lengths.")

        question_answer_context_triplet = list(zip(questions, answers, contexts))

        score = 0

        counter = 1

        length = len(questions)

        for question, answer, context in question_answer_context_triplet:
            score += self.__evaluate_question_answer_pair(question, answer, context, counter, verbose)

            if verbose and counter % 1 == 0:
                print("\n> {} % done\n".format((counter / length) * 100))

            counter += 1

        score = score / counter

        print("\n\n[FINAL SCORE] ========> {}\n\n".format(score))

        return score
Exemple #12
0
print(dicts[:3])

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="taschenhirn")

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(dicts)

# initialize sparse retriever:
retriever = ElasticsearchRetriever(document_store=document_store)

# Alternative:
reader = TransformersReader(model_name_or_path="Sahajtomar/GELECTRAQA",
                            tokenizer="Sahajtomar/GELECTRAQA")

# initialize pipe
pipe = ExtractiveQAPipeline(reader, retriever)

# You can configure how many candidates the reader and retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers.
#prediction = pipe.run(query="Welche Staaten grenzen an den Bodensee?", top_k_retriever=10, top_k_reader=5)
pipe.run(query="Welches ist der größte See Bayerns?",
         top_k_retriever=5,
         top_k_reader=2)

pipe.run(query="Wie weit erstreckt sich die Arktis?",
         top_k_retriever=5,
         top_k_reader=2)
Exemple #13
0
def transformers_distilbert():
    return TransformersReader(
        model_name_or_path="distilbert-base-uncased-distilled-squad",
        tokenizer="distilbert-base-uncased",
        use_gpu=-1)
Exemple #14
0
def get_neural_reader(reader_path, use_gpu=True):
    use_gpu = 0 if use_gpu else -1
    return TransformersReader(model=reader_path,
                              tokenizer=reader_path,
                              use_gpu=use_gpu)
Exemple #15
0
    retriever = ElasticsearchRetriever(document_store=document_store)
elif RETRIEVER_TYPE is None or RETRIEVER_TYPE == "ElasticsearchFilterOnlyRetriever":
    retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store)
else:
    raise ValueError(
        f"Could not load Retriever of type '{RETRIEVER_TYPE}'. "
        f"Please adjust RETRIEVER_TYPE to one of: "
        f"'EmbeddingRetriever', 'ElasticsearchRetriever', 'ElasticsearchFilterOnlyRetriever', None"
        f"OR modify rest_api/search.py to support your retriever")

if READER_MODEL_PATH:  # for extractive doc-qa
    if READER_TYPE == "TransformersReader":
        use_gpu = -1 if not USE_GPU else GPU_NUMBER
        reader = TransformersReader(
            model=str(READER_MODEL_PATH),
            use_gpu=use_gpu,
            context_window_size=CONTEXT_WINDOW_SIZE,
            tokenizer=str(READER_TOKENIZER))  # type: Optional[FARMReader]
    elif READER_TYPE == "FARMReader":
        reader = FARMReader(
            model_name_or_path=str(READER_MODEL_PATH),
            batch_size=BATCHSIZE,
            use_gpu=USE_GPU,
            context_window_size=CONTEXT_WINDOW_SIZE,
            top_k_per_candidate=TOP_K_PER_CANDIDATE,
            no_ans_boost=NO_ANS_BOOST,
            num_processes=MAX_PROCESSES,
            max_seq_len=MAX_SEQ_LEN,
            doc_stride=DOC_STRIDE,
        )  # type: Optional[FARMReader]
    else:
Exemple #16
0
retriever = TfidfRetriever(document_store=documentstore)

if READER_MODEL_PATH:  # for extractive doc-qa
    '''reader = FARMReader(
        model_name_or_path=str(READER_MODEL_PATH),
        batch_size=BATCHSIZE,
        use_gpu=USE_GPU,
        context_window_size=CONTEXT_WINDOW_SIZE,
        top_k_per_candidate=TOP_K_PER_CANDIDATE,
        no_ans_boost=NO_ANS_BOOST,
        num_processes=MAX_PROCESSES,
        max_seq_len=MAX_SEQ_LEN,
        doc_stride=DOC_STRIDE,
    )  # type: Optional[FARMReader]'''

    reader = TransformersReader(use_gpu=-1)
else:
    reader = None  # don't need one for pure FAQ matching

FINDERS = {1: Finder(reader=reader, retriever=retriever)}


#############################################
# Data schema for request & response
#############################################
class Question(BaseModel):
    questions: List[str]
    filters: Optional[Dict[str, Optional[str]]] = None
    top_k_reader: int = DEFAULT_TOP_K_READER
    top_k_retriever: int = DEFAULT_TOP_K_RETRIEVER
def main():
    # fetch model files if not present. not hosted in git repo
    model_exists = os.path.isfile(
        './kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin')
    if not model_exists:
        logging.info("Starting model download (700MB) ...")
        urllib.request.urlretrieve(
            "https://cdn.huggingface.co/mrm8488/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin",
            "./kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin")
        logging.info("model successfully downloaded")
    # start Elasticsearch
    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.call(
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" --name "TeslaNew" elasticsearch:7.6.2',
            shell=True)
        if status.returncode:
            raise Exception(
                "Failed to launch Elasticsearch. If you want to "
                "connect to an existing Elasticsearch instance"
                "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # 512 dimensions because that is what the sentnce transformer returns
    document_store = ElasticsearchDocumentStore(host="localhost",
                                                username="",
                                                password="",
                                                index="document",
                                                embedding_dim=512,
                                                embedding_field="embedding")

    # load docs in database
    if LAUNCH_ELASTICSEARCH or POPULATE_DOCUMENT_STORE:
        dicts = convert_files_to_dicts(dir_path=data_path,
                                       clean_func=clean_text,
                                       split_paragraphs=True)

        logging.info("files to dicts done.")
        # logging.info("first 10 dicts:", dicts[0:10])
        # write dicts containing the texts to the database
        document_store.write_documents(dicts)
        logging.info("documents to store written.")

        retriever = EmbeddingRetriever(
            document_store=document_store,
            embedding_model=retriever_model_name_full,
            model_format=retriever_model_type,
            gpu=False)
        # generate embeddings for each text and add it to the databse entry
        document_store.update_embeddings(retriever)
        logging.info("embeddings to documents in store written.")

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model=retriever_model_name_full,
                                   model_format=retriever_model_type,
                                   gpu=False)

    # reader wont be used in the retrieval because results take longer and the quality is worse
    # still has to be initialized
    reader = TransformersReader(model="./kbQA/" + reader_model_name,
                                tokenizer="./kbQA/" + reader_model_name,
                                use_gpu=-1)
    finder = Finder(retriever=retriever, reader=reader)

    if TEST:
        try:
            with open("./kbQA/Test.json", encoding="utf-8") as file:
                times = []
                results = []
                failed = []
                # each line has multiple paragraphs and embeddings, read file line
                # by line
                for line in enumerate(file):
                    # load the json string of the current line as a a python object
                    data = json.loads(line[1])
                    q = data["question"]
                    # fetch results from db
                    start_time = time.process_time()
                    candidate_docs = finder.retriever.retrieve(query=q,
                                                               filters=None,
                                                               top_k=5)
                    end_time = time.process_time()
                    times.append(end_time - start_time)
                    answered = False
                    for doc in candidate_docs:
                        if data["answer"] in doc.text:
                            answered = True
                            results.append(True)
                            break
                    if not answered:
                        answers = []
                        for doc in candidate_docs:
                            answers.append(doc.text)
                        failed.append({
                            "q": q,
                            "correct": data["answer"],
                            "a": answers
                        })
                total = 0
                for zeit in times:
                    total = total + zeit
                logging.info("Average time per request: %f",
                             total / len(times))
                logging.info("Questions answered correctly: %d/%d (%f)",
                             len(results), len(times),
                             len(results) / len(times))
                logging.debug("Failed questions:")
                for fail in failed:
                    logging.debug("Question: %s", fail["q"])
                    logging.debug("Correct Answer: %s", fail["correct"])
                    for answer in fail["a"]:
                        logging.debug(answer)

        except Exception as e:
            traceback.print_exc()
            logging.error(f"exception: {e}")
    else:
        # loop until Keyboard-Interrupt event ctrl+c or "!q" input
        while True:
            try:
                # Eread input from console input
                q = input("Enter:").strip()
                # input "!q" to stop execution
                if q == "!q":
                    exit(0)
                # fetch results from db
                candidate_docs = finder.retriever.retrieve(query=q,
                                                           filters=None,
                                                           top_k=5)
                for doc in candidate_docs:
                    logging.info("doc id: %s", doc.id)
                    logging.info("doc meta name: %s", doc.meta["name"])
                    logging.info("doc text: %s", doc.text)
                    logging.info("doc query score: %s", doc.query_score)
                    logging.info("")
                # not used
                # prediction = finder.get_answers(
                #     question=q, top_k_retriever=10, top_k_reader=5)
                # print_answers(prediction, details="medium")
            except Exception as e:
                traceback.print_exc()
                logging.error(f"exception: {e}")
Exemple #18
0
    retriever = ElasticsearchRetriever(document_store=document_store)
elif RETRIEVER_TYPE is None or RETRIEVER_TYPE == "ElasticsearchFilterOnlyRetriever":
    retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store)
else:
    raise ValueError(
        f"Could not load Retriever of type '{RETRIEVER_TYPE}'. "
        f"Please adjust RETRIEVER_TYPE to one of: "
        f"'EmbeddingRetriever', 'ElasticsearchRetriever', 'ElasticsearchFilterOnlyRetriever', None"
        f"OR modify rest_api/search.py to support your retriever")

if READER_MODEL_PATH:  # for extractive doc-qa
    if READER_TYPE == "TransformersReader":
        use_gpu = -1 if not USE_GPU else GPU_NUMBER
        reader = TransformersReader(
            model_name_or_path=READER_MODEL_PATH,
            use_gpu=use_gpu,
            context_window_size=CONTEXT_WINDOW_SIZE,
            return_no_answers=READER_CAN_HAVE_NO_ANSWER,
            tokenizer=READER_TOKENIZER)  # type: Optional[BaseReader]
    elif READER_TYPE == "FARMReader":
        reader = FARMReader(
            model_name_or_path=READER_MODEL_PATH,
            batch_size=BATCHSIZE,
            use_gpu=USE_GPU,
            context_window_size=CONTEXT_WINDOW_SIZE,
            top_k_per_candidate=TOP_K_PER_CANDIDATE,
            no_ans_boost=NO_ANS_BOOST,
            num_processes=MAX_PROCESSES,
            max_seq_len=MAX_SEQ_LEN,
            doc_stride=DOC_STRIDE,
        )  # type: Optional[BaseReader]
    else:
Exemple #19
0
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever

import os

# data = pd.read_csv('test.txt', sep='\t')
document_store = ElasticsearchDocumentStore(host="localhost",
                                            username="",
                                            password="",
                                            index="document")

retriever = ElasticsearchRetriever(document_store=document_store)
reader = TransformersReader(model_name_or_path='deepset/roberta-base-squad2',
                            tokenizer='deepset/roberta-base-squad2',
                            context_window_size=500,
                            use_gpu=-1)
# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, context_window_size=500)
finder = Finder(reader, retriever)

if __name__ == '__main__':
    # questions = ["What do we know about Bourin and Uchiyama?"]
    '''
    prediction = finder.get_answers(question="What do we know about symbiotic stars?",
                                    top_k_retriever=10, top_k_reader=3)
    print_answers(prediction, details='minimal')
    '''
    while True:
        qes = input('Question: ')
        # print(qes)
        prediction = finder.get_answers(question=qes,