Example #1
0
                                                  username="",
                                                  password="",
                                                  index="document",
                                                  embedding_field="embedding",
                                                  embedding_dim=768)
documet_store_sparse = ElasticsearchDocumentStore(host="localhost",
                                                  username="",
                                                  password="",
                                                  index="document")

document_store_dense.write_documents(base_corpus())
dense_retriever = DensePassageRetriever(document_store=document_store_dense,
                                        embedding_model="dpr-bert-base-nq",
                                        do_lower_case=True,
                                        use_gpu=True)
document_store_dense.update_embeddings(dense_retriever)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                    use_gpu=True)

alphabets = "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"


def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n", " ")
                               split_paragraphs=True)

# Now, let's write the docs to our DB.
document_store.write_documents(dicts[:16])

### Retriever
retriever = DensePassageRetriever(document_store=document_store,
                                  embedding_model="dpr-bert-base-nq",
                                  do_lower_case=True,
                                  use_gpu=True)
# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)

### Reader
# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",
                    use_gpu=True)

### Finder
# The Finder sticks together reader and retriever in a pipeline to answer our actual questions.
finder = Finder(reader, retriever)

### VoilĂ ! Ask a question!
# You can configure how many candidates the reader and retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers.
prediction = finder.get_answers(question="Who is the father of Arya Stark?",
Example #3
0
def main():
    # fetch model files if not present. not hosted in git repo
    # model_exists = os.path.isfile(
    #     './kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin')
    # if not model_exists:
    #     logging.info("Starting model download (about 700MB) ...")
    #     urllib.request.urlretrieve(
    #         "https://cdn.huggingface.co/mrm8488/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin",
    #         "./kbQA/bert-multi-cased-finetuned-xquadv1/pytorch_model.bin")
    #     logging.info("model successfully downloaded")
    # start Elasticsearch
    if LAUNCH_ELASTICSEARCH:
        logging.info("Starting Elasticsearch ...")
        status = subprocess.call(
            'docker run -d -p 9200:9200 -e "discovery.type=single-node" --name "MLQA2" elasticsearch:7.6.2',
            shell=True
        )
        if status.returncode:
            raise Exception("Failed to launch Elasticsearch. If you want to "
                            "connect to an existing Elasticsearch instance"
                            "then set LAUNCH_ELASTICSEARCH in the script to False.")
        time.sleep(15)

    # 512 dimensions because that is what the sentnce transformer returns
    document_store = ElasticsearchDocumentStore(host="localhost", username="",
                                                password="", index="document",
                                                embedding_dim=512,
                                                embedding_field="embedding")

    # load docs in database
    if LAUNCH_ELASTICSEARCH or POPULATE_DOCUMENT_STORE:
        dicts = convert_files_to_dicts(
            dir_path=data_path, clean_func=clean_text, split_paragraphs=True)

        logging.info("files to dicts done.")
        # write dicts containing the texts to the database
        document_store.write_documents(dicts)
        logging.info("documents to store written.")

        retriever = EmbeddingRetriever(document_store=document_store,
                                       embedding_model=retriever_model_name_full,
                                       model_format=retriever_model_type,
                                       gpu=False)
        # generate embeddings for each text and add it to the databse entry
        document_store.update_embeddings(retriever)
        logging.info("embeddings to documents in store written.")

    retriever = EmbeddingRetriever(document_store=document_store,
                                   embedding_model=retriever_model_name_full,
                                   model_format=retriever_model_type,
                                   gpu=False)

    # reader wont be used in the retrieval because results take longer and the quality is worse
    # still has to be initialized
    # reader = TransformersReader(model="./kbQA/" + reader_model_name,
    #                             tokenizer="./kbQA/" + reader_model_name,
    #                             use_gpu=-1)
    finder = Finder(retriever=retriever, reader=None)

    if TEST:
        try:
            with open("./kbQA/Test.json", encoding="utf-8") as file:
                times = []
                results = []
                failed = []
                # each line has multiple paragraphs and embeddings, read file line
                # by line
                for line in enumerate(file):
                    # load the json string of the current line as a a python object
                    data = json.loads(line[1])
                    q = data["question"]
                    # fetch results from db
                    start_time = time.process_time()
                    candidate_docs = finder.retriever.retrieve(
                        query=q, filters=None, top_k=5)
                    end_time = time.process_time()
                    times.append(end_time-start_time)
                    answered = False
                    for doc in candidate_docs:
                        if data["answer"] in doc.text:
                            answered = True
                            results.append(True)
                            break
                    if not answered:
                        answers = []
                        for doc in candidate_docs:
                            answers.append(doc.text)
                        failed.append(
                            {"q": q, "correct": data["answer"], "a": answers})
                total = 0
                for zeit in times:
                    total = total + zeit
                logging.info("Average time per request: %f",
                             total / len(times))
                logging.info("Questions answered correctly: %d/%d (%f)",
                             len(results), len(times), len(results)/len(times))
                logging.info("Failed questions:")
                for fail in failed:
                    logging.info("Question: %s", fail["q"])
                    logging.info("Correct Answer: %s", fail["correct"])
                    for answer in fail["a"]:
                        logging.info(answer)

        except Exception as e:
            traceback.print_exc()
            logging.error(f"exception: {e}")
    else:
        # loop until Keyboard-Interrupt event ctrl+c or "!q" input
        while True:
            try:
                # Eread input from console input
                q = input("Enter:").strip()
                # input "!q" to stop execution
                if q == "!q":
                    exit(0)
                # fetch results from db
                candidate_docs = finder.retriever.retrieve(
                    query=q, filters=None, top_k=5)
                for doc in candidate_docs:
                    logging.info("doc id: %s", doc.id)
                    logging.info("doc meta name: %s", doc.meta["name"])
                    logging.info("doc text: %s", doc.text)
                    logging.info("doc query score: %s", doc.query_score)
                    logging.info("")
                # not used
                # prediction = finder.get_answers(
                #     question=q, top_k_retriever=10, top_k_reader=5)
                # print_answers(prediction, details="medium")
            except Exception as e:
                traceback.print_exc()
                logging.error(f"exception: {e}")