Example #1
0
def fetch_data_from_repo(
    doc_dir="resources/data/website_data/",
    s3_url="https://github.com/Thabo-5/Chatbot-scraper/raw/master/txt_files.zip",
    doc_store=FAISSDocumentStore()):
    """
    Function to download data from s3 bucket/ github
    Parameters
    ----------
        doc_dir (str): path to destination folder
        s3_url (str): path to download zipped data
        doc_store (class): Haystack document store
    Returns
    -------
        document_store (object): Haystack document store object
    """
    document_store = doc_store
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
    import os
    for filename in os.listdir(path=doc_dir):
        with open(os.path.join(doc_dir, filename),
                  'r',
                  encoding='utf-8',
                  errors='replace') as file:
            text = file.read()
            file.close()
        with open(os.path.join(doc_dir, filename),
                  'w',
                  encoding='utf-8',
                  errors='replace') as file:
            file.write(text)
            file.close()
    # Convert files to dicts
    dicts = convert_files_to_dicts(dir_path=doc_dir,
                                   clean_func=clean_wiki_text,
                                   split_paragraphs=True)

    # Now, let's write the dicts containing documents to our DB.
    document_store.write_documents(dicts)
    return document_store
Example #2
0
# You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in
# your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.
if LAUNCH_ELASTICSEARCH:
    logging.info("Starting Elasticsearch ...")
    status = subprocess.run(
        ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
    )
    if status.returncode:
        raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                        "then set LAUNCH_ELASTICSEARCH in the script to False.")
    time.sleep(30)

# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents
doc_dir = "../data/nq"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset.json.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", create_index=False)
# Add evaluation data to Elasticsearch database
if LAUNCH_ELASTICSEARCH:
    document_store.add_eval_data("../data/nq/nq_dev_subset.json")
else:
    logger.warning("Since we already have a running ES instance we should not index the same documents again."
                   "If you still want to do this call: 'document_store.add_eval_data('../data/nq/nq_dev_subset.json')' manually ")

# Initialize Retriever
retriever = ElasticsearchRetriever(document_store=document_store)

# Initialize Reader
reader = FARMReader("deepset/roberta-base-squad2")