def test_db_write_read(): sql_document_store = SQLDocumentStore() write_documents_to_db(document_store=sql_document_store, document_dir="samples/docs") documents = sql_document_store.get_all_documents() assert len(documents) == 2 doc = sql_document_store.get_document_by_id("1") assert doc.keys() == {"id", "name", "text", "tags"}
def test_db_write_read(): from haystack.database import db db.drop_all() db.create_all() write_documents_to_db(document_dir="samples/docs") documents = db.session.query(Document).order_by(Document.text).all() assert len(documents) == 2 assert documents[0].text == 'A Doc specifically talking about haystack.\nHaystack can be used to scale QA models to large document collections.'
def test_sql_write_read(): sql_document_store = SQLDocumentStore() write_documents_to_db(document_store=sql_document_store, document_dir="samples/docs") documents = sql_document_store.get_all_documents() assert len(documents) == 2 doc = sql_document_store.get_document_by_id("1") assert doc.id assert doc.text
def test_elasticsearch_write_read(elasticsearch_fixture): document_store = ElasticsearchDocumentStore() write_documents_to_db(document_store=document_store, document_dir="samples/docs") sleep(2) # wait for documents to be available for query documents = document_store.get_all_documents() assert len(documents) == 2 assert documents[0].id assert documents[0].text
def write_to_db(): try: ## TODO: Get DOCS_DIR from config write_documents_to_db(document_store=document_store, document_dir=doc_dir, \ only_empty_db=True, split_paragraphs=True) return True except : return jsonify("Can not write to DB")
""" Load preprocessed text files into ES """ import logging import subprocess import time from haystack import Finder from haystack.database.elasticsearch import ElasticsearchDocumentStore from haystack.indexing.cleaning import clean_wiki_text from haystack.indexing.io import write_documents_to_db, fetch_archive_from_http from haystack.reader.farm import FARMReader from haystack.reader.transformers import TransformersReader from haystack.utils import print_answers from haystack.retriever.elasticsearch import ElasticsearchRetriever #load data into ES doc_dir = "/home/sebastian/SideProject/QA/wikiextractor/preprocessed/folder_1" document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") write_documents_to_db( document_store=document_store, document_dir=doc_dir, #clean_func=clean_wiki_text, only_empty_db=False, split_paragraphs=True)
# Let's first get some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones doc_dir = "data/article_txt_got" s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip" fetch_archive_from_http(url=s3_url, output_dir=doc_dir) # The documents can be stored in different types of "DocumentStores". # For dev we suggest a light-weight SQL DB # For production we suggest elasticsearch document_store = SQLDocumentStore(url="sqlite:///qa.db") # Now, let's write the docs to our DB. # You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) # It must take a str as input, and return a str. write_documents_to_db(document_store=document_store, document_dir=doc_dir, clean_func=clean_wiki_text, only_empty_db=True) ## Initalize Reader, Retriever & Finder # A retriever identifies the k most promising chunks of text that might contain the answer for our question # Retrievers use some simple but fast algorithm, here: TF-IDF retriever = TfidfRetriever(document_store=document_store) # A reader scans the text chunks in detail and extracts the k best answers # Reader use more powerful but slower deep learning models # You can select a local model or any of the QA models published on huggingface's model hub (https://huggingface.co/models) # here: a medium sized BERT QA model trained via FARM on Squad 2.0 reader = FARMReader(model_name_or_path="deepset/bert-base-cased-squad2", use_gpu=False)
#TODO Enable CORS MODELS_DIRS = ["model"] USE_GPU = False BATCH_SIZE = 16 from haystack.database import db db.create_all() # Let's first get some documents that we want to query # Here: 517 Wikipedia articles for Game of Thrones doc_dir = "../data" # Note: requires changing the function in io.py and adding encoding='utf-8' in our case write_documents_to_db(document_dir=doc_dir, clean_func=clean_wiki_text) # , only_empty_db=True app = FastAPI(title="Haystack API for Taschenhirn", version="0.1") ############################################# # Load all models in memory ############################################# ## Indexing & cleaning documents # Init a database (default: sqllite) model_paths = [] for model_dir in MODELS_DIRS: path = Path(model_dir) if path.is_dir(): models = [f for f in path.iterdir() if f.is_dir()]