def test_tfidf_retriever(): from haystack.retriever.tfidf import TfidfRetriever test_docs = [{ "name": "testing the finder 1", "text": "godzilla says hello" }, { "name": "testing the finder 2", "text": "optimus prime says bye" }, { "name": "testing the finder 3", "text": "alien says arghh" }] from haystack.database.memory import InMemoryDocumentStore document_store = InMemoryDocumentStore() document_store.write_documents(test_docs) retriever = TfidfRetriever(document_store) retriever.fit() assert retriever.retrieve("godzilla", top_k=1) == [ Document(id='0', text='godzilla says hello', external_source_id=None, question=None, query_score=None, meta={}) ]
def test_finder_get_answers_with_in_memory_store(): test_docs = [{ "name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': { 'url': 'url' } }, { "name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': { 'url': 'url' } }, { "name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': { 'url': 'url' } }] from haystack.database.memory import InMemoryDocumentStore document_store = InMemoryDocumentStore() document_store.write_documents(test_docs) retriever = TfidfRetriever(document_store=document_store) reader = TransformersReader( model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) finder = Finder(reader, retriever) prediction = finder.get_answers(question="testing finder", top_k_retriever=10, top_k_reader=5) assert prediction is not None
def test_finder_get_answers(): test_docs = [ {"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", "meta": {"test": "test"}}, {"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", "meta": {"test": "test"}}, {"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", "meta": {"test": "test"}} ] document_store = SQLDocumentStore(url="sqlite:///qa_test.db") document_store.write_documents(test_docs) retriever = TfidfRetriever(document_store=document_store) reader = TransformersReader(model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) finder = Finder(reader, retriever) prediction = finder.get_answers(question="testing finder", top_k_retriever=10, top_k_reader=5) assert prediction is not None
def qa(self, question, text_field): document_store = ElasticsearchDocumentStore(host=ES_HOST, username=ES_USERNAME, password=ES_PASSWORD, index=self.ELASTIC_INDEX, text_field=text_field) retriever = TfidfRetriever(document_store=document_store) reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False) finder = Finder(reader, retriever) prediction = finder.get_answers(question=question, top_k_retriever=1, top_k_reader=5) return prediction
) if EMBEDDING_MODEL_PATH: retriever = EmbeddingRetriever( document_store=document_store, embedding_model=EMBEDDING_MODEL_PATH, model_format=EMBEDDING_MODEL_FORMAT, gpu=USE_GPU ) # type: BaseRetriever else: retriever = ElasticsearchRetriever(document_store=document_store)''' documentstore = SQLDocumentStore(url="sqlite:///qa.db") retriever = TfidfRetriever(document_store=documentstore) if READER_MODEL_PATH: # for extractive doc-qa '''reader = FARMReader( model_name_or_path=str(READER_MODEL_PATH), batch_size=BATCHSIZE, use_gpu=USE_GPU, context_window_size=CONTEXT_WINDOW_SIZE, top_k_per_candidate=TOP_K_PER_CANDIDATE, no_ans_boost=NO_ANS_BOOST, num_processes=MAX_PROCESSES, max_seq_len=MAX_SEQ_LEN, doc_stride=DOC_STRIDE, ) # type: Optional[FARMReader]''' reader = TransformersReader(use_gpu=-1)
## Indexing & cleaning documents # Init a database (default: sqllite) model_paths = [] for model_dir in MODELS_DIRS: path = Path(model_dir) if path.is_dir(): models = [f for f in path.iterdir() if f.is_dir()] model_paths.extend(models) #model_paths = [Path('./model')] if len(model_paths) == 0: logger.error( f"Could not find any model to load. Checked folders: {MODELS_DIRS}") retriever = TfidfRetriever() FINDERS = {} for idx, model_dir in enumerate(model_paths, start=1): reader = FARMReader(model_dir=str(model_dir), batch_size=BATCH_SIZE, use_gpu=USE_GPU) FINDERS[idx] = Finder(reader, retriever) logger.info(f"Initialized Finder (ID={idx}) with model '{model_dir}'") logger.info( "Open http://127.0.0.1:8000/docs to see Swagger API Documentation.") logger.info( """ Or just try it out directly: curl --request POST --url 'http://127.0.0.1:8000/finders/1/ask' --data '{"question": "Who is the father of Arya Starck?"}'""" ) #############################################
MODELS_DIRS = ["saved_models", "models", "model"] USE_GPU = False BATCH_SIZE = 16 DATABASE_URL = "sqlite:///qa.db" MODEL_PATHS = ['deepset/bert-base-cased-squad2'] app = FastAPI(title="Haystack API", version="0.1") if len(MODEL_PATHS) == 0: logger.error( f"No model to load. Please specify one via MODEL_PATHS (e.g. ['deepset/bert-base-cased-squad2']" ) datastore = SQLDocumentStore(url=DATABASE_URL) retriever = TfidfRetriever(datastore=datastore) FINDERS = {} for idx, model_dir in enumerate(MODEL_PATHS, start=1): reader = FARMReader(model_name_or_path=str(model_dir), batch_size=BATCH_SIZE, use_gpu=USE_GPU) FINDERS[idx] = Finder(reader, retriever) logger.info(f"Initialized Finder (ID={idx}) with model '{model_dir}'") logger.info( "Open http://127.0.0.1:8000/docs to see Swagger API Documentation.") logger.info( """ Or just try it out directly: curl --request POST --url 'http://127.0.0.1:8000/finders/1/ask' --data '{"question": "Who is the father of Arya Starck?"}'""" )