def benchmark_reader(): reader_results = [] doc_store = get_document_store("elasticsearch") docs, labels = eval_data_from_file(data_dir / filename) index_to_doc_store(doc_store, docs, None, labels) for reader_name in reader_models: for reader_type in reader_types: try: reader = get_reader(reader_name, reader_type) results = reader.eval(document_store=doc_store, doc_index=doc_index, label_index=label_index, device="cuda") # print(results) results["passages_per_second"] = n_passages / results[ "reader_time"] results["reader"] = reader_name results["error"] = "" reader_results.append(results) except Exception as e: results = { 'EM': 0., 'f1': 0., 'top_n_accuracy': 0., 'top_n': 0, 'reader_time': 0., "passages_per_second": 0., "seconds_per_query": 0., 'reader': reader_name, "error": e } reader_results.append(results) reader_df = pd.DataFrame.from_records(reader_results) reader_df.to_csv("reader_results.csv")
def benchmark_indexing(): retriever_results = [] for n_docs in n_docs_options: for retriever_name, doc_store_name in retriever_doc_stores: doc_store = get_document_store(doc_store_name, es_similarity=es_similarity) retriever = get_retriever(retriever_name, doc_store) docs, _ = prepare_data(data_dir, filename_gold, filename_negative, n_docs=n_docs) tic = perf_counter() index_to_doc_store(doc_store, docs, retriever) toc = perf_counter() indexing_time = toc - tic print(indexing_time) retriever_results.append({ "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, "indexing_time": indexing_time, "docs_per_second": n_docs / indexing_time, "date_time": datetime.datetime.now()}) retriever_df = pd.DataFrame.from_records(retriever_results) retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store") retriever_df.to_csv("retriever_index_results.csv") doc_store.delete_all_documents(index=doc_index) doc_store.delete_all_documents(index=label_index) time.sleep(10) del doc_store del retriever
def benchmark_reader(ci=False, update_json=False, save_markdown=False, **kwargs): if ci: reader_models = reader_models_ci max_docs = 100 # heuristic to estimate num of passages for the reduced num of docs n_passages = n_total_passages * (max_docs / n_total_docs) else: reader_models = reader_models_full max_docs = None n_passages = n_total_passages reader_results = [] doc_store = get_document_store("elasticsearch") # download squad data _download_extract_downstream_data(input_file=data_dir / filename) docs, labels = eval_data_from_file(data_dir / filename, max_docs) index_to_doc_store(doc_store, docs, None, labels) for reader_name in reader_models: for reader_type in reader_types: logger.info( f"##### Start reader run - model:{reader_name}, type: {reader_type} ##### " ) try: reader = get_reader(reader_name, reader_type) results = reader.eval(document_store=doc_store, doc_index=doc_index, label_index=label_index, device="cuda") # results = reader.eval_on_file(data_dir, filename, device="cuda") print(results) results["passages_per_second"] = n_passages / results[ "reader_time"] results["reader"] = reader_name results["error"] = "" reader_results.append(results) except Exception as e: results = { 'EM': 0., 'f1': 0., 'top_n_accuracy': 0., 'top_n': 0, 'reader_time': 0., "passages_per_second": 0., "seconds_per_query": 0., 'reader': reader_name, "error": e } reader_results.append(results) reader_df = pd.DataFrame.from_records(reader_results) reader_df.to_csv(results_file) if save_markdown: md_file = results_file.replace(".csv", ".md") with open(md_file, "w") as f: f.write(str(reader_df.to_markdown())) if update_json: populate_reader_json()
def benchmark_querying(): """ Benchmark the time it takes to perform querying. Doc embeddings are loaded from file.""" retriever_results = [] for n_docs in n_docs_options: for retriever_name, doc_store_name in retriever_doc_stores: try: logger.info(f"##### Start run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ") doc_store = get_document_store(doc_store_name, es_similarity=es_similarity) retriever = get_retriever(retriever_name, doc_store) add_precomputed = retriever_name in ["dpr"] # For DPR, precomputed embeddings are loaded from file docs, labels = prepare_data(data_dir, filename_gold, filename_negative, n_docs=n_docs, n_queries=n_queries, add_precomputed=add_precomputed) logger.info("Start indexing...") index_to_doc_store(doc_store, docs, retriever, labels) logger.info("Start queries...") raw_results = retriever.eval() results = { "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, "n_queries": raw_results["n_questions"], "retrieve_time": raw_results["retrieve_time"], "queries_per_second": raw_results["n_questions"] / raw_results["retrieve_time"], "seconds_per_query": raw_results["retrieve_time"]/ raw_results["n_questions"], "recall": raw_results["recall"], "map": raw_results["map"], "top_k": raw_results["top_k"], "date_time": datetime.datetime.now(), "error": None } doc_store.delete_all_documents() time.sleep(5) del doc_store del retriever except Exception as e: tb = traceback.format_exc() results = { "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, "n_queries": 0, "retrieve_time": 0., "queries_per_second": 0., "seconds_per_query": 0., "recall": 0., "map": 0., "top_k": 0, "date_time": datetime.datetime.now(), "error": str(tb) } logger.info(results) retriever_results.append(results) retriever_df = pd.DataFrame.from_records(retriever_results) retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store") retriever_df.to_csv("retriever_query_results.csv")
def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir, filename_gold, filename_negative, data_s3_url, embeddings_filenames, embeddings_dir, update_json, save_markdown, **kwargs): retriever_results = [] for n_docs in n_docs_options: for retriever_name, doc_store_name in retriever_doc_stores: logger.info( f"##### Start indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### " ) try: doc_store = get_document_store(doc_store_name) retriever = get_retriever(retriever_name, doc_store) docs, _ = prepare_data( data_dir=data_dir, filename_gold=filename_gold, filename_negative=filename_negative, remote_url=data_s3_url, embeddings_filenames=embeddings_filenames, embeddings_dir=embeddings_dir, n_docs=n_docs) tic = perf_counter() index_to_doc_store(doc_store, docs, retriever) toc = perf_counter() indexing_time = toc - tic print(indexing_time) retriever_results.append({ "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, "indexing_time": indexing_time, "docs_per_second": n_docs / indexing_time, "date_time": datetime.datetime.now(), "error": None }) retriever_df = pd.DataFrame.from_records(retriever_results) retriever_df = retriever_df.sort_values( by="retriever").sort_values(by="doc_store") retriever_df.to_csv(index_results_file) logger.info("Deleting all docs from this run ...") if isinstance(doc_store, FAISSDocumentStore): doc_store.session.close() else: doc_store.delete_all_documents(index=doc_index) doc_store.delete_all_documents(index=label_index) if save_markdown: md_file = index_results_file.replace(".csv", ".md") with open(md_file, "w") as f: f.write(str(retriever_df.to_markdown())) time.sleep(10) del doc_store del retriever except Exception: tb = traceback.format_exc() logging.error( f"##### The following Error was raised while running indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs #####" ) logging.error(tb) retriever_results.append({ "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, "indexing_time": 0, "docs_per_second": 0, "date_time": datetime.datetime.now(), "error": str(tb) }) logger.info("Deleting all docs from this run ...") if isinstance(doc_store, FAISSDocumentStore): doc_store.session.close() else: doc_store.delete_all_documents(index=doc_index) doc_store.delete_all_documents(index=label_index) time.sleep(10) del doc_store del retriever if update_json: populate_retriever_json()
def benchmark_querying(n_docs_options, retriever_doc_stores, data_dir, data_s3_url, filename_gold, filename_negative, n_queries, embeddings_filenames, embeddings_dir, update_json, save_markdown, **kwargs): """ Benchmark the time it takes to perform querying. Doc embeddings are loaded from file.""" retriever_results = [] for n_docs in n_docs_options: for retriever_name, doc_store_name in retriever_doc_stores: try: logger.info( f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### " ) if retriever_name in ["elastic", "sentence_transformers"]: similarity = "cosine" else: similarity = "dot_product" doc_store = get_document_store(doc_store_name, similarity=similarity) retriever = get_retriever(retriever_name, doc_store) add_precomputed = retriever_name in ["dpr"] # For DPR, precomputed embeddings are loaded from file docs, labels = prepare_data( data_dir=data_dir, filename_gold=filename_gold, filename_negative=filename_negative, remote_url=data_s3_url, embeddings_filenames=embeddings_filenames, embeddings_dir=embeddings_dir, n_docs=n_docs, n_queries=n_queries, add_precomputed=add_precomputed) logger.info("Start indexing...") index_to_doc_store(doc_store, docs, retriever, labels) logger.info("Start queries...") raw_results = retriever.eval() results = { "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, "n_queries": raw_results["n_questions"], "retrieve_time": raw_results["retrieve_time"], "queries_per_second": raw_results["n_questions"] / raw_results["retrieve_time"], "seconds_per_query": raw_results["retrieve_time"] / raw_results["n_questions"], "recall": raw_results["recall"] * 100, "map": raw_results["map"] * 100, "top_k": raw_results["top_k"], "date_time": datetime.datetime.now(), "error": None } logger.info("Deleting all docs from this run ...") if isinstance(doc_store, FAISSDocumentStore): doc_store.session.close() else: doc_store.delete_all_documents(index=doc_index) doc_store.delete_all_documents(index=label_index) time.sleep(5) del doc_store del retriever except Exception: tb = traceback.format_exc() logging.error( f"##### The following Error was raised while running querying run: {retriever_name}, {doc_store_name}, {n_docs} docs #####" ) logging.error(tb) results = { "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, "n_queries": 0, "retrieve_time": 0., "queries_per_second": 0., "seconds_per_query": 0., "recall": 0., "map": 0., "top_k": 0, "date_time": datetime.datetime.now(), "error": str(tb) } logger.info("Deleting all docs from this run ...") if isinstance(doc_store, FAISSDocumentStore): doc_store.session.close() else: doc_store.delete_all_documents(index=doc_index) doc_store.delete_all_documents(index=label_index) time.sleep(5) del doc_store del retriever logger.info(results) retriever_results.append(results) retriever_df = pd.DataFrame.from_records(retriever_results) retriever_df = retriever_df.sort_values( by="retriever").sort_values(by="doc_store") retriever_df.to_csv(query_results_file) if save_markdown: md_file = query_results_file.replace(".csv", ".md") with open(md_file, "w") as f: f.write(str(retriever_df.to_markdown())) if update_json: populate_retriever_json()
def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir, filename_gold, filename_negative, data_s3_url, embeddings_filenames, embeddings_dir, update_json, **kwargs): retriever_results = [] for n_docs in n_docs_options: for retriever_name, doc_store_name in retriever_doc_stores: logger.info( f"##### Start indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### " ) try: doc_store = get_document_store(doc_store_name) retriever = get_retriever(retriever_name, doc_store) docs, _ = prepare_data( data_dir=data_dir, filename_gold=filename_gold, filename_negative=filename_negative, data_s3_url=data_s3_url, embeddings_filenames=embeddings_filenames, embeddings_dir=embeddings_dir, n_docs=n_docs) tic = perf_counter() index_to_doc_store(doc_store, docs, retriever) toc = perf_counter() indexing_time = toc - tic print(indexing_time) retriever_results.append({ "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, "indexing_time": indexing_time, "docs_per_second": n_docs / indexing_time, "date_time": datetime.datetime.now(), "error": None }) retriever_df = pd.DataFrame.from_records(retriever_results) retriever_df = retriever_df.sort_values( by="retriever").sort_values(by="doc_store") retriever_df.to_csv(index_results_file) doc_store.delete_all_documents(index=doc_index) doc_store.delete_all_documents(index=label_index) time.sleep(10) del doc_store del retriever except Exception as e: tb = traceback.format_exc() logging.ERROR( f"##### The following Error was raised while running indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs #####" ) logging.Error(tb) retriever_results.append({ "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, "indexing_time": 0, "docs_per_second": 0, "date_time": datetime.datetime.now(), "error": str(tb) }) doc_store.delete_all_documents(index=doc_index) doc_store.delete_all_documents(index=label_index) time.sleep(10) del doc_store del retriever if update_json: populate_retriever_json()
def benchmark_querying(n_docs_options, retriever_doc_stores, data_dir, data_s3_url, filename_gold, filename_negative, n_queries, embeddings_filenames, embeddings_dir, update_json, **kwargs): """ Benchmark the time it takes to perform querying. Doc embeddings are loaded from file.""" retriever_results = [] for n_docs in n_docs_options: for retriever_name, doc_store_name in retriever_doc_stores: try: logger.info( f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### " ) doc_store = get_document_store(doc_store_name) retriever = get_retriever(retriever_name, doc_store) add_precomputed = retriever_name in ["dpr"] # For DPR, precomputed embeddings are loaded from file docs, labels = prepare_data( data_dir=data_dir, filename_gold=filename_gold, filename_negative=filename_negative, data_s3_url=data_s3_url, embeddings_filenames=embeddings_filenames, embeddings_dir=embeddings_dir, n_docs=n_docs, n_queries=n_queries, add_precomputed=add_precomputed) logger.info("Start indexing...") index_to_doc_store(doc_store, docs, retriever, labels) logger.info("Start queries...") raw_results = retriever.eval() results = { "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, "n_queries": raw_results["n_questions"], "retrieve_time": raw_results["retrieve_time"], "queries_per_second": raw_results["n_questions"] / raw_results["retrieve_time"], "seconds_per_query": raw_results["retrieve_time"] / raw_results["n_questions"], "recall": raw_results["recall"], "map": raw_results["map"], "top_k": raw_results["top_k"], "date_time": datetime.datetime.now(), "error": None } doc_store.delete_all_documents(index=doc_index) doc_store.delete_all_documents(index=label_index) time.sleep(5) del doc_store del retriever except Exception as e: tb = traceback.format_exc() logging.ERROR( f"##### The following Error was raised while running querying run: {retriever_name}, {doc_store_name}, {n_docs} docs #####" ) logging.Error(tb) results = { "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, "n_queries": 0, "retrieve_time": 0., "queries_per_second": 0., "seconds_per_query": 0., "recall": 0., "map": 0., "top_k": 0, "date_time": datetime.datetime.now(), "error": str(tb) } doc_store.delete_all_documents(index=doc_index) doc_store.delete_all_documents(index=label_index) time.sleep(5) del doc_store del retriever logger.info(results) retriever_results.append(results) retriever_df = pd.DataFrame.from_records(retriever_results) retriever_df = retriever_df.sort_values( by="retriever").sort_values(by="doc_store") retriever_df.to_csv(query_results_file) if update_json: populate_retriever_json()