Example #1
0
def benchmark_reader():
    reader_results = []
    doc_store = get_document_store("elasticsearch")
    docs, labels = eval_data_from_file(data_dir / filename)
    index_to_doc_store(doc_store, docs, None, labels)
    for reader_name in reader_models:
        for reader_type in reader_types:
            try:
                reader = get_reader(reader_name, reader_type)
                results = reader.eval(document_store=doc_store,
                                      doc_index=doc_index,
                                      label_index=label_index,
                                      device="cuda")
                # print(results)
                results["passages_per_second"] = n_passages / results[
                    "reader_time"]
                results["reader"] = reader_name
                results["error"] = ""
                reader_results.append(results)
            except Exception as e:
                results = {
                    'EM': 0.,
                    'f1': 0.,
                    'top_n_accuracy': 0.,
                    'top_n': 0,
                    'reader_time': 0.,
                    "passages_per_second": 0.,
                    "seconds_per_query": 0.,
                    'reader': reader_name,
                    "error": e
                }
                reader_results.append(results)
            reader_df = pd.DataFrame.from_records(reader_results)
            reader_df.to_csv("reader_results.csv")
Example #2
0
def benchmark_indexing():

    retriever_results = []
    for n_docs in n_docs_options:
        for retriever_name, doc_store_name in retriever_doc_stores:
            doc_store = get_document_store(doc_store_name, es_similarity=es_similarity)

            retriever = get_retriever(retriever_name, doc_store)

            docs, _ = prepare_data(data_dir, filename_gold, filename_negative, n_docs=n_docs)

            tic = perf_counter()
            index_to_doc_store(doc_store, docs, retriever)
            toc = perf_counter()
            indexing_time = toc - tic

            print(indexing_time)

            retriever_results.append({
                "retriever": retriever_name,
                "doc_store": doc_store_name,
                "n_docs": n_docs,
                "indexing_time": indexing_time,
                "docs_per_second": n_docs / indexing_time,
                "date_time": datetime.datetime.now()})
            retriever_df = pd.DataFrame.from_records(retriever_results)
            retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store")
            retriever_df.to_csv("retriever_index_results.csv")

            doc_store.delete_all_documents(index=doc_index)
            doc_store.delete_all_documents(index=label_index)
            time.sleep(10)
            del doc_store
            del retriever
Example #3
0
def benchmark_reader(ci=False,
                     update_json=False,
                     save_markdown=False,
                     **kwargs):
    if ci:
        reader_models = reader_models_ci
        max_docs = 100
        # heuristic to estimate num of passages for the reduced num of docs
        n_passages = n_total_passages * (max_docs / n_total_docs)
    else:
        reader_models = reader_models_full
        max_docs = None
        n_passages = n_total_passages
    reader_results = []
    doc_store = get_document_store("elasticsearch")
    # download squad data
    _download_extract_downstream_data(input_file=data_dir / filename)
    docs, labels = eval_data_from_file(data_dir / filename, max_docs)

    index_to_doc_store(doc_store, docs, None, labels)
    for reader_name in reader_models:
        for reader_type in reader_types:
            logger.info(
                f"##### Start reader run - model:{reader_name}, type: {reader_type} ##### "
            )
            try:
                reader = get_reader(reader_name, reader_type)
                results = reader.eval(document_store=doc_store,
                                      doc_index=doc_index,
                                      label_index=label_index,
                                      device="cuda")
                # results = reader.eval_on_file(data_dir, filename, device="cuda")
                print(results)
                results["passages_per_second"] = n_passages / results[
                    "reader_time"]
                results["reader"] = reader_name
                results["error"] = ""
                reader_results.append(results)
            except Exception as e:
                results = {
                    'EM': 0.,
                    'f1': 0.,
                    'top_n_accuracy': 0.,
                    'top_n': 0,
                    'reader_time': 0.,
                    "passages_per_second": 0.,
                    "seconds_per_query": 0.,
                    'reader': reader_name,
                    "error": e
                }
                reader_results.append(results)
            reader_df = pd.DataFrame.from_records(reader_results)
            reader_df.to_csv(results_file)
            if save_markdown:
                md_file = results_file.replace(".csv", ".md")
                with open(md_file, "w") as f:
                    f.write(str(reader_df.to_markdown()))
    if update_json:
        populate_reader_json()
Example #4
0
def benchmark_querying():
    """ Benchmark the time it takes to perform querying. Doc embeddings are loaded from file."""
    retriever_results = []
    for n_docs in n_docs_options:
        for retriever_name, doc_store_name in retriever_doc_stores:
            try:
                logger.info(f"##### Start run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
                doc_store = get_document_store(doc_store_name, es_similarity=es_similarity)
                retriever = get_retriever(retriever_name, doc_store)
                add_precomputed = retriever_name in ["dpr"]
                # For DPR, precomputed embeddings are loaded from file
                docs, labels = prepare_data(data_dir,
                                            filename_gold,
                                            filename_negative,
                                            n_docs=n_docs,
                                            n_queries=n_queries,
                                            add_precomputed=add_precomputed)
                logger.info("Start indexing...")
                index_to_doc_store(doc_store, docs, retriever, labels)
                logger.info("Start queries...")

                raw_results = retriever.eval()
                results = {
                    "retriever": retriever_name,
                    "doc_store": doc_store_name,
                    "n_docs": n_docs,
                    "n_queries": raw_results["n_questions"],
                    "retrieve_time": raw_results["retrieve_time"],
                    "queries_per_second": raw_results["n_questions"] / raw_results["retrieve_time"],
                    "seconds_per_query": raw_results["retrieve_time"]/ raw_results["n_questions"],
                    "recall": raw_results["recall"],
                    "map": raw_results["map"],
                    "top_k": raw_results["top_k"],
                    "date_time": datetime.datetime.now(),
                    "error": None
                }

                doc_store.delete_all_documents()
                time.sleep(5)
                del doc_store
                del retriever
            except Exception as e:
                tb = traceback.format_exc()
                results = {
                    "retriever": retriever_name,
                    "doc_store": doc_store_name,
                    "n_docs": n_docs,
                    "n_queries": 0,
                    "retrieve_time": 0.,
                    "queries_per_second": 0.,
                    "seconds_per_query": 0.,
                    "recall": 0.,
                    "map": 0.,
                    "top_k": 0,
                    "date_time": datetime.datetime.now(),
                    "error": str(tb)
                }
            logger.info(results)
            retriever_results.append(results)

            retriever_df = pd.DataFrame.from_records(retriever_results)
            retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store")
            retriever_df.to_csv("retriever_query_results.csv")
def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir,
                       filename_gold, filename_negative, data_s3_url,
                       embeddings_filenames, embeddings_dir, update_json,
                       save_markdown, **kwargs):

    retriever_results = []
    for n_docs in n_docs_options:
        for retriever_name, doc_store_name in retriever_doc_stores:
            logger.info(
                f"##### Start indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### "
            )
            try:
                doc_store = get_document_store(doc_store_name)
                retriever = get_retriever(retriever_name, doc_store)
                docs, _ = prepare_data(
                    data_dir=data_dir,
                    filename_gold=filename_gold,
                    filename_negative=filename_negative,
                    remote_url=data_s3_url,
                    embeddings_filenames=embeddings_filenames,
                    embeddings_dir=embeddings_dir,
                    n_docs=n_docs)

                tic = perf_counter()
                index_to_doc_store(doc_store, docs, retriever)
                toc = perf_counter()
                indexing_time = toc - tic

                print(indexing_time)

                retriever_results.append({
                    "retriever": retriever_name,
                    "doc_store": doc_store_name,
                    "n_docs": n_docs,
                    "indexing_time": indexing_time,
                    "docs_per_second": n_docs / indexing_time,
                    "date_time": datetime.datetime.now(),
                    "error": None
                })
                retriever_df = pd.DataFrame.from_records(retriever_results)
                retriever_df = retriever_df.sort_values(
                    by="retriever").sort_values(by="doc_store")
                retriever_df.to_csv(index_results_file)
                logger.info("Deleting all docs from this run ...")

                if isinstance(doc_store, FAISSDocumentStore):
                    doc_store.session.close()
                else:
                    doc_store.delete_all_documents(index=doc_index)
                    doc_store.delete_all_documents(index=label_index)

                if save_markdown:
                    md_file = index_results_file.replace(".csv", ".md")
                    with open(md_file, "w") as f:
                        f.write(str(retriever_df.to_markdown()))
                time.sleep(10)
                del doc_store
                del retriever

            except Exception:
                tb = traceback.format_exc()
                logging.error(
                    f"##### The following Error was raised while running indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs #####"
                )
                logging.error(tb)
                retriever_results.append({
                    "retriever": retriever_name,
                    "doc_store": doc_store_name,
                    "n_docs": n_docs,
                    "indexing_time": 0,
                    "docs_per_second": 0,
                    "date_time": datetime.datetime.now(),
                    "error": str(tb)
                })
                logger.info("Deleting all docs from this run ...")
                if isinstance(doc_store, FAISSDocumentStore):
                    doc_store.session.close()
                else:
                    doc_store.delete_all_documents(index=doc_index)
                    doc_store.delete_all_documents(index=label_index)
                time.sleep(10)
                del doc_store
                del retriever
    if update_json:
        populate_retriever_json()
def benchmark_querying(n_docs_options, retriever_doc_stores, data_dir,
                       data_s3_url, filename_gold, filename_negative,
                       n_queries, embeddings_filenames, embeddings_dir,
                       update_json, save_markdown, **kwargs):
    """ Benchmark the time it takes to perform querying. Doc embeddings are loaded from file."""
    retriever_results = []

    for n_docs in n_docs_options:
        for retriever_name, doc_store_name in retriever_doc_stores:
            try:
                logger.info(
                    f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### "
                )
                if retriever_name in ["elastic", "sentence_transformers"]:
                    similarity = "cosine"
                else:
                    similarity = "dot_product"
                doc_store = get_document_store(doc_store_name,
                                               similarity=similarity)
                retriever = get_retriever(retriever_name, doc_store)
                add_precomputed = retriever_name in ["dpr"]
                # For DPR, precomputed embeddings are loaded from file
                docs, labels = prepare_data(
                    data_dir=data_dir,
                    filename_gold=filename_gold,
                    filename_negative=filename_negative,
                    remote_url=data_s3_url,
                    embeddings_filenames=embeddings_filenames,
                    embeddings_dir=embeddings_dir,
                    n_docs=n_docs,
                    n_queries=n_queries,
                    add_precomputed=add_precomputed)
                logger.info("Start indexing...")
                index_to_doc_store(doc_store, docs, retriever, labels)
                logger.info("Start queries...")

                raw_results = retriever.eval()
                results = {
                    "retriever":
                    retriever_name,
                    "doc_store":
                    doc_store_name,
                    "n_docs":
                    n_docs,
                    "n_queries":
                    raw_results["n_questions"],
                    "retrieve_time":
                    raw_results["retrieve_time"],
                    "queries_per_second":
                    raw_results["n_questions"] / raw_results["retrieve_time"],
                    "seconds_per_query":
                    raw_results["retrieve_time"] / raw_results["n_questions"],
                    "recall":
                    raw_results["recall"] * 100,
                    "map":
                    raw_results["map"] * 100,
                    "top_k":
                    raw_results["top_k"],
                    "date_time":
                    datetime.datetime.now(),
                    "error":
                    None
                }

                logger.info("Deleting all docs from this run ...")
                if isinstance(doc_store, FAISSDocumentStore):
                    doc_store.session.close()
                else:
                    doc_store.delete_all_documents(index=doc_index)
                    doc_store.delete_all_documents(index=label_index)
                time.sleep(5)
                del doc_store
                del retriever
            except Exception:
                tb = traceback.format_exc()
                logging.error(
                    f"##### The following Error was raised while running querying run: {retriever_name}, {doc_store_name}, {n_docs} docs #####"
                )
                logging.error(tb)
                results = {
                    "retriever": retriever_name,
                    "doc_store": doc_store_name,
                    "n_docs": n_docs,
                    "n_queries": 0,
                    "retrieve_time": 0.,
                    "queries_per_second": 0.,
                    "seconds_per_query": 0.,
                    "recall": 0.,
                    "map": 0.,
                    "top_k": 0,
                    "date_time": datetime.datetime.now(),
                    "error": str(tb)
                }
                logger.info("Deleting all docs from this run ...")
                if isinstance(doc_store, FAISSDocumentStore):
                    doc_store.session.close()
                else:
                    doc_store.delete_all_documents(index=doc_index)
                    doc_store.delete_all_documents(index=label_index)
                time.sleep(5)
                del doc_store
                del retriever
            logger.info(results)
            retriever_results.append(results)

            retriever_df = pd.DataFrame.from_records(retriever_results)
            retriever_df = retriever_df.sort_values(
                by="retriever").sort_values(by="doc_store")
            retriever_df.to_csv(query_results_file)
            if save_markdown:
                md_file = query_results_file.replace(".csv", ".md")
                with open(md_file, "w") as f:
                    f.write(str(retriever_df.to_markdown()))
    if update_json:
        populate_retriever_json()
Example #7
0
def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir,
                       filename_gold, filename_negative, data_s3_url,
                       embeddings_filenames, embeddings_dir, update_json,
                       **kwargs):

    retriever_results = []
    for n_docs in n_docs_options:
        for retriever_name, doc_store_name in retriever_doc_stores:
            logger.info(
                f"##### Start indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### "
            )
            try:
                doc_store = get_document_store(doc_store_name)
                retriever = get_retriever(retriever_name, doc_store)
                docs, _ = prepare_data(
                    data_dir=data_dir,
                    filename_gold=filename_gold,
                    filename_negative=filename_negative,
                    data_s3_url=data_s3_url,
                    embeddings_filenames=embeddings_filenames,
                    embeddings_dir=embeddings_dir,
                    n_docs=n_docs)

                tic = perf_counter()
                index_to_doc_store(doc_store, docs, retriever)
                toc = perf_counter()
                indexing_time = toc - tic

                print(indexing_time)

                retriever_results.append({
                    "retriever": retriever_name,
                    "doc_store": doc_store_name,
                    "n_docs": n_docs,
                    "indexing_time": indexing_time,
                    "docs_per_second": n_docs / indexing_time,
                    "date_time": datetime.datetime.now(),
                    "error": None
                })
                retriever_df = pd.DataFrame.from_records(retriever_results)
                retriever_df = retriever_df.sort_values(
                    by="retriever").sort_values(by="doc_store")
                retriever_df.to_csv(index_results_file)
                doc_store.delete_all_documents(index=doc_index)
                doc_store.delete_all_documents(index=label_index)
                time.sleep(10)
                del doc_store
                del retriever

            except Exception as e:
                tb = traceback.format_exc()
                logging.ERROR(
                    f"##### The following Error was raised while running indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs #####"
                )
                logging.Error(tb)
                retriever_results.append({
                    "retriever": retriever_name,
                    "doc_store": doc_store_name,
                    "n_docs": n_docs,
                    "indexing_time": 0,
                    "docs_per_second": 0,
                    "date_time": datetime.datetime.now(),
                    "error": str(tb)
                })
                doc_store.delete_all_documents(index=doc_index)
                doc_store.delete_all_documents(index=label_index)
                time.sleep(10)
                del doc_store
                del retriever
    if update_json:
        populate_retriever_json()
Example #8
0
def benchmark_querying(n_docs_options, retriever_doc_stores, data_dir,
                       data_s3_url, filename_gold, filename_negative,
                       n_queries, embeddings_filenames, embeddings_dir,
                       update_json, **kwargs):
    """ Benchmark the time it takes to perform querying. Doc embeddings are loaded from file."""
    retriever_results = []

    for n_docs in n_docs_options:
        for retriever_name, doc_store_name in retriever_doc_stores:
            try:
                logger.info(
                    f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### "
                )
                doc_store = get_document_store(doc_store_name)
                retriever = get_retriever(retriever_name, doc_store)
                add_precomputed = retriever_name in ["dpr"]
                # For DPR, precomputed embeddings are loaded from file
                docs, labels = prepare_data(
                    data_dir=data_dir,
                    filename_gold=filename_gold,
                    filename_negative=filename_negative,
                    data_s3_url=data_s3_url,
                    embeddings_filenames=embeddings_filenames,
                    embeddings_dir=embeddings_dir,
                    n_docs=n_docs,
                    n_queries=n_queries,
                    add_precomputed=add_precomputed)
                logger.info("Start indexing...")
                index_to_doc_store(doc_store, docs, retriever, labels)
                logger.info("Start queries...")

                raw_results = retriever.eval()
                results = {
                    "retriever":
                    retriever_name,
                    "doc_store":
                    doc_store_name,
                    "n_docs":
                    n_docs,
                    "n_queries":
                    raw_results["n_questions"],
                    "retrieve_time":
                    raw_results["retrieve_time"],
                    "queries_per_second":
                    raw_results["n_questions"] / raw_results["retrieve_time"],
                    "seconds_per_query":
                    raw_results["retrieve_time"] / raw_results["n_questions"],
                    "recall":
                    raw_results["recall"],
                    "map":
                    raw_results["map"],
                    "top_k":
                    raw_results["top_k"],
                    "date_time":
                    datetime.datetime.now(),
                    "error":
                    None
                }

                doc_store.delete_all_documents(index=doc_index)
                doc_store.delete_all_documents(index=label_index)
                time.sleep(5)
                del doc_store
                del retriever
            except Exception as e:
                tb = traceback.format_exc()
                logging.ERROR(
                    f"##### The following Error was raised while running querying run: {retriever_name}, {doc_store_name}, {n_docs} docs #####"
                )
                logging.Error(tb)
                results = {
                    "retriever": retriever_name,
                    "doc_store": doc_store_name,
                    "n_docs": n_docs,
                    "n_queries": 0,
                    "retrieve_time": 0.,
                    "queries_per_second": 0.,
                    "seconds_per_query": 0.,
                    "recall": 0.,
                    "map": 0.,
                    "top_k": 0,
                    "date_time": datetime.datetime.now(),
                    "error": str(tb)
                }
                doc_store.delete_all_documents(index=doc_index)
                doc_store.delete_all_documents(index=label_index)
                time.sleep(5)
                del doc_store
                del retriever
            logger.info(results)
            retriever_results.append(results)

            retriever_df = pd.DataFrame.from_records(retriever_results)
            retriever_df = retriever_df.sort_values(
                by="retriever").sort_values(by="doc_store")
            retriever_df.to_csv(query_results_file)
    if update_json:
        populate_retriever_json()