Beispiel #1
0
    def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_index: str = "label",
                      batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None,
                      max_docs: Union[int, bool] = None):
        """
        Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
        If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise
        from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors.

        :param filename: Name of the file containing evaluation data (json or jsonl)
        :param doc_index: Elasticsearch index where evaluation documents should be stored
        :param label_index: Elasticsearch index where labeled questions should be stored
        :param batch_size: Optional number of documents that are loaded and processed at a time.
                           When set to None (default) all documents are processed at once.
        :param preprocessor: Optional PreProcessor to preprocess evaluation documents.
                             It can be used for splitting documents into passages (and assigning labels to corresponding passages).
                             Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0.
                             When set to None (default) preprocessing is disabled.
        :param max_docs: Optional number of documents that will be loaded.
                         When set to None (default) all available eval documents are used.

        """
        # TODO improve support for PreProcessor when adding eval data
        if preprocessor is not None:
            assert preprocessor.split_by != "sentence", f"Split by sentence not supported.\n" \
                                                    f"Please set 'split_by' to either 'word' or 'passage' in the supplied PreProcessor."
            assert preprocessor.split_overlap == 0, f"Overlapping documents are currently not supported when adding eval data.\n" \
                                                    f"Please set 'split_overlap=0' in the supplied PreProcessor."
            assert preprocessor.clean_empty_lines == False, f"clean_empty_lines currently not supported when adding eval data.\n" \
                                                    f"Please set 'clean_empty_lines=False' in the supplied PreProcessor."
            assert preprocessor.clean_whitespace == False, f"clean_whitespace is currently not supported when adding eval data.\n" \
                                                    f"Please set 'clean_whitespace=False' in the supplied PreProcessor."
            assert preprocessor.clean_header_footer == False, f"clean_header_footer is currently not supported when adding eval data.\n" \
                                                    f"Please set 'clean_header_footer=False' in the supplied PreProcessor."

        file_path = Path(filename)
        if file_path.suffix == ".json":
            if batch_size is None:
                docs, labels = eval_data_from_json(filename, max_docs=max_docs, preprocessor=preprocessor)
                self.write_documents(docs, index=doc_index)
                self.write_labels(labels, index=label_index)
            else:
                jsonl_filename = (file_path.parent / (file_path.stem + '.jsonl')).as_posix()
                logger.info(f"Adding evaluation data batch-wise is not compatible with json-formatted SQuAD files. "
                            f"Converting json to jsonl to: {jsonl_filename}")
                squad_json_to_jsonl(filename, jsonl_filename)
                self.add_eval_data(jsonl_filename, doc_index, label_index, batch_size)

        elif file_path.suffix == ".jsonl":
            for docs, labels in eval_data_from_jsonl(filename, batch_size, max_docs=max_docs, preprocessor=preprocessor):
                if docs:
                    self.write_documents(docs, index=doc_index)
                if labels:
                    self.write_labels(labels, index=label_index)

        else:
            logger.error("File needs to be in json or jsonl format.")
def prepare_data(data_dir,
                 filename_gold,
                 filename_negative,
                 remote_url,
                 embeddings_filenames,
                 embeddings_dir,
                 n_docs=None,
                 n_queries=None,
                 add_precomputed=False):
    """
    filename_gold points to a squad format file.
    filename_negative points to a csv file where the first column is doc_id and second is document text.
    If add_precomputed is True, this fn will look in the embeddings files for precomputed embeddings to add to each Document
    """

    logging.getLogger("farm").setLevel(logging.INFO)
    download_from_url(remote_url + filename_gold,
                      filepath=data_dir + filename_gold)
    download_from_url(remote_url + filename_negative,
                      filepath=data_dir + filename_negative)
    if add_precomputed:
        for embedding_filename in embeddings_filenames:
            download_from_url(
                remote_url + str(embeddings_dir) + embedding_filename,
                filepath=data_dir + str(embeddings_dir) + embedding_filename)
    logging.getLogger("farm").setLevel(logging.WARN)

    gold_docs, labels = eval_data_from_json(data_dir + filename_gold)

    # Reduce number of docs
    gold_docs = gold_docs[:n_docs]

    # Remove labels whose gold docs have been removed
    doc_ids = [x.id for x in gold_docs]
    labels = [x for x in labels if x.document_id in doc_ids]

    # Filter labels down to n_queries
    selected_queries = list(
        set(f"{x.document_id} | {x.question}" for x in labels))
    selected_queries = selected_queries[:n_queries]
    labels = [
        x for x in labels
        if f"{x.document_id} | {x.question}" in selected_queries
    ]

    n_neg_docs = max(0, n_docs - len(gold_docs))
    neg_docs = prepare_negative_passages(data_dir, filename_negative,
                                         n_neg_docs)
    docs = gold_docs + neg_docs

    if add_precomputed:
        docs = add_precomputed_embeddings(data_dir + embeddings_dir,
                                          embeddings_filenames, docs)

    return docs, labels
Beispiel #3
0
def benchmark_reader(ci=False,
                     update_json=False,
                     save_markdown=False,
                     **kwargs):
    if ci:
        reader_models = reader_models_ci
    else:
        reader_models = reader_models_full
    reader_results = []
    doc_store = get_document_store("elasticsearch")
    # download squad data
    _download_extract_downstream_data(input_file=data_dir / filename)
    docs, labels = eval_data_from_json(data_dir / filename, max_docs=None)

    index_to_doc_store(doc_store, docs, None, labels)
    for reader_name in reader_models:
        for reader_type in reader_types:
            logger.info(
                f"##### Start reader run - model:{reader_name}, type: {reader_type} ##### "
            )
            try:
                reader = get_reader(reader_name, reader_type)
                results = reader.eval(document_store=doc_store,
                                      doc_index=doc_index,
                                      label_index=label_index,
                                      device="cuda")
                # print(results)
                results["passages_per_second"] = n_total_passages / results[
                    "reader_time"]
                results["reader"] = reader_name
                results["error"] = ""
                reader_results.append(results)
            except Exception as e:
                results = {
                    'EM': 0.,
                    'f1': 0.,
                    'top_n_accuracy': 0.,
                    'top_n': 0,
                    'reader_time': 0.,
                    "passages_per_second": 0.,
                    "seconds_per_query": 0.,
                    'reader': reader_name,
                    "error": e
                }
                reader_results.append(results)
            reader_df = pd.DataFrame.from_records(reader_results)
            reader_df.to_csv(results_file)
            if save_markdown:
                md_file = results_file.replace(".csv", ".md")
                with open(md_file, "w") as f:
                    f.write(str(reader_df.to_markdown()))
    doc_store.delete_all_documents(label_index)
    doc_store.delete_all_documents(doc_index)
    if update_json:
        populate_reader_json()
Beispiel #4
0
    def add_eval_data(self,
                      filename: str,
                      doc_index: str = "eval_document",
                      label_index: str = "label",
                      batch_size: Optional[int] = None):
        """
        Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
        If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise
        from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors.

        :param filename: Name of the file containing evaluation data (json or jsonl)
        :type filename: str
        :param doc_index: Elasticsearch index where evaluation documents should be stored
        :type doc_index: str
        :param label_index: Elasticsearch index where labeled questions should be stored
        :type label_index: str
        :param batch_size: Number of documents that are loaded and processed at a time.
                           Only works with jsonl formatted files. Setting batch_size and
                           using a json formatted file will convert the json to jsonl prior
                           to adding eval data.
        :type batch_size: int
        """
        if filename.endswith(".json"):
            if batch_size is None:
                docs, labels = eval_data_from_json(filename)
                self.write_documents(docs, index=doc_index)
                self.write_labels(labels, index=label_index)
            else:
                jsonl_filename = filename + "l"
                logger.info(
                    f"Adding evaluation data batch-wise is not compatible with json-formatted SQuAD files. "
                    f"Converting json to jsonl to: {jsonl_filename}")
                squad_json_to_jsonl(filename, jsonl_filename)
                self.add_eval_data(jsonl_filename, doc_index, label_index,
                                   batch_size)

        elif filename.endswith(".jsonl"):
            for docs, labels in eval_data_from_jsonl(filename, batch_size):
                if docs:
                    self.write_documents(docs, index=doc_index)
                if labels:
                    self.write_labels(labels, index=label_index)

        else:
            logger.error("File needs to be in json or jsonl format.")