Beispiel #1
0
def run_workflow(
    jsonl_path: str = None,
    dataset: Dataset = None,
    doc_column: str = None,
    reference_column: str = None,
    summary_columns: List[str] = None,
    bert_aligner_threshold: float = 0.5,
    bert_aligner_top_k: int = 3,
    embedding_aligner_threshold: float = 0.5,
    embedding_aligner_top_k: int = 3,
    processed_dataset_path: str = None,
    n_samples: int = None,
    anonymize: bool = False,
):
    assert (jsonl_path is None) != (dataset is None), \
        "One of `jsonl_path` and `dataset` must be specified."
    assert processed_dataset_path is not None, \
        "Please specify a path to save the dataset."

    # Load the dataset
    if jsonl_path is not None:
        dataset = Dataset.from_jsonl(jsonl_path)

    if doc_column is None:
        # Assume `doc_column` is called "document"
        doc_column = 'document'
        assert doc_column in dataset.column_names, \
            f"`doc_column={doc_column}` is not a column in dataset."
        print("Assuming `doc_column` is called 'document'.")

    if reference_column is None:
        # Assume `reference_column` is called "summary:reference"
        reference_column = 'summary:reference'
        print("Assuming `reference_column` is called 'summary:reference'.")
        if reference_column not in dataset.column_names:
            print("No reference summary loaded")
            reference_column = None

    if summary_columns is None or len(summary_columns) == 0:
        # Assume `summary_columns` are prefixed by "summary:"
        summary_columns = []
        for col in dataset.column_names:
            if col.startswith("summary:") and col != "summary:reference":
                summary_columns.append(col)
        print(
            f"Reading summary columns from dataset. Found {summary_columns}.")

    if len(summary_columns) == 0 and reference_column is None:
        raise ValueError("At least one summary is required")

    # Set visible rows to restrict to the first `n_samples`
    if n_samples:
        dataset.set_visible_rows(list(range(n_samples)))

    # Combine the text columns into one list
    text_columns = [doc_column] + ([reference_column] if reference_column else
                                   []) + summary_columns

    # Preprocessing all the text columns
    dataset = dataset.update(
        lambda x: {
            f'preprocessed_{k}': x[k] if args.no_clean else clean_text(x[k])
            for k in text_columns
        })

    # Run the Spacy pipeline on all preprocessed text columns
    try:
        nlp = load('en_core_web_lg')
    except OSError:
        nlp = None

    if nlp is None:
        raise OSError(
            'Missing spaCy model "en_core_web_lg". Please run "python -m spacy download en_core_web_lg"'
        )

    nlp.add_pipe('sentencizer', before="parser")
    spacy = Spacy(nlp=nlp)
    dataset = spacy(
        dataset,
        [f'preprocessed_{col}' for col in text_columns],
        batch_size=100,
    )

    # Run the 3 align pipelines
    bert_aligner = BertscoreAlignerCap(
        threshold=bert_aligner_threshold,
        top_k=bert_aligner_top_k,
        spacy=spacy,
    )

    embedding_aligner = StaticEmbeddingAlignerCap(
        threshold=embedding_aligner_threshold,
        top_k=embedding_aligner_top_k,
        spacy=spacy,
    )

    ngram_aligner = NGramAlignerCap(spacy=spacy, )

    dataset = _run_aligners(
        dataset=dataset,
        aligners=[bert_aligner, embedding_aligner, ngram_aligner],
        doc_column=f'preprocessed_{doc_column}',
        reference_column=f'preprocessed_{reference_column}'
        if reference_column else None,
        summary_columns=[f'preprocessed_{col}' for col in summary_columns],
    )

    # Save the dataset
    if anonymize:
        # Remove certain columns to anonymize and save to disk
        for col in [doc_column, reference_column]:
            if col is not None:
                dataset.remove_column(col)
                dataset.remove_column(f'preprocessed_{col}')
                dataset.remove_column(
                    str(spacy.identifier(columns=[f'preprocessed_{col}'])))
                del dataset.interactions[CACHEDOPS].history[(
                    spacy.identifier, f'preprocessed_{col}')]
        dataset.save_to_disk(f'{processed_dataset_path}.anonymized')
    else:
        # Directly save to disk
        dataset.save_to_disk(processed_dataset_path)

    return dataset
Beispiel #2
0
def _run_aligners(
    dataset: Dataset,
    aligners: List[CachedOperation],
    doc_column: str,
    reference_column: str,
    summary_columns: List[str] = None,
):
    if not summary_columns:
        summary_columns = []

    to_columns = []
    if reference_column is not None:
        to_columns.append(reference_column)
    to_columns.extend(summary_columns)

    for aligner in aligners:

        # Run the aligner on (document, summary) pairs

        dataset = aligner(
            dataset,
            [doc_column] + to_columns,
            # Must use `batch_size = 1`
            batch_size=1,
        )

        if reference_column is not None and len(summary_columns):
            # Run the aligner on (reference, summary) pairs
            dataset = aligner(
                dataset,
                [reference_column] + summary_columns,
                # Must use `batch_size = 1`
                batch_size=1,
            )

        if len(to_columns) > 1:
            # Instead of having one column for (document, summary) comparisons, split
            # off into (1 + |summary_columns|) total columns, one for each comparison

            # Retrieve the (document, summary) column
            doc_summary_column = aligner.retrieve(
                dataset[:],
                [doc_column] + to_columns,
            )[tuple([doc_column] + to_columns)]

            for i, col in enumerate(to_columns):
                # Add as a new column after encoding with the aligner's `encode` method
                dataset.add_column(
                    column=str(aligner.identifier(columns=[doc_column, col])),
                    values=[
                        aligner.encode([row[i]]) for row in doc_summary_column
                    ],
                )

            # Remove the (document, summary) column
            dataset.remove_column(
                str(aligner.identifier(columns=[doc_column] + to_columns)))
            del dataset.interactions[CACHEDOPS].history[(
                aligner.identifier,
                strings_as_json(strings=[doc_column] + to_columns))]

        if reference_column is not None and len(summary_columns) > 1:
            # Instead of having one column for (reference, summary) comparisons, split
            # off into (|summary_columns|) total columns, one for each comparison

            # Retrieve the (reference, summary) column
            reference_summary_column = aligner.retrieve(
                dataset[:],
                [reference_column] + summary_columns,
            )[tuple([reference_column] + summary_columns)]

            for i, col in enumerate(summary_columns):
                # Add as a new column
                dataset.add_column(column=str(
                    aligner.identifier(columns=[reference_column, col])),
                                   values=[
                                       aligner.encode([row[i]])
                                       for row in reference_summary_column
                                   ])

            # Remove the (reference, summary) column
            dataset.remove_column(
                str(
                    aligner.identifier(columns=[reference_column] +
                                       summary_columns)))
            del dataset.interactions[CACHEDOPS].history[(
                aligner.identifier,
                strings_as_json(strings=[reference_column] + summary_columns))]

    return dataset