Exemple #1
0
def load_dataset(path: str):
    if path.endswith('.jsonl'):
        return Dataset.from_jsonl(path)
    try:
        return Dataset.load_from_disk(path)
    except NotADirectoryError:
        return Dataset.from_jsonl(path)
def deanonymize_dataset(
    rg_path: str,
    standardized_dataset: Dataset,
    processed_dataset_path: str = None,
    n_samples: int = None,
):
    """Take an anonymized dataset and add back the original dataset columns."""
    assert processed_dataset_path is not None, \
        "Please specify a path to save the dataset."

    # Load the dataset
    dataset = Dataset.load_from_disk(rg_path)

    if n_samples:
        dataset.set_visible_rows(list(range(n_samples)))
        standardized_dataset.set_visible_rows(list(range(n_samples)))

    text_columns = []

    # Add columns from the standardized dataset
    dataset.add_column('document', standardized_dataset['document'])
    text_columns.append('document')

    if 'summary:reference' in standardized_dataset.column_names:
        dataset.add_column('summary:reference',
                           standardized_dataset['summary:reference'])
        text_columns.append('summary:reference')

    # Preprocessing all the text columns
    dataset = dataset.update(
        lambda x: {
            f'preprocessed_{k}': x[k] if args.no_clean else clean_text(x[k])
            for k in text_columns
        })

    # Run the Spacy pipeline on all preprocessed text columns
    try:
        nlp = load('en_core_web_lg')
    except OSError:
        nlp = load('en_core_web_sm')

    nlp.add_pipe('sentencizer', before="parser")
    spacy = Spacy(nlp=nlp)
    dataset = spacy(
        dataset,
        [f'preprocessed_{col}' for col in text_columns],
        batch_size=100,
    )

    # Directly save to disk
    dataset.save_to_disk(processed_dataset_path)

    return dataset
    if args.join_predictions:
        # Join the predictions with the dataset
        dataset = join_predictions(
            dataset_jsonl=args.dataset_jsonl,
            prediction_jsonls=args.prediction_jsonls,
            save_jsonl_path=args.save_jsonl_path,
        )

    if args.workflow:
        # Run the processing workflow
        dataset = None
        # Check if `args.dataset_rg` was passed in
        if args.dataset_rg:
            # Load the dataset directly
            dataset = Dataset.load_from_disk(args.dataset_rg)

        run_workflow(
            jsonl_path=args.dataset_jsonl,
            dataset=dataset,
            doc_column=args.doc_column,
            reference_column=args.reference_column,
            summary_columns=args.summary_columns,
            bert_aligner_threshold=args.bert_aligner_threshold,
            bert_aligner_top_k=args.bert_aligner_top_k,
            embedding_aligner_threshold=args.embedding_aligner_threshold,
            embedding_aligner_top_k=args.embedding_aligner_top_k,
            processed_dataset_path=args.processed_dataset_path,
            n_samples=args.n_samples if not args.try_it else 10,
            anonymize=args.anonymize,
        )