def run_workflow( jsonl_path: str = None, dataset: Dataset = None, doc_column: str = None, reference_column: str = None, summary_columns: List[str] = None, bert_aligner_threshold: float = 0.5, bert_aligner_top_k: int = 3, embedding_aligner_threshold: float = 0.5, embedding_aligner_top_k: int = 3, processed_dataset_path: str = None, n_samples: int = None, anonymize: bool = False, ): assert (jsonl_path is None) != (dataset is None), \ "One of `jsonl_path` and `dataset` must be specified." assert processed_dataset_path is not None, \ "Please specify a path to save the dataset." # Load the dataset if jsonl_path is not None: dataset = Dataset.from_jsonl(jsonl_path) if doc_column is None: # Assume `doc_column` is called "document" doc_column = 'document' assert doc_column in dataset.column_names, \ f"`doc_column={doc_column}` is not a column in dataset." print("Assuming `doc_column` is called 'document'.") if reference_column is None: # Assume `reference_column` is called "summary:reference" reference_column = 'summary:reference' print("Assuming `reference_column` is called 'summary:reference'.") if reference_column not in dataset.column_names: print("No reference summary loaded") reference_column = None if summary_columns is None or len(summary_columns) == 0: # Assume `summary_columns` are prefixed by "summary:" summary_columns = [] for col in dataset.column_names: if col.startswith("summary:") and col != "summary:reference": summary_columns.append(col) print( f"Reading summary columns from dataset. Found {summary_columns}.") if len(summary_columns) == 0 and reference_column is None: raise ValueError("At least one summary is required") # Set visible rows to restrict to the first `n_samples` if n_samples: dataset.set_visible_rows(list(range(n_samples))) # Combine the text columns into one list text_columns = [doc_column] + ([reference_column] if reference_column else []) + summary_columns # Preprocessing all the text columns dataset = dataset.update( lambda x: { f'preprocessed_{k}': x[k] if args.no_clean else clean_text(x[k]) for k in text_columns }) # Run the Spacy pipeline on all preprocessed text columns try: nlp = load('en_core_web_lg') except OSError: nlp = None if nlp is None: raise OSError( 'Missing spaCy model "en_core_web_lg". Please run "python -m spacy download en_core_web_lg"' ) nlp.add_pipe('sentencizer', before="parser") spacy = Spacy(nlp=nlp) dataset = spacy( dataset, [f'preprocessed_{col}' for col in text_columns], batch_size=100, ) # Run the 3 align pipelines bert_aligner = BertscoreAlignerCap( threshold=bert_aligner_threshold, top_k=bert_aligner_top_k, spacy=spacy, ) embedding_aligner = StaticEmbeddingAlignerCap( threshold=embedding_aligner_threshold, top_k=embedding_aligner_top_k, spacy=spacy, ) ngram_aligner = NGramAlignerCap(spacy=spacy, ) dataset = _run_aligners( dataset=dataset, aligners=[bert_aligner, embedding_aligner, ngram_aligner], doc_column=f'preprocessed_{doc_column}', reference_column=f'preprocessed_{reference_column}' if reference_column else None, summary_columns=[f'preprocessed_{col}' for col in summary_columns], ) # Save the dataset if anonymize: # Remove certain columns to anonymize and save to disk for col in [doc_column, reference_column]: if col is not None: dataset.remove_column(col) dataset.remove_column(f'preprocessed_{col}') dataset.remove_column( str(spacy.identifier(columns=[f'preprocessed_{col}']))) del dataset.interactions[CACHEDOPS].history[( spacy.identifier, f'preprocessed_{col}')] dataset.save_to_disk(f'{processed_dataset_path}.anonymized') else: # Directly save to disk dataset.save_to_disk(processed_dataset_path) return dataset
def _run_aligners( dataset: Dataset, aligners: List[CachedOperation], doc_column: str, reference_column: str, summary_columns: List[str] = None, ): if not summary_columns: summary_columns = [] to_columns = [] if reference_column is not None: to_columns.append(reference_column) to_columns.extend(summary_columns) for aligner in aligners: # Run the aligner on (document, summary) pairs dataset = aligner( dataset, [doc_column] + to_columns, # Must use `batch_size = 1` batch_size=1, ) if reference_column is not None and len(summary_columns): # Run the aligner on (reference, summary) pairs dataset = aligner( dataset, [reference_column] + summary_columns, # Must use `batch_size = 1` batch_size=1, ) if len(to_columns) > 1: # Instead of having one column for (document, summary) comparisons, split # off into (1 + |summary_columns|) total columns, one for each comparison # Retrieve the (document, summary) column doc_summary_column = aligner.retrieve( dataset[:], [doc_column] + to_columns, )[tuple([doc_column] + to_columns)] for i, col in enumerate(to_columns): # Add as a new column after encoding with the aligner's `encode` method dataset.add_column( column=str(aligner.identifier(columns=[doc_column, col])), values=[ aligner.encode([row[i]]) for row in doc_summary_column ], ) # Remove the (document, summary) column dataset.remove_column( str(aligner.identifier(columns=[doc_column] + to_columns))) del dataset.interactions[CACHEDOPS].history[( aligner.identifier, strings_as_json(strings=[doc_column] + to_columns))] if reference_column is not None and len(summary_columns) > 1: # Instead of having one column for (reference, summary) comparisons, split # off into (|summary_columns|) total columns, one for each comparison # Retrieve the (reference, summary) column reference_summary_column = aligner.retrieve( dataset[:], [reference_column] + summary_columns, )[tuple([reference_column] + summary_columns)] for i, col in enumerate(summary_columns): # Add as a new column dataset.add_column(column=str( aligner.identifier(columns=[reference_column, col])), values=[ aligner.encode([row[i]]) for row in reference_summary_column ]) # Remove the (reference, summary) column dataset.remove_column( str( aligner.identifier(columns=[reference_column] + summary_columns))) del dataset.interactions[CACHEDOPS].history[( aligner.identifier, strings_as_json(strings=[reference_column] + summary_columns))] return dataset