def load_dataset(path: str): if path.endswith('.jsonl'): return Dataset.from_jsonl(path) try: return Dataset.load_from_disk(path) except NotADirectoryError: return Dataset.from_jsonl(path)
def deanonymize_dataset( rg_path: str, standardized_dataset: Dataset, processed_dataset_path: str = None, n_samples: int = None, ): """Take an anonymized dataset and add back the original dataset columns.""" assert processed_dataset_path is not None, \ "Please specify a path to save the dataset." # Load the dataset dataset = Dataset.load_from_disk(rg_path) if n_samples: dataset.set_visible_rows(list(range(n_samples))) standardized_dataset.set_visible_rows(list(range(n_samples))) text_columns = [] # Add columns from the standardized dataset dataset.add_column('document', standardized_dataset['document']) text_columns.append('document') if 'summary:reference' in standardized_dataset.column_names: dataset.add_column('summary:reference', standardized_dataset['summary:reference']) text_columns.append('summary:reference') # Preprocessing all the text columns dataset = dataset.update( lambda x: { f'preprocessed_{k}': x[k] if args.no_clean else clean_text(x[k]) for k in text_columns }) # Run the Spacy pipeline on all preprocessed text columns try: nlp = load('en_core_web_lg') except OSError: nlp = load('en_core_web_sm') nlp.add_pipe('sentencizer', before="parser") spacy = Spacy(nlp=nlp) dataset = spacy( dataset, [f'preprocessed_{col}' for col in text_columns], batch_size=100, ) # Directly save to disk dataset.save_to_disk(processed_dataset_path) return dataset
if args.join_predictions: # Join the predictions with the dataset dataset = join_predictions( dataset_jsonl=args.dataset_jsonl, prediction_jsonls=args.prediction_jsonls, save_jsonl_path=args.save_jsonl_path, ) if args.workflow: # Run the processing workflow dataset = None # Check if `args.dataset_rg` was passed in if args.dataset_rg: # Load the dataset directly dataset = Dataset.load_from_disk(args.dataset_rg) run_workflow( jsonl_path=args.dataset_jsonl, dataset=dataset, doc_column=args.doc_column, reference_column=args.reference_column, summary_columns=args.summary_columns, bert_aligner_threshold=args.bert_aligner_threshold, bert_aligner_top_k=args.bert_aligner_top_k, embedding_aligner_threshold=args.embedding_aligner_threshold, embedding_aligner_top_k=args.embedding_aligner_top_k, processed_dataset_path=args.processed_dataset_path, n_samples=args.n_samples if not args.try_it else 10, anonymize=args.anonymize, )