def main(args, model=None) -> SummarizationModule: Path(args.output_dir).mkdir(exist_ok=True) check_output_dir(args, expected_items=3) if model is None: if "summarization" in args.task: model: SummarizationModule = SummarizationModule(args) else: model: SummarizationModule = TranslationModule(args) dataset = Path(args.data_dir).name if ( args.logger_name == "default" or args.fast_dev_run or str(args.output_dir).startswith("/tmp") or str(args.output_dir).startswith("/var") ): from pytorch_lightning.loggers import CSVLogger logger = CSVLogger('chen_logs',name = 'SCHWEIGEN') # don't pollute wandb logs unnecessarily elif args.logger_name == "wandb": from pytorch_lightning.loggers import WandbLogger project = os.environ.get("WANDB_PROJECT", dataset) logger = WandbLogger(name=model.output_dir.name, project=project) elif args.logger_name == "wandb_shared": from pytorch_lightning.loggers import WandbLogger logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}") if args.early_stopping_patience >= 0: es_callback = get_early_stopping_callback(model.val_metric, args.early_stopping_patience) else: es_callback = False lower_is_better = args.val_metric == "loss" trainer: pl.Trainer = generic_train( model, args, logging_callback=Seq2SeqLoggingCallback(), checkpoint_callback=get_checkpoint_callback( args.output_dir, model.val_metric, args.save_top_k, lower_is_better ), early_stopping_callback=es_callback, logger=logger, ) pickle_save(model.hparams, model.output_dir / "hparams.pkl") if not args.do_predict: return model model.hparams.test_checkpoint = "" checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True))) if checkpoints: model.hparams.test_checkpoint = checkpoints[-1] trainer.resume_from_checkpoint = checkpoints[-1] trainer.logger.log_hyperparams(model.hparams) # test() without a model tests using the best checkpoint automatically trainer.test() return model
def parse_arguments(): """ Parse command line arguments """ parser = argparse.ArgumentParser( description='Framabook text content extraction for Common Voice') parser.add_argument('--minwords', type=int, default=3, help='Minimum number of words to accept a sentence') parser.add_argument('--maxwords', type=int, default=15, help='Maximum number of words to accept a sentence') parser.add_argument('--one', action='store_true', default=False, help='Stop after the first file written.') parser.add_argument('--dry', action='store_true', default=False, help='Dry run, do not write any data file.') parser.add_argument('--abbr', action='store_true', default=False, help='Print abbreviations extracted from abbr tags.') parser.add_argument('--code', action='store_true', default=False, help='Print deleted text from code tags.') parser.add_argument( '--plaintext', action='store_true', default=False, help='Extract plain text. If False, write extracted sentences.') parser.add_argument('inputdir', type=str, help='Input directory OR -1 for URL file') parser.add_argument('outputdir', type=str, help='Output directory') args = parser.parse_args() # if it is different from -1 then the EPUB file is in the DATA folder # if it is -1 we will call the url of the variable EPUB_LINK if args.inputdir != '-1': check_output_dir(args.inputdir) check_output_dir(args.outputdir) return vars(args)
def distill_main(args): Path(args.output_dir).mkdir(exist_ok=True) check_output_dir(args, expected_items=3) model = create_module(args) return ft_main(args, model=model)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) check_output_dir(training_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = BartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = BartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = BartForConditionalGeneration.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): assert (data_args.tgt_lang is not None and data_args.src_lang is not None), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, tokenizer=tokenizer, ) all_metrics = {} # Training if training_args.do_train: logger.info("*** Train ***") train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) metrics = train_result.metrics metrics["train_n_objs"] = data_args.n_train trainer.save_model() # this also saves the tokenizer if trainer.is_world_process_zero(): handle_metrics("train", metrics, training_args.output_dir) all_metrics.update(metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams) metrics["val_n_objs"] = data_args.n_val metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): handle_metrics("val", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.do_predict: logger.info("*** Predict ***") test_output = trainer.predict( test_dataset=test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics = test_output.metrics metrics["test_n_objs"] = data_args.n_test if trainer.is_world_process_zero(): metrics["test_loss"] = round(metrics["test_loss"], 4) handle_metrics("test", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = lmap(str.strip, test_preds) write_txt_file( test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json")) return all_metrics
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) check_output_dir(training_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): assert (data_args.tgt_lang is not None and data_args.src_lang is not None), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, tokenizer=tokenizer, ) all_metrics = {} # Training if training_args.do_train: logger.info("*** Train ***") train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) metrics = train_result.metrics metrics["train_n_objs"] = data_args.n_train trainer.save_model() # this also saves the tokenizer if trainer.is_world_process_zero(): handle_metrics("train", metrics, training_args.output_dir) all_metrics.update(metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) tokenizer.save_pretrained(training_args.output_dir) if training_args.tune: def eval_func_for_lpot(model): trainer.model = model results = trainer.evaluate( eval_dataset=eval_dataset, metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams) assert data_args.task.startswith("summarization") or data_args.task.startswith("translation") , \ "data_args.task should startswith summarization or translation" task_metrics_keys = [ 'val_bleu', 'val_rouge1', 'val_rouge2', 'val_rougeL', 'val_rougeLsum' ] for key in task_metrics_keys: if key in results.keys(): logger.info("Finally Eval {}:{}".format(key, results[key])) if 'bleu' in key: acc = results[key] break if 'rouge' in key: acc = sum( [v for k, v in results.items() if "rouge" in k]) / 4 break return acc from lpot.experimental import Quantization, common quantizer = Quantization("./conf.yaml") quantizer.model = common.Model(model) quantizer.calib_dataloader = common.DataLoader( eval_dataset, batch_size=training_args.eval_batch_size, collate_fn=Seq2SeqDataCollator_lpot(tokenizer, data_args, training_args.tpu_num_cores)) quantizer.eval_func = eval_func_for_lpot q_model = quantizer() q_model.save(training_args.tuned_checkpoint) exit(0) if training_args.benchmark: if training_args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(training_args.tuned_checkpoint)), model) else: new_model = model trainer.model = new_model results = trainer.evaluate( eval_dataset=eval_dataset, metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, iters=training_args.iters, warmup_iter=training_args.warmup_iter, ) if data_args.task.startswith("summarization"): print('Accuracy: %.4f' % (sum([v for k, v in results.items() if "rouge" in k]) / 4)) if data_args.task.startswith("translation"): print('Accuracy: %.4f' % (results['val_bleu'])) print('Throughput: %.3f samples/sec' % (results["val_samples_per_second"])) print('Latency: %.3f ms' % (1 * 1000 / results["val_samples_per_second"])) print('Batch size = %d' % training_args.per_device_eval_batch_size) exit(0) if training_args.accuracy_only: if training_args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(training_args.tuned_checkpoint)), model) else: new_model = model trainer.model = new_model results = trainer.evaluate( eval_dataset=eval_dataset, metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) if data_args.task.startswith("summarization"): print('Accuracy: %.4f' % (sum([v for k, v in results.items() if "rouge" in k]) / 4)) if data_args.task.startswith("translation"): print('Accuracy: %.4f' % (results['val_bleu'])) print('Latency: %.3f ms' % (1 * 1000 / results["val_samples_per_second"])) print('Batch size = %d' % training_args.per_device_eval_batch_size) exit(0) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate( metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics["val_n_objs"] = data_args.n_val metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): handle_metrics("val", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.do_predict: logger.info("*** Predict ***") test_output = trainer.predict( test_dataset=test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics = test_output.metrics metrics["test_n_objs"] = data_args.n_test if trainer.is_world_process_zero(): metrics["test_loss"] = round(metrics["test_loss"], 4) handle_metrics("test", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = lmap(str.strip, test_preds) write_txt_file( test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json")) return all_metrics
help='Dry run, do not write any data file.') parser.add_argument('--min-words', type=int, default=3, help='Minimum number of words to accept a sentence') parser.add_argument('--max-words', type=int, default=15, help='Maximum number of words to accept a sentence') parser.add_argument('file', type=str, help='Source XML file') parser.add_argument('output', type=str, help='Output directory') args = parser.parse_args() check_output_dir(args.output) doc = parse(args.file) indent_level = 0 visited = [] is_syceron = False accepted_seance_context = [ re.compile("CompteRendu@Metadonnees@DateSeance"), re.compile( "CompteRendu@Metadonnees@Sommaire@Sommaire1@TitreStruct@Intitule"), re.compile("CompteRendu@Contenu@Quantiemes@Journee"), #re.compile("CompteRendu@Contenu@ouverture_seance@paragraphe@ORATEURS@ORATEUR@NOM"), re.compile(".*@paragraphe@texte$"), ]