def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--task", default="", type=str, required=True, help="The GLUE task to run", ) parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") return parser
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) # Add BART specific options parser.add_argument( "--max_source_length", default=1024, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=56, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the dataset files for the CNN/DM summarization task.", ) parser.add_argument( "--logging_dir", default='tensorboard_logs', type=str, required=False, help="The directory for tensorboard_logs", ) return parser
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) add_generic_args(parser, root_dir) parser.add_argument( "--max_source_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=25, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--val_max_target_length", default=25, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--test_max_target_length", default=25, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default") parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_val", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--label_smoothing", type=float, default=0.0, required=False) parser.add_argument( "--prefix", type=str, default=None, help="Prefix added at the beginning of each text, typically used with T5-based models.", ) parser.add_argument( "--early_stopping_patience", type=int, default=-1, required=False, help="-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So val_check_interval will effect it.", ) parser.add_argument( "--distributed-port", type=int, default=-1, required=False, help="Port number for distributed training." ) parser.add_argument( "--model_type", choices=["rag_sequence", "rag_token", "bart", "t5"], type=str, help="RAG model type: sequence or token, if none specified, the type is inferred from the model_name_or_path", ) return parser
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--task", default="", type=str, required=True, help="The GLUE task to run", ) parser.add_argument( "--gpus", default=0, type=int, help= "The number of GPUs allocated for this, it is by default 0 meaning none", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") return parser
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--gpus", default=0, type=int, help="The number of GPUs allocated for this, it is by default 0 meaning none", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) def IntorFloat(string): if '.' in string: return float(string) else: return int(string) parser.add_argument( "--val_check_interval", default=1.0, type=IntorFloat, help="How often to check the validation set. Use float to check within a training epoch" ) return parser
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) add_generic_args(parser, root_dir) parser.add_argument( "--max_source_length", default=1024, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=100, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--val_max_target_length", default=100, # these defaults are optimized for CNNDM. For xsum, see README.md. type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--test_max_target_length", default=100, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--freeze_encoder", action="store_true") parser.add_argument("--freeze_embeds", action="store_true") parser.add_argument("--sortish_sampler", action="store_true", default=False) parser.add_argument("--max_tokens_per_batch", type=int, default=None) parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default") parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_val", type=int, default=500, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument( "--task", type=str, default="summarization", required=False, help="# examples. -1 means use all." ) parser.add_argument("--label_smoothing", type=float, default=0.0, required=False) parser.add_argument("--src_lang", type=str, default="", required=False) parser.add_argument("--tgt_lang", type=str, default="", required=False) parser.add_argument("--eval_beams", type=int, default=None, required=False) parser.add_argument( "--val_metric", type=str, default=None, required=False, choices=["bleu", "rouge2", "loss", None] ) parser.add_argument("--eval_max_gen_length", type=int, default=None, help="never generate more than n tokens") parser.add_argument("--length_penalty", type=float, default=1.0, help="never generate more than n tokens") parser.add_argument("--save_top_k", type=int, default=1, required=False, help="How many checkpoints to save") parser.add_argument( "--early_stopping_patience", type=int, default=-1, required=False, help="-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So val_check_interval will effect it.", ) return parser
def add_model_specific_args(parser, root_dir): # Add NER specific options BaseTransformer.add_model_specific_args(parser, root_dir) parser.add_argument( "--task_type", default="NER", type=str, help="Task type to fine tune in training (e.g. NER, POS, etc)" ) parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--labels", default="", type=str, help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", ) parser.add_argument( "--gpus", default=0, type=int, help="The number of GPUs allocated for this, it is by default 0 meaning none", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) return parser
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) add_generic_args(parser, root_dir) parser.add_argument( "--max_source_length", default=1024, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=56, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--val_max_target_length", default=142, # these defaults are optimized for CNNDM. For xsum, see README.md. type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--test_max_target_length", default=142, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--data_dir", type=str, required=True, help="The input data dir. Should contain train.source, train.target, val.source, val.target, test.source, test.target", ) parser.add_argument("--freeze_encoder", action="store_true") parser.add_argument("--freeze_embeds", action="store_true") parser.add_argument("--sortish_sampler", action="store_true", default=False) parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default") parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_val", type=int, default=500, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument( "--task", type=str, default="summarization", required=False, help="# examples. -1 means use all." ) parser.add_argument("--label_smoothing", type=float, default=0.0, required=False) parser.add_argument("--src_lang", type=str, default="", required=False) parser.add_argument("--tgt_lang", type=str, default="", required=False) parser.add_argument( "--early_stopping_patience", type=int, default=-1, required=False, help="-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So val_check_interval will effect it.", ) return parser
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) add_generic_args(parser, root_dir) parser.add_argument( "--max_source_length", default=48, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=24, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--val_max_target_length", default=24, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--test_max_target_length", default=24, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--data_dir", type=str, required=True, help="The input data dir. Should contain train.source, train.target, val.source, val.target, test.source, test.target", ) parser.add_argument("--freeze_encoder", action="store_true") parser.add_argument("--freeze_embeds", action="store_true") parser.add_argument("--sortish_sampler", action="store_true", default=False) parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default") parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_val", type=int, default=500, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument( "--task", type=str, default="summarization", required=False, help="# examples. -1 means use all." ) parser.add_argument("--src_lang", type=str, default="", required=False) parser.add_argument("--tgt_lang", type=str, default="", required=False) parser.add_argument("--atomic", action="store_true") return parser
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) # Add BART specific options parser.add_argument( "--max_source_length", default=384, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=512, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the dataset files for the CNN/DM summarization task.", ) parser.add_argument( "--early_stopping_patience", type=int, default=-1, required=False, help= "-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So val_check_interval will effect it.", ) parser.add_argument( "--checkpoint", default=None, type=str, help="The checkpoint to initialize model", ) parser.add_argument( "--checkpoint_model", default=None, type=str, help= "The input data dir. Should contain the dataset files for the CNN/DM summarization task.", ) return parser
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) add_generic_args(parser, root_dir) # fmt: off parser.add_argument( "--max_source_length", default=1024, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=56, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--val_max_target_length", default=142, # these defaults are optimized for CNNDM. For xsum, see README.md. type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--test_max_target_length", default=142, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--data_dir", type=str, required=True, help="The input data dir. Should contain train.source, train.target, val.source, val.target, test.source, test.target", ) parser.add_argument("--freeze_encoder", action="store_true") parser.add_argument("--freeze_embeds", action="store_true") parser.add_argument("--sortish_sampler", action="store_true", default=False) parser.add_argument("--logger", type=str, choices=["default", "wandb", "wandb_shared"], default="default") # parser.add_argument("--wandb_project", type=str, default="default") parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_val", type=int, default=500, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.") # fmt: on return parser
def parse_args(): parser = ArgumentParser() # add some script specific args parser.add_argument("--seed", type=int, default=42) parser.add_argument( "--output_dir", type=str, default="", help="Directory to write outputs to or load checkpoint from" ) parser.add_argument("--do_train", action="store_true", help="Run training loop") parser.add_argument("--do_predict", action="store_true", help="Run test loop") # enable all trainer args parser = Trainer.add_argparse_args(parser) # add the base module args parser = BaseTransformer.add_model_specific_args(parser) # add the glue module args parser = GLUETransformer.add_model_specific_args(parser) # cook them all up :) args = parser.parse_args() return args
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) add_generic_args(parser, root_dir) parser.add_argument( "--max_source_length", default=1024, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=56, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--val_max_target_length", default= 142, # these defaults are optimized for CNNDM. For xsum, see README.md. type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--test_max_target_length", default=142, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--freeze_encoder", action="store_true") parser.add_argument("--freeze_embeds", action="store_true") parser.add_argument("--sortish_sampler", action="store_true", default=False) parser.add_argument("--max_tokens_per_batch", type=int, default=None) parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default") parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_val", type=int, default=500, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--task", type=str, default="summarization", required=False, help="# examples. -1 means use all.") parser.add_argument("--label_smoothing", type=float, default=0.0, required=False) parser.add_argument("--src_lang", type=str, default="", required=False) parser.add_argument("--tgt_lang", type=str, default="", required=False) parser.add_argument( "--eval_beams", type=int, default=None, required=False, help="# beams to use. 0 corresponds to not using beam search.") parser.add_argument("--val_metric", type=str, default=None, required=False, choices=["bleu", "rouge2", "loss", None]) parser.add_argument("--eval_max_gen_length", type=int, default=None, help="never generate more than n tokens") parser.add_argument("--save_top_k", type=int, default=1, required=False, help="How many checkpoints to save") parser.add_argument( "--early_stopping_patience", type=int, default=-1, required=False, help= "-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So val_check_interval will effect it.", ) parser.add_argument( '--json-summary', type=str, default="results/dllogger.json", help='If provided, the json summary will be written to' 'the specified file.') parser.add_argument( '--distill', type=str, default=None, help= "string indicating distillation to perform, only sft supported", choices=["sft", None]) parser.add_argument( '--layers', type=str, default=None, help= "string indicating which layers to distill for SFT, split by '-' (ex. 0-6-11)" ) parser.add_argument('--do_encoder', action="store_true", default=False, help="if true distills the encoder") parser.add_argument('--do_decoder', action="store_true", default=False, help="if true distills the decoder") return parser
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) add_generic_args(parser, root_dir) parser.add_argument( "--max_source_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=25, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--val_max_target_length", default=25, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--test_max_target_length", default=25, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default") parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_val", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--label_smoothing", type=float, default=0.0, required=False) parser.add_argument( "--prefix", type=str, default=None, help= "Prefix added at the beginning of each text, typically used with T5-based models.", ) parser.add_argument( "--early_stopping_patience", type=int, default=-1, required=False, help= "-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So val_check_interval will effect it.", ) parser.add_argument("--distributed-port", type=int, default=-1, required=False, help="Port number for distributed training.") parser.add_argument( "--model_type", choices=["rag_sequence", "rag_token", "bart", "t5"], type=str, help= "RAG model type: sequence or token, if none specified, the type is inferred from the model_name_or_path", ) parser.add_argument( "--context_encoder_name", default="facebook/dpr-ctx_encoder-multiset-base", type=str, help= "Name of the pre-trained context encoder checkpoint from the DPR", ) parser.add_argument( "--csv_path", default=str( Path(__file__).parent / "test_run" / "dummy-kb" / "my_knowledge_dataset.csv"), type=str, help="path of the raw KB csv", ) parser.add_argument("--end2end", action="store_true", help="whether to train the system end2end or not") parser.add_argument("--index_gpus", type=int, help="how many GPUs used in re-encoding process") parser.add_argument( "--shard_dir", type=str, default=str(Path(__file__).parent / "test_run" / "kb-shards"), help= "directory used to keep temporary shards during the re-encode process", ) parser.add_argument( "--gpu_order", type=str, help= "order of the GPU used during the fine-tuning. Used to finding free GPUs during the re-encode process. I do not have many GPUs :)", ) parser.add_argument("--indexing_freq", type=int, help="frequency of re-encode process") return parser
def add_model_specific_args(parser, root_dir): BaseTransformer.add_model_specific_args(parser, root_dir) add_generic_args(parser, root_dir) parser.add_argument( "--max_source_length", default=1024, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=56, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--freeze_encoder", action="store_true") parser.add_argument("--freeze_embeds", action="store_true") parser.add_argument("--sortish_sampler", action="store_true", default=False) parser.add_argument("--max_tokens_per_batch", type=int, default=None) parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default") parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_val", type=int, default=500, required=False, help="# examples. -1 means use all.") parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.") parser.add_argument( "--task", type=str, default="summarization", required=False, help="# examples. -1 means use all." ) parser.add_argument("--label_smoothing", type=float, default=0.0, required=False) parser.add_argument("--src_lang", type=str, default="", required=False) parser.add_argument("--tgt_lang", type=str, default="", required=False) parser.add_argument("--eval_beams", type=int, default=None, required=False) parser.add_argument( "--val_metric", type=str, default=None, required=False, choices=["bleu", "rouge2", "loss", None] ) parser.add_argument("--eval_max_gen_length", type=int, default=None, help="never generate more than n tokens") parser.add_argument( "--early_stopping_patience", type=int, default=-1, required=False, help="-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So val_check_interval will effect it.", ) #################################### ## Decoding Strategy ## #################################### parser.add_argument('--decode_method', choices=['greedy', 'beam', 'nucleus']) parser.add_argument("--decode_num_beams", default=5, type=int, required=False, help="") parser.add_argument("--decode_p", default=0.9, type=float, required=False, help="") #################################### ## Unlikelihood Loss Parameters ## #################################### parser.add_argument("--unlikelihood_training", action="store_true", help="whether to use unlikelihood training") parser.add_argument("--unlikelihood_training_mode", choices=["cochrane", "newsela", "both"], help="which weights to use for unlikelihood training") parser.add_argument("--unlikelihood_cochrane_weights_file", default="data/logr_weights/bart_freq_normalized_ids.txt", type=str, required=False, help="The file containing logistic regression weights learned on the Cochrane dataset for use in unlikelihood training") parser.add_argument("--unlikelihood_newsela_weights_file", default="data/logr_weights/bart_freq_newsela_ids.txt", type=str, required=False, help="The file containing logistic regression weights learned on the Newsela dataset for use in unlikelihood training") parser.add_argument("--unlikelihood_exclude_tokens", default="", type=str, required=False, help="Comma-separated numbers") parser.add_argument("--unlikelihood_num_weights", default=100, type=int, required=False, help="The number of weights in unlikelihood training, if -1 use all of them") parser.add_argument("--unlikelihood_softmax", action="store_true", help="whether to softmax the token weights in unlikelihood training") parser.add_argument("--unlikelihood_temperature", default=2, type=int, help="temperature to use in softmax when normalizing logistic regression weights") parser.add_argument("--unlikelihood_selective_penalty", action="store_true", help="whether to use unlikelihood loss only if argmax is the penalty token") parser.add_argument("--unlikelihood_alpha", default=100.0, type=float, required=False, help="") ########################################## ## Unlikelihood Loss Token Parameters ## ########################################## parser.add_argument("--unlikelihood_training_tokens", action="store_true", help="whether to use unlikelihood training at token level") parser.add_argument("--unlikelihood_tokens_alpha", default=1.0, type=float, required=False, help="") return parser