def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument( '--langs', required=True, metavar='LANG', help='comma-separated list of monolingual language, ' 'for example, "en,de,fr". These should match the ' 'langs from pretraining (and be in the same order). ' 'You should always add all pretraining language idx ' 'during finetuning.') parser.add_argument( '--extra-lang-symbol', default='', type=str, help='comma-separated list of monolingual language, ' 'for example, "en,de,fr". These should match the ' 'langs from pretraining (and be in the same order). ' 'You should always add all pretraining language idx ' 'during finetuning.') parser.add_argument( '--prepend-bos', action='store_true', help='prepend bos token to each sentence, which matches ' 'mBART pretraining')
def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument( '--noise', default='random_delete', choices=['random_delete', 'random_mask', 'no_noise', 'full_mask']) parser.add_argument('--add-mask-token', action='store_true', help='add a mask token for model compatibility.') parser.add_argument('--use-mask-token', default=False, action='store_true') parser.add_argument('--add-lang-token', default=False, action='store_true') parser.add_argument('--use-lang-token', default=False, action='store_true') parser.add_argument( '--langs', default=None, metavar='LANG', help= 'comma-separated list of monolingual language, for example, "en,de,fr"' 'be careful these langs are what you used for pretraining (the same order),' 'not for finetuning.' 'you should always add all pretraining language idx during finetuning.' )
def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument( '--remove-eos-from-source', action='store_true', help='if set, remove eos from end of source if it\'s present') parser.add_argument('--load-dependency', action='store_true', help='load the dependency heads') parser.add_argument( '--dependency-with-input', action='store_true', help='if set, target-side\'s dependencies are based on the inputs') parser.add_argument( '--use-gold-dependency', action='store_true', help='use the source\'s gold dependency for inference') parser.add_argument( '--print-dependency', nargs='?', const='hard', help= 'if set, uses attention feedback to compute and print dependency')
def add_args(parser): """Add task-specific arguments to the parser.""" TranslationTask.add_args(parser) parser.add_argument( '--noise', default='no_noise', choices=['random_mask', 'no_noise', 'full_mask'])
def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument('--method', default='hMoEup', choices=['sMoElp', 'sMoEup', 'hMoElp', 'hMoEup']) parser.add_argument('--num-experts', default=3, type=int, metavar='N', help='number of experts') parser.add_argument('--mean-pool-gating-network', action='store_true', help='use a simple mean-pooling gating network') parser.add_argument('--mean-pool-gating-network-dropout', type=float, help='dropout for mean-pooling gating network') parser.add_argument( '--mean-pool-gating-network-encoder-dim', type=float, help='encoder output dim for mean-pooling gating network') parser.add_argument('--gen-expert', type=int, default=0, help='which expert to use for generation')
def add_args(parser): TranslationTask.add_args(parser) parser.add_argument('--morpho-dropout', type=float, default=0.5) parser.add_argument('--morpho-dropout-initial', type=float, default=None) parser.add_argument('--morpho-dropout-end-epoch', type=int, default=None)
def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument('--noise', default='random_delete', choices=[ 'random_delete', 'random_delete_shuffle', 'random_mask', 'no_noise', 'full_mask' ]) parser.add_argument('--random-seed', default=1, type=int)
def add_args(parser): """Add task-specific arguments to the parser.""" TranslationTask.add_args(parser) parser.add_argument( '--langs', required=True, metavar='LANG', help='comma-separated list of monolingual language, ' 'for example, "en,de,fr". These should match the ' 'langs from pretraining (and be in the same order). ' 'You should always add all pretraining language idx ' 'during finetuning.')
def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument('--bert-model', type=str, metavar='DIR', required=True, help='path to the BERT model') parser.add_argument('--fine-tuning', action='store_true', help='if set, the BERT model will be tuned') parser.set_defaults(left_pad_source=False)
def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument( '--noise', default='random_delete', choices=['random_delete', 'random_mask', 'no_noise', 'full_mask']) parser.add_argument('--append_eos_to_target', default=True, type=bool, help="the path of the mean file.") parser.add_argument('--append_bos_to_target', default=True, type=bool, help="the path of the mean file.")
def initialize(self, context): """ load model and extra files. """ logger.info( f"Will initialize with system_properties: {context.system_properties}" ) model_pt_path, model_file_dir, device = self._handler_initialize( context) config = json.loads( (Path(model_file_dir) / "model_generation.json").read_text()) self.device = device translation_cfg = TranslationConfig() self.vocab = TranslationTask.load_dictionary("dict.txt") self.spm = sentencepiece.SentencePieceProcessor() self.spm.Load("sentencepiece.bpe.model") logger.info("Loaded sentencepiece.bpe.model") if config.get("dummy", False): self.sequence_generator = FakeGenerator() logger.warning("Will use a FakeGenerator model, only testing BPE") else: task = TranslationTask(translation_cfg, self.vocab, self.vocab) [model], cfg = fairseq.checkpoint_utils.load_model_ensemble( [model_pt_path], task=task) model.eval().to(self.device) logger.info( f"Loaded model from {model_pt_path} to device {self.device}") logger.info( f"Will use the following config: {json.dumps(config, indent=4)}" ) self.sequence_generator = SequenceGenerator( [model], tgt_dict=self.vocab, beam_size=config.get("beam_size", 1), max_len_a=config.get("max_len_a", 1.3), max_len_b=config.get("max_len_b", 5), min_len=config.get("min_len", 5), max_len=model.max_decoder_positions(), ) if not self.sequence_generator.model.has_incremental: logger.warning("Incremental generation is disabled !!!") self.taskIO = TaskIO() self.initialized = True
def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) # Training: parser.add_argument('--pretrained', default=None, type=str) parser.add_argument('--copy-embeddings', action='store_true') parser.add_argument('--copy-network', action='store_true') parser.add_argument('--shift', default=1, type=int) parser.add_argument('--policy', default='eos', type=str) parser.add_argument('--write-threshold', default=0.5, type=float) parser.add_argument('--align-index', default=1, type=int) parser.add_argument('--pick-alignment', default='index', type=str) parser.add_argument('--path-oracle', default='alignment', type=str) parser.add_argument('--path-oracle-rank', default=50, type=int) parser.add_argument('--path-oracle-tol', default=0.1, type=float) parser.add_argument('--path-oracle-lookahead', default=0, type=int) parser.add_argument('--path-oracle-waitk', default=7, type=int) parser.add_argument('--path-oracle-width', default=3, type=int)
def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument('--mono-data', default=None, help='monolingual data, split by :') parser.add_argument( '--mono-one-split-each-epoch', action='store_true', default=False, help='use on split of monolingual data at each epoch') parser.add_argument('--parallel-ratio', default=1.0, type=float, help='subsample ratio of parallel data') parser.add_argument('--mono-ratio', default=1.0, type=float, help='subsample ratio of mono data')
def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument( '--noise', default='random_delete', choices=['random_delete', 'random_mask', 'no_noise', 'full_mask']) parser.add_argument('--shape_size', default=224, type=int, help="the shape size of image") parser.add_argument( '--sample-startegy', default="sampling_with_tgt_len", type=str, choices=["sampling_with_tgt_len", "sampling_with_src_len"], help="sampling frames from the video") parser.add_argument( '--tgtlen-times', default=5, type=int, help= "The maximum number of frames of the source video is 'tgtlen_times' times" "the length of the target sentence, if use, the sample_startegy must be " "sampling_with_tgt_len") parser.add_argument('--mean-img-file', default=None, type=str, help="the path of the mean file.") parser.add_argument('--append_eos_to_target', default=True, type=bool, help="the path of the mean file.") parser.add_argument('--append_bos_to_target', default=True, type=bool, help="the path of the mean file.")
def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument( '--langs', type=str, metavar='LANG', help='comma-separated list of monolingual language, ' 'for example, "en,de,fr". These should match the ' 'langs from pretraining (and be in the same order). ' 'You should always add all pretraining language idx ' 'during finetuning.') parser.add_argument( '--prepend-bos', action='store_true', help='prepend bos token to each sentence, which matches ' 'mBART pretraining') parser.add_argument( '--domain-dict', type=str, required=True, help= 'Path a file that contains a list of all domains (same format as dict.txt)' ) parser.add_argument( '--train-domains', type=str, required=True, help='File of same line count as training split where each ' 'line has some domain from the domain_dict.txt') parser.add_argument( '--valid-domains', type=str, required=True, help='File of same line count as validation split where each ' 'line has some domain from the domain_dict.txt')
def from_checkpoint(self, checkpoint, roberta_cache_path=None, inspector=None): ''' Initialize model from checkpoint ''' # load fairseq task parser = options.get_interactive_generation_parser() options.add_optimization_args(parser) args = options.parse_args_and_arch(parser, input_args=['--data dummy']) # Read extra arguments model_folder = os.path.dirname(checkpoint.split(':')[0]) # config with fairseq-preprocess and fairseq-train args config_json = f'{model_folder}/config.json' assert os.path.isfile(config_json), \ "Model trained with v0.3.0 or above?" with open(config_json) as fid: extra_args = json.loads(fid.read()) prepro_args = extra_args['fairseq_preprocess_args'] train_args = extra_args['fairseq_train_args'] # extra args by hand args.source_lang = 'en' args.target_lang = 'actions' args.path = checkpoint args.roberta_cache_path = roberta_cache_path dim = train_args['--pretrained-embed-dim'][0] args.model_overrides = \ "{'pretrained_embed_dim':%s, 'task': 'translation'}" % dim assert bool(args.left_pad_source), "Only left pad supported" # dictionaries src_dict_path = f'{model_folder}/dict.{args.source_lang}.txt' tgt_dict_path = f'{model_folder}/dict.{args.target_lang}.txt' assert os.path.isfile(src_dict_path), \ f"Missing {src_dict_path}.\nModel trained with v0.3.0 or above?"\ "\ncheck scripts/stack-transformer/update_model_to_v0.3.0.sh" assert os.path.isfile(tgt_dict_path), \ f"Missing {tgt_dict_path}.\nModel trained with v0.3.0 or above?"\ "\ncheck scripts/stack-transformer/update_model_to_v0.3.0.sh" src_dict = Dictionary.load(src_dict_path) tgt_dict = Dictionary.load(tgt_dict_path) use_cuda = torch.cuda.is_available() and not args.cpu # Override task to ensure compatibility with old models and overide # TODO: Task may not be even needed task = TranslationTask(args, src_dict, tgt_dict) model = load_models(args, task, use_cuda) # Load RoBERTa embeddings = PretrainedEmbeddings( name=prepro_args['--pretrained-embed'][0], bert_layers=[int(x) for x in prepro_args['--bert-layers']] if '--bert-layers' in prepro_args else None, model=load_roberta(name=prepro_args['--pretrained-embed'][0], roberta_cache_path=args.roberta_cache_path, roberta_use_gpu=use_cuda)) print("Finished loading models") # State machine variables machine_rules = f'{model_folder}/train.rules.json' assert os.path.isfile(machine_rules), f"Missing {machine_rules}" machine_type = prepro_args['--machine-type'][0] return self(model, machine_rules, machine_type, src_dict, tgt_dict, use_cuda, embeddings=embeddings, inspector=inspector)
def build_generator(self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None): if getattr(args, "score_reference", False): raise NotImplementedError() else: from .noisy_channel_sequence_generator import NoisyChannelSequenceGenerator use_cuda = torch.cuda.is_available() and not self.args.cpu assert self.args.lm_model is not None, '--lm-model required for noisy channel generation!' assert self.args.lm_data is not None, '--lm-data required for noisy channel generation to map between LM and bitext vocabs' if self.args.channel_model is not None: import copy ch_args_task = copy.deepcopy(self.args) tmp = ch_args_task.source_lang ch_args_task.source_lang = ch_args_task.target_lang ch_args_task.target_lang = tmp ch_args_task._name = 'translation' channel_task = TranslationTask.setup_task(ch_args_task) arg_dict = {} arg_dict['task'] = 'language_modeling' arg_dict['sample_break_mode'] = 'eos' arg_dict['data'] = self.args.lm_data arg_dict['output_dictionary_size'] = -1 lm_args = argparse.Namespace(**arg_dict) lm_task = LanguageModelingTask.setup_task(lm_args) lm_dict = lm_task.output_dictionary if self.args.channel_model is not None: channel_models, _ = checkpoint_utils.load_model_ensemble( self.args.channel_model.split(':'), task=channel_task) for model in channel_models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if self.args.fp16: model.half() if use_cuda: model.cuda() else: channel_models = None lm_models, _ = checkpoint_utils.load_model_ensemble( self.args.lm_model.split(':'), task=lm_task) for model in lm_models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if self.args.fp16: model.half() if use_cuda: model.cuda() return NoisyChannelSequenceGenerator( combine_method=self.args.combine_method, tgt_dict=self.target_dictionary, src_dict=self.source_dictionary, beam_size=getattr(args, 'beam', 5), max_len_a=getattr(args, 'max_len_a', 0), max_len_b=getattr(args, 'max_len_b', 200), min_len=getattr(args, 'min_len', 1), len_penalty=getattr(args, 'lenpen', 1), unk_penalty=getattr(args, 'unkpen', 0), temperature=getattr(args, 'temperature', 1.), match_source_len=getattr(args, 'match_source_len', False), no_repeat_ngram_size=getattr(args, 'no_repeat_ngram_size', 0), normalize_scores=(not getattr(args, 'unnormalized', False)), channel_models=channel_models, k2=getattr(self.args, 'k2', 50), ch_weight=getattr(self.args, 'ch_wt', 1), channel_scoring_type=self.args.channel_scoring_type, top_k_vocab=self.args.top_k_vocab, lm_models=lm_models, lm_dict=lm_dict, lm_weight=getattr(self.args, 'lm_wt', 1), normalize_lm_scores_by_tgt_len=getattr( self.args, 'normalize_lm_scores_by_tgt_len', False), )
def add_args(parser): """Add task-specific arguments to the parser.""" TranslationTask.add_args(parser) # fmt: off parser.add_argument( '--channel-model', metavar='FILE', help= 'path to P(S|T) model. P(S|T) and P(T|S) must share source and target dictionaries.' ) parser.add_argument( '--combine-method', default='lm_only', choices=['lm_only', 'noisy_channel'], help="""method for combining direct and channel model scores. lm_only: decode with P(T|S)P(T) noisy_channel: decode with 1/t P(T|S) + 1/s(P(S|T)P(T))""" ) parser.add_argument( '--normalize-lm-scores-by-tgt-len', action='store_true', default=False, help='normalize lm score by target length instead of source length' ) parser.add_argument( '--channel-scoring-type', default='log_norm', choices=[ 'unnormalized', 'log_norm', 'k2_separate', 'src_vocab', 'src_vocab_batched' ], help= "Normalize bw scores with log softmax or return bw scores without log softmax" ) parser.add_argument( '--top-k-vocab', default=0, type=int, help= 'top k vocab IDs to use with `src_vocab` in channel model scoring') parser.add_argument( '--k2', default=50, type=int, help= 'the top k2 candidates to rescore with the noisy channel model for each beam' ) parser.add_argument('--ch-wt', default=1, type=float, help='weight for the channel model') parser.add_argument( '--lm-model', metavar='FILE', help= 'path to lm model file, to model P(T). P(T) must share the same vocab as the direct model on the target side' ) parser.add_argument( '--lm-data', metavar='FILE', help= 'path to lm model training data for target language, used to properly load LM with correct dictionary' ) parser.add_argument('--lm-wt', default=1, type=float, help='the weight of the lm in joint decoding')
def add_args(parser): TranslationTask.add_args(parser) parser.add_argument('--without-padding', action='store_true', help='load the dataset lazily')
def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument('--forget-rate', type=float, default=0.9, metavar='D', help='rho = (t + decay)^{-forget}') parser.add_argument('--decay-rate', type=float, default=1., metavar='D', help='rho = (t + decay)^{-forget}') parser.add_argument('--retrieve-split', type=str, default='train', help='the retrieve pool') parser.add_argument('--dec-opt-freq', type=int, default=1, help='the relative update freq of decoder') parser.add_argument('--enc-opt-freq', type=int, default=1, help='the relative update freq of encoder') parser.add_argument('--iw-nsamples', type=int, default=1000, help='number of importance-weighted samples') parser.add_argument('--eval-mode', type=str, default='none', choices=[ 'iw', 'entropy', 'gen_sample', 'gen_reconstruction', 'time', 'none', 'from_file', 'gen_interpolation' ], help='evaluation modes') parser.add_argument('--eval-gen-file', type=str, default=None, help='read in prototypes and edit vectors') parser.add_argument('--eval-gen-edit-vec', action='store_true', default=False, help='write edit vectors in the generation file') parser.add_argument( '--prune-num', type=int, default=-1, help='perform evaluation based on top prune_num templates only') # parser.add_argument('--prune-num-offline', type=int, default=-1, # help='perform evaluation based on top prune_num templates only (offline version)') parser.add_argument( '--free-bits', type=float, default=0, help='the free bits param to regularize KLt, 0 to disable') parser.add_argument( '--lambda-t-config', default="1.0", type=str, metavar='CONFIG', help='KLt coefficient ' 'use fixed weight during training if set to floating point number. ' 'use piecewise linear function over number of updates to schedule the ' 'weight with the format: w0:step0,w1:step1,...') parser.add_argument( '--gen-nz', type=int, default=10, help='number of edit vector samples to draw from the prior') parser.add_argument('--gen-np', type=int, default=200, help='number of top prototypes') parser.add_argument( '--write-loss-path', type=str, default=None, help='write out loss at evaluation time for interpolation exp')
def add_args(parser): """Add task-specific arguments to the parser.""" TranslationTask.add_args(parser) parser.add_argument('--eval-waitk', default=3, type=int)
def add_args(parser): TranslationTask.add_args(parser) pass
def add_args(parser): TranslationTask.add_args(parser) # bart setting parser.add_argument( "--mask", default=0.0, type=float, help="fraction of words/subwords that will be masked", ) parser.add_argument( "--mask-random", default=0.0, type=float, help="instead of using [MASK], use random token this often", ) parser.add_argument( "--insert", default=0.0, type=float, help="insert this percentage of additional random tokens", ) parser.add_argument( "--poisson-lambda", default=3.0, type=float, help="randomly shuffle sentences for this proportion of inputs", ) parser.add_argument( "--mask-length", default="span-poisson", type=str, choices=["subword", "word", "span-poisson"], help="mask length to choose", ) parser.add_argument( "--replace-length", default=1, type=int, help= "when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)", ) # multi-lingual parser.add_argument( "--multilang-sampling-alpha", type=float, default=1.0, help="smoothing alpha for sample ratios across multiple datasets", ) parser.add_argument( "--lang-pairs", default="", metavar="PAIRS", help= "comma-separated list of language pairs (in training order): phnen-en,phnfr-fr,phnit-it. Do masking", ) parser.add_argument( "--lang-pairs-bitext", default="", metavar="PAIRS", help= "comma-separated list of language pairs (in training order): en-de,en-fr,de-fr. No masking", ) parser.add_argument("--add-src-lang-token", default=False, action="store_true") parser.add_argument("--add-tgt-lang-token", default=False, action="store_true") parser.add_argument( "--no-whole-word-mask-langs", type=str, default="", metavar="N", help= "languages without spacing between words dont support whole word masking", ) parser.add_argument("--use-mask-whole-words", default=False, action="store_true")
epoch=epoch, disable_iterator_cache=not cached, # Set this to False to speed up. However, if set to False, changing max_tokens beyond # first call of this method has no effect. ) return batch_iterator if __name__ == '__main__': args = parse_args() # todo: 返回是tuble 而不是cfgdic hw5_config = get_cfg(args) task_cfg = TranslationConfig( data=hw5_config.get("data_path"), source_lang=hw5_config.get("source_lang"), target_lang=hw5_config.get("target_lang"), train_subset="train", required_seq_len_multiple=8, dataset_impl="mmap", upsample_primary=1, ) task = TranslationTask.setup_task(task_cfg) demo_epoch_obj = load_data_iterator(task, "valid", epoch=1, max_tokens=20, num_workers=1, cached=False) demo_iter = demo_epoch_obj.next_epoch_itr(shuffle=True) sample = next(demo_iter) sample
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) # MT_teach = task.build_model(args, mode="MT") MT_teach = TranslationTask.load_pretained_model(args.pre_trained_mt, args.mt_src_dict, args.mt_tgt_dict) NHG_teach = task.build_model(args, mode="NHG") if hasattr(args, "share_decoder_input_output_embed"): args.share_decoder_input_output_embed = False # print(model) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format(sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: trainer = FP16Trainer(args, task, model, criterion) else: if torch.cuda.get_device_capability(0)[0] >= 7: print('| NOTICE: your device may support faster training with --fp16') trainer = Trainer(args, task, model, criterion, NHG_teach, MT_teach) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader max_positions = trainer.get_model().max_positions() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available os.makedirs(args.save_dir, exist_ok=True) load_checkpoint(args, trainer, epoch_itr, teacher_model=True) for para in NHG_teach.parameters(): para.requires_grad = False for para in MT_teach.parameters(): para.requires_grad = False # Send a dummy batch to warm the caching allocator # dummy_batch = task.dataset('train').get_dummy_batch(args.max_tokens, max_positions) # trainer.dummy_train_step(dummy_batch) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses, valid_state = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0], valid_state) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path, data_path, spm_model_path=None): # assumes join dicitionary json_indent = 2 # prep assert os.path.exists(fsmt_checkpoint_path) os.makedirs(pytorch_dump_folder_path, exist_ok=True) print(f"Writing results to {pytorch_dump_folder_path}") chkpt = torch.load(fsmt_checkpoint_path) chkpt['cfg']['task'].data = data_path chkpt['cfg']['model'].data = data_path torch.save(chkpt, fsmt_checkpoint_path) task_args, model_args = chkpt['cfg']['task'], chkpt['cfg']['model'] task = TranslationTask.setup_task(task_args) model = task.build_model(model_args) # model config fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") model_conf = { "architectures": ["FSMTForConditionalGeneration"], "model_type": "fsmt", "activation_dropout": model_args.activation_dropout, "activation_function": "relu", "attention_dropout": model_args.attention_dropout, "d_model": model_args.decoder_embed_dim, "dropout": model_args.dropout, "init_std": 0.02, "max_position_embeddings": model_args.max_source_positions, "num_hidden_layers": model_args.encoder_layers, "src_vocab_size": len(task.source_dictionary), "tgt_vocab_size": len(task.target_dictionary), "langs": [task_args.source_lang, task_args.target_lang], "encoder_attention_heads": model_args.encoder_attention_heads, "encoder_ffn_dim": model_args.encoder_ffn_embed_dim, "encoder_layerdrop": model_args.encoder_layerdrop, "encoder_layers": model_args.encoder_layers, "decoder_attention_heads": model_args.decoder_attention_heads, "decoder_ffn_dim": model_args.decoder_ffn_embed_dim, "decoder_layerdrop": model_args.decoder_layerdrop, "decoder_layers": model_args.decoder_layers, "bos_token_id": 0, "pad_token_id": 1, "eos_token_id": 2, "is_encoder_decoder": True, "scale_embedding": not model_args.no_scale_embedding, "tie_word_embeddings": model_args.share_all_embeddings, "share_decoder_input_output_embed": model_args.share_decoder_input_output_embed } # good hparam defaults to start with model_conf["num_beams"] = 5 model_conf["early_stopping"] = False model_conf["length_penalty"] = 1.0 print(f"Generating {fsmt_model_config_file}") with open(fsmt_model_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) # model hub_gen = TransformerModel.from_pretrained( dirname(fsmt_checkpoint_path), checkpoint_file=basename(fsmt_checkpoint_path), data_name_or_path=task_args.data) model_state_dict = hub_gen.models[0].state_dict() # rename keys to start with 'model.' model_state_dict = OrderedDict( ("model." + k, v) for k, v in model_state_dict.items()) # remove unneeded keys ignore_keys = [ "model.model", "model.encoder.version", "model.decoder.version", #"model.encoder_embed_tokens.weight", #"model.decoder_embed_tokens.weight", "model.encoder.embed_positions._float_tensor", "model.decoder.embed_positions._float_tensor", ] for k in ignore_keys: model_state_dict.pop(k, None) #print(model_state_dict.keys()) config = FSMTConfig.from_pretrained(pytorch_dump_folder_path) model_new = FSMTForConditionalGeneration(config) # check that it loads ok model_new.load_state_dict(model_state_dict, strict=False) # save pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) print(f"Generating {pytorch_weights_dump_path}") torch.save(model_state_dict, pytorch_weights_dump_path) pytorch_vocab_dump_path = os.path.join(pytorch_dump_folder_path, "vocab.txt") print(f"Generating {pytorch_vocab_dump_path}") assert hub_gen.src_dict.indices == hub_gen.tgt_dict.indices with open(pytorch_vocab_dump_path, 'w') as f: for item in hub_gen.src_dict.indices: f.write("%s\n" % item) if spm_model_path is not None: copyfile(spm_model_path, f"{pytorch_dump_folder_path}/spm_model.spm") print("Conversion is done!")