def train_translation_model(data_dir, arch, extra_flags=None, task='translation', run_validation=False, lang_flags=None, extra_valid_flags=None): if lang_flags is None: lang_flags = [ '--source-lang', 'in', '--target-lang', 'out', ] train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ '--task', task, data_dir, '--save-dir', data_dir, '--arch', arch, '--optimizer', 'nag', '--lr', '0.05', '--max-tokens', '500', '--max-epoch', '1', '--no-progress-bar', '--distributed-world-size', '1', '--num-workers', '0', ] + lang_flags + (extra_flags or []), ) train.main(train_args) if run_validation: # test validation validate_parser = options.get_validation_parser() validate_args = options.parse_args_and_arch(validate_parser, [ '--task', task, data_dir, '--path', os.path.join(data_dir, 'checkpoint_last.pt'), '--valid-subset', 'valid', '--max-tokens', '500', '--no-progress-bar', '--num-workers', '0', ] + lang_flags + (extra_valid_flags or [])) validate.main(validate_args)
def train_masked_lm(data_dir, arch, extra_flags=None): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ '--task', 'masked_lm', data_dir, '--arch', arch, '--optimizer', 'adam', '--lr', '0.0001', '--criterion', 'masked_lm', '--max-sentences', '500', '--save-dir', data_dir, '--max-epoch', '1', '--no-progress-bar', '--distributed-world-size', '1', '--ddp-backend', 'no_c10d', '--num-workers', 0, ] + (extra_flags or []), ) train.main(train_args)
def train_roberta_head(data_dir, arch, num_classes=2, extra_flags=None): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ '--task', 'sentence_prediction', data_dir, '--arch', arch, '--encoder-layers', '2', '--num-classes', str(num_classes), '--optimizer', 'adam', '--lr', '0.0001', '--criterion', 'sentence_prediction', '--max-tokens', '500', '--max-positions', '500', '--batch-size', '500', '--save-dir', data_dir, '--max-epoch', '1', '--no-progress-bar', '--distributed-world-size', '1', '--ddp-backend', 'no_c10d', '--num-workers', 0, ] + (extra_flags or []), ) train.main(train_args)
def train_language_model(data_dir, arch, extra_flags=None, run_validation=False): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ '--task', 'language_modeling', data_dir, '--arch', arch, '--optimizer', 'adam', '--lr', '0.0001', '--criterion', 'adaptive_loss', '--adaptive-softmax-cutoff', '5,10,15', '--max-tokens', '500', '--tokens-per-sample', '500', '--save-dir', data_dir, '--max-epoch', '1', '--no-progress-bar', '--distributed-world-size', '1', '--ddp-backend', 'no_c10d', ] + (extra_flags or []), ) train.main(train_args) if run_validation: # test validation validate_parser = options.get_validation_parser() validate_args = options.parse_args_and_arch(validate_parser, [ '--task', 'language_modeling', data_dir, '--path', os.path.join(data_dir, 'checkpoint_last.pt'), '--valid-subset', 'valid', '--max-tokens', '500', '--no-progress-bar', ]) validate.main(validate_args)
def _run_training(self, cmd: List[str]): try: from fairseq_cli.train import main parser = options.get_training_parser() if self.arch.startswith("bart"): parser.add_argument("--max-positions", type=int) args = options.parse_args_and_arch(parser, input_args=cmd) main(args) except ImportError: cmd.insert(0, "fairseq-train") subprocess.run(cmd)
def fairseq_train(input_args): """ Helper function for training """ parser = options.get_training_parser() args = options.parse_args_and_arch(parser, input_args=input_args) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=train.distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: train.distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print( '| NOTE: you may get better performance with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=train.distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training train.main(args)
def run(self, url, world_rank, args): """Runs the fairseq training. We set args for different ray actors for communication, add a checkpoint hook, and call the main function of fairseq. """ # Set the init_method and rank of the process for distributed training. print("Ray worker at {url} rank {rank}".format(url=url, rank=world_rank)) self.url = url self.world_rank = world_rank args.distributed_rank = world_rank args.distributed_init_method = url # Add a checkpoint hook to make use of new resources. self.add_checkpoint_hook(args) # Call the original main function of fairseq. main(args, init_distributed=(args.distributed_world_size > 1))
def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=False): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ "--task", "language_modeling", data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--criterion", "adaptive_loss", "--adaptive-softmax-cutoff", "5,10,15", "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", "1", "--ddp-backend", "no_c10d", "--num-workers", 0, ] + (extra_flags or []), ) train.main(train_args) # try scalar quantization scalar_quant_train_parser = options.get_training_parser() scalar_quant_train_args = options.parse_args_and_arch( scalar_quant_train_parser, [ "--task", "language_modeling", data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--criterion", "adaptive_loss", "--adaptive-softmax-cutoff", "5,10,15", "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-update", "3", "--no-progress-bar", "--distributed-world-size", "1", "--ddp-backend", "no_c10d", "--num-workers", 0, "--quant-noise-scalar", "0.5", ] + (extra_flags or []), ) train.main(scalar_quant_train_args) # try iterative PQ quantization quantize_parser = options.get_training_parser() quantize_args = options.parse_args_and_arch( quantize_parser, [ "--task", "language_modeling", data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--criterion", "adaptive_loss", "--adaptive-softmax-cutoff", "5,10,15", "--max-tokens", "50", "--tokens-per-sample", "50", "--max-update", "6", "--no-progress-bar", "--distributed-world-size", "1", "--ddp-backend", "no_c10d", "--num-workers", 0, "--restore-file", os.path.join(data_dir, "checkpoint_last.pt"), "--reset-optimizer", "--quantization-config-path", os.path.join( os.path.dirname(__file__), "transformer_quantization_config.yaml" ), ] + (extra_flags or []), ) train.main(quantize_args)
def train_legacy_masked_language_model(data_dir, arch, extra_args=()): train_parser = options.get_training_parser() # TODO: langs should be in and out right? train_args = options.parse_args_and_arch( train_parser, [ "--task", "cross_lingual_lm", data_dir, "--arch", arch, # Optimizer args "--optimizer", "adam", "--lr-scheduler", "reduce_lr_on_plateau", "--lr-shrink", "0.5", "--lr", "0.0001", "--min-lr", "1e-09", # dropout, attention args "--dropout", "0.1", "--attention-dropout", "0.1", # MLM args "--criterion", "legacy_masked_lm_loss", "--masked-lm-only", "--monolingual-langs", "in,out", "--num-segment", "5", # Transformer args: use a small transformer model for fast training "--encoder-layers", "1", "--encoder-embed-dim", "32", "--encoder-attention-heads", "1", "--encoder-ffn-embed-dim", "32", # Other training args "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", "1", "--dataset-impl", "raw", ] + list(extra_args), ) train.main(train_args)
def train_translation_model( data_dir, arch, extra_flags=None, task="translation", run_validation=False, lang_flags=None, extra_valid_flags=None, ): if lang_flags is None: lang_flags = [ "--source-lang", "in", "--target-lang", "out", ] train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ "--task", task, data_dir, "--save-dir", data_dir, "--arch", arch, "--optimizer", "nag", "--lr", "0.05", "--max-tokens", "500", "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", "1", "--num-workers", "0", ] + lang_flags + (extra_flags or []), ) train.main(train_args) if run_validation: # test validation validate_parser = options.get_validation_parser() validate_args = options.parse_args_and_arch( validate_parser, [ "--task", task, data_dir, "--path", os.path.join(data_dir, "checkpoint_last.pt"), "--valid-subset", "valid", "--max-tokens", "500", "--no-progress-bar", "--num-workers", "0", ] + lang_flags + (extra_valid_flags or []), ) validate.main(validate_args)
def train_main(): if args.distributed_world_size > 1: distributed_main(device_id, args) else: main(args)
def fairseq_train( preprocessed_dir, exp_dir, ngpus=None, max_tokens=2000, arch='fconv_iwslt_de_en', pretrained_emb_path=None, embeddings_dim=None, # Transformer (decoder is the same as encoder for now) encoder_embed_dim=512, encoder_layers=6, encoder_attention_heads=8, # encoder_decoder_dim_ratio=1, # share_embeddings=True, max_epoch=50, warmup_updates=None, lr=0.1, min_lr=1e-9, dropout=0.2, label_smoothing=0.1, lr_scheduler='fixed', weight_decay=0.0001, criterion='label_smoothed_cross_entropy', optimizer='nag', validations_before_sari_early_stopping=10, fp16=False): exp_dir = Path(exp_dir) with log_stdout(exp_dir / 'fairseq_train.stdout'): preprocessed_dir = Path(preprocessed_dir) exp_dir.mkdir(exist_ok=True, parents=True) # Copy dictionaries to exp_dir for generation shutil.copy(preprocessed_dir / 'dict.complex.txt', exp_dir) shutil.copy(preprocessed_dir / 'dict.simple.txt', exp_dir) train_parser = options.get_training_parser() # if share_embeddings: # assert encoder_decoder_dim_ratio == 1 args = [ '--task', 'translation', preprocessed_dir, '--raw-text', '--source-lang', 'complex', '--target-lang', 'simple', '--save-dir', os.path.join(exp_dir, 'checkpoints'), '--clip-norm', 0.1, '--criterion', criterion, '--no-epoch-checkpoints', '--save-interval-updates', 5000, # Validate every n updates '--validations-before-sari-early-stopping', validations_before_sari_early_stopping, '--arch', arch, # '--decoder-out-embed-dim', int(embeddings_dim * encoder_decoder_dim_ratio), # Output dim of decoder '--max-tokens', max_tokens, '--max-epoch', max_epoch, '--lr-scheduler', lr_scheduler, '--dropout', dropout, '--lr', lr, '--lr-shrink', 0.5, # For reduce lr on plateau scheduler '--min-lr', min_lr, '--weight-decay', weight_decay, '--optimizer', optimizer, '--label-smoothing', label_smoothing, '--seed', random.randint(1, 1000), # '--force-anneal', '200', # '--distributed-world-size', '1', ] if arch == 'transformer': args.extend([ '--encoder-embed-dim', encoder_embed_dim, '--encoder-ffn-embed-dim', 4 * encoder_embed_dim, '--encoder-layers', encoder_layers, '--encoder-attention-heads', encoder_attention_heads, '--decoder-layers', encoder_layers, '--decoder-attention-heads', encoder_attention_heads, ]) if pretrained_emb_path is not None: args.extend([ '--encoder-embed-path', pretrained_emb_path if pretrained_emb_path is not None else '' ]) args.extend([ '--decoder-embed-path', pretrained_emb_path if pretrained_emb_path is not None else '' ]) if embeddings_dim is not None: args.extend(['--encoder-embed-dim', embeddings_dim]) # Input and output dim of encoder args.extend(['--decoder-embed-dim', embeddings_dim]) # Input dim of decoder if ngpus is not None: args.extend(['--distributed-world-size', ngpus]) # if share_embeddings: # args.append('--share-input-output-embed') if fp16: args.append('--fp16') if warmup_updates is not None: args.extend(['--warmup-updates', warmup_updates]) args = [str(arg) for arg in args] train_args = options.parse_args_and_arch(train_parser, args) train.main(train_args)