Beispiel #1
0
def train_translation_model(data_dir,
                            arch,
                            extra_flags=None,
                            task='translation',
                            run_validation=False,
                            lang_flags=None,
                            extra_valid_flags=None):
    if lang_flags is None:
        lang_flags = [
            '--source-lang',
            'in',
            '--target-lang',
            'out',
        ]
    train_parser = options.get_training_parser()
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            '--task',
            task,
            data_dir,
            '--save-dir',
            data_dir,
            '--arch',
            arch,
            '--optimizer',
            'nag',
            '--lr',
            '0.05',
            '--max-tokens',
            '500',
            '--max-epoch',
            '1',
            '--no-progress-bar',
            '--distributed-world-size',
            '1',
            '--num-workers',
            '0',
        ] + lang_flags + (extra_flags or []),
    )
    train.main(train_args)

    if run_validation:
        # test validation
        validate_parser = options.get_validation_parser()
        validate_args = options.parse_args_and_arch(validate_parser, [
            '--task',
            task,
            data_dir,
            '--path',
            os.path.join(data_dir, 'checkpoint_last.pt'),
            '--valid-subset',
            'valid',
            '--max-tokens',
            '500',
            '--no-progress-bar',
            '--num-workers',
            '0',
        ] + lang_flags + (extra_valid_flags or []))
        validate.main(validate_args)
Beispiel #2
0
def train_masked_lm(data_dir, arch, extra_flags=None):
    train_parser = options.get_training_parser()
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            '--task',
            'masked_lm',
            data_dir,
            '--arch',
            arch,
            '--optimizer',
            'adam',
            '--lr',
            '0.0001',
            '--criterion',
            'masked_lm',
            '--max-sentences',
            '500',
            '--save-dir',
            data_dir,
            '--max-epoch',
            '1',
            '--no-progress-bar',
            '--distributed-world-size',
            '1',
            '--ddp-backend',
            'no_c10d',
            '--num-workers',
            0,
        ] + (extra_flags or []),
    )
    train.main(train_args)
Beispiel #3
0
def train_roberta_head(data_dir, arch, num_classes=2, extra_flags=None):
    train_parser = options.get_training_parser()
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            '--task', 'sentence_prediction',
            data_dir,
            '--arch', arch,
            '--encoder-layers', '2',
            '--num-classes', str(num_classes),
            '--optimizer', 'adam',
            '--lr', '0.0001',
            '--criterion', 'sentence_prediction',
            '--max-tokens', '500',
            '--max-positions', '500',
            '--batch-size', '500',
            '--save-dir', data_dir,
            '--max-epoch', '1',
            '--no-progress-bar',
            '--distributed-world-size', '1',
            '--ddp-backend', 'no_c10d',
            '--num-workers', 0,
        ] + (extra_flags or []),
    )
    train.main(train_args)
Beispiel #4
0
def train_language_model(data_dir,
                         arch,
                         extra_flags=None,
                         run_validation=False):
    train_parser = options.get_training_parser()
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            '--task',
            'language_modeling',
            data_dir,
            '--arch',
            arch,
            '--optimizer',
            'adam',
            '--lr',
            '0.0001',
            '--criterion',
            'adaptive_loss',
            '--adaptive-softmax-cutoff',
            '5,10,15',
            '--max-tokens',
            '500',
            '--tokens-per-sample',
            '500',
            '--save-dir',
            data_dir,
            '--max-epoch',
            '1',
            '--no-progress-bar',
            '--distributed-world-size',
            '1',
            '--ddp-backend',
            'no_c10d',
        ] + (extra_flags or []),
    )
    train.main(train_args)

    if run_validation:
        # test validation
        validate_parser = options.get_validation_parser()
        validate_args = options.parse_args_and_arch(validate_parser, [
            '--task',
            'language_modeling',
            data_dir,
            '--path',
            os.path.join(data_dir, 'checkpoint_last.pt'),
            '--valid-subset',
            'valid',
            '--max-tokens',
            '500',
            '--no-progress-bar',
        ])
        validate.main(validate_args)
Beispiel #5
0
 def _run_training(self, cmd: List[str]):
     try:
         from fairseq_cli.train import main
         parser = options.get_training_parser()
         if self.arch.startswith("bart"):
             parser.add_argument("--max-positions", type=int)
         args = options.parse_args_and_arch(parser, input_args=cmd)
         main(args)
     except ImportError:
         cmd.insert(0, "fairseq-train")
         subprocess.run(cmd)
Beispiel #6
0
def fairseq_train(input_args):
    """
    Helper function for training
    """

    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser, input_args=input_args)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=train.distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            train.distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print(
                '| NOTE: you may get better performance with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=train.distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        train.main(args)
Beispiel #7
0
    def run(self, url, world_rank, args):
        """Runs the fairseq training.

        We set args for different ray actors for communication,
        add a checkpoint hook, and call the main function of fairseq.
        """

        # Set the init_method and rank of the process for distributed training.
        print("Ray worker at {url} rank {rank}".format(url=url,
                                                       rank=world_rank))
        self.url = url
        self.world_rank = world_rank
        args.distributed_rank = world_rank
        args.distributed_init_method = url

        # Add a checkpoint hook to make use of new resources.
        self.add_checkpoint_hook(args)

        # Call the original main function of fairseq.
        main(args, init_distributed=(args.distributed_world_size > 1))
Beispiel #8
0
def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=False):
    train_parser = options.get_training_parser()
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            "--task",
            "language_modeling",
            data_dir,
            "--arch",
            arch,
            "--optimizer",
            "adam",
            "--lr",
            "0.0001",
            "--criterion",
            "adaptive_loss",
            "--adaptive-softmax-cutoff",
            "5,10,15",
            "--max-tokens",
            "500",
            "--tokens-per-sample",
            "500",
            "--save-dir",
            data_dir,
            "--max-epoch",
            "1",
            "--no-progress-bar",
            "--distributed-world-size",
            "1",
            "--ddp-backend",
            "no_c10d",
            "--num-workers",
            0,
        ]
        + (extra_flags or []),
    )
    train.main(train_args)

    # try scalar quantization
    scalar_quant_train_parser = options.get_training_parser()
    scalar_quant_train_args = options.parse_args_and_arch(
        scalar_quant_train_parser,
        [
            "--task",
            "language_modeling",
            data_dir,
            "--arch",
            arch,
            "--optimizer",
            "adam",
            "--lr",
            "0.0001",
            "--criterion",
            "adaptive_loss",
            "--adaptive-softmax-cutoff",
            "5,10,15",
            "--max-tokens",
            "500",
            "--tokens-per-sample",
            "500",
            "--save-dir",
            data_dir,
            "--max-update",
            "3",
            "--no-progress-bar",
            "--distributed-world-size",
            "1",
            "--ddp-backend",
            "no_c10d",
            "--num-workers",
            0,
            "--quant-noise-scalar",
            "0.5",
        ]
        + (extra_flags or []),
    )
    train.main(scalar_quant_train_args)

    # try iterative PQ quantization
    quantize_parser = options.get_training_parser()
    quantize_args = options.parse_args_and_arch(
        quantize_parser,
        [
            "--task",
            "language_modeling",
            data_dir,
            "--arch",
            arch,
            "--optimizer",
            "adam",
            "--lr",
            "0.0001",
            "--criterion",
            "adaptive_loss",
            "--adaptive-softmax-cutoff",
            "5,10,15",
            "--max-tokens",
            "50",
            "--tokens-per-sample",
            "50",
            "--max-update",
            "6",
            "--no-progress-bar",
            "--distributed-world-size",
            "1",
            "--ddp-backend",
            "no_c10d",
            "--num-workers",
            0,
            "--restore-file",
            os.path.join(data_dir, "checkpoint_last.pt"),
            "--reset-optimizer",
            "--quantization-config-path",
            os.path.join(
                os.path.dirname(__file__), "transformer_quantization_config.yaml"
            ),
        ]
        + (extra_flags or []),
    )
    train.main(quantize_args)
Beispiel #9
0
def train_legacy_masked_language_model(data_dir, arch, extra_args=()):
    train_parser = options.get_training_parser()
    # TODO: langs should be in and out right?
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            "--task",
            "cross_lingual_lm",
            data_dir,
            "--arch",
            arch,
            # Optimizer args
            "--optimizer",
            "adam",
            "--lr-scheduler",
            "reduce_lr_on_plateau",
            "--lr-shrink",
            "0.5",
            "--lr",
            "0.0001",
            "--min-lr",
            "1e-09",
            # dropout, attention args
            "--dropout",
            "0.1",
            "--attention-dropout",
            "0.1",
            # MLM args
            "--criterion",
            "legacy_masked_lm_loss",
            "--masked-lm-only",
            "--monolingual-langs",
            "in,out",
            "--num-segment",
            "5",
            # Transformer args: use a small transformer model for fast training
            "--encoder-layers",
            "1",
            "--encoder-embed-dim",
            "32",
            "--encoder-attention-heads",
            "1",
            "--encoder-ffn-embed-dim",
            "32",
            # Other training args
            "--max-tokens",
            "500",
            "--tokens-per-sample",
            "500",
            "--save-dir",
            data_dir,
            "--max-epoch",
            "1",
            "--no-progress-bar",
            "--distributed-world-size",
            "1",
            "--dataset-impl",
            "raw",
        ] + list(extra_args),
    )
    train.main(train_args)
Beispiel #10
0
def train_translation_model(
    data_dir,
    arch,
    extra_flags=None,
    task="translation",
    run_validation=False,
    lang_flags=None,
    extra_valid_flags=None,
):
    if lang_flags is None:
        lang_flags = [
            "--source-lang",
            "in",
            "--target-lang",
            "out",
        ]
    train_parser = options.get_training_parser()
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            "--task",
            task,
            data_dir,
            "--save-dir",
            data_dir,
            "--arch",
            arch,
            "--optimizer",
            "nag",
            "--lr",
            "0.05",
            "--max-tokens",
            "500",
            "--max-epoch",
            "1",
            "--no-progress-bar",
            "--distributed-world-size",
            "1",
            "--num-workers",
            "0",
        ]
        + lang_flags
        + (extra_flags or []),
    )
    train.main(train_args)

    if run_validation:
        # test validation
        validate_parser = options.get_validation_parser()
        validate_args = options.parse_args_and_arch(
            validate_parser,
            [
                "--task",
                task,
                data_dir,
                "--path",
                os.path.join(data_dir, "checkpoint_last.pt"),
                "--valid-subset",
                "valid",
                "--max-tokens",
                "500",
                "--no-progress-bar",
                "--num-workers",
                "0",
            ]
            + lang_flags
            + (extra_valid_flags or []),
        )
        validate.main(validate_args)
Beispiel #11
0
 def train_main():
     if args.distributed_world_size > 1:
         distributed_main(device_id, args)
     else:
         main(args)
Beispiel #12
0
def fairseq_train(
        preprocessed_dir,
        exp_dir,
        ngpus=None,
        max_tokens=2000,
        arch='fconv_iwslt_de_en',
        pretrained_emb_path=None,
        embeddings_dim=None,
        # Transformer (decoder is the same as encoder for now)
        encoder_embed_dim=512,
        encoder_layers=6,
        encoder_attention_heads=8,
        # encoder_decoder_dim_ratio=1,
        # share_embeddings=True,
        max_epoch=50,
        warmup_updates=None,
        lr=0.1,
        min_lr=1e-9,
        dropout=0.2,
        label_smoothing=0.1,
        lr_scheduler='fixed',
        weight_decay=0.0001,
        criterion='label_smoothed_cross_entropy',
        optimizer='nag',
        validations_before_sari_early_stopping=10,
        fp16=False):
    exp_dir = Path(exp_dir)
    with log_stdout(exp_dir / 'fairseq_train.stdout'):
        preprocessed_dir = Path(preprocessed_dir)
        exp_dir.mkdir(exist_ok=True, parents=True)
        # Copy dictionaries to exp_dir for generation
        shutil.copy(preprocessed_dir / 'dict.complex.txt', exp_dir)
        shutil.copy(preprocessed_dir / 'dict.simple.txt', exp_dir)
        train_parser = options.get_training_parser()
        # if share_embeddings:
        #     assert encoder_decoder_dim_ratio == 1
        args = [
            '--task',
            'translation',
            preprocessed_dir,
            '--raw-text',
            '--source-lang',
            'complex',
            '--target-lang',
            'simple',
            '--save-dir',
            os.path.join(exp_dir, 'checkpoints'),
            '--clip-norm',
            0.1,
            '--criterion',
            criterion,
            '--no-epoch-checkpoints',
            '--save-interval-updates',
            5000,  # Validate every n updates
            '--validations-before-sari-early-stopping',
            validations_before_sari_early_stopping,
            '--arch',
            arch,

            # '--decoder-out-embed-dim', int(embeddings_dim * encoder_decoder_dim_ratio),  # Output dim of decoder
            '--max-tokens',
            max_tokens,
            '--max-epoch',
            max_epoch,
            '--lr-scheduler',
            lr_scheduler,
            '--dropout',
            dropout,
            '--lr',
            lr,
            '--lr-shrink',
            0.5,  # For reduce lr on plateau scheduler
            '--min-lr',
            min_lr,
            '--weight-decay',
            weight_decay,
            '--optimizer',
            optimizer,
            '--label-smoothing',
            label_smoothing,
            '--seed',
            random.randint(1, 1000),
            # '--force-anneal', '200',
            # '--distributed-world-size', '1',
        ]
        if arch == 'transformer':
            args.extend([
                '--encoder-embed-dim',
                encoder_embed_dim,
                '--encoder-ffn-embed-dim',
                4 * encoder_embed_dim,
                '--encoder-layers',
                encoder_layers,
                '--encoder-attention-heads',
                encoder_attention_heads,
                '--decoder-layers',
                encoder_layers,
                '--decoder-attention-heads',
                encoder_attention_heads,
            ])
        if pretrained_emb_path is not None:
            args.extend([
                '--encoder-embed-path',
                pretrained_emb_path if pretrained_emb_path is not None else ''
            ])
            args.extend([
                '--decoder-embed-path',
                pretrained_emb_path if pretrained_emb_path is not None else ''
            ])
        if embeddings_dim is not None:
            args.extend(['--encoder-embed-dim',
                         embeddings_dim])  # Input and output dim of encoder
            args.extend(['--decoder-embed-dim',
                         embeddings_dim])  # Input dim of decoder
        if ngpus is not None:
            args.extend(['--distributed-world-size', ngpus])
        # if share_embeddings:
        #     args.append('--share-input-output-embed')
        if fp16:
            args.append('--fp16')
        if warmup_updates is not None:
            args.extend(['--warmup-updates', warmup_updates])
        args = [str(arg) for arg in args]
        train_args = options.parse_args_and_arch(train_parser, args)
        train.main(train_args)