Example #1
0
def train_translation_model(data_dir, arch, extra_flags=None):
    train_parser = options.get_training_parser()
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            '--task', 'translation',
            data_dir,
            '--save-dir', data_dir,
            '--arch', arch,
            '--optimizer', 'nag',
            '--lr', '0.05',
            '--max-tokens', '500',
            '--max-epoch', '1',
            '--no-progress-bar',
            '--distributed-world-size', '1',
            '--source-lang', 'in',
            '--target-lang', 'out',
        ] + (extra_flags or []),
    )
    train.main(train_args)
Example #2
0
def train_language_model(data_dir, arch):
    train_parser = options.get_training_parser()
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            '--task', 'language_modeling',
            data_dir,
            '--arch', arch,
            '--optimizer', 'nag',
            '--lr', '1.0',
            '--criterion', 'adaptive_loss',
            '--adaptive-softmax-cutoff', '5,10,15',
            '--decoder-layers', '[(850, 3)] * 2 + [(1024,4)]',
            '--decoder-embed-dim', '280',
            '--max-tokens', '500',
            '--tokens-per-sample', '500',
            '--save-dir', data_dir,
            '--max-epoch', '1',
            '--no-progress-bar',
            '--distributed-world-size', '1',
        ],
    )
    train.main(train_args)
Example #3
0
def load_dataset_splits(task, splits):
    for split in splits:
        if split == 'train':
            task.load_dataset(split, combine=True)
        else:
            for k in itertools.count():
                split_k = split + (str(k) if k > 0 else '')
                try:
                    task.load_dataset(split_k, combine=False)
                except FileNotFoundError as e:
                    if k > 0:
                        break
                    raise e


if __name__ == '__main__':
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser)

    if args.distributed_port > 0 or args.distributed_init_method is not None:
        from distributed_train import main as distributed_main

        distributed_main(args)
    elif args.distributed_world_size > 1:
        from multiprocessing_train import main as multiprocessing_main

        multiprocessing_main(args)
    else:
        main(args)
Example #4
0
def cli_main():
    parser = options.get_training_parser()
    parser.add_argument(
        "--comet-logging",
        action="store_true",
        help="Whether to use Comet.ML for logging",
    )
    args = options.parse_args_and_arch(parser)

    logging = getattr(args, "comet_logging", False)
    config = None
    if logging:
        PROJECT = "machine-translation"
        if not keyring.get_password("comet", PROJECT):
            comet_ml_api_key = getpass("Please enter the comet.ml API key: ")
            keyring.set_password("comet", PROJECT, comet_ml_api_key)
        else:
            comet_ml_api_key = keyring.get_password("comet", PROJECT)

        experiment = Experiment(
            api_key=comet_ml_api_key,
            project_name="machine-translation",
            workspace="machine-translation",
            auto_output_logging=None,
        )
        config = {
            "api_key": comet_ml_api_key,
            "experiment_key": experiment.get_key()
        }
        print("Proceeding with Comet.ML logging...")

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, config, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args, config)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = "tcp://localhost:{port}".format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != "no_c10d":
            print(
                "| NOTE: you may get better performance with: --ddp-backend=no_c10d"
            )
        torch.multiprocessing.spawn(fn=distributed_main,
                                    args=(args, config),
                                    nprocs=args.distributed_world_size)
    else:
        # single GPU training
        main(args, config=config)
    if config:
        experiment.end()
Example #5
0
def train_translation_model(
    data_dir,
    arch,
    extra_flags=None,
    task="translation",
    run_validation=False,
    lang_flags=None,
    extra_valid_flags=None,
):
    if lang_flags is None:
        lang_flags = [
            "--source-lang",
            "in",
            "--target-lang",
            "out",
        ]
    train_parser = options.get_training_parser()
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            "--task",
            task,
            data_dir,
            "--save-dir",
            data_dir,
            "--arch",
            arch,
            "--optimizer",
            "nag",
            "--lr",
            "0.05",
            "--max-tokens",
            "500",
            "--max-epoch",
            "1",
            "--no-progress-bar",
            "--distributed-world-size",
            "1",
            "--num-workers",
            "0",
        ]
        + lang_flags
        + (extra_flags or []),
    )
    train.main(train_args)

    if run_validation:
        # test validation
        validate_parser = options.get_validation_parser()
        validate_args = options.parse_args_and_arch(
            validate_parser,
            [
                "--task",
                task,
                data_dir,
                "--path",
                os.path.join(data_dir, "checkpoint_last.pt"),
                "--valid-subset",
                "valid",
                "--max-tokens",
                "500",
                "--no-progress-bar",
                "--num-workers",
                "0",
            ]
            + lang_flags
            + (extra_valid_flags or []),
        )
        validate.main(validate_args)
def _quantize_language_model(data_dir,
                             arch,
                             extra_flags=None,
                             run_validation=False):
    train_parser = options.get_training_parser()
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            "--task",
            "language_modeling",
            data_dir,
            "--arch",
            arch,
            "--optimizer",
            "adam",
            "--lr",
            "0.0001",
            "--criterion",
            "adaptive_loss",
            "--adaptive-softmax-cutoff",
            "5,10,15",
            "--max-tokens",
            "500",
            "--tokens-per-sample",
            "500",
            "--save-dir",
            data_dir,
            "--max-epoch",
            "1",
            "--no-progress-bar",
            "--distributed-world-size",
            "1",
            "--ddp-backend",
            "no_c10d",
            "--num-workers",
            0,
        ] + (extra_flags or []),
    )
    train.main(train_args)

    # try scalar quantization
    scalar_quant_train_parser = options.get_training_parser()
    scalar_quant_train_args = options.parse_args_and_arch(
        scalar_quant_train_parser,
        [
            "--task",
            "language_modeling",
            data_dir,
            "--arch",
            arch,
            "--optimizer",
            "adam",
            "--lr",
            "0.0001",
            "--criterion",
            "adaptive_loss",
            "--adaptive-softmax-cutoff",
            "5,10,15",
            "--max-tokens",
            "500",
            "--tokens-per-sample",
            "500",
            "--save-dir",
            data_dir,
            "--max-update",
            "3",
            "--no-progress-bar",
            "--distributed-world-size",
            "1",
            "--ddp-backend",
            "no_c10d",
            "--num-workers",
            0,
            "--quant-noise-scalar",
            "0.5",
        ] + (extra_flags or []),
    )
    train.main(scalar_quant_train_args)

    # try iterative PQ quantization
    quantize_parser = options.get_training_parser()
    quantize_args = options.parse_args_and_arch(
        quantize_parser,
        [
            "--task",
            "language_modeling",
            data_dir,
            "--arch",
            arch,
            "--optimizer",
            "adam",
            "--lr",
            "0.0001",
            "--criterion",
            "adaptive_loss",
            "--adaptive-softmax-cutoff",
            "5,10,15",
            "--max-tokens",
            "50",
            "--tokens-per-sample",
            "50",
            "--max-update",
            "6",
            "--no-progress-bar",
            "--distributed-world-size",
            "1",
            "--ddp-backend",
            "no_c10d",
            "--num-workers",
            0,
            "--restore-file",
            os.path.join(data_dir, "checkpoint_last.pt"),
            "--reset-optimizer",
            "--quantization-config-path",
            os.path.join(os.path.dirname(__file__),
                         "transformer_quantization_config.yaml"),
        ] + (extra_flags or []),
    )
    train.main(quantize_args)
Example #7
0
def train_language_model(
    data_dir,
    arch,
    extra_flags=None,
    run_validation=False,
    extra_valid_flags=None,
    task="language_modeling",
    world_size=1,
):
    train_parser = options.get_training_parser()
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            "--task",
            task,
            data_dir,
            "--arch",
            arch,
            "--optimizer",
            "adam",
            "--lr",
            "0.0001",
            "--max-tokens",
            "500",
            "--tokens-per-sample",
            "500",
            "--save-dir",
            data_dir,
            "--max-epoch",
            "1",
            "--no-progress-bar",
            "--distributed-world-size",
            str(world_size),
            "--ddp-backend",
            "no_c10d",
            "--num-workers",
            "0",
        ] + (extra_flags or []),
    )
    cfg = convert_namespace_to_omegaconf(train_args)
    distributed_utils.call_main(cfg, train.main)

    if run_validation:
        # test validation
        validate_parser = options.get_validation_parser()
        validate_args = options.parse_args_and_arch(
            validate_parser,
            [
                "--task",
                task,
                data_dir,
                "--path",
                os.path.join(data_dir, "checkpoint_last.pt"),
                "--valid-subset",
                "valid",
                "--max-tokens",
                "500",
                "--no-progress-bar",
                "--num-workers",
                "0",
            ] + (extra_valid_flags or []),
        )
        validate.main(validate_args)
Example #8
0
def train_main(alpha, beta, save_path):
    parser = options.get_training_parser()
    input_args = [
        data_set, '--share-decoder-input-output-embed', '--arch',
        'transformer_iwslt_de_en', '--max-tokens', '4000', '--lr', '5e-4',
        '--save-interval', '2', '--max-epoch', '85', '--patience', '5',
        '--optimizer', 'adam', '--adam-betas', '(0.9, 0.98)', '--clip-norm',
        '0.0', '--weight-decay', '0.0001', '--dropout', '0.3',
        '--lr-scheduler', 'inverse_sqrt', '--warmup-updates', '4000',
        '--keep-last-epochs', '4', '--criterion', 'jensen_cross_entropy',
        '--alpha',
        str(alpha), '--beta',
        str(beta), '--use-uniform', '--fp16', '--save-dir', save_path
    ]

    args = options.parse_args_and_arch(parser, input_args=input_args)
    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print(
                '| NOTE: you may get better performance with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)

    ckpts = os.listdir(args.save_dir)
    try:
        ckpts.remove('checkpoint_last.pt')
    except ValueError:
        print("no checkpoint_last.pt in folder", args.save_dir)

    f = open(os.path.join(args.save_dir, "final_entropies.txt"), "a+")
    results = {}
    entropies = {}
    for ckpt in ckpts:
        if '.pt' in ckpt:
            path = os.path.join(args.save_dir, ckpt)
            f.write(path + '\n')
            run_generation(path, results, entropies)

            f.write('{entropy: ' + str(entropies[path]) + ', bleu: ' +
                    str(results[path]) + '}\n')

    f.close()
    return results
Example #9
0
def cli_main():

    parser = options.get_training_parser()
    parser.add_argument(
        '--config',
        type=str,
        nargs='*',
        help=
        'paths to JSON files of experiment configurations, from high to low priority',
    )
    parser.add_argument('--exp-name',
                        type=str,
                        default='',
                        help='name of the experiment')
    parser.add_argument(
        '--debug',
        default=False,
        action='store_true',
        help='run training in the debugging mode',
    )
    parser.add_argument('--path-attributes',
                        type=str,
                        nargs='*',
                        default=['task', 'arch', 'lr'])
    parser.add_argument(
        '--filter_best_last_ckpts',
        type=str,
        default=False,
        help=
        'whether to filter out checkpoint_best and checkpoint_last from checkpoint list'
    )
    parser.add_argument('--log_valid_progress',
                        type=str,
                        default=False,
                        help='whether to log validation progress')
    pre_parsed_args, unknown = parser.parse_known_args()

    config_dict = {}
    for config_path in pre_parsed_args.config:
        config_dict = update_config(config_dict, compose_configs(config_path))

    parser_modifier = modify_factory(config_dict)

    args = options.parse_args_and_arch(parser, modify_parser=parser_modifier)

    update_namespace(args, config_dict)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if (args.update_freq is not None and max(args.update_freq) > 1
                and args.ddp_backend != 'no_c10d'):
            logger.info(
                'NOTE: you may get faster training with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
def setup():
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser)
    # make sure everything is reset before loading the model
    args.reset_optimizer = True
    args.reset_meters = True
    args.reset_dataloader = True
    args.reset_lr_scheduler = True
    args.path = args.restore_file
    args.max_sentences_valid = 1  # We attack batch size 1 at the moment
    args.beam = 1  # beam size 1 for inference on the model, could use higher
    utils.import_user_module(args)

    torch.manual_seed(args.seed)

    # setup task, model, loss function, and trainer
    task = tasks.setup_task(args)
    if not args.interactive_attacks:
        for valid_sub_split in args.valid_subset.split(
                ','):  # load validation data
            task.load_dataset(valid_sub_split, combine=False, epoch=0)
    models, _ = checkpoint_utils.load_model_ensemble(args.path.split(':'),
                                                     arg_overrides={},
                                                     task=task)
    assert len(
        models) == 1  # Make sure you didn't pass an ensemble of models in
    model = models[0]

    if torch.cuda.is_available() and not args.cpu:
        assert torch.cuda.device_count() == 1  # only works on 1 GPU for now
        torch.cuda.set_device(0)
        model.cuda()
    args.beam = 1  # beam size 1 for now
    model.make_generation_fast_(beamable_mm_beam_size=args.beam,
                                need_attn=False)

    criterion = task.build_criterion(args)
    trainer = Trainer(args, task, model, criterion)
    generator = task.build_generator(args)

    bpe_vocab_size = trainer.get_model().encoder.embed_tokens.weight.shape[0]
    add_hooks(trainer.get_model(),
              bpe_vocab_size)  # add gradient hooks to embeddings
    embedding_weight = get_embedding_weight(
        trainer.get_model(), bpe_vocab_size)  # save the embedding matrix
    if not args.interactive_attacks:
        subset = args.valid_subset.split(',')[
            0]  # only one validation subset handled
        itr = trainer.task.get_batch_iterator(
            dataset=trainer.task.dataset(subset),
            max_tokens=args.max_tokens_valid,
            max_sentences=args.max_sentences_valid,
            max_positions=utils.resolve_max_positions(
                trainer.task.max_positions(),
                trainer.get_model().max_positions(),
            ),
            ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
            required_batch_size_multiple=args.required_batch_size_multiple,
            seed=args.seed,
            num_shards=args.distributed_world_size,
            shard_id=args.distributed_rank,
            num_workers=args.num_workers,
        ).next_epoch_itr(shuffle=False)
    else:
        itr = [
            None
        ] * 100000  # a fake dataset to go through, overwritten when doing interactive attacks

    # Handle BPE
    bpe = encoders.build_bpe(args)
    assert bpe is not None
    return args, trainer, generator, embedding_weight, itr, bpe
Example #11
0
def fairseq_train(
        preprocessed_dir,
        exp_dir,
        ngpus=None,
        max_tokens=2000,
        arch='fconv_iwslt_de_en',
        pretrained_emb_path=None,
        embeddings_dim=None,
        # Transformer (decoder is the same as encoder for now)
        encoder_embed_dim=512,
        encoder_layers=6,
        encoder_attention_heads=8,
        # encoder_decoder_dim_ratio=1,
        # share_embeddings=True,
        max_epoch=50,
        warmup_updates=None,
        lr=0.1,
        min_lr=1e-9,
        dropout=0.2,
        label_smoothing=0.1,
        lr_scheduler='fixed',
        weight_decay=0.0001,
        criterion='label_smoothed_cross_entropy',
        optimizer='nag',
        validations_before_sari_early_stopping=10,
        fp16=False):
    exp_dir = Path(exp_dir)
    with log_stdout(exp_dir / 'fairseq_train.stdout'):
        preprocessed_dir = Path(preprocessed_dir)
        exp_dir.mkdir(exist_ok=True, parents=True)
        # Copy dictionaries to exp_dir for generation
        shutil.copy(preprocessed_dir / 'dict.complex.txt', exp_dir)
        shutil.copy(preprocessed_dir / 'dict.simple.txt', exp_dir)
        train_parser = options.get_training_parser()
        # if share_embeddings:
        #     assert encoder_decoder_dim_ratio == 1
        args = [
            '--task',
            'translation',
            preprocessed_dir,
            '--raw-text',
            '--source-lang',
            'complex',
            '--target-lang',
            'simple',
            '--save-dir',
            os.path.join(exp_dir, 'checkpoints'),
            '--clip-norm',
            0.1,
            '--criterion',
            criterion,
            '--no-epoch-checkpoints',
            '--save-interval-updates',
            5000,  # Validate every n updates
            '--validations-before-sari-early-stopping',
            validations_before_sari_early_stopping,
            '--arch',
            arch,

            # '--decoder-out-embed-dim', int(embeddings_dim * encoder_decoder_dim_ratio),  # Output dim of decoder
            '--max-tokens',
            max_tokens,
            '--max-epoch',
            max_epoch,
            '--lr-scheduler',
            lr_scheduler,
            '--dropout',
            dropout,
            '--lr',
            lr,
            '--lr-shrink',
            0.5,  # For reduce lr on plateau scheduler
            '--min-lr',
            min_lr,
            '--weight-decay',
            weight_decay,
            '--optimizer',
            optimizer,
            '--label-smoothing',
            label_smoothing,
            '--seed',
            random.randint(1, 1000),
            # '--force-anneal', '200',
            # '--distributed-world-size', '1',
        ]
        if arch == 'transformer':
            args.extend([
                '--encoder-embed-dim',
                encoder_embed_dim,
                '--encoder-ffn-embed-dim',
                4 * encoder_embed_dim,
                '--encoder-layers',
                encoder_layers,
                '--encoder-attention-heads',
                encoder_attention_heads,
                '--decoder-layers',
                encoder_layers,
                '--decoder-attention-heads',
                encoder_attention_heads,
            ])
        if pretrained_emb_path is not None:
            args.extend([
                '--encoder-embed-path',
                pretrained_emb_path if pretrained_emb_path is not None else ''
            ])
            args.extend([
                '--decoder-embed-path',
                pretrained_emb_path if pretrained_emb_path is not None else ''
            ])
        if embeddings_dim is not None:
            args.extend(['--encoder-embed-dim',
                         embeddings_dim])  # Input and output dim of encoder
            args.extend(['--decoder-embed-dim',
                         embeddings_dim])  # Input dim of decoder
        if ngpus is not None:
            args.extend(['--distributed-world-size', ngpus])
        # if share_embeddings:
        #     args.append('--share-input-output-embed')
        if fp16:
            args.append('--fp16')
        if warmup_updates is not None:
            args.extend(['--warmup-updates', warmup_updates])
        args = [str(arg) for arg in args]
        train_args = options.parse_args_and_arch(train_parser, args)
        train.main(train_args)
Example #12
0
    def test_masks_token_spans(self):
        with TemporaryDirectory() as dirname:

            # prep input file
            raw_file = os.path.join(dirname, "raw")
            data = make_data(out_file=raw_file)
            vocab = build_vocab(data)

            # binarize
            binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False)
            split = "train"
            bin_file = os.path.join(dirname, split)
            dataset_impl = "mmap"

            FileBinarizer.multiprocess_dataset(
                input_file=raw_file,
                binarizer=binarizer,
                dataset_impl=dataset_impl,
                vocab_size=len(vocab),
                output_prefix=bin_file,
            )

            # adding sentinel tokens
            for i in range(100):
                vocab.add_symbol(f"<extra_id_{i}>")

            # setup task
            train_args = options.parse_args_and_arch(
                options.get_training_parser(),
                [
                    "--task",
                    "span_masked_lm",
                    "--arch",
                    "bart_base",
                    "--seed",
                    "42",
                    dirname,
                ],
            )
            cfg = convert_namespace_to_omegaconf(train_args)
            task = SpanMaskedLMTask(cfg.task, binarizer.dict)

            # load datasets
            original_dataset = task._load_dataset_split(bin_file, 1, False)
            task.load_dataset(split)
            masked_dataset = task.dataset(split)

            iterator = task.get_batch_iterator(
                dataset=masked_dataset,
                max_tokens=65_536,
                max_positions=4_096,
            ).next_epoch_itr(shuffle=False)
            num_tokens = len(vocab)
            for batch in iterator:
                for sample in range(len(batch)):
                    sample_id = batch["id"][sample]
                    original_tokens = original_dataset[sample_id]
                    masked_src_tokens = batch["net_input"]["src_tokens"][
                        sample]
                    masked_src_length = batch["net_input"]["src_lengths"][
                        sample]
                    masked_tgt_tokens = batch["target"][sample]

                    original_offset = 0
                    masked_tgt_offset = 0
                    extra_id_token = len(vocab) - 1
                    for masked_src_token in masked_src_tokens[:
                                                              masked_src_length]:
                        if masked_src_token == extra_id_token:
                            assert (masked_src_token ==
                                    masked_tgt_tokens[masked_tgt_offset])
                            extra_id_token -= 1
                            masked_tgt_offset += 1
                            while (original_offset < len(original_tokens)
                                   and masked_tgt_tokens[masked_tgt_offset] !=
                                   extra_id_token):
                                assert (original_tokens[original_offset] ==
                                        masked_tgt_tokens[masked_tgt_offset])
                                original_offset += 1
                                masked_tgt_offset += 1
                        else:
                            assert original_tokens[
                                original_offset] == masked_src_token
                            original_offset += 1
Example #13
0
    def test_denoising(self):
        with TemporaryDirectory() as dirname:

            # prep input file
            raw_file = os.path.join(dirname, "raw")
            data = make_data(out_file=raw_file)
            vocab = build_vocab(data)

            # binarize
            binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False)
            split = "train"
            bin_file = os.path.join(dirname, split)
            dataset_impl = "mmap"
            FileBinarizer.multiprocess_dataset(
                input_file=raw_file,
                binarizer=binarizer,
                dataset_impl=dataset_impl,
                vocab_size=len(vocab),
                output_prefix=bin_file,
            )

            # setup task
            train_args = options.parse_args_and_arch(
                options.get_training_parser(),
                [
                    "--task",
                    "denoising",
                    "--arch",
                    "bart_base",
                    "--seed",
                    "42",
                    "--mask-length",
                    "word",
                    "--permute-sentences",
                    "1",
                    "--rotate",
                    "0",
                    "--replace-length",
                    "-1",
                    "--mask",
                    "0.2",
                    dirname,
                ],
            )
            cfg = convert_namespace_to_omegaconf(train_args)
            task = DenoisingTask(cfg.task, binarizer.dict)

            # load datasets
            original_dataset = task._load_dataset_split(bin_file, 1, False)
            task.load_dataset(split)
            masked_dataset = task.dataset(split)

            iterator = task.get_batch_iterator(
                dataset=masked_dataset,
                max_tokens=65_536,
                max_positions=4_096,
            ).next_epoch_itr(shuffle=False)
            mask_index = task.source_dictionary.index("<mask>")
            for batch in iterator:
                for sample in range(len(batch)):
                    net_input = batch["net_input"]
                    masked_src_tokens = net_input["src_tokens"][sample]
                    masked_src_length = net_input["src_lengths"][sample]
                    masked_tgt_tokens = batch["target"][sample]

                    sample_id = batch["id"][sample]
                    original_tokens = original_dataset[sample_id]
                    original_tokens = original_tokens.masked_select(
                        masked_src_tokens[:masked_src_length] == mask_index
                    )
                    masked_tokens = masked_tgt_tokens.masked_select(
                        masked_src_tokens == mask_index
                    )

                    assert masked_tokens.equal(original_tokens)
Example #14
0
def train_masked_language_model(data_dir, arch, extra_args=()):
    train_parser = options.get_training_parser()
    # TODO: langs should be in and out right?
    train_args = options.parse_args_and_arch(
        train_parser,
        [
            "--task",
            "cross_lingual_lm",
            data_dir,
            "--arch",
            arch,
            # Optimizer args
            "--optimizer",
            "adam",
            "--lr-scheduler",
            "reduce_lr_on_plateau",
            "--lr-shrink",
            "0.5",
            "--lr",
            "0.0001",
            "--min-lr",
            "1e-09",
            # dropout, attention args
            "--dropout",
            "0.1",
            "--attention-dropout",
            "0.1",
            # MLM args
            "--criterion",
            "masked_lm_loss",
            "--masked-lm-only",
            "--monolingual-langs",
            "in,out",
            "--num-segment",
            "5",
            # Transformer args: use a small transformer model for fast training
            "--encoder-layers",
            "1",
            "--encoder-embed-dim",
            "32",
            "--encoder-attention-heads",
            "1",
            "--encoder-ffn-embed-dim",
            "32",
            # Other training args
            "--max-tokens",
            "500",
            "--tokens-per-sample",
            "500",
            "--save-dir",
            data_dir,
            "--max-epoch",
            "1",
            "--no-progress-bar",
            "--distributed-world-size",
            "1",
            "--dataset-impl",
            "raw",
        ] + list(extra_args),
    )
    train.main(train_args)
def cli_main():

    parser = options.get_training_parser()
    parser.add_argument(
        '--train-subtransformer',
        action='store_true',
        default=False,
        help='whether train SuperTransformer or SubTransformer')
    parser.add_argument(
        '--sub-configs',
        required=False,
        is_config_file=True,
        help=
        'when training SubTransformer, use --configs to specify architecture and --sub-configs to specify other settings'
    )

    # for profiling
    parser.add_argument('--profile-flops',
                        action='store_true',
                        help='measure the FLOPs of a SubTransformer')

    parser.add_argument('--latgpu',
                        action='store_true',
                        help='measure SubTransformer latency on GPU')
    parser.add_argument('--latcpu',
                        action='store_true',
                        help='measure SubTransformer latency on CPU')
    parser.add_argument(
        '--latiter',
        type=int,
        default=300,
        help='how many iterations to run when measure the latency')
    parser.add_argument('--latsilent',
                        action='store_true',
                        help='keep silent when measure latency')

    parser.add_argument(
        '--validate-subtransformer',
        action='store_true',
        help='evaluate the SubTransformer on the validation set')
    options.add_generation_args(parser)
    args = options.parse_args_and_arch(parser)
    if args.latcpu:
        args.cpu = True
        args.fp16 = False

    if args.latgpu or args.latcpu or args.profile_flops:
        args.distributed_world_size = 1

    #if args.distributed_init_method is None:
    #   distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        #if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
        if not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                #nprocs=torch.cuda.device_count(),
                nprocs=8,  #Use all TPU cores
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        #assert args.distributed_world_size <= torch.cuda.device_count()
        import torch_xla.distributed.xla_multiprocessing as xmp
        torch.multiprocessing.set_sharing_strategy("file_system")
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print(
                '| NOTE: you may get better performance with: --ddp-backend=no_c10d'
            )

        xmp.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=8,  # use all 8 TPU cores
        )

        #torch.multiprocessing.spawn(
        #    fn=distributed_main,
        #    args=(args, ),
        #    nprocs=args.distributed_world_size,
        #)

    else:
        # single GPU training
        main(args)
Example #16
0
def main(args):
    if args.distributed_init_method is None and args.distributed_port > 0:
        # We can determine the init method automatically for Slurm.
        node_list = os.environ.get('SLURM_JOB_NODELIST')
        if node_list is not None:
            try:
                hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', node_list])
                args.distributed_init_method = 'tcp://{host}:{port}'.format(
                    host=hostnames.split()[0].decode('utf-8'),
                    port=args.distributed_port)
                args.distributed_rank = int(os.environ.get('SLURM_PROCID'))
                args.device_id = int(os.environ.get('SLURM_LOCALID'))
            except subprocess.CalledProcessError as e:  # scontrol failed
                raise e
            except FileNotFoundError as e:  # Slurm is not installed
                pass
    if args.distributed_init_method is None:
        raise ValueError('--distributed-init-method or --distributed-port '
                         'must be specified for distributed training')

    args.distributed_rank = distributed_utils.distributed_init(args)
    print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank))
    single_process_main(args)


if __name__ == '__main__':
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser)
    main(args)
Example #17
0
def cli_main():

    parser = options.get_training_parser()
    parser.add_argument(
        '--config',
        type=str,
        nargs='*',
        help=
        'paths to JSON files of experiment configurations, from high to low priority',
    )
    parser.add_argument('--exp-name',
                        type=str,
                        default='',
                        help='name of the experiment')
    parser.add_argument(
        '--debug',
        default=False,
        action='store_true',
        help='run training in the debugging mode',
    )
    parser.add_argument('--path-attributes',
                        type=str,
                        nargs='*',
                        default=['task', 'arch', 'lr'])
    parser.add_argument('--torch-file-system', action='store_true')

    pre_parsed_args, unknown = parser.parse_known_args()

    config_dict = {}
    for config_path in pre_parsed_args.config:
        config_dict = update_config(config_dict, compose_configs(config_path))

    parser_modifier = modify_factory(config_dict)

    args = options.parse_args_and_arch(parser, modify_parser=parser_modifier)

    update_namespace(args, config_dict)

    # set sharing strategy file system in case /dev/shm/ limits are small
    if args.torch_file_system:
        torch.multiprocessing.set_sharing_strategy('file_system')

    training_name = get_training_name(args)
    base_save_dir = generate_save_dir(args, training_name, sys.argv[1:])
    setattr(args, 'training_name', training_name)
    setattr(args, 'save_dir', os.path.join(base_save_dir, 'checkpoints'))
    setattr(args, 'tensorboard_logdir',
            os.path.join(base_save_dir, 'tensorboard'))

    save_config(vars(args), base_save_dir)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if (args.update_freq is not None and max(args.update_freq) > 1
                and args.ddp_backend != 'no_c10d'):
            logger.info(
                'NOTE: you may get faster training with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )

    else:
        # single GPU training
        main(args)