Exemple #1
0
def main():
    """
    Launches data-parallel multi-gpu training.
    """
    args = parse_args()

    if not args.cudnn:
        torch.backends.cudnn.enabled = False
    if args.seed is not None:
        torch.manual_seed(args.seed + args.rank)

    # initialize distributed backend
    distributed = args.world_size > 1
    if distributed:
        backend = 'nccl' if args.cuda else 'gloo'
        dist.init_process_group(backend=backend,
                                rank=args.rank,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    # create directory for results
    save_path = os.path.join(args.results_dir, args.save)
    args.save_path = save_path
    os.makedirs(save_path, exist_ok=True)

    # setup logging
    log_filename = f'log_gpu_{args.rank}.log'
    setup_logging(os.path.join(save_path, log_filename))

    logging.info(f'Saving results to: {save_path}')
    logging.info(f'Run arguments: {args}')

    if args.cuda:
        torch.cuda.set_device(args.rank)

    # build tokenizer
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME))

    # build datasets
    train_data = ParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_train,
        max_len=args.max_length_train,
        sort=False,
        max_size=args.max_size)

    val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir,
                                                      config.SRC_VAL_FNAME),
                               tgt_fname=os.path.join(args.dataset_dir,
                                                      config.TGT_VAL_FNAME),
                               tokenizer=tokenizer,
                               min_len=args.min_length_val,
                               max_len=args.max_length_val,
                               sort=True)

    test_data = TextDataset(src_fname=os.path.join(args.dataset_dir,
                                                   config.SRC_TEST_FNAME),
                            tokenizer=tokenizer,
                            min_len=args.min_length_test,
                            max_len=args.max_length_test,
                            sort=False)

    vocab_size = tokenizer.vocab_size

    # build GNMT model
    model_config = dict(vocab_size=vocab_size,
                        math=args.math,
                        **literal_eval(args.model_config))
    model = GNMT(**model_config)
    logging.info(model)

    batch_first = model.batch_first

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD, args.smoothing)
    opt_config = literal_eval(args.optimization_config)
    logging.info(f'Training optimizer: {opt_config}')

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info(f'Number of parameters: {num_parameters}')

    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.batch_size,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         bucketing=args.bucketing,
                                         num_workers=args.workers,
                                         drop_last=True)

    val_loader = val_data.get_loader(batch_size=args.val_batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     num_workers=args.workers,
                                     drop_last=False)

    test_loader = test_data.get_loader(batch_size=args.test_batch_size,
                                       batch_first=batch_first,
                                       shuffle=False,
                                       num_workers=args.workers,
                                       drop_last=False)

    translator = Translator(model=model,
                            tokenizer=tokenizer,
                            loader=test_loader,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_length_test,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda,
                            print_freq=args.print_freq,
                            dataset_dir=args.dataset_dir,
                            target_bleu=args.target_bleu,
                            save_path=args.save_path)

    # create trainer
    trainer_options = dict(criterion=criterion,
                           grad_clip=args.grad_clip,
                           save_path=save_path,
                           save_freq=args.save_freq,
                           save_info={
                               'config': args,
                               'tokenizer': tokenizer
                           },
                           opt_config=opt_config,
                           batch_first=batch_first,
                           keep_checkpoints=args.keep_checkpoints,
                           math=args.math,
                           print_freq=args.print_freq,
                           cuda=args.cuda,
                           distributed=distributed,
                           intra_epoch_eval=args.intra_epoch_eval,
                           translator=translator)

    trainer_options['model'] = model
    trainer = trainers.Seq2SeqTrainer(**trainer_options)

    # optionally resume from a checkpoint
    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error(f'No checkpoint found at {args.resume}')

    # training loop
    best_loss = float('inf')
    for epoch in range(args.start_epoch, args.epochs):
        logging.info(f'Starting epoch {epoch}')

        if distributed:
            train_loader.sampler.set_epoch(epoch)

        trainer.epoch = epoch
        train_loss, train_perf = trainer.optimize(train_loader)

        # evaluate on validation set
        if args.rank == 0 and not args.disable_eval:
            logging.info(f'Running validation on dev set')
            val_loss, val_perf = trainer.evaluate(val_loader)

            # remember best prec@1 and save checkpoint
            is_best = val_loss < best_loss
            best_loss = min(val_loss, best_loss)
            trainer.save(save_all=args.save_all, is_best=is_best)

        break_training = False
        if not args.disable_eval:
            test_bleu, break_training = translator.run(calc_bleu=True,
                                                       epoch=epoch)

        if args.rank == 0 and not args.disable_eval:
            logging.info(f'Summary: Epoch: {epoch}\t'
                         f'Training Loss: {train_loss:.4f}\t'
                         f'Validation Loss: {val_loss:.4f}\t'
                         f'Test BLEU: {test_bleu:.2f}')
            logging.info(f'Performance: Epoch: {epoch}\t'
                         f'Training: {train_perf:.0f} Tok/s\t'
                         f'Validation: {val_perf:.0f} Tok/s')
        else:
            logging.info(f'Summary: Epoch: {epoch}\t'
                         f'Training Loss {train_loss:.4f}')
            logging.info(f'Performance: Epoch: {epoch}\t'
                         f'Training: {train_perf:.0f} Tok/s')

        logging.info(f'Finished epoch {epoch}')
        if break_training:
            break
Exemple #2
0
def main():
    mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__))
    mlperf_log.LOGGER.propagate = False
    mlperf_log.gnmt_print(key=mlperf_log.RUN_START)

    args = parse_args()
    print(args)

    if not args.cudnn:
        torch.backends.cudnn.enabled = False
    mlperf_log.gnmt_print(key=mlperf_log.RUN_SET_RANDOM_SEED)
    if args.seed:
        torch.manual_seed(args.seed + args.rank)

    # initialize distributed backend
    distributed = args.world_size > 1
    if distributed:
        backend = 'nccl' if args.cuda else 'gloo'
        dist.init_process_group(backend=backend, rank=args.rank,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    # create directory for results
    save_path = os.path.join(args.results_dir, args.save)
    os.makedirs(save_path, exist_ok=True)

    # setup logging
    log_filename = f'log_gpu_{args.rank}.log'
    setup_logging(os.path.join(save_path, log_filename))

    logging.info(f'Saving results to: {save_path}')
    logging.info(f'Run arguments: {args}')

    if args.cuda:
        torch.cuda.set_device(args.rank)

    # build tokenizer
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME))

    # build datasets
    mlperf_log.gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING)
    mlperf_log.gnmt_print(key=mlperf_log.TRAIN_HP_MAX_SEQ_LEN,
                          value=args.max_length_train)

    train_data = ParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_train,
        max_len=args.max_length_train,
        sort=False,
        max_size=args.max_size)

    mlperf_log.gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES,
                          value=len(train_data))

    val_data = ParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_VAL_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_VAL_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_val,
        max_len=args.max_length_val,
        sort=True)

    mlperf_log.gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL)

    test_data = ParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TEST_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_val,
        max_len=args.max_length_val,
        sort=False)

    mlperf_log.gnmt_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES,
                          value=len(test_data))

    vocab_size = tokenizer.vocab_size
    mlperf_log.gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size)

    # build GNMT model
    model_config = dict(vocab_size=vocab_size, math=args.math,
                        **literal_eval(args.model_config))
    # SSY the real model
    # seq2seq/models/gnmt.py
    model = models.GNMT(**model_config)
    logging.info(model)

    batch_first = model.batch_first

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD, args.smoothing)
    opt_config = literal_eval(args.optimization_config)
    logging.info(f'Training optimizer: {opt_config}')

    # create trainer
    trainer_options = dict(
        criterion=criterion,
        grad_clip=args.grad_clip,
        save_path=save_path,
        save_freq=args.save_freq,
        save_info={'config': args, 'tokenizer': tokenizer},
        opt_config=opt_config,
        batch_first=batch_first,
        keep_checkpoints=args.keep_checkpoints,
        math=args.math,
        print_freq=args.print_freq,
        cuda=args.cuda,
        distributed=distributed)

    trainer_options['model'] = model
    # SSY only the trainer seq2seq/train/trainer.py
    # not the models
    trainer = trainers.Seq2SeqTrainer(**trainer_options)

    translator = Translator(model,
                            tokenizer,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_length_val,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda)

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info(f'Number of parameters: {num_parameters}')

    # optionally resume from a checkpoint
    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(
                checkpoint_file, 'model_best.pth')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error(f'No checkpoint found at {args.resume}')

    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.batch_size,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         bucket=args.bucketing,
                                         num_workers=args.workers,
                                         drop_last=True,
                                         distributed=distributed)

    mlperf_log.gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE,
                          value=args.batch_size * args.world_size)
    mlperf_log.gnmt_print(key=mlperf_log.INPUT_SIZE,
                          value=train_loader.sampler.num_samples)


    val_loader = val_data.get_loader(batch_size=args.eval_batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     num_workers=args.workers,
                                     drop_last=False,
                                     distributed=False)

    test_loader = test_data.get_loader(batch_size=args.eval_batch_size,
                                       batch_first=batch_first,
                                       shuffle=False,
                                       num_workers=0,
                                       drop_last=False,
                                       distributed=False)

    mlperf_log.gnmt_print(key=mlperf_log.EVAL_SIZE,
                          value=len(test_loader.sampler))

    # training loop
    best_loss = float('inf')
    mlperf_log.gnmt_print(key=mlperf_log.TRAIN_LOOP)
    for epoch in range(args.start_epoch, args.epochs):
        mlperf_log.gnmt_print(key=mlperf_log.TRAIN_EPOCH,
                              value=epoch)
        logging.info(f'Starting epoch {epoch}')

        if distributed:
            train_loader.sampler.set_epoch(epoch)

        trainer.epoch = epoch
        train_loss = trainer.optimize(train_loader)

        # evaluate on validation set
        if args.rank == 0 and not args.disable_eval:
            logging.info(f'Running validation on dev set')
            val_loss = trainer.evaluate(val_loader)

            # remember best prec@1 and save checkpoint
            is_best = val_loss < best_loss
            best_loss = min(val_loss, best_loss)

            mlperf_log.gnmt_print(key=mlperf_log.TRAIN_CHECKPOINT)
            trainer.save(save_all=args.save_all, is_best=is_best)

            logging.info(f'Epoch: {epoch}\t'
                         f'Training Loss {train_loss:.4f}\t'
                         f'Validation Loss {val_loss:.4f}')
        else:
            logging.info(f'Epoch: {epoch}\t'
                         f'Training Loss {train_loss:.4f}')

        if args.cuda:
            break_training = torch.cuda.LongTensor([0])
        else:
            break_training = torch.LongTensor([0])

        if args.rank == 0 and not args.disable_eval:
            logging.info(f'Running evaluation on test set')
            mlperf_log.gnmt_print(key=mlperf_log.EVAL_START, value=epoch)

            model.eval()
            torch.cuda.empty_cache()

            eval_path = os.path.join(save_path, f'eval_epoch_{epoch}')
            eval_file = open(eval_path, 'w')

            for i, (src, tgt, indices) in enumerate(test_loader):
                src, src_length = src

                if translator.batch_first:
                    batch_size = src.size(0)
                else:
                    batch_size = src.size(1)
                beam_size = args.beam_size

                bos = [translator.insert_target_start] * (batch_size * beam_size)
                bos = torch.LongTensor(bos)
                if translator.batch_first:
                    bos = bos.view(-1, 1)
                else:
                    bos = bos.view(1, -1)

                src_length = torch.LongTensor(src_length)

                if args.cuda:
                    src = src.cuda()
                    src_length = src_length.cuda()
                    bos = bos.cuda()

                with torch.no_grad():
                    context = translator.model.encode(src, src_length)
                    context = [context, src_length, None]

                    if beam_size == 1:
                        generator = translator.generator.greedy_search
                    else:
                        generator = translator.generator.beam_search
                    preds, lengths, counter = generator(batch_size, bos, context)

                preds = preds.cpu()
                lengths = lengths.cpu()

                output = []
                for idx, pred in enumerate(preds):
                    end = lengths[idx] - 1
                    pred = pred[1: end]
                    pred = pred.tolist()
                    out = translator.tok.detokenize(pred)
                    output.append(out)

                output = [output[indices.index(i)] for i in range(len(output))]
                for line in output:
                    eval_file.write(line)
                    eval_file.write('\n')

            eval_file.close()

            # run moses detokenizer
            detok_path = os.path.join(args.dataset_dir, config.DETOKENIZER)
            detok_eval_path = eval_path + '.detok'

            with open(detok_eval_path, 'w') as detok_eval_file,  \
                    open(eval_path, 'r') as eval_file:
                subprocess.run(['perl', f'{detok_path}'], stdin=eval_file,
                               stdout=detok_eval_file, stderr=subprocess.DEVNULL)

            # run sacrebleu
            reference_path = os.path.join(args.dataset_dir, config.TGT_TEST_TARGET_FNAME)
            sacrebleu = subprocess.run([f'sacrebleu --input {detok_eval_path} \
                                        {reference_path} --score-only -lc --tokenize intl'],
                                       stdout=subprocess.PIPE, shell=True)
            bleu = float(sacrebleu.stdout.strip())
            logging.info(f'Finished evaluation on test set')
            logging.info(f'BLEU on test dataset: {bleu}')

            if args.target_bleu:
                if bleu >= args.target_bleu:
                    logging.info(f'Target accuracy reached')
                    break_training[0] = 1

            torch.cuda.empty_cache()
            mlperf_log.gnmt_print(key=mlperf_log.EVAL_ACCURACY,
                                  value={"epoch": epoch, "value": bleu})
            mlperf_log.gnmt_print(key=mlperf_log.EVAL_TARGET,
                                  value=args.target_bleu)
            mlperf_log.gnmt_print(key=mlperf_log.EVAL_STOP)

        if distributed:
            dist.broadcast(break_training, 0)

        logging.info(f'Finished epoch {epoch}')
        if break_training:
            break

    mlperf_log.gnmt_print(key=mlperf_log.RUN_STOP,
                         value={"success": bool(break_training)})
    mlperf_log.gnmt_print(key=mlperf_log.RUN_FINAL)
def main():
    args = parse_args()
    print(args)

    profile_dir = args.profile_dir
    if not args.profile:
        profile_dir = None

    if not args.cudnn:
        torch.backends.cudnn.enabled = False
    if args.seed:
        torch.manual_seed(args.seed + args.rank)

    if args.cuda:
        torch.cuda.set_device(args.gpu_rank)

    # initialize distributed backend
    distributed = args.world_size > 1
    if distributed:
        print "init process group"
        backend = 'nccl' if args.cuda else 'gloo'
        dist.init_process_group(backend=backend, rank=args.rank,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    print "distributed backend initialized"
    # create directory for results
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # setup logging
    log_filename = 'log_gpu_{}.log'.format(args.rank)
    setup_logging(os.path.join(save_path, log_filename))

    logging.info('Saving results to: {}'.format(save_path))
    logging.info('Run arguments: {}'.format(args))

    # build tokenizer
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME))

    # build datasets
    train_data = ParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_train,
        max_len=args.max_length_train,
        sort=False,
        max_size=args.max_size)

    val_data = ParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_VAL_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_VAL_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_val,
        max_len=args.max_length_val,
        sort=True)

    test_data = ParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TEST_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_val,
        max_len=args.max_length_val,
        sort=False)

    vocab_size = tokenizer.vocab_size

    # build GNMT model
    model_config = dict(vocab_size=vocab_size, math=args.math,
                        **literal_eval(args.model_config))
    model = models.GNMT(**model_config)
    logging.info(model)

    batch_first = model.batch_first

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD, args.smoothing)
    opt_config = literal_eval(args.optimization_config)
    logging.info('Training optimizer: {}'.format(opt_config))

    # create trainer
    trainer_options = dict(
        criterion=criterion,
        grad_clip=args.grad_clip,
        save_path=save_path,
        save_freq=args.save_freq,
        save_info={'config': args, 'tokenizer': tokenizer},
        opt_config=opt_config,
        batch_first=batch_first,
        keep_checkpoints=args.keep_checkpoints,
        math=args.math,
        print_freq=args.print_freq,
        cuda=args.cuda,
        distributed=distributed,
        log_dir=profile_dir,
        num_minibatches=args.num_minibatches,
        cupti=args.cupti)

    trainer_options['model'] = model
    trainer = trainers.Seq2SeqTrainer(**trainer_options)

    translator = Translator(model,
                            tokenizer,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_length_val,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda)

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info('Number of parameters: {}'.format(num_parameters))

    # optionally resume from a checkpoint
    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(
                checkpoint_file, 'model_best.pth')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error('No checkpoint found at {}'.format(args.resume))

    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.batch_size,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         bucket=args.bucketing,
                                         num_workers=args.workers,
                                         drop_last=True,
                                         distributed=distributed,
                                         log_dir=profile_dir)

    val_loader = val_data.get_loader(batch_size=args.eval_batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     num_workers=args.workers,
                                     drop_last=False,
                                     distributed=False)

    test_loader = test_data.get_loader(batch_size=args.eval_batch_size,
                                       batch_first=batch_first,
                                       shuffle=False,
                                       num_workers=0,
                                       drop_last=False,
                                       distributed=False)

    # training loop
    best_loss = float('inf')
    for epoch in range(args.start_epoch, args.epochs):
        logging.info('Starting epoch {}'.format(epoch))

        if distributed:
            train_loader.sampler.set_epoch(epoch)

        trainer.epoch = epoch
        train_loss = trainer.optimize(train_loader)
        if args.profile:
            if args.cuda:
                break_training = torch.cuda.LongTensor([0])
            else:
                break_training = torch.LongTensor([0])
            print("profiling finished...")
            break

        if args.num_minibatches > 0:
            break
        # evaluate on validation set
        if args.rank == 0 and not args.disable_eval:
            logging.info('Running validation on dev set')
            val_loss = trainer.evaluate(val_loader)

            # remember best prec@1 and save checkpoint
            is_best = val_loss < best_loss
            best_loss = min(val_loss, best_loss)

            trainer.save(save_all=args.save_all, is_best=is_best)

            logging.info('Epoch: {}\tTraining Loss {:.4f}\tValidation Loss {:.4f}'.format(
                         epoch, train_loss, val_loss))
        else:
            logging.info('Epoch: {}\tTraining Loss {:.4f}'.format(
                         epoch, train_loss))

        if args.cuda:
            break_training = torch.cuda.LongTensor([0])
        else:
            break_training = torch.LongTensor([0])

        if args.rank == 0 and not args.disable_eval:
            logging.info('Running evaluation on test set')

            model.eval()
            torch.cuda.empty_cache()

            eval_path = os.path.join(save_path, 'eval_epoch_{}'.format(epoch))
            eval_file = open(eval_path, 'w')

            for i, (src, tgt, indices) in enumerate(test_loader):
                src, src_length = src

                if translator.batch_first:
                    batch_size = src.size(0)
                else:
                    batch_size = src.size(1)
                beam_size = args.beam_size

                bos = [translator.insert_target_start] * (batch_size * beam_size)
                bos = torch.LongTensor(bos)
                if translator.batch_first:
                    bos = bos.view(-1, 1)
                else:
                    bos = bos.view(1, -1)

                src_length = torch.LongTensor(src_length)

                if args.cuda:
                    src = src.cuda()
                    src_length = src_length.cuda()
                    bos = bos.cuda()

                with torch.no_grad():
                    context = translator.model.encode(src, src_length)
                    context = [context, src_length, None]

                    if beam_size == 1:
                        generator = translator.generator.greedy_search
                    else:
                        generator = translator.generator.beam_search
                    preds, lengths, counter = generator(batch_size, bos, context)

                preds = preds.cpu()
                lengths = lengths.cpu()

                output = []
                for idx, pred in enumerate(preds):
                    end = lengths[idx] - 1
                    pred = pred[1: end]
                    pred = pred.tolist()
                    out = translator.tok.detokenize(pred)
                    output.append(out)

                output = [output[indices.index(i)] for i in range(len(output))]
                for line in output:
                    eval_file.write(line)
                    eval_file.write('\n')

            eval_file.close()

            # run moses detokenizer
            detok_path = os.path.join(args.dataset_dir, config.DETOKENIZER)
            detok_eval_path = eval_path + '.detok'

            with open(detok_eval_path, 'w') as detok_eval_file,  \
                    open(eval_path, 'r') as eval_file:
                subprocess.run(['perl', '{}'.format(detok_path)], stdin=eval_file,
                               stdout=detok_eval_file, stderr=subprocess.DEVNULL)

            # run sacrebleu
            reference_path = os.path.join(args.dataset_dir, config.TGT_TEST_TARGET_FNAME)
            sacrebleu = subprocess.run(['sacrebleu --input {} {} --score-only -lc --tokenize intl'.format(
                                         detok_eval_path, reference_path)],
                                       stdout=subprocess.PIPE, shell=True)
            bleu = float(sacrebleu.stdout.strip())
            logging.info('Finished evaluation on test set')
            logging.info('BLEU on test dataset: {}'.format(bleu))

            if args.target_bleu:
                if bleu >= args.target_bleu:
                    logging.info('Target accuracy reached')
                    break_training[0] = 1

            torch.cuda.empty_cache()

        if distributed:
            dist.broadcast(break_training, 0)

        logging.info('Finished epoch {}'.format(epoch))
        if break_training:
            break
Exemple #4
0
def main():
    """
    Launches data-parallel multi-gpu training.
    """
    mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__))
    mlperf_log.LOGGER.propagate = False

    args = parse_args()

    if args.cuda:
        torch.cuda.set_device(args.local_rank)
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    # initialize distributed backend
    distributed = False
    if 'WORLD_SIZE' in os.environ:
        distributed = int(os.environ['WORLD_SIZE']) > 1

    if distributed:
        assert args.cuda
        '''Initialize distributed communication'''
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        assert torch.distributed.is_initialized()

    gnmt_print(key=mlperf_log.RUN_START)

    args.rank = get_rank()

    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # create directory for results
    save_path = os.path.join(args.results_dir, args.save)
    args.save_path = save_path
    os.makedirs(save_path, exist_ok=True)

    # setup logging
    log_filename = f'log_gpu_{args.rank}.log'
    setup_logging(os.path.join(save_path, log_filename))

    logging.info(f'Saving results to: {save_path}')
    logging.info(f'Run arguments: {args}')

    # setup L2 promotion
    if args.cuda:
        l2_promote()

    gnmt_print(key=mlperf_log.RUN_SET_RANDOM_SEED)
    # https://github.com/mlperf/policies/issues/120#issuecomment-431111348
    if args.seed is None:
        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
        master_seed = random.SystemRandom().randint(0, 2**32 - 1)
        if get_rank() == 0:
            # master seed is reported only from rank=0 worker, it's to avoid
            # confusion, seeds from rank=0 are later broadcasted to other
            # workers
            logging.info(f'Using random master seed: {master_seed}')
    else:
        # master seed was specified from command line
        master_seed = args.seed
        logging.info(f'Using master seed from command line: {master_seed}')

    # initialize seeding RNG
    seeding_rng = random.Random(master_seed)

    # generate worker seeds, one seed for every distributed worker
    worker_seeds = generate_seeds(seeding_rng, get_world_size())

    # generate seeds for data shuffling, one seed for every epoch
    shuffling_seeds = generate_seeds(seeding_rng, args.epochs)

    # broadcast seeds from rank=0 to other workers
    worker_seeds = broadcast_seeds(worker_seeds, device)
    shuffling_seeds = broadcast_seeds(shuffling_seeds, device)

    # set worker seed
    worker_seed = worker_seeds[args.rank]
    logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}')
    torch.manual_seed(worker_seed)

    # build tokenizer
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME))

    # build datasets
    gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING)
    gnmt_print(key=mlperf_log.TRAIN_HP_MAX_SEQ_LEN,
               value=args.max_length_train)

    train_data = LazyParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_train,
        max_len=args.max_length_train,
        sort=False,
        max_size=args.max_size)

    gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES,
               value=len(train_data))

    val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir,
                                                      config.SRC_VAL_FNAME),
                               tgt_fname=os.path.join(args.dataset_dir,
                                                      config.TGT_VAL_FNAME),
                               tokenizer=tokenizer,
                               min_len=args.min_length_val,
                               max_len=args.max_length_val,
                               sort=True)

    gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL)

    test_data = TextDataset(src_fname=os.path.join(args.dataset_dir,
                                                   config.SRC_TEST_FNAME),
                            tokenizer=tokenizer,
                            min_len=args.min_length_test,
                            max_len=args.max_length_test,
                            sort=False)

    gnmt_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=len(test_data))

    vocab_size = tokenizer.vocab_size
    # size of the vocabulary has been padded to a multiple of 8
    gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size)

    # build GNMT model
    model_config = dict(vocab_size=vocab_size,
                        math=args.math,
                        **literal_eval(args.model_config))
    model = GNMT(**model_config)
    logging.info(model)

    batch_first = model.batch_first

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD, args.smoothing)
    opt_config = literal_eval(args.optimization_config)
    scheduler_config = literal_eval(args.scheduler_config)
    logging.info(f'Training optimizer: {opt_config}')
    logging.info(f'Training LR Schedule: {scheduler_config}')

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info(f'Number of parameters: {num_parameters}')

    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.batch_size,
                                         seeds=shuffling_seeds,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         bucketing=args.bucketing,
                                         num_workers=args.train_loader_workers)

    gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE,
               value=args.batch_size * get_world_size())
    gnmt_print(key=mlperf_log.INPUT_SIZE,
               value=train_loader.sampler.num_samples)

    val_loader = val_data.get_loader(batch_size=args.val_batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     num_workers=args.val_loader_workers)

    test_loader = test_data.get_loader(batch_size=args.test_batch_size,
                                       batch_first=batch_first,
                                       shuffle=False,
                                       pad=True,
                                       num_workers=args.test_loader_workers)

    gnmt_print(key=mlperf_log.EVAL_SIZE, value=len(test_loader.dataset))

    translator = Translator(model=model,
                            tokenizer=tokenizer,
                            loader=test_loader,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_length_test,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda,
                            print_freq=args.print_freq,
                            dataset_dir=args.dataset_dir,
                            target_bleu=args.target_bleu,
                            save_path=args.save_path)

    # create trainer
    trainer_options = dict(
        criterion=criterion,
        grad_clip=args.grad_clip,
        save_path=save_path,
        save_freq=args.save_freq,
        save_info={
            'config': args,
            'tokenizer': tokenizer.get_state()
        },
        opt_config=opt_config,
        scheduler_config=scheduler_config,
        batch_first=batch_first,
        keep_checkpoints=args.keep_checkpoints,
        math=args.math,
        print_freq=args.print_freq,
        cuda=args.cuda,
        distributed=distributed,
        distributed_overlap_allreduce=args.enable_apex_allreduce_overlap,
        distributed_overlap_allreduce_messagesize=args.apex_message_size,
        intra_epoch_eval=args.intra_epoch_eval,
        translator=translator,
        arch=args.arch)

    trainer_options['model'] = model
    trainer = trainers.Seq2SeqTrainer(**trainer_options)

    # optionally resume from a checkpoint
    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error(f'No checkpoint found at {args.resume}')

    # training loop
    # best_loss = float('inf')
    gnmt_print(key=mlperf_log.TRAIN_LOOP)

    for epoch in range(1):
        logging.info(f'Starting epoch {epoch}')
        gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=epoch)

        if distributed:
            train_loader.sampler.set_epoch(epoch)

        trainer.epoch = epoch
        train_loss, train_perf = trainer.optimize(train_loader)

        logging.info(f'Finished epoch {epoch}')

    # Save the checkpoint at the end of the training loop, after the RUN_STOP
    # tag
    # https://github.com/mlperf/policies/issues/55#issuecomment-428335773
    if not args.disable_eval:
        gnmt_print(key=mlperf_log.TRAIN_CHECKPOINT)
        if get_rank() == 0:
            trainer.save(save_all=args.save_all, is_best=True)

    gnmt_print(key=mlperf_log.RUN_FINAL)
Exemple #5
0
def main():
    """
    Launches translation (inference).
    Inference is executed on a single GPU, implementation supports beam search
    with length normalization and coverage penalty.
    """
    args = parse_args()
    args.batch_first = False

    if args.cuda:
        torch.cuda.set_device(0)
    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if args.math == 'fp16' and not args.cuda:
        raise RuntimeError('fp16 requires cuda')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    num_stages = args.num_stages
    # compute BLEU score for every epoch
    print("Epoch\tBLEU score")
    epoch = 0
    while True:
        # no more epochs to run, since desired file not available
        if not os.path.isfile(
                os.path.join(args.checkpoint_path,
                             f"checkpoint.0.pth.tar.epoch.{epoch}")):
            break

        module = importlib.import_module(args.module)
        model = module.model(None)
        num_modules = len(model)

        key_to_module_mapping = OrderedDict()
        all_stages_state_dict = OrderedDict()
        module_id = 0
        stage_id = 0
        for stage_id in range(num_stages):
            # load the checkpoint associated with a stage
            full_checkpoint_path = os.path.join(
                args.checkpoint_path,
                f"checkpoint.{stage_id}.pth.tar.epoch.{epoch}")
            checkpoint = torch.load(full_checkpoint_path,
                                    map_location=torch.device('cpu'))

            # iterate through all modules in stage_id's checkpoint
            local_module_id = 0

            # quit when checkpoints for all modules in full model are loaded
            while module_id < num_modules:

                # load checkpoint corresponding to different modules in our runtime
                state_dict = checkpoint["state_dict"]
                state_dict_key = "module%d" % local_module_id

                if state_dict_key not in state_dict:
                    break
                state_dict = checkpoint["state_dict"][state_dict_key]

                # remove mask buffer
                keys_to_delete = []
                for key in state_dict:
                    if "mask" in key:
                        keys_to_delete.append(key)
                for key in keys_to_delete:
                    del state_dict[key]

                if checkpoint_from_distributed(state_dict):
                    state_dict = unwrap_distributed(state_dict)

                # collect all state_dicts in a single OrderedDict
                for key in state_dict:
                    all_stages_state_dict[(stage_id, local_module_id,
                                           key)] = state_dict[key]

                stage_module, _, _ = model[module_id]
                for key in state_dict:
                    # key_to_module_mapping maps key (in state_dict) to the
                    # torch.nn.Module wrapping the parameter and the name
                    # of parameter (weight, bias, etc.)
                    key_to_module_mapping[(
                        stage_id, local_module_id,
                        key)] = get_submodule_and_parameter_name(
                            stage_module, key)

                # load tokenizer state
                tokenizer = Tokenizer()
                tokenizer.set_state(checkpoint['tokenizer'])
                vocab_size = tokenizer.vocab_size

                local_module_id += 1
                module_id += 1

        epoch += 1

        # build model, and load state dict
        model_config = {
            'vocab_size': vocab_size,
            'batch_first': args.batch_first,
            'hidden_size': 1024,
            'num_layers': args.num_layers,
            'dropout': 0.2,
            'share_embedding': False
        }
        model = GNMT(**model_config)
        model_state_dict = OrderedDict()
        for real_key in model.state_dict():
            (module, parameter_name) = get_submodule_and_parameter_name(
                model, real_key)
            # find key in all_stages_state_dict that corresponds to real_key in
            # model's state_dict
            for key in key_to_module_mapping:
                (module2, parameter_name2) = key_to_module_mapping[key]
                if parameter_name == parameter_name2 and str(module) == str(
                        module2):
                    break
            if parameter_name == parameter_name2 and str(module) == str(
                    module2):
                model_state_dict[real_key] = all_stages_state_dict[key]
                del key_to_module_mapping[key]
                del all_stages_state_dict[key]

        # load state_dict into model, and perform inference
        model.load_state_dict(model_state_dict)

        if args.math == 'fp32':
            dtype = torch.FloatTensor
        if args.math == 'fp16':
            dtype = torch.HalfTensor

        model.type(dtype)
        model = model.cuda()
        model.eval()

        # construct the dataset
        test_data = TextDataset(src_fname=args.input,
                                tokenizer=tokenizer,
                                sort=False)

        # build the data loader
        test_loader = test_data.get_loader(world_size=1,
                                           rank=0,
                                           batch_size=args.batch_size,
                                           batch_first=args.batch_first,
                                           shuffle=False,
                                           pad=True,
                                           num_workers=0)

        # build the translator object
        translator = Translator(model=model,
                                tokenizer=tokenizer,
                                loader=test_loader,
                                beam_size=args.beam_size,
                                max_seq_len=args.max_seq_len,
                                len_norm_factor=args.len_norm_factor,
                                len_norm_const=args.len_norm_const,
                                cov_penalty_factor=args.cov_penalty_factor,
                                cuda=args.cuda,
                                print_freq=args.print_freq,
                                dataset_dir=args.dataset_dir)

        # execute the inference
        test_bleu, _ = translator.run(calc_bleu=args.bleu,
                                      eval_path=args.output,
                                      reference_path=args.reference,
                                      summary=True)
        print(f'{epoch}\t{test_bleu:.2f}')
Exemple #6
0
def main():
    args = parse_args()
    print(args)

    if args.cuda:
        torch.cuda.set_device(0)
    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if args.math == 'fp16' and not args.cuda:
        raise RuntimeError('fp16 requires cuda')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    vocab_size = checkpoint['tokenizer'].vocab_size
    model_config = dict(vocab_size=vocab_size,
                        math=checkpoint['config'].math,
                        **literal_eval(checkpoint['config'].model_config))
    model_config['batch_first'] = args.batch_first
    model = models.GNMT(**model_config)

    state_dict = checkpoint['state_dict']
    if checkpoint_from_distributed(state_dict):
        state_dict = unwrap_distributed(state_dict)

    model.load_state_dict(state_dict)

    if args.math == 'fp32':
        dtype = torch.FloatTensor
    if args.math == 'fp16':
        dtype = torch.HalfTensor

    model.type(dtype)
    if args.cuda:
        model = model.cuda()
    model.eval()

    tokenizer = checkpoint['tokenizer']

    test_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir,
                                                       config.SRC_TEST_FNAME),
                                tgt_fname=os.path.join(args.dataset_dir,
                                                       config.TGT_TEST_FNAME),
                                tokenizer=tokenizer,
                                min_len=0,
                                max_len=150,
                                sort=False)

    test_loader = test_data.get_loader(batch_size=args.batch_size,
                                       batch_first=True,
                                       shuffle=False,
                                       num_workers=0,
                                       drop_last=False,
                                       distributed=False)

    translator = Translator(model,
                            tokenizer,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_seq_len,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda)

    model.eval()
    torch.cuda.empty_cache()

    # only write the output to file in accuracy mode
    if args.mode == 'accuracy':
        test_file = open(args.output, 'w', encoding='UTF-8')

    batch_time = AverageMeter(False)
    tot_tok_per_sec = AverageMeter(False)
    iterations = AverageMeter(False)
    enc_seq_len = AverageMeter(False)
    dec_seq_len = AverageMeter(False)
    stats = {}

    for i, (src, tgt, indices) in enumerate(test_loader):
        translate_timer = time.time()
        src, src_length = src

        if translator.batch_first:
            batch_size = src.size(0)
        else:
            batch_size = src.size(1)
        beam_size = args.beam_size

        bos = [translator.insert_target_start] * (batch_size * beam_size)
        bos = torch.LongTensor(bos)
        if translator.batch_first:
            bos = bos.view(-1, 1)
        else:
            bos = bos.view(1, -1)

        src_length = torch.LongTensor(src_length)
        stats['total_enc_len'] = int(src_length.sum())

        if args.cuda:
            src = src.cuda()
            src_length = src_length.cuda()
            bos = bos.cuda()

        with torch.no_grad():
            context = translator.model.encode(src, src_length)
            context = [context, src_length, None]

            if beam_size == 1:
                generator = translator.generator.greedy_search
            else:
                generator = translator.generator.beam_search
            preds, lengths, counter = generator(batch_size, bos, context)

        stats['total_dec_len'] = lengths.sum().item()
        stats['iters'] = counter

        preds = preds.cpu()
        lengths = lengths.cpu()

        output = []
        for idx, pred in enumerate(preds):
            end = lengths[idx] - 1
            pred = pred[1:end]
            pred = pred.tolist()
            out = translator.tok.detokenize(pred)
            output.append(out)

        # only write the output to file in accuracy mode
        if args.mode == 'accuracy':
            output = [output[indices.index(i)] for i in range(len(output))]
            for line in output:
                test_file.write(line)
                test_file.write('\n')

        # Get timing
        elapsed = time.time() - translate_timer
        batch_time.update(elapsed, batch_size)

        total_tokens = stats['total_dec_len'] + stats['total_enc_len']
        ttps = total_tokens / elapsed
        tot_tok_per_sec.update(ttps, batch_size)

        iterations.update(stats['iters'])
        enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size)
        dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size)

        if i % 5 == 0:
            log = []
            log += 'TEST '
            log += 'Time {:.3f} ({:.3f})\t'.format(batch_time.val,
                                                   batch_time.avg)
            log += 'Decoder iters {:.1f} ({:.1f})\t'.format(
                iterations.val, iterations.avg)
            log += 'Tok/s {:.0f} ({:.0f})'.format(tot_tok_per_sec.val,
                                                  tot_tok_per_sec.avg)
            log = ''.join(log)
            print(log)

    # summary timing
    time_per_sentence = (batch_time.avg / batch_size)
    log = []
    log += 'TEST SUMMARY:\n'
    log += 'Lines translated: {}\t'.format(len(test_loader.dataset))
    log += 'Avg total tokens/s: {:.0f}\n'.format(tot_tok_per_sec.avg)
    log += 'Avg time per batch: {:.3f} s\t'.format(batch_time.avg)
    log += 'Avg time per sentence: {:.3f} ms\n'.format(1000 *
                                                       time_per_sentence)
    log += 'Avg encoder seq len: {:.2f}\t'.format(enc_seq_len.avg)
    log += 'Avg decoder seq len: {:.2f}\t'.format(dec_seq_len.avg)
    log += 'Total decoder iterations: {}'.format(int(iterations.sum))
    log = ''.join(log)
    print(log)

    # only write the output to file in accuracy mode
    if args.mode == 'accuracy':
        test_file.close()

        test_path = args.output
        # run moses detokenizer
        detok_path = os.path.join(args.dataset_dir, config.DETOKENIZER)
        detok_test_path = test_path + '.detok'

        with open(detok_test_path, 'w') as detok_test_file, \
                open(test_path, 'r') as test_file:
            subprocess.run(['perl', detok_path],
                           stdin=test_file,
                           stdout=detok_test_file,
                           stderr=subprocess.DEVNULL)

        # run sacrebleu
        reference_path = os.path.join(args.dataset_dir,
                                      config.TGT_TEST_TARGET_FNAME)
        sacrebleu = subprocess.run([
            'sacrebleu --input {} {} --score-only -lc --tokenize intl'.format(
                detok_test_path, reference_path)
        ],
                                   stdout=subprocess.PIPE,
                                   shell=True)
        bleu = float(sacrebleu.stdout.strip())

        print('BLEU on test dataset: {}'.format(bleu))

        print('Finished evaluation on test set')
Exemple #7
0
def main():
    """
    Launches translation (inference).
    Inference is executed on a single GPU, implementation supports beam search
    with length normalization and coverage penalty.
    """
    args = parse_args()

    # initialize distributed backend
    distributed = args.world_size > 1
    if distributed:
        backend = 'nccl' if args.cuda else 'gloo'
        dist.init_process_group(backend=backend,
                                rank=args.rank,
                                init_method=args.dist_url,
                                world_size=args.world_size)
    setup_logging()
    logging.info(f'Run arguments: {args}')

    if args.cuda:
        torch.cuda.set_device(args.rank)
    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if args.math == 'fp16' and not args.cuda:
        raise RuntimeError('fp16 requires cuda')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # load checkpoint and deserialize to CPU (to save GPU memory)
    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    # build GNMT model
    tokenizer = Tokenizer()
    tokenizer.set_state(checkpoint['tokenizer'])
    vocab_size = tokenizer.vocab_size
    model_config = dict(vocab_size=vocab_size,
                        math=checkpoint['config'].math,
                        **literal_eval(checkpoint['config'].model_config))
    model_config['batch_first'] = args.batch_first
    model = GNMT(**model_config)

    state_dict = checkpoint['state_dict']
    if checkpoint_from_distributed(state_dict):
        state_dict = unwrap_distributed(state_dict)

    model.load_state_dict(state_dict)

    if args.math == 'fp32':
        dtype = torch.FloatTensor
    if args.math == 'fp16':
        dtype = torch.HalfTensor

    model.type(dtype)
    if args.cuda:
        model = model.cuda()
    model.eval()

    # construct the dataset
    test_data = TextDataset(src_fname=args.input,
                            tokenizer=tokenizer,
                            sort=False)

    # build the data loader
    test_loader = test_data.get_loader(batch_size=args.batch_size,
                                       batch_first=args.batch_first,
                                       shuffle=False,
                                       pad=True,
                                       num_workers=0,
                                       drop_last=False)

    # build the translator object
    translator = Translator(model=model,
                            tokenizer=tokenizer,
                            loader=test_loader,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_seq_len,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda,
                            print_freq=args.print_freq,
                            dataset_dir=args.dataset_dir)

    # execute the inference
    translator.run(calc_bleu=args.bleu,
                   eval_path=args.output,
                   reference_path=args.reference,
                   summary=True)
Exemple #8
0
def main():
    args = parse_args()
    print(args)

    if args.cuda:
        torch.cuda.set_device(0)
    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if args.math == 'fp16' and not args.cuda:
        raise RuntimeError('fp16 requires cuda')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    vocab_size = checkpoint['tokenizer'].vocab_size
    model_config = dict(vocab_size=vocab_size, math=checkpoint['config'].math,
                        **literal_eval(checkpoint['config'].model_config))
    model_config['batch_first'] = args.batch_first
    model = models.GNMT(**model_config)

    state_dict = checkpoint['state_dict']
    if checkpoint_from_distributed(state_dict):
        state_dict = unwrap_distributed(state_dict)

    model.load_state_dict(state_dict)

    if args.math == 'fp32':
        dtype = torch.FloatTensor
    if args.math == 'fp16':
        dtype = torch.HalfTensor

    model.type(dtype)
    if args.cuda:
        model = model.cuda()
    model.eval()

    tokenizer = checkpoint['tokenizer']

    translation_model = Translator(model,
                                   tokenizer,
                                   beam_size=args.beam_size,
                                   max_seq_len=args.max_seq_len,
                                   len_norm_factor=args.len_norm_factor,
                                   len_norm_const=args.len_norm_const,
                                   cov_penalty_factor=args.cov_penalty_factor,
                                   cuda=args.cuda)

    output_file = codecs.open(args.output, 'w', encoding='UTF-8')

    # run model on generated data, for accurate timings starting from 1st batch
    dummy_data = ['abc ' * (args.max_seq_len // 4)] * args.batch_size
    translation_model.translate(dummy_data)

    if args.cuda:
        torch.cuda.synchronize()

    batch_time = AverageMeter(False)
    enc_tok_per_sec = AverageMeter(False)
    dec_tok_per_sec = AverageMeter(False)
    tot_tok_per_sec = AverageMeter(False)

    enc_seq_len = AverageMeter(False)
    dec_seq_len = AverageMeter(False)

    total_lines = 0
    total_iters = 0
    with codecs.open(args.input, encoding='UTF-8') as input_file:
        for idx, lines in enumerate(grouper(input_file, args.batch_size)):
            lines = [l for l in lines if l]
            n_lines = len(lines)
            total_lines += n_lines

            translate_timer = time.time()
            translated_lines, stats = translation_model.translate(lines)
            elapsed = time.time() - translate_timer

            batch_time.update(elapsed, n_lines)
            etps = stats['total_enc_len'] / elapsed
            dtps = stats['total_dec_len'] / elapsed
            enc_seq_len.update(stats['total_enc_len'] / n_lines, n_lines)
            dec_seq_len.update(stats['total_dec_len'] / n_lines, n_lines)
            enc_tok_per_sec.update(etps, n_lines)
            dec_tok_per_sec.update(dtps, n_lines)

            tot_tok = stats['total_dec_len'] + stats['total_enc_len']
            ttps = tot_tok / elapsed
            tot_tok_per_sec.update(ttps, n_lines)

            n_iterations = stats['iters']
            total_iters += n_iterations

            write_output(output_file, translated_lines)

            if idx % args.print_freq == args.print_freq - 1:
                print(f'TRANSLATION: '
                      f'Batch {idx} '
                      f'Iters {n_iterations}\t'
                      f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      f'Tot tok/s {tot_tok_per_sec.val:.0f} ({tot_tok_per_sec.avg:.0f})\t'
                      f'Enc tok/s {enc_tok_per_sec.val:.0f} ({enc_tok_per_sec.avg:.0f})\t'
                      f'Dec tok/s {dec_tok_per_sec.val:.0f} ({dec_tok_per_sec.avg:.0f})')

    output_file.close()

    print(f'TRANSLATION SUMMARY:\n'
          f'Lines translated: {total_lines}\t'
          f'Avg time per batch: {batch_time.avg:.3f} s\t'
          f'Avg time per sentence: {1000*(batch_time.avg / args.batch_size):.3f} ms\n'
          f'Avg enc seq len: {enc_seq_len.avg:.2f}\t'
          f'Avg dec seq len: {dec_seq_len.avg:.2f}\t'
          f'Total iterations: {total_iters}\t\n'
          f'Avg tot tok/s: {tot_tok_per_sec.avg:.0f}\t'
          f'Avg enc tok/s: {enc_tok_per_sec.avg:.0f}\t'
          f'Avg dec tok/s: {dec_tok_per_sec.avg:.0f}')
Exemple #9
0
def main():
    mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__))
    mlperf_log.LOGGER.propagate = False
    mlperf_log.gnmt_print(key=mlperf_log.RUN_START)

    args = exp.get_arguments(parse_args(), show=True)
    device = exp.get_device()
    chrono = exp.chrono()

    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # initialize distributed backend
    distributed = args.world_size > 1
    if distributed:
        backend = 'nccl' if args.cuda else 'gloo'
        dist.init_process_group(backend=backend,
                                rank=args.rank,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    # create directory for results
    save_path = os.environ.get('OUTPUT_DIRECTORY')
    if save_path is None:
        save_path = '/tmp'

    if args.save is not None:
        save_path = os.path.join(args.results_dir, args.save)
        os.makedirs(save_path, exist_ok=True)

    # setup logging
    log_filename = f'log_gpu_{args.rank}.log'

    setup_logging(os.path.join(save_path, log_filename))

    if args.cuda:
        torch.cuda.set_device(args.rank)

    # build tokenizer
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME))

    train_data = ParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_train,
        max_len=args.max_length_train,
        sort=False,
        max_size=args.max_size)

    mlperf_log.gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES,
                          value=len(train_data))

    vocab_size = tokenizer.vocab_size
    mlperf_log.gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size)

    # build GNMT model
    model_config = dict(vocab_size=vocab_size,
                        math=args.math,
                        **literal_eval(args.model_config))
    model = models.GNMT(**model_config)
    logging.info(model)

    batch_first = model.batch_first

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD, args.smoothing)
    opt_config = literal_eval(args.optimization_config)

    # create trainer
    trainer_options = dict(criterion=criterion,
                           grad_clip=args.grad_clip,
                           save_path=save_path,
                           save_freq=args.save_freq,
                           save_info={
                               'config': args,
                               'tokenizer': tokenizer
                           },
                           opt_config=opt_config,
                           batch_first=batch_first,
                           keep_checkpoints=args.keep_checkpoints,
                           math=args.math,
                           print_freq=args.print_freq,
                           cuda=args.cuda,
                           distributed=distributed)

    trainer_options['model'] = model
    trainer = trainers.Seq2SeqTrainer(**trainer_options, number=args.number)

    translator = Translator(model,
                            tokenizer,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_length_val,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda)

    num_parameters = sum([l.nelement() for l in model.parameters()])

    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.batch_size,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         bucket=args.bucketing,
                                         num_workers=args.workers,
                                         drop_last=True,
                                         distributed=distributed)

    mlperf_log.gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE,
                          value=args.batch_size * args.world_size)
    mlperf_log.gnmt_print(key=mlperf_log.INPUT_SIZE,
                          value=train_loader.sampler.num_samples)

    # training loop
    best_loss = float('inf')
    mlperf_log.gnmt_print(key=mlperf_log.TRAIN_LOOP)

    for epoch in range(0, args.repeat):

        with chrono.time('train') as t:
            if distributed:
                train_loader.sampler.set_epoch(epoch)

            trainer.epoch = epoch
            train_loss = trainer.optimize(train_loader)
            exp.log_epoch_loss(train_loss)

        exp.show_eta(epoch, t)

    exp.report()
Exemple #10
0
def main():
    """
    Launches data-parallel multi-gpu training.
    """
    mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__))
    mlperf_log.LOGGER.propagate = False

    args = parse_args()
    device = utils.set_device(args.cuda, args.local_rank)
    distributed = utils.init_distributed(args.cuda)
    gnmt_print(key=mlperf_log.RUN_START, sync=True)
    args.rank = utils.get_rank()

    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # create directory for results
    save_path = os.path.join(args.results_dir, args.save)
    args.save_path = save_path
    os.makedirs(save_path, exist_ok=True)

    # setup logging
    log_filename = f'log_rank_{utils.get_rank()}.log'
    utils.setup_logging(os.path.join(save_path, log_filename))

    if args.env:
        utils.log_env_info()

    logging.info(f'Saving results to: {save_path}')
    logging.info(f'Run arguments: {args}')

    # automatically set train_iter_size based on train_global_batch_size,
    # world_size and per-worker train_batch_size
    if args.train_global_batch_size is not None:
        global_bs = args.train_global_batch_size
        bs = args.train_batch_size
        world_size = utils.get_world_size()
        assert global_bs % (bs * world_size) == 0
        args.train_iter_size = global_bs // (bs * world_size)
        logging.info(f'Global batch size was set in the config, '
                     f'Setting train_iter_size to {args.train_iter_size}')

    worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs,
                                                      device)
    worker_seed = worker_seeds[args.rank]
    logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}')
    torch.manual_seed(worker_seed)

    # build tokenizer
    pad_vocab = utils.pad_vocabulary(args.math)
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME),
                          pad_vocab)

    # build datasets
    gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING, sync=False)
    gnmt_print(key=mlperf_log.TRAIN_HP_MAX_SEQ_LEN,
               value=args.max_length_train,
               sync=False)

    train_data = LazyParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_train,
        max_len=args.max_length_train,
        sort=False,
        max_size=args.max_size)

    gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES,
               value=len(train_data),
               sync=False)

    val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir,
                                                      config.SRC_VAL_FNAME),
                               tgt_fname=os.path.join(args.dataset_dir,
                                                      config.TGT_VAL_FNAME),
                               tokenizer=tokenizer,
                               min_len=args.min_length_val,
                               max_len=args.max_length_val,
                               sort=True)

    gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL, sync=False)

    test_data = TextDataset(src_fname=os.path.join(args.dataset_dir,
                                                   config.SRC_TEST_FNAME),
                            tokenizer=tokenizer,
                            min_len=args.min_length_test,
                            max_len=args.max_length_test,
                            sort=True)

    gnmt_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES,
               value=len(test_data),
               sync=False)

    vocab_size = tokenizer.vocab_size
    gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size, sync=False)

    # build GNMT model
    model_config = {
        'hidden_size': args.hidden_size,
        'num_layers': args.num_layers,
        'dropout': args.dropout,
        'batch_first': False,
        'share_embedding': args.share_embedding
    }
    model = GNMT(vocab_size=vocab_size, **model_config)
    logging.info(model)

    batch_first = model.batch_first

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD, args.smoothing)

    opt_config = {'optimizer': args.optimizer, 'lr': args.lr}
    opt_config.update(literal_eval(args.optimizer_extra))
    logging.info(f'Training optimizer config: {opt_config}')

    scheduler_config = {
        'warmup_steps': args.warmup_steps,
        'remain_steps': args.remain_steps,
        'decay_interval': args.decay_interval,
        'decay_steps': args.decay_steps,
        'decay_factor': args.decay_factor
    }

    logging.info(f'Training LR schedule config: {scheduler_config}')

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info(f'Number of parameters: {num_parameters}')

    batching_opt = {
        'shard_size': args.shard_size,
        'num_buckets': args.num_buckets
    }
    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.train_batch_size,
                                         seeds=shuffling_seeds,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         batching=args.batching,
                                         batching_opt=batching_opt,
                                         num_workers=args.train_loader_workers)

    gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE,
               value=args.train_batch_size * utils.get_world_size(),
               sync=False)
    gnmt_print(key=mlperf_log.INPUT_SIZE,
               value=train_loader.sampler.num_samples,
               sync=False)

    val_loader = val_data.get_loader(batch_size=args.val_batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     num_workers=args.val_loader_workers)

    test_loader = test_data.get_loader(batch_size=args.test_batch_size,
                                       batch_first=batch_first,
                                       shuffle=False,
                                       pad=True,
                                       num_workers=args.test_loader_workers)

    gnmt_print(key=mlperf_log.EVAL_SIZE,
               value=len(test_loader.dataset),
               sync=False)

    translator = Translator(model=model,
                            tokenizer=tokenizer,
                            loader=test_loader,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_length_test,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda,
                            print_freq=args.print_freq,
                            dataset_dir=args.dataset_dir,
                            target_bleu=args.target_bleu,
                            save_path=args.save_path)

    # create trainer
    total_train_iters = len(train_loader) // args.train_iter_size * args.epochs
    save_info = {
        'model_config': model_config,
        'config': args,
        'tokenizer': tokenizer.get_state()
    }
    trainer_options = dict(criterion=criterion,
                           grad_clip=args.grad_clip,
                           iter_size=args.train_iter_size,
                           save_path=save_path,
                           save_freq=args.save_freq,
                           save_info=save_info,
                           opt_config=opt_config,
                           scheduler_config=scheduler_config,
                           train_iterations=total_train_iters,
                           batch_first=batch_first,
                           keep_checkpoints=args.keep_checkpoints,
                           math=args.math,
                           print_freq=args.print_freq,
                           cuda=args.cuda,
                           distributed=distributed,
                           intra_epoch_eval=args.intra_epoch_eval,
                           translator=translator)

    trainer_options['model'] = model
    trainer = trainers.Seq2SeqTrainer(**trainer_options)

    # optionally resume from a checkpoint
    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error(f'No checkpoint found at {args.resume}')

    # training loop
    best_loss = float('inf')
    break_training = False
    test_bleu = None
    gnmt_print(key=mlperf_log.TRAIN_LOOP, sync=True)
    for epoch in range(args.start_epoch, args.epochs):
        logging.info(f'Starting epoch {epoch}')
        gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=epoch, sync=True)

        train_loader.sampler.set_epoch(epoch)

        trainer.epoch = epoch
        train_loss, train_perf = trainer.optimize(train_loader)

        # evaluate on validation set
        if args.eval:
            logging.info(f'Running validation on dev set')
            val_loss, val_perf = trainer.evaluate(val_loader)

            # remember best prec@1 and save checkpoint
            gnmt_print(key=mlperf_log.TRAIN_CHECKPOINT, sync=False)
            if args.rank == 0:
                is_best = val_loss < best_loss
                best_loss = min(val_loss, best_loss)
                trainer.save(save_all=args.save_all, is_best=is_best)

        if args.eval:
            gnmt_print(key=mlperf_log.EVAL_START, value=epoch, sync=True)
            test_bleu, break_training = translator.run(calc_bleu=True,
                                                       epoch=epoch)
            gnmt_print(key=mlperf_log.EVAL_ACCURACY,
                       value={
                           "epoch": epoch,
                           "value": round(test_bleu, 2)
                       },
                       sync=False)
            gnmt_print(key=mlperf_log.EVAL_TARGET,
                       value=args.target_bleu,
                       sync=False)
            gnmt_print(key=mlperf_log.EVAL_STOP, sync=True)

        acc_log = []
        acc_log += [f'Summary: Epoch: {epoch}']
        acc_log += [f'Training Loss: {train_loss:.4f}']
        if args.eval:
            acc_log += [f'Validation Loss: {val_loss:.4f}']
            acc_log += [f'Test BLEU: {test_bleu:.2f}']

        perf_log = []
        perf_log += [f'Performance: Epoch: {epoch}']
        perf_log += [f'Training: {train_perf:.0f} Tok/s']
        if args.eval:
            perf_log += [f'Validation: {val_perf:.0f} Tok/s']

        if args.rank == 0:
            logging.info('\t'.join(acc_log))
            logging.info('\t'.join(perf_log))

        logging.info(f'Finished epoch {epoch}')
        if break_training:
            break

    gnmt_print(key=mlperf_log.RUN_STOP,
               value={"success": bool(break_training)},
               sync=True)
    gnmt_print(key=mlperf_log.RUN_FINAL, sync=False)
Exemple #11
0
def main():
    """
    Launches translation (inference).
    Inference is executed on a single GPU, implementation supports beam search
    with length normalization and coverage penalty.
    """
    args = parse_args()
    utils.set_device(args.cuda, args.local_rank)
    utils.init_distributed(args.cuda)
    setup_logging()

    if args.env:
        utils.log_env_info()

    logging.info(f'Run arguments: {args}')

    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # load checkpoint and deserialize to CPU (to save GPU memory)
    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    # build GNMT model
    tokenizer = Tokenizer()
    tokenizer.set_state(checkpoint['tokenizer'])
    vocab_size = tokenizer.vocab_size
    model_config = checkpoint['model_config']
    model_config['batch_first'] = args.batch_first
    model = GNMT(vocab_size=vocab_size, **model_config)
    model.load_state_dict(checkpoint['state_dict'])

    for (math, batch_size, beam_size) in product(args.math, args.batch_size,
                                                 args.beam_size):
        logging.info(f'math: {math}, batch size: {batch_size}, '
                     f'beam size: {beam_size}')
        if math == 'fp32':
            dtype = torch.FloatTensor
        if math == 'fp16':
            dtype = torch.HalfTensor
        model.type(dtype)

        if args.cuda:
            model = model.cuda()
        model.eval()

        # construct the dataset
        test_data = TextDataset(src_fname=args.input,
                                tokenizer=tokenizer,
                                sort=args.sort)

        # build the data loader
        test_loader = test_data.get_loader(batch_size=batch_size,
                                           batch_first=args.batch_first,
                                           shuffle=False,
                                           pad=True,
                                           num_workers=0)

        # build the translator object
        translator = Translator(model=model,
                                tokenizer=tokenizer,
                                loader=test_loader,
                                beam_size=beam_size,
                                max_seq_len=args.max_seq_len,
                                len_norm_factor=args.len_norm_factor,
                                len_norm_const=args.len_norm_const,
                                cov_penalty_factor=args.cov_penalty_factor,
                                cuda=args.cuda,
                                print_freq=args.print_freq,
                                dataset_dir=args.dataset_dir)

        # execute the inference
        translator.run(calc_bleu=args.bleu,
                       eval_path=args.output,
                       reference_path=args.reference,
                       summary=True)
def main():
    """
    Launches data-parallel multi-gpu training.
    """

    mlperf_compliance.mlperf_log.LOGGER.propagate = False

    mlperf_compliance.mlperf_log.setdefault(
        root_dir=os.path.dirname(os.path.abspath(__file__)),
        benchmark=mlperf_compliance.constants.GNMT,
        stack_offset=1,
        extra_print=False
        )

    mlperf_print(key=mlperf_compliance.constants.INIT_START,
                 log_all_ranks=True)

    args = parse_args()
    device = utils.set_device(args.cuda, args.local_rank)
    distributed = utils.init_distributed(args.cuda)

    # preinit and warmup streams/ groups for apex DDP communicators
    allreduce_communicators=None
    if distributed and args.apex_num_allreduce_streams > 1:
        bucket_pgs = [torch.distributed.new_group() for _ in range(args.apex_num_allreduce_streams)]
        bucket_streams = [torch.cuda.Stream() for _ in range(args.apex_num_allreduce_streams)]
        for pg, stream in zip(bucket_pgs,bucket_streams):
            with torch.cuda.stream(stream):
                torch.distributed.all_reduce(torch.cuda.FloatTensor(1), group=pg)
        allreduce_communicators=(bucket_pgs,bucket_streams)

    args.rank = utils.get_rank()

    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # create directory for results
    save_path = os.path.join(args.results_dir, args.save)
    args.save_path = save_path
    os.makedirs(save_path, exist_ok=True)

    # setup logging
    log_filename = f'log_rank_{utils.get_rank()}.log'
    utils.setup_logging(args.log_all_ranks,
                        os.path.join(save_path, log_filename))

    if args.env:
        utils.log_env_info()

    logging.info(f'Saving results to: {save_path}')
    logging.info(f'Run arguments: {args}')

    # automatically set train_iter_size based on train_global_batch_size,
    # world_size and per-worker train_batch_size
    if args.train_global_batch_size is not None:
        global_bs = args.train_global_batch_size
        bs = args.train_batch_size
        world_size = utils.get_world_size()
        assert global_bs % (bs * world_size) == 0
        args.train_iter_size = global_bs // (bs * world_size)
        logging.info(f'Global batch size was set in the config, '
                     f'Setting train_iter_size to {args.train_iter_size}')
    # setup L2 promotion
    if args.cuda:
        utils.l2_promote()

    worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs,
                                                      device)
    worker_seed = worker_seeds[args.rank]
    logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}')
    torch.manual_seed(worker_seed)

    # build tokenizer
    # https://github.com/mlperf/policies/issues/201
    pad_vocab = utils.pad_vocabulary(args.math)
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME),
                          pad_vocab)

    vocab_size = tokenizer.vocab_size

    # build GNMT model
    model_config = {'hidden_size': args.hidden_size,
                    'num_layers': args.num_layers,
                    'dropout': args.dropout, 'batch_first': False,
                    'share_embedding': args.share_embedding,
                    'fusion': args.fused_attention}
    model = GNMT(vocab_size=vocab_size, **model_config)
    logging.info(model)

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD, args.smoothing,
                                args.fused_xentropy)

    opt_config = {'optimizer': args.optimizer, 'lr': args.lr}
    opt_config.update(literal_eval(args.optimizer_extra))
    logging.info(f'Training optimizer config: {opt_config}')

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info(f'Number of parameters: {num_parameters}')

    # create trainer
    save_info = {'model_config': model_config, 'config': args, 'tokenizer':
                 tokenizer.get_state()}
    loss_scaling = {'init_scale': args.init_scale, 'upscale_interval':
                    args.upscale_interval}
    trainer_options = dict(
        criterion=criterion,
        grad_clip=args.grad_clip,
        iter_size=args.train_iter_size,
        save_path=save_path,
        save_freq=args.save_freq,
        save_info=save_info,
        opt_config=opt_config,
        batch_first=model.batch_first,
        keep_checkpoints=args.keep_checkpoints,
        math=args.math,
        loss_scaling=loss_scaling,
        print_freq=args.print_freq,
        cuda=args.cuda,
        distributed=distributed,
        distributed_overlap_allreduce=args.enable_apex_allreduce_overlap,
        distributed_overlap_num_allreduce_streams=args.apex_num_allreduce_streams,
        distributed_overlap_allreduce_messagesize=args.apex_message_size,
        distributed_overlap_allreduce_communicators=allreduce_communicators,
        intra_epoch_eval=args.intra_epoch_eval,
        prealloc_mode=args.prealloc_mode)

    trainer_options['model'] = model
    trainer = trainers.Seq2SeqTrainer(**trainer_options)

    trainer.preallocate(args.train_batch_size, args.max_length_train,
                        training=True)

    mlperf_print(key=mlperf_compliance.constants.INIT_STOP,
                 sync=True)
    mlperf_print(key=mlperf_compliance.constants.RUN_START,
                 sync=True)
    utils.barrier()

    mlperf_print(key=mlperf_compliance.constants.MAX_SEQUENCE_LENGTH,
                 value=args.max_length_train,
                 metadata={'method': 'discard'})

    if args.use_preproc_data:
        train_data = PreprocessedDataset(
            min_len=args.min_length_train,
            max_len=args.max_length_train,
            vocab_size=tokenizer.vocab_size,
            )
        train_data.read_data(
            os.path.join(args.preproc_data_dir, 'training.bin'),
            tokenizer.vocab_size,
            )
        train_data.prepare()
    else:
        train_data = LazyParallelDataset(
            src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
            tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
            tokenizer=tokenizer,
            min_len=args.min_length_train,
            max_len=args.max_length_train,
            sort=False,
            max_size=args.max_size,
            )

    test_data = TextDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_test,
        max_len=args.max_length_test,
        sort=True)

    batching_opt = {'shard_size': args.shard_size,
                    'num_buckets': args.num_buckets}

    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.train_batch_size,
                                         seeds=shuffling_seeds,
                                         batch_first=model.batch_first,
                                         shuffle=True,
                                         batching=args.batching,
                                         batching_opt=batching_opt,
                                         num_workers=args.train_loader_workers)

    mlperf_print(key=mlperf_compliance.constants.GLOBAL_BATCH_SIZE,
                 value=args.train_batch_size * utils.get_world_size(),
                 sync=False)

    test_loader = test_data.get_loader(batch_size=args.test_batch_size,
                                       batch_first=model.batch_first,
                                       shuffle=False,
                                       num_workers=args.test_loader_workers)

    translator = Translator(model=model,
                            tokenizer=tokenizer,
                            loader=test_loader,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_length_test,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda,
                            print_freq=args.print_freq,
                            dataset_dir=args.dataset_dir,
                            target_bleu=args.target_bleu,
                            save_path=args.save_path)

    total_train_iters = len(train_loader) // args.train_iter_size * args.epochs

    scheduler_config = {'warmup_steps': args.warmup_steps,
                        'remain_steps': args.remain_steps,
                        'decay_interval': args.decay_interval,
                        'decay_steps': args.decay_steps,
                        'decay_factor': args.decay_factor}

    logging.info(f'Training LR schedule config: {scheduler_config}')
    scheduler = WarmupMultiStepLR(trainer.optimizer, total_train_iters,
                                  **scheduler_config)
    trainer.scheduler = scheduler
    trainer.translator = translator

    # optionally resume from a checkpoint
    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(
                checkpoint_file, 'model_best.pth')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error(f'No checkpoint found at {args.resume}')

    # training loop
    break_training = False
    test_bleu = None
    for epoch in range(args.start_epoch, args.epochs):
        mlperf_print(key=mlperf_compliance.constants.BLOCK_START,
                     metadata={'first_epoch_num': epoch + 1,
                               'epoch_count': 1},
                     sync=True)
        mlperf_print(key=mlperf_compliance.constants.EPOCH_START,
                     metadata={'epoch_num': epoch + 1},
                     sync=True)

        logging.info(f'Starting epoch {epoch}')
        train_loader.sampler.set_epoch(epoch)

        trainer.epoch = epoch
        train_loss, train_perf = trainer.optimize(train_loader)

        mlperf_print(key=mlperf_compliance.constants.EPOCH_STOP,
                     metadata={'epoch_num': epoch + 1},
                     sync=True)

        if args.eval:
            mlperf_print(key=mlperf_compliance.constants.EVAL_START,
                         metadata={'epoch_num': epoch + 1},
                         sync=True)
            test_bleu, break_training = translator.run(calc_bleu=True,
                                                       epoch=epoch)
            mlperf_print(key=mlperf_compliance.constants.EVAL_ACCURACY,
                         value=test_bleu,
                         metadata={'epoch_num': epoch + 1},
                         sync=False)
            mlperf_print(key=mlperf_compliance.constants.EVAL_STOP,
                         metadata={'epoch_num': epoch + 1},
                         sync=True)

        acc_log = []
        acc_log += [f'Summary: Epoch: {epoch}']
        acc_log += [f'Training Loss: {train_loss:.4f}']
        if args.eval:
            acc_log += [f'Test BLEU: {test_bleu:.2f}']

        perf_log = []
        perf_log += [f'Performance: Epoch: {epoch}']
        perf_log += [f'Training: {train_perf:.0f} Tok/s']

        if args.rank == 0:
            logging.info('\t'.join(acc_log))
            logging.info('\t'.join(perf_log))

        logging.info(f'Finished epoch {epoch}')
        mlperf_print(key=mlperf_compliance.constants.BLOCK_STOP,
                     metadata={'first_epoch_num': epoch + 1},
                     sync=True)

        if break_training:
            break

    if args.use_preproc_data:
        train_data.finalize()

    status = 'success' if break_training else 'aborted'
    mlperf_print(key=mlperf_compliance.constants.RUN_STOP,
                 metadata={'status': status},
                 sync=True)
Exemple #13
0
def main():
    """
    Launches translation (inference).
    Inference is executed on a single GPU, implementation supports beam search
    with length normalization and coverage penalty.
    """
    args = parse_args()

    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(levelname)s - %(message)s",
                        datefmt="%Y-%m-%d %H:%M:%S",
                        filename='log.log',
                        filemode='w')
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    formatter = logging.Formatter('%(message)s')
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)

    logging.info(args)

    if args.cuda:
        torch.cuda.set_device(0)
    if not args.cuda and torch.cuda.is_available():
        warnings.warn('cuda is available but not enabled')
    if args.math == 'fp16' and not args.cuda:
        raise RuntimeError('fp16 requires cuda')
    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # load checkpoint and deserialize to CPU (to save GPU memory)
    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    # build GNMT model
    tokenizer = checkpoint['tokenizer']
    vocab_size = tokenizer.vocab_size
    model_config = dict(vocab_size=vocab_size,
                        math=checkpoint['config'].math,
                        **literal_eval(checkpoint['config'].model_config))
    model_config['batch_first'] = args.batch_first
    model = GNMT(**model_config)

    state_dict = checkpoint['state_dict']
    if checkpoint_from_distributed(state_dict):
        state_dict = unwrap_distributed(state_dict)

    model.load_state_dict(state_dict)

    if args.math == 'fp32':
        dtype = torch.FloatTensor
    if args.math == 'fp16':
        dtype = torch.HalfTensor

    model.type(dtype)
    if args.cuda:
        model = model.cuda()
    model.eval()

    # construct the dataset
    test_data = TextDataset(src_fname=args.input,
                            tokenizer=tokenizer,
                            sort=False)

    # build the data loader
    test_loader = test_data.get_loader(batch_size=args.batch_size,
                                       batch_first=args.batch_first,
                                       shuffle=False,
                                       num_workers=0,
                                       drop_last=False)

    # build the translator object
    translator = Translator(model=model,
                            tokenizer=tokenizer,
                            loader=test_loader,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_seq_len,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda,
                            print_freq=args.print_freq,
                            dataset_dir=args.dataset_dir)

    # execute the inference
    translator.run(calc_bleu=args.bleu,
                   eval_path=args.output,
                   reference_path=args.reference,
                   summary=True)
Exemple #14
0
def main():
    execution_timer = time.time()

    tfiargs = tfiParser.getParser()
    args = tfiargs.parse_args()

    # import os
    # os.environ['CUDA_LAUNCH_BLOCKING']='1'

    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True

    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        cudnn.benchmark = True
        print("Use GPU: {} for training".format(args.gpu))

    checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    vocab_size = checkpoint['tokenizer'].vocab_size

    model_config = dict(vocab_size=vocab_size,
                        math=checkpoint['config'].math,
                        **literal_eval(checkpoint['config'].model_config))

    model_config['batch_first'] = args.batch_first

    model = models.GNMT(**model_config)

    state_dict = checkpoint['state_dict']

    if checkpoint_from_distributed(state_dict):
        state_dict = unwrap_distributed(state_dict)

    model.load_state_dict(state_dict)

    if args.gpu is not None:
        model = model.cuda()

    tokenizer = checkpoint['tokenizer']

    test_data = ParallelDataset(src_fname=os.path.join(args.data,
                                                       config.SRC_TEST_FNAME),
                                tgt_fname=os.path.join(args.data,
                                                       config.TGT_TEST_FNAME),
                                tokenizer=tokenizer,
                                min_len=0,
                                max_len=150,
                                sort=False)

    test_loader = test_data.get_loader(batch_size=args.batch_size,
                                       batch_first=True,
                                       shuffle=False,
                                       num_workers=0,
                                       drop_last=False,
                                       distributed=False)

    translator = Translator(model,
                            tokenizer,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_seq_len,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.gpu is not None)

    model.eval()
    # torch.cuda.empty_cache()

    if args.record_prefix is not None:
        record = Record('GNMTv2',
                        batch_size=args.batch_size,
                        injection=args.injection,
                        fiLayer=args.layer,
                        fiFeatures=args.fiFeats,
                        fiWeights=args.fiWeights)
    # Faulty Run
    if args.faulty:
        fi = FI(model,
                record=record,
                fiMode=args.injection,
                fiLayer=args.layer,
                fiBit=args.bit,
                fiFeatures=args.fiFeats,
                fiWeights=args.fiWeights,
                log=args.log)

        traverse_time = AverageMeter()
        start = time.time()
        fi.traverseModel(model)
        traverse_time.update(time.time() - start)

        displayConfig(args)
        fi.injectionMode = True
        print("\n Number of new layers: #%d \n" % fi.numNewLayers)

    elif args.golden:
        import distiller.modules as dist
        model = dist.convert_model_to_distiller_lstm(model)

    if args.quantize:
        overrides_yaml = """
        .*att_rnn.attn.*:
            clip_acts: NONE # Quantize without clipping
        decoder.classifier.classifier:
            clip_acts: NONE # Quantize without clipping
        """
        from distiller.utils import yaml_ordered_load
        overrides = yaml_ordered_load(
            overrides_yaml)  # Basic quantizer defintion

        stats_file = '/home/bfgoldstein/torchfi/examples/wmt16/model_stats.yaml'

        quantizer = tfi.FIPostTraLinearQuantizer(
            model,
            mode=args.quant_mode,
            bits_activations=args.quant_bacts,
            bits_parameters=args.quant_bwts,
            bits_accum=args.quant_baccum,
            per_channel_wts=args.quant_channel,
            clip_acts=args.quant_cacts,
            model_activation_stats=args.quant_stats_file,
            overrides=overrides,
            clip_n_stds=args.quant_cnstds,
            scale_approx_mult_bits=args.quant_scalebits)
        quantizer.prepare_model()
        # model = quantizer.model
        if args.faulty:
            fi.setQuantParams(args)

    print(model._modules.items())

    # Setting model to evaluation mode and cuda (if enabled) after FI traverse
    model.eval()
    if args.gpu is not None:
        model = model.cuda()

    test_file = open(args.record_prefix +
                     getRecordPrefix(args, 'fp32', faulty=args.faulty) +
                     ".tok",
                     'w',
                     encoding='UTF-8')

    batch_time = AverageMeter(False)
    tot_tok_per_sec = AverageMeter(False)
    iterations = AverageMeter(False)
    enc_seq_len = AverageMeter(False)
    dec_seq_len = AverageMeter(False)
    bleu_score = AverageMeter(False)
    score_time = AverageMeter(False)
    stats = {}

    reference_content = readReferenceFile(args)

    for batch_idx, (input, target, indices) in enumerate(test_loader):
        translate_timer = time.time()
        input_data, input_lenght = input

        if translator.batch_first:
            batch_size = input_data.size(0)
        else:
            batch_size = input_data.size(1)
        beam_size = args.beam_size

        bos = [translator.insert_target_start] * (batch_size * beam_size)
        bos = torch.LongTensor(bos)

        if translator.batch_first:
            bos = bos.view(-1, 1)
        else:
            bos = bos.view(1, -1)

        input_lenght = torch.LongTensor(input_lenght)
        stats['total_enc_len'] = int(input_lenght.sum())

        if args.gpu is not None:
            input_data = input_data.cuda(args.gpu, non_blocking=True)
            input_lenght = input_lenght.cuda(args.gpu, non_blocking=True)
            bos = bos.cuda(args.gpu, non_blocking=True)

        with torch.no_grad():
            context = translator.model.encode(input_data, input_lenght)
            context = [context, input_lenght, None]

            if beam_size == 1:
                generator = translator.generator.greedy_search
            else:
                generator = translator.generator.beam_search

            preds, lengths, counter = generator(batch_size, bos, context)

        if args.faulty:
            fi.injectionMode = True

        stats['total_dec_len'] = lengths.sum().item()
        stats['iters'] = counter

        preds = preds.cpu()
        lengths = lengths.cpu()

        output = []
        for idx, pred in enumerate(preds):
            end = lengths[idx] - 1
            pred = pred[1:end]
            pred = pred.tolist()
            out = translator.tok.detokenize(pred)
            output.append(out)

        output = [output[indices.index(i)] for i in range(len(output))]

        for line_idx, line in enumerate(output):
            score_timer = time.time()
            detok_sentence = detokenizeSentence(args, line)
            chunk = (batch_idx * batch_size) + line_idx
            score = scoreBleuSentence(args, detok_sentence,
                                      reference_content[chunk])
            bleu_score.update(score)
            record.addBleuScores(score)
            # Get timing
            elapsed = time.time() - score_timer
            score_time.update(elapsed)
            test_file.write(line)
            test_file.write('\n')

        # Get timing
        elapsed = time.time() - translate_timer
        batch_time.update(elapsed, batch_size)

        total_tokens = stats['total_dec_len'] + stats['total_enc_len']
        ttps = total_tokens / elapsed
        tot_tok_per_sec.update(ttps, batch_size)

        iterations.update(stats['iters'])
        enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size)
        dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size)

        if batch_idx % args.print_freq == 0:
            print('[Test {}] Time: {:.3f} ({:.3f})\t   \
                    Decoder iters {:.1f} ({:.1f})\t \
                    Tok/s {:.0f} ({:.0f})\n \
                    Bleu score: {:.2f} ({:.2f})\t \
                    Bleu time: {:.3f} ({:.3f})'.format(
                batch_idx, batch_time.val, batch_time.avg, iterations.val,
                iterations.avg, tot_tok_per_sec.val, tot_tok_per_sec.avg,
                bleu_score.val, bleu_score.avg, score_time.val,
                score_time.avg))

    # summary timing
    time_per_sentence = (batch_time.avg / batch_size)

    print('[Test] Summary \n    \
        Lines translated: {}\t  \
        Avg total tokens/s: {:.0f}\n    \
        Avg time per batch: {:.3f} s\t  \
        Avg time per sentence: {:.3f} ms\n  \
        Avg encoder seq len: {:.2f}\t   \
        Avg decoder seq len: {:.2f}\t   \
        Total decoder iterations: {}\n  \
        Traverse time : {:.3f} s\t  \
        Total number of injections: {}'.format(
        len(test_loader.dataset), tot_tok_per_sec.avg, batch_time.avg,
        1000 * time_per_sentence, enc_seq_len.avg, dec_seq_len.avg,
        int(iterations.sum), traverse_time.val if args.faulty else 0.0,
        int(fi.numInjections) if args.faulty else 0))

    test_file.close()

    detok = detokenizeFile(args)
    bleu = scoreBleuFile(args, detok)

    record.setBleuScoreAvg(bleu)
    saveRecord(
        args.record_prefix + getRecordPrefix(args, 'fp32', faulty=args.faulty),
        record)

    print('BLEU on test dataset: {}'.format(bleu))
    # Get timing
    execution_elapsed = time.time() - execution_timer
    print('Finished evaluation on test set in {:.2f} seconds'.format(
        execution_elapsed))