Beispiel #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="Multi-GPU - Local rank"
            )
    params = parser.parse_args()

    # Initialize distributed training
    torch.cuda.set_device(params.local_rank)
    torch.distributed.init_process_group(backend="nccl",  init_method='env://')
    trainer = Enc_Dec_Trainer(params)

    # Check whether dump_path exists, if not create one
    if os.path.exists(config.dump_path) == False:
        os.makedirs(config.dump_path)

    # Save config in dump_path
    f = open(os.path.join(config.dump_path, "config.pkl"), 'wb')
    pickle.dump(config, f)
    f.close()

    # Create logger for each process
    logger = create_logger(
            os.path.join(config.dump_path, 'train.log'),
            rank=getattr(params, 'local_rank', 0)
            )

    # Start epoch training
    for i_epoch in range(trainer.epoch_size):
        data_iter = iter(trainer.iterators["train"].get_iterator(True, True))
        for i_batch, raw_batch in enumerate(data_iter):
            trainer.train_step(raw_batch)
            trainer.iter()
        scores = trainer.valid_step()
        trainer.save_best_model(scores)
        trainer.save_periodic()
        self.end_epoch()
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="Multi-GPU - Local rank")
    parser.add_argument("--raw_src",
                        type=str,
                        default=None,
                        help="Tokenized source train file")
    parser.add_argument("--raw_tgt",
                        type=str,
                        default=None,
                        help="Tokenized target train file")
    parser.add_argument("--continue_path",
                        type=str,
                        default=None,
                        help="Where to reload checkpoint")
    parser.add_argument("--dump_path",
                        type=str,
                        default=None,
                        help="Where to store checkpoints")
    parser.add_argument("--reload_network_only",
                        action='store_true',
                        help="Whether reload optimizer states")
    params = parser.parse_args()

    if params.raw_src is not None:
        config.SRC_RAW_TRAIN_PATH = params.raw_src
    if params.raw_tgt is not None:
        config.TGT_RAW_TRAIN_PATH = params.raw_tgt
    if params.continue_path is not None:
        config.continue_path = params.continue_path
    if params.dump_path is not None:
        config.dump_path = params.dump_path
    config.reload_network_only = params.reload_network_only

    # Initialize distributed training
    if params.local_rank != -1:
        torch.cuda.set_device(params.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method='env://')
    trainer = Enc_Dec_Trainer(params)

    # Check whether dump_path exists, if not create one
    if params.local_rank == 0 or config.multi_gpu == False:
        if os.path.exists(config.dump_path) == False:
            os.makedirs(config.dump_path)

        # Save config in dump_path
        f = open(os.path.join(config.dump_path, "config.pkl"), 'wb')
        pickle.dump(config, f)
        f.close()
    torch.distributed.barrier()

    # Create logger for each process
    logger = create_logger(os.path.join(config.dump_path, 'train.log'),
                           rank=getattr(params, 'local_rank', 0))

    # Start epoch training
    for i_epoch in range(trainer.epoch_size):
        if trainer.epoch > trainer.epoch_size:
            break

        if config.multi_gpu == False or int(os.environ["NGPUS"]) == 1:
            # Single GPU, do not need to split dataset
            data_iter = iter(trainer.iterators["train"].get_iterator(
                True, True))
        else:
            if params.local_rank == 0:
                # Split dataset into NGPUS subsets, with the same number of batches
                # Store NGPUS subsets in config.data_bin
                subset_batches = trainer.iterators["train"].get_batch_ids(
                    shuffle=True,
                    group_by_size=True,
                    num_subsets=int(os.environ["NGPUS"]))

                for i_sub in range(len(subset_batches)):
                    f = open(
                        os.path.join(config.data_bin, "batches_" + str(i_sub)),
                        'wb')
                    pickle.dump(subset_batches[i_sub], f)
                    f.close()

            torch.distributed.barrier()
            # Each process reads its own subset
            f = open(
                os.path.join(config.data_bin,
                             "batches_" + str(params.local_rank)), 'rb')
            subset_batches = pickle.load(f)
            f.close()
            n_batches = len(subset_batches)
            print("Process {}, n_batches is {}".format(params.local_rank,
                                                       n_batches))
            data_iter = iter(trainer.iterators["train"].get_batches_iterator(
                subset_batches))
            num_train = sum([len(b) for b in subset_batches])
            trainer.num_train = num_train

        for i_batch, raw_batch in enumerate(data_iter):
            try:
                if i_batch == n_batches - 1:
                    print(raw_batch.src.size())
                trainer.train_step(raw_batch)
                trainer.iter()
            except RuntimeError:
                continue

        scores = trainer.valid_step()
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
        torch.distributed.barrier()
Beispiel #3
0
def MASS_main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="Multi-GPU - Local rank")
    parser.add_argument("--continue_path",
                        type=str,
                        default=None,
                        help="Where to reload checkpoint")
    parser.add_argument("--dump_path",
                        type=str,
                        default=None,
                        help="Where to store checkpoints")
    parser.add_argument('--data_bin',
                        default=None,
                        type=str,
                        help="Path to store binarized data")
    parser.add_argument('--epoch_size',
                        default=None,
                        type=int,
                        help="Maximum train epochs")
    parser.add_argument('--eval_only',
                        action="store_true",
                        help="Only perform evaluation")
    params = parser.parse_args()

    if params.continue_path is not None:
        config.continue_path = params.continue_path
    if params.dump_path is not None:
        config.dump_path = params.dump_path
    if params.data_bin is not None:
        config.data_bin = params.data_bin
        config.train_iter_dump_path = config.data_bin + "train_iter"
        config.valid_iter_dump_path = config.data_bin + "valid_iter"
        config.total_vocab_dump_path = config.data_bin + "TOTAL"
    if params.epoch_size is not None:
        config.epoch_size = params.epoch_size

    # Initialize distributed training
    if params.local_rank != -1:
        torch.cuda.set_device(params.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method='env://')
    trainer = Enc_Dec_Trainer(params)

    # Check whether dump_path exists, if not create one
    if params.local_rank == 0 or config.multi_gpu == False:
        if os.path.exists(config.dump_path) == False:
            os.makedirs(config.dump_path)

        # Save config in dump_path
        f = open(os.path.join(config.dump_path, "config.pkl"), 'wb')
        pickle.dump(config, f)
        f.close()
    torch.distributed.barrier()

    # Create logger for each process
    logger = create_logger(os.path.join(config.dump_path, 'train.log'),
                           rank=getattr(params, 'local_rank', 0))

    if params.eval_only:
        trainer.valid_step()
        exit()

    # Start epoch training
    for i_epoch in range(trainer.epoch_size):
        if trainer.epoch > trainer.epoch_size:
            break

        if config.multi_gpu == False or int(os.environ["NGPUS"]) == 1:
            # Single GPU, do not need to split dataset
            subset_batches = trainer.iterators["train"].get_batch_ids(
                shuffle=True, group_by_size=True)
            data_iter = iter(trainer.iterators["train"].get_batches_iterator(
                subset_batches))
            trainer.num_train = sum([len(b) for b in subset_batches]) * len(
                config.LANS)
        else:
            if params.local_rank == 0:
                # Split dataset into NGPUS subsets, with the same number of batches
                # Store NGPUS subsets in config.data_bin
                subset_batches = trainer.iterators["train"].get_batch_ids(
                    shuffle=True,
                    group_by_size=True,
                    num_subsets=int(os.environ["NGPUS"]))

                for i_sub in range(len(subset_batches)):
                    f = open(
                        os.path.join(config.data_bin, "batches_" + str(i_sub)),
                        'wb')
                    pickle.dump(subset_batches[i_sub], f)
                    f.close()

            torch.distributed.barrier()
            # Each process reads its own subset
            f = open(
                os.path.join(config.data_bin,
                             "batches_" + str(params.local_rank)), 'rb')
            subset_batches = pickle.load(f)
            f.close()
            trainer.num_train = sum([len(b) for b in subset_batches]) * len(
                config.LANS)

            data_iter = iter(trainer.iterators["train"].get_batches_iterator(
                subset_batches))

        for i_batch, raw_batch in enumerate(data_iter):
            try:
                keys = list(raw_batch.keys())
                random.shuffle(keys)
                for k in keys:
                    trainer.mass_step(raw_batch[k], k)
                    trainer.iter()
                    torch.distributed.barrier()
            except RuntimeError:
                continue

        scores = trainer.valid_step()
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
        torch.distributed.barrier()
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="Multi-GPU - Local rank")
    params = parser.parse_args()

    # Initialize distributed training
    if params.local_rank != -1:
        torch.cuda.set_device(params.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method='env://')
    trainer = Enc_Dec_Trainer(params)

    # Check whether dump_path exists, if not create one
    if os.path.exists(config.dump_path) == False:
        os.makedirs(config.dump_path)

    # Save config in dump_path
    f = open(os.path.join(config.dump_path, "config.pkl"), 'wb')
    pickle.dump(config, f)
    f.close()

    # Create logger for each process
    logger = create_logger(os.path.join(config.dump_path, 'train.log'),
                           rank=getattr(params, 'local_rank', 0))

    # Start epoch training
    for i_epoch in range(trainer.epoch_size):
        if trainer.epoch > trainer.epoch_size:
            break

        if config.multi_gpu == False or int(os.environ["NGPUS"]) == 1:
            # Single GPU, do not need to split dataset
            data_iter = iter(trainer.iterators["train"].get_iterator(
                True, True))
        else:
            if params.local_rank == 0:
                if os.path.exists(config.data_bin) == False:
                    os.makedirs(config.data_bin)

                # Split dataset into NGPUS subsets, with the same number of batches
                # Store NGPUS subsets in config.data_bin
                subset_batches = trainer.iterators["train"].get_batch_ids(
                    shuffle=True,
                    group_by_size=True,
                    num_subsets=int(os.environ["NGPUS"]))

                for i_sub in range(len(subset_batches)):
                    f = open(
                        os.path.join(config.data_bin, "batches_" + str(i_sub)),
                        'wb')
                    pickle.dump(subset_batches[i_sub], f)
                    f.close()

            torch.distributed.barrier()
            # Each process reads its own subset
            f = open(
                os.path.join(config.data_bin,
                             "batches_" + str(params.local_rank)), 'rb')
            subset_batches = pickle.load(f)
            f.close()
            data_iter = iter(trainer.iterators["train"].get_batches_iterator(
                subset_batches))
            num_train = sum([len(b) for b in subset_batches])
            trainer.num_train = num_train

        for i_batch, raw_batch in enumerate(data_iter):
            try:
                trainer.train_step(raw_batch)
                trainer.iter()
            except RuntimeError:
                continue

        scores = trainer.valid_step()
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch()
        torch.distributed.barrier()
Beispiel #5
0
def Multi_MT_main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="Multi-GPU - Local rank")
    parser.add_argument("--continue_path",
                        type=str,
                        default=None,
                        help="Where to reload checkpoint")
    parser.add_argument("--dump_path",
                        type=str,
                        default=None,
                        help="Where to store checkpoints")
    parser.add_argument('--data_bin',
                        default=None,
                        type=str,
                        help="Path to store binarized data")
    parser.add_argument('--epoch_size',
                        default=None,
                        type=int,
                        help="Maximum train epochs")
    parser.add_argument('--eval_only',
                        action="store_true",
                        help="Only perform evaluation")
    params = parser.parse_args()

    if params.continue_path is not None:
        config.continue_path = params.continue_path
    if params.dump_path is not None:
        config.dump_path = params.dump_path
    if params.data_bin is not None:
        config.data_bin = params.data_bin
        config.train_iter_dump_path = config.data_bin + "train_iter"
        config.valid_iter_dump_path = config.data_bin + "valid_iter"
        config.total_vocab_dump_path = config.data_bin + "TOTAL"
    if params.epoch_size is not None:
        config.epoch_size = params.epoch_size

    # Initialize distributed training
    if params.local_rank != -1:
        torch.cuda.set_device(params.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method='env://')
    trainer = Enc_Dec_Trainer(params)

    # Check whether dump_path exists, if not create one
    if params.local_rank == 0 or config.multi_gpu == False:
        if os.path.exists(config.dump_path) == False:
            os.makedirs(config.dump_path)

        # Save config in dump_path
        f = open(os.path.join(config.dump_path, "config.pkl"), 'wb')
        pickle.dump(config, f)
        f.close()
    torch.distributed.barrier()

    # Create logger for each process
    logger = create_logger(os.path.join(config.dump_path, 'train.log'),
                           rank=getattr(params, 'local_rank', 0))

    if params.eval_only:
        trainer.valid_step()
        exit()

    def check_epoch_end(flags):
        for k, v in flags.items():
            if v is False:
                return False
        return True

    # Start epoch training
    for i_epoch in range(trainer.epoch_size):
        if trainer.epoch > trainer.epoch_size:
            break

        should_end_epoch = {}
        for direction in trainer.iterators["train"].keys():
            should_end_epoch[direction] = False

        data_iter = {}
        for direction in trainer.iterators["train"].keys():
            data_iter[direction] = reset_train_iter(trainer, params, direction)

        while check_epoch_end(should_end_epoch) is False:
            for direction, para_train_iter in data_iter.items():
                try:
                    src_lan, tgt_lan = direction.split('-')
                    raw_batch = next(para_train_iter)
                    trainer.multi_mt_step(raw_batch, src_lan, tgt_lan)
                    trainer.iter()
                except StopIteration:
                    data_iter[direction] = reset_train_iter(
                        trainer, params, direction)
                    should_end_epoch[direction] = True

        scores = trainer.valid_step()
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
        torch.distributed.barrier()