Ejemplo n.º 1
0
def train(args):
    # parse config
    config = parse_config(args.config)
    train_config = merge_configs(config, 'train', vars(args))
    valid_config = merge_configs(config, 'valid', vars(args))
    print_configs(train_config, 'Train')
    train_model = models.get_model(args.model_name, train_config, mode='train')
    valid_model = models.get_model(args.model_name, valid_config, mode='valid')

    # build model
    startup = fluid.Program()
    train_prog = fluid.Program()
    if args.fix_random_seed:
        startup.random_seed = 1000
        train_prog.random_seed = 1000
    with fluid.program_guard(train_prog, startup):
        with fluid.unique_name.guard():
            train_model.build_input(use_dataloader=True)
            train_model.build_model()
            # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label
            train_feeds = train_model.feeds()
            train_fetch_list = train_model.fetches()
            train_loss = train_fetch_list[0]
            optimizer = train_model.optimizer()
            optimizer.minimize(train_loss)
            train_dataloader = train_model.dataloader()

    valid_prog = fluid.Program()
    with fluid.program_guard(valid_prog, startup):
        with fluid.unique_name.guard():
            valid_model.build_input(use_dataloader=True)
            valid_model.build_model()
            valid_feeds = valid_model.feeds()
            valid_fetch_list = valid_model.fetches()
            valid_dataloader = valid_model.dataloader()

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup)

    if args.resume:
        # if resume weights is given, load resume weights directly
        assert os.path.exists(args.resume + '.pdparams'), \
                "Given resume weight dir {}.pdparams not exist.".format(args.resume)
        fluid.load(train_prog, model_path=args.resume, executor=exe)
    else:
        # if not in resume mode, load pretrain weights
        if args.pretrain:
            assert os.path.exists(args.pretrain), \
                    "Given pretrain weight dir {} not exist.".format(args.pretrain)
        pretrain = args.pretrain or train_model.get_pretrain_weights()
        if pretrain:
            train_model.load_pretrain_params(exe, pretrain, train_prog, place)

    build_strategy = fluid.BuildStrategy()
    build_strategy.enable_inplace = True
    if args.model_name in ['CTCN']:
        build_strategy.enable_sequential_execution = True

    exec_strategy = fluid.ExecutionStrategy()

    compiled_train_prog = fluid.compiler.CompiledProgram(
        train_prog).with_data_parallel(loss_name=train_loss.name,
                                       build_strategy=build_strategy,
                                       exec_strategy=exec_strategy)
    compiled_valid_prog = fluid.compiler.CompiledProgram(
        valid_prog).with_data_parallel(share_vars_from=compiled_train_prog,
                                       build_strategy=build_strategy,
                                       exec_strategy=exec_strategy)

    # get reader
    bs_denominator = 1
    if args.use_gpu:
        # check number of GPUs
        gpus = os.getenv("CUDA_VISIBLE_DEVICES", "")
        if gpus == "":
            pass
        else:
            gpus = gpus.split(",")
            num_gpus = len(gpus)
            assert num_gpus == train_config.TRAIN.num_gpus, \
                   "num_gpus({}) set by CUDA_VISIBLE_DEVICES " \
                   "shoud be the same as that " \
                   "set in {}({})".format(
                   num_gpus, args.config, train_config.TRAIN.num_gpus)
        bs_denominator = train_config.TRAIN.num_gpus

    train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /
                                        bs_denominator)
    valid_config.VALID.batch_size = int(valid_config.VALID.batch_size /
                                        bs_denominator)
    train_reader = get_reader(args.model_name.upper(), 'train', train_config)
    valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config)

    # get metrics
    train_metrics = get_metrics(args.model_name.upper(), 'train', train_config)
    valid_metrics = get_metrics(args.model_name.upper(), 'valid', valid_config)

    epochs = args.epoch or train_model.epoch_num()

    exe_places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
    train_dataloader.set_sample_list_generator(train_reader, places=exe_places)
    valid_dataloader.set_sample_list_generator(valid_reader, places=exe_places)

    train_with_dataloader(exe,
                          train_prog,
                          compiled_train_prog,
                          train_dataloader,
                          train_fetch_list,
                          train_metrics,
                          epochs=epochs,
                          log_interval=args.log_interval,
                          valid_interval=args.valid_interval,
                          save_dir=args.save_dir,
                          save_model_name=args.model_name,
                          fix_random_seed=args.fix_random_seed,
                          compiled_test_prog=compiled_valid_prog,
                          test_dataloader=valid_dataloader,
                          test_fetch_list=valid_fetch_list,
                          test_metrics=valid_metrics,
                          is_profiler=args.is_profiler,
                          profiler_path=args.profiler_path)
Ejemplo n.º 2
0
def train(args):
    # implement distributed training by fleet
    use_fleet = True
    if use_fleet:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        args.num_trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
        args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        print('-------------', args.num_trainers, args.trainer_id)

    if args.trainer_id == 0:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)

    # parse config
    config = parse_config(args.config)
    train_config = merge_configs(config, 'train', vars(args))
    print_configs(train_config, 'Train')
    train_model = models.get_model(args.model_name, train_config, mode='train')

    # build model
    startup = fluid.Program()
    train_prog = fluid.Program()
    if args.fix_random_seed:
        startup.random_seed = 1000
        train_prog.random_seed = 1000
    with fluid.program_guard(train_prog, startup):
        with fluid.unique_name.guard():
            train_model.build_input(use_dataloader=True)
            train_model.build_model()
            # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label
            train_feeds = train_model.feeds()
            train_fetch_list = train_model.fetches()
            train_loss = train_fetch_list[0]
            optimizer = train_model.optimizer()

            if use_fleet:
                optimizer = fleet.distributed_optimizer(optimizer)

            optimizer.minimize(train_loss)
            train_dataloader = train_model.dataloader()

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup)

    if args.resume:
        # if resume weights is given, load resume weights directly
        assert os.path.exists(args.resume + '.pdparams'), \
                "Given resume weight dir {}.pdparams not exist.".format(args.resume)
        fluid.load(train_prog, model_path=args.resume, executor=exe)
    else:
        # if not in resume mode, load pretrain weights
        if args.pretrain:
            assert os.path.exists(args.pretrain), \
                    "Given pretrain weight dir {} not exist.".format(args.pretrain)
        pretrain = args.pretrain or train_model.get_pretrain_weights()
        if pretrain:
            train_model.load_pretrain_params(exe, pretrain, train_prog, place)

    build_strategy = fluid.BuildStrategy()
    build_strategy.enable_inplace = True
    if args.model_name in ['CTCN']:
        build_strategy.enable_sequential_execution = True

    exec_strategy = fluid.ExecutionStrategy()

    if use_fleet:
        compiled_train_prog = fleet.main_program
    else:
        compiled_train_prog = fluid.compiler.CompiledProgram(
            train_prog).with_data_parallel(loss_name=train_loss.name,
                                           build_strategy=build_strategy,
                                           exec_strategy=exec_strategy)
    # get reader
    bs_denominator = 1
    if args.use_gpu:
        # check number of GPUs
        gpus = os.getenv("CUDA_VISIBLE_DEVICES", "")
        if gpus == "":
            pass
        else:
            gpus = gpus.split(",")
            num_gpus = len(gpus)
            assert num_gpus == train_config.TRAIN.num_gpus, \
                   "num_gpus({}) set by CUDA_VISIBLE_DEVICES " \
                   "shoud be the same as that " \
                   "set in {}({})".format(
                   num_gpus, args.config, train_config.TRAIN.num_gpus)
        bs_denominator = train_config.TRAIN.num_gpus

    train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /
                                        bs_denominator)
    train_reader = get_reader(args.model_name.upper(), 'train', train_config)

    # get metrics
    train_metrics = get_metrics(args.model_name.upper(), 'train', train_config)

    epochs = args.epoch or train_model.epoch_num()
    exe_places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()

    train_dataloader.set_batch_generator(train_reader, places=place)

    train_with_dataloader(exe,
                          train_prog,
                          compiled_train_prog,
                          train_dataloader,
                          train_fetch_list,
                          train_metrics,
                          epochs=epochs,
                          log_interval=args.log_interval,
                          save_dir=args.save_dir,
                          num_trainers=args.num_trainers,
                          trainer_id=args.trainer_id,
                          save_model_name=args.model_name,
                          fix_random_seed=args.fix_random_seed,
                          is_profiler=args.is_profiler,
                          profiler_path=args.profiler_path)