Exemple #1
0
 def optimize(self, loss, optimizer_type, lr):
     optimizer = F.optimizer.Adam(learning_rate=lr)
     dist_strategy = DistributedStrategy()
     dist_strategy.enable_sequential_execution = True
     optimizer = cfleet.distributed_optimizer(optimizer,
                                              strategy=dist_strategy)
     _, param_grads = optimizer.minimize(loss, F.default_startup_program())
    def optimize(self, metrics):
        """
        Optimize the model by metrics(mainly `metrics["loss"]`).
        """
        # TODO: support dygraph
        if self.warmup_steps > 0:
            scheduled_lr = layers.learning_rate_scheduler.noam_decay(
                1 / (self.warmup_steps * (self.learning_rate**2)),
                self.warmup_steps)
        else:
            scheduled_lr = layers.create_global_var(
                name=fluid.unique_name.generate("learning_rate"),
                shape=[1],
                value=self.learning_rate,
                dtype="float32",
                persistable=True)
        grad_clip = fluid.clip.GradientClipByGlobalNorm(self.max_grad_norm)

        self.optimizer = AdamW(learning_rate=scheduled_lr,
                               grad_clip=grad_clip,
                               weight_decay=self.weight_decay)

        if self.is_distributed:
            self.optimizer = fleet.distributed_optimizer(
                self.optimizer, strategy=self.dist_strategy)

        self.optimizer.minimize(metrics["loss"])
        return scheduled_lr
Exemple #3
0
    def test_open_sync_batch_norm(self):
        import paddle.fluid as fluid
        import paddle.fluid.incubate.fleet.base.role_maker as role_maker
        from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy

        if not fluid.core.is_compiled_with_cuda():
            # Operator "gen_nccl_id" has not been registered
            return

        data = fluid.layers.data(name='X', shape=[1], dtype='float32')
        hidden = fluid.layers.fc(input=data, size=10)
        loss = fluid.layers.mean(hidden)

        optimizer = fluid.optimizer.AdamOptimizer()

        role = role_maker.UserDefinedCollectiveRoleMaker(0, ['127.0.0.1:6170'])
        fleet.init(role)

        dist_strategy = DistributedStrategy()
        dist_strategy.sync_batch_norm = True

        dist_optimizer = fleet.distributed_optimizer(optimizer,
                                                     strategy=dist_strategy)
        dist_optimizer.minimize(loss)

        self.assertEqual(dist_strategy.exec_strategy.num_threads, 1)
Exemple #4
0
def build_program(is_train, main_prog, startup_prog, args, dist_strategy=None):
    image_shape = [int(m) for m in args.image_shape.split(",")]
    model_name = args.model
    model_list = [m for m in dir(models) if "__" not in m]
    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
    model = models.__dict__[model_name]()
    with fluid.program_guard(main_prog, startup_prog):
        use_mixup = args.use_mixup
        if is_train and use_mixup:
            py_reader = fluid.layers.py_reader(
                capacity=16,
                shapes=[[-1] + image_shape, [-1, 1], [-1, 1], [-1, 1]],
                lod_levels=[0, 0, 0, 0],
                dtypes=["float32", "int64", "int64", "float32"],
                use_double_buffer=True)
        else:
            py_reader = fluid.layers.py_reader(
                capacity=16,
                shapes=[[-1] + image_shape, [-1, 1]],
                lod_levels=[0, 0],
                dtypes=["float32", "int64"],
                use_double_buffer=True)

        with fluid.unique_name.guard():
            if is_train and  use_mixup:
                image, y_a, y_b, lam = fluid.layers.read_file(py_reader)
                avg_cost = net_config(image=image, y_a=y_a, y_b=y_b, lam=lam, model=model, args=args, label=0, is_train=True)
                avg_cost.persistable = True
                build_program_out = [py_reader, avg_cost]
            else:
                image, label = fluid.layers.read_file(py_reader)
                avg_cost, acc_top1, acc_top5 = net_config(image, model, args, label=label, is_train=is_train)
                avg_cost.persistable = True
                acc_top1.persistable = True
                acc_top5.persistable = True
                build_program_out = [py_reader, avg_cost, acc_top1, acc_top5]

            if is_train:
                params = model.params
                params["total_images"] = args.total_images
                params["lr"] = args.lr
                params["num_epochs"] = args.num_epochs
                params["learning_strategy"]["batch_size"] = args.batch_size
                params["learning_strategy"]["name"] = args.lr_strategy
                params["l2_decay"] = args.l2_decay
                params["momentum_rate"] = args.momentum_rate

                optimizer = optimizer_setting(params)
                global_lr = optimizer._global_learning_rate()
                if args.fp16:
                    optimizer = fluid.contrib.mixed_precision.decorate(optimizer)
                dist_optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
                _, param_grads = dist_optimizer.minimize(avg_cost)

                global_lr.persistable=True
                build_program_out.append(global_lr)

    return build_program_out
def get_distributed_optimizer(optimizer):
    """
    Get the default collective distributed optimizer under fleet.
    """
    dist_strategy = DistributedStrategy()
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)
    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
    return optimizer
Exemple #6
0
def setup_optimizer(optimizer, use_cuda, is_distributed):
    """
    Setup the optimizer
    """
    if use_cuda:
        if is_distributed:
            dist_strategy = DistributedStrategy()
            optimizer = fleet.distributed_optimizer(optimizer,
                                                    strategy=dist_strategy)
Exemple #7
0
def distributed_optimize(optimizer):
    '''
    A part of configuration for distributed training
    '''
    strategy = DistributedStrategy()
    strategy.fuse_all_reduce_ops = True
    strategy.nccl_comm_num = 2 
    strategy.fuse_elewise_add_act_ops=True
    strategy.fuse_bn_act_ops = True
    return fleet.distributed_optimizer(optimizer, strategy=strategy)
Exemple #8
0
def build_program(is_train, main_prog, startup_prog, args, dist_strategy=None, data_layout="NCHW"):
    model_name = args.model
    model_list = [m for m in dir(models) if "__" not in m]
    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
    model = models.__dict__[model_name]()
    with fluid.program_guard(main_prog, startup_prog):
        use_mixup = args.use_mixup
        data_loader, data = utility.create_data_loader(is_train, args, data_layout=data_layout)

        with fluid.unique_name.guard():
            if is_train and  use_mixup:
                image, y_a, y_b, lam = data[0], data[1], data[2], data[3]
                avg_cost = net_config(image=image, y_a=y_a, y_b=y_b, lam=lam, model=model,
                                      args=args, label=0, is_train=True, data_format=data_layout)
                avg_cost.persistable = True
                build_program_out = [data_loader, avg_cost]
            else:
                image, label = data[0], data[1],
                avg_cost, acc_top1, acc_top5 = net_config(image, model, args,
                                                          label=label, is_train=is_train, data_format=data_layout)
                avg_cost.persistable = True
                acc_top1.persistable = True
                acc_top5.persistable = True
                build_program_out = [data_loader, avg_cost, acc_top1, acc_top5]

            if is_train:
                params = model.params
                params["total_images"] = args.total_images
                params["lr"] = args.lr
                params["num_epochs"] = args.num_epochs
                params["learning_strategy"]["batch_size"] = args.batch_size
                params["learning_strategy"]["name"] = args.lr_strategy
                params["l2_decay"] = args.l2_decay
                params["momentum_rate"] = args.momentum_rate
                params["use_dgc"] = args.use_dgc
                params["rampup_begin_step"] = args.rampup_begin_step

                optimizer = optimizer_setting(params)
                global_lr = optimizer._global_learning_rate()
                if args.fp16:
                    optimizer = fluid.contrib.mixed_precision.decorate(optimizer,
                                                                       init_loss_scaling=args.scale_loss,
                                                                       use_dynamic_loss_scaling=args.use_dynamic_loss_scaling)
                if args.use_recompute:
                    dist_strategy.forward_recompute = True
                    dist_strategy.enable_sequential_execution=True
                    dist_strategy.recompute_checkpoints = model.checkpoints
                dist_optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
                _, param_grads = dist_optimizer.minimize(avg_cost)

                global_lr.persistable=True
                build_program_out.append(global_lr)

    return build_program_out
Exemple #9
0
def setup_optimizer(optimizer, model, use_cuda, is_distributed):
    """
    Setup the optimizer
    """
    if use_cuda:
        if is_distributed:
            dist_strategy.recompute_checkpoints = model.checkpoints
            optimizer = fleet.distributed_optimizer(optimizer,
                                                    strategy=dist_strategy)
        else:
            optimizer = fluid.optimizer.RecomputeOptimizer(optimizer)
            optimizer._set_checkpoints(model.checkpoints)
Exemple #10
0
    def test_distributed_basic(self):
        checker = acp._get_checker()
        fs = HDFSClient(checker.hdfs_home, None)
        fs.delete(checker.hdfs_checkpoint_path)
        self._reset_generator()

        logger.info("begin test_distributed_basic")
        fs = LocalFS()
        save_dir = "./run_save_0"
        fs.delete(save_dir)

        #basic
        exe, main_prog, startup_prog = self._generate()

        compiled, data_loader, optimizer, loss, image, label = \
            self._init_env(exe, main_prog, startup_prog, minimize=False)

        #fleet
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        with fluid.program_guard(main_prog, startup_prog):
            dist_optimizer = fleet.distributed_optimizer(optimizer)
            dist_optimizer.minimize(loss)

        exe.run(startup_prog)

        o = None
        i = 0
        name = None
        for i in acp.train_epoch_range(3, 0):
            o = acp._get_train_epoch_range()
            name = o.name
            logger.info("_run_save_0 name:{} epoch_no:{}".format(o.name, i))

            for data in data_loader():
                fetch = exe.run(fleet.main_program,
                                feed=data,
                                fetch_list=[loss])

            self.assertEqual(len(o._exe_status), 1)

        o = acp._get_train_epoch_range()
        assert o == None, "now train epoch must not exits now"
        self.assertEqual(i, 2)

        fs.delete(save_dir)

        logger.info("end test_distributed_basic")
Exemple #11
0
    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
        # Input data
        images = fluid.layers.data(name='pixel',
                                   shape=[1, 28, 28],
                                   dtype=DTYPE)
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')

        # Train program
        predict = cnn_model(images)
        cost = fluid.layers.cross_entropy(input=predict, label=label)
        avg_cost = fluid.layers.mean(x=cost)

        # Evaluator
        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
        batch_acc = fluid.layers.accuracy(input=predict,
                                          label=label,
                                          total=batch_size_tensor)

        inference_program = fluid.default_main_program().clone()
        # Optimization
        # TODO(typhoonzero): fix distributed adam optimizer
        # opt = fluid.optimizer.AdamOptimizer(
        #     learning_rate=0.001, beta1=0.9, beta2=0.999)
        if not use_dgc:
            opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
        else:
            opt = fluid.optimizer.DGCMomentumOptimizer(learning_rate=self.lr,
                                                       momentum=0.9,
                                                       rampup_begin_step=0)

        # Reader
        train_reader = paddle.batch(paddle.dataset.mnist.test(),
                                    batch_size=batch_size)
        test_reader = paddle.batch(paddle.dataset.mnist.test(),
                                   batch_size=batch_size)

        if dist_strategy:
            dist_opt = fleet.distributed_optimizer(optimizer=opt,
                                                   strategy=dist_strategy)
            _, param_grads = dist_opt.minimize(avg_cost)
        else:
            opt.minimize(avg_cost)

        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
    def run_gpu_fleet_api_trainer(self, args):
        import paddle.distributed.fleet as fleet
        import paddle.distributed.fleet.base.role_maker as role_maker
        # 1. enable dygraph
        paddle.disable_static()

        # 2. init seed
        seed = 90
        paddle.static.default_startup_program().random_seed = seed
        paddle.static.default_main_program().random_seed = seed
        np.random.seed(seed)
        random.seed = seed
        # get trainer id
        args.trainer_id = paddle.distributed.get_rank()

        # 3. init parallel env
        if args.update_method == "nccl2":
            fleet.init(is_collective=True)

        # 4. train model
        model, train_reader, opt = self.get_model()
        if args.update_method == "nccl2":
            opt = fleet.distributed_optimizer(opt)
            model = fleet.distributed_model(model)

        out_losses = []
        for step_id, data in enumerate(train_reader()):
            data = self._get_data(data, args)
            if step_id == RUN_STEP:
                break
            loss = self.run_one_loop(model, opt, data)
            out_losses.append(loss.numpy())

            if args.update_method == "nccl2":
                loss = model.scale_loss(loss)

            loss.backward()
            if args.update_method == "nccl2":
                model.apply_collective_grads()

            opt.step()
            opt.clear_grad()
        print_to_out(out_losses)
Exemple #13
0
def dist_optimizer(config, optimizer):
    """
    Create a distributed optimizer based on a normal optimizer
    Args:
        config(dict):
        optimizer(): a normal optimizer
    Returns:
        optimizer: a distributed optimizer
    """
    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = 3
    exec_strategy.num_iteration_per_drop_scope = 10

    dist_strategy = DistributedStrategy()
    dist_strategy.nccl_comm_num = 1
    dist_strategy.fuse_all_reduce_ops = True
    dist_strategy.exec_strategy = exec_strategy
    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)

    return optimizer
    def _test_check_point(self, fs, dir_path):
        file_name = "persistables"

        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
        feeder = fluid.DataFeeder(feed_list=[image, label],
                                  place=fluid.CPUPlace())
        predict = fluid.layers.fc(input=image, size=10, act='softmax')
        loss = fluid.layers.cross_entropy(input=predict, label=label)
        avg_loss = fluid.layers.mean(loss)
        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001)

        dist_optimizer = fleet.distributed_optimizer(optimizer)
        dist_optimizer.minimize(avg_loss)

        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(fluid.default_startup_program())

        status = TrainStatus(2)
        fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
        n1 = fleet._get_last_checkpoint_no(dir_path, fs=fs)

        status2 = fleet.load_check_point(exe, dir_path, trainer_id=0, fs=fs)
        self.assertEqual(status2, status)

        fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
        n2 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
        self.assertEqual(n2, n1 + 1)

        fleet.clean_redundant_check_points(dir_path, fs=fs)
Exemple #15
0
    def set_optimizer(self, FLAGS, net_output):
        """
        set optimizer
        """
        optimizer = net_output['optimizer']
        if self.is_multi_gpu(FLAGS):
            trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
            num_trainers = int(os.getenv("PADDLE_TRAINERS_NUM"))
            trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
            logging.info("train_id:%s, num_trainers:%s, trainer_endpoints:%s" % (trainer_id,
                        num_trainers, trainer_endpoints))
            
            trainer_endpoints = trainer_endpoints.split(',')

            role = role_maker.UserDefinedCollectiveRoleMaker(current_id=trainer_id,
                    worker_endpoints=trainer_endpoints)
            fleet.init(role)
            
            dist_strategy = DistributedStrategy()
            #num_nodes = len(set([x.split(':')[0] for x in trainer_endpoints]))
            #if num_nodes == 1:
            #    dist_strategy.use_local_sgd = True
                #dist_strategy.mode = "collective" #multi node is nccl2
                #dist_strategy.collective_mode = "local_sgd" # local_sgd or grad_allreduce
            #    logging.info("use local sgd, not nccl2 for single node.")

            """
            #TODO:
            dist_strategy.enable_inplace = FLAGS.with_inplace
            if FLAGS.fuse_ops:
                dist_strategy.fuse_all_reduce_ops = 1
            dist_strategy.nccl_comm_num = FLAGS.nccl_comm_num
            """
            optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)

        return optimizer.minimize(net_output['loss'])
fleet.init(role)

# PE training
main_prog = fluid.Program()
start_prog = fluid.Program()
with fluid.program_guard(main_prog, start_prog):
    x = fluid.data(name='x', shape=[-1, 2], dtype='float32')
    label = fluid.data(name='label', shape=[-1, 1], dtype='float32')
    y = fluid.layers.fc(x, size=1, param_attr=fluid.initializer.Constant(1.0))

    cost = fluid.layers.square_error_cost(y, label)
    loss = fluid.layers.reduce_sum(cost)

optimizer = fluid.optimizer.SGD(learning_rate=0.0)
strategy = DistributedStrategy()
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(loss, start_prog)

place = fluid.CUDAPlace(int(os.environ['FLAGS_selected_gpus']))
exe = fluid.Executor(place)
exe.run(start_prog)

train_prog = fleet.main_program
x_data = np.ones(shape=[1, 2], dtype=np.float32)
label_data = np.ones(shape=[1, 1], dtype=np.float32)
out = exe.run(train_prog,
              feed={
                  'x': x_data,
                  'label': label_data
              },
              fetch_list=[loss.name])
Exemple #17
0
def optimization(loss,
                 warmup_steps,
                 num_train_steps,
                 learning_rate,
                 train_program,
                 startup_prog,
                 weight_decay,
                 scheduler='linear_warmup_decay',
                 use_dynamic_loss_scaling=False,
                 incr_every_n_steps=1000,
                 decr_every_n_nan_or_inf=2,
                 incr_ratio=2.0,
                 decr_ratio=0.8,
                 dist_strategy=None,
                 use_lamb=False):
    if warmup_steps > 0:
        if scheduler == 'noam_decay':
            scheduled_lr = fluid.layers.learning_rate_scheduler\
             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
                         warmup_steps)
        elif scheduler == 'linear_warmup_decay':
            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
                                               num_train_steps)
        else:
            raise ValueError("Unkown learning rate scheduler, should be "
                             "'noam_decay' or 'linear_warmup_decay'")
        if use_lamb:
            optimizer = fluid.optimizer.LambOptimizer(
                learning_rate=scheduled_lr)
        else:
            optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
    else:
        scheduled_lr = fluid.layers.create_global_var(
            name=fluid.unique_name.generate("learning_rate"),
            shape=[1],
            value=learning_rate,
            dtype='float32',
            persistable=True)
        if use_lamb:
            optimizer = fluid.optimizer.LambOptimizer(
                learning_rate=scheduled_lr)
        else:
            optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
        optimizer._learning_rate_map[
            fluid.default_main_program()] = scheduled_lr

    fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
        clip_norm=1.0))

    def exclude_from_weight_decay(name):
        if name.find("layer_norm") > -1:
            return True
        bias_suffix = ["_bias", "_b", ".b_0"]
        for suffix in bias_suffix:
            if name.endswith(suffix):
                return True
        return False

    param_list = dict()

    for param in train_program.global_block().all_parameters():
        param_list[param.name] = param * 1.0
        param_list[param.name].stop_gradient = True

    if dist_strategy is not None:
        # use fleet api
        optimizer = fleet.distributed_optimizer(optimizer,
                                                strategy=dist_strategy)

    _, param_grads = optimizer.minimize(loss)

    if weight_decay > 0:
        for param, grad in param_grads:
            if exclude_from_weight_decay(param.name):
                continue
            with param.block.program._optimized_guard(
                [param, grad]), fluid.framework.name_scope("weight_decay"):
                updated_param = param - param_list[
                    param.name] * weight_decay * scheduled_lr
                fluid.layers.assign(output=param, input=updated_param)

    return scheduled_lr
def main(args):
    cfg = XiaoduHiConfig()
    cfg.scene_sensor_algo = 'yolov4'

    wae_ndarray = np.load(os.path.join(args.wae_dir, 'raw_wae.npy'))
    start_epoch = 0

    train_program = fluid.Program()
    startup_program = fluid.Program()

    with fluid.program_guard(train_program, startup_program):
        attention_ctrl = AttentionController(
            inputs_type=args.inputs_type,
            num_actions=wae_ndarray.shape[0],
            act_tr_dim=wae_ndarray.shape[1],
            act_emb_ndarray=wae_ndarray,
            num_frames=cfg.ob_window_len,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            model_dim=args.model_dim,
            num_decoder_blocks=args.num_decoder_blocks,
            num_heads=args.num_heads,
            ffn_dim=args.ffn_dim,
            dropout=args.dropout,
            normalize_before=args.normalize_before,
            frame_emb_trainable=args.frame_emb_trainable,
            trigger_loss_coef=args.trigger_loss_coef,
            obj_loss_coef=args.obj_loss_coef,
            act_loss_coef=args.act_loss_coef,
            use_last_act_loss=args.use_last_act_loss,
            mode='train')
        preds = attention_ctrl.predict()
        test_program = train_program.clone(for_test=True)

        optimizer = fluid.optimizer.AdamOptimizer(
            learning_rate=args.lr,
            regularization=fluid.regularizer.L2Decay(
                regularization_coeff=0.1))
        if args.distributed_training:
            optimizer = fleet.distributed_optimizer(optimizer)
            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
            fleet.init(role)

        optimizer.minimize(attention_ctrl.loss)

    if args.distributed_training:
        place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))
    else:
        place = fluid.CUDAPlace(args.gpu)

    exe = fluid.Executor(place)
    exe.run(startup_program)

    if args.inputs_type.startswith('inst_crop') and \
       args.inputs_type != 'inst_crop_wo_crop':
        fluid.io.load_vars(
            exe, MobileNetV2_Pretrained,
            main_program=train_program,
            predicate=lambda v: os.path.exists(
                os.path.join(MobileNetV2_Pretrained, v.name)))
        print('Loaded weights from {}'.format(MobileNetV2_Pretrained))

    if args.init_params is not None:
        base = os.path.basename(args.init_params)
        if base.startswith('epoch_'):
            start_epoch = int(base[len('epoch_'):]) + 1

        tb_state = os.path.join(args.init_params, 'tb_state.txt')
        if os.path.exists(tb_state):
            global _update_step
            global _eval_step
            with open(tb_state, 'r') as f:
                update_step, eval_step = f.readline().split(' ')
                _update_step = int(update_step)
                _eval_step = int(eval_step)

        fluid.io.load_vars(
            exe, args.init_params,
            main_program=train_program,
            predicate=lambda v: os.path.exists(
                os.path.join(args.init_params, v.name)))
        print('Loaded weights from {}'.format(args.init_params))

    if args.distributed_training:
        train_worker_gpus = [int(os.environ.get('FLAGS_selected_gpus', 0))]
        test_worker_gpus = train_worker_gpus
    else:
        train_worker_gpus = convert_gpu_ids(args.data_worker_gpus_for_train)
        test_worker_gpus = convert_gpu_ids(args.data_worker_gpus_for_test)

    if not os.path.exists(args.save):
        os.makedirs(args.save)

    if not args.use_decord:
        train_dataloader = XiaoduHiDataloaderv2(
            attention_ctrl.feed_list, [place], args.yolov4_model_dir,
            args.video_tracking_dir, args.train_dataset,
            full_neg_txt=args.full_neg_train,
            batch_size=args.bs,
            num_workers=args.data_workers_for_train,
            worker_gpus=train_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            ob_window_len=cfg.ob_window_len,
            interval=cfg.interval,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            augment=False,
            resample_negs_per_epoch=True)
        test_dataloader = XiaoduHiDataloaderv2(
            attention_ctrl.feed_list, [place], args.yolov4_model_dir,
            args.video_tracking_dir, args.test_dataset,
            full_neg_txt=args.full_neg_test,
            batch_size=args.bs,
            num_workers=args.data_workers_for_test,
            worker_gpus=test_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            ob_window_len=cfg.ob_window_len,
            interval=cfg.interval,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            augment=False,
            resample_negs_per_epoch=False,
            for_test=True)

        test_dataloader.save_to_txt(
            os.path.join(args.save, 'eval_data.txt'), dt=200)
    else:
        train_dataloader = XiaoduHiDecordLoader(
            attention_ctrl.feed_list, [place],
            args.yolov4_model_dir, args.decord_ds_pkl,
            decord_readers=args.decord_readers,
            yolov4_detectors=args.decord_detectors,
            post_workers=args.decord_post_workers,
            batch_size=args.bs,
            detector_gpus=train_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            for_test=False)
        test_dataloader = XiaoduHiDecordLoader(
            attention_ctrl.feed_list, [place],
            args.yolov4_model_dir, args.decord_ds_pkl,
            decord_readers=args.decord_readers,
            yolov4_detectors=args.decord_detectors,
            post_workers=args.decord_post_workers,
            batch_size=args.bs,
            detector_gpus=test_worker_gpus,
            roi_feat_resolution=cfg.roi_feat_resolution,
            tokens_per_frame=cfg.tokens_per_frame,
            visual_token_dim=cfg.visual_token_dim,
            for_test=True)

    train_dataloader.start_workers()
    test_dataloader.start_workers()

    train_log = os.path.join(args.save, 'loss.csv')
    eval_log = os.path.join(args.save, 'eval.txt')
    with open(os.path.join(args.save, 'args.txt'), 'w') as f:
        f.write(str(args))

    tb_writer = SummaryWriter(
        logdir=os.path.join(args.save, 'logdir'),
        purge_step=None if _update_step == 0 else _update_step)

    worker_index = None if not args.distributed_training \
        else fleet.worker_index()

    # if worker_index == 0:
    #     eval_model(exe, test_program, preds, attention_ctrl.act_loss,
    #                test_dataloader, -1, log_file=eval_log,
    #                tb_writer=tb_writer, worker_index=worker_index)
    for epoch_id in range(start_epoch, args.epochs):
        print('--------------- Epoch %d ---------------' % epoch_id)
        train_epoch(exe, train_program, attention_ctrl, train_dataloader,
                    log_file=train_log, tb_writer=tb_writer,
                    worker_index=worker_index)

        save_dir = os.path.join(args.save, 'epoch_{}'.format(epoch_id))
        shutil.rmtree(save_dir, ignore_errors=True)
        os.mkdir(save_dir)
        fluid.io.save_params(exe, save_dir, main_program=train_program)

        if epoch_id > 0 and epoch_id % args.run_eval_after_epochs == 0:
            eval_model(exe, test_program, preds, attention_ctrl.act_loss,
                       test_dataloader, epoch_id, log_file=eval_log,
                       tb_writer=tb_writer)

        tb_state = os.path.join(save_dir, 'tb_state.txt')
        with open(tb_state, 'w') as f:
            f.write('{} {}'.format(_update_step, _eval_step))

    if epoch_id % args.run_eval_after_epochs != 0:
        eval_model(exe, test_program, preds, attention_ctrl.act_loss,
                   test_dataloader, epoch_id, log_file=eval_log,
                   tb_writer=tb_writer, worker_index=worker_index)

    train_dataloader.stop_workers()
    test_dataloader.stop_workers()
def main():
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)  # new line 3
    fleet.init(role)  # new line 4
    env = os.environ

    num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 0))
    assert num_trainers != 0, "multi-machine training process must be started using distributed.launch..."
    trainer_id = int(env.get("PADDLE_TRAINER_ID", 0))

    # set different seeds for different trainers
    random.seed(trainer_id)
    np.random.seed(trainer_id)

    if FLAGS.enable_ce:
        random.seed(0)
        np.random.seed(0)

    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    check_config(cfg)
    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    save_only = getattr(cfg, 'save_prediction_only', False)
    if save_only:
        raise NotImplementedError('The config file only support prediction,'
                                  ' training stage is not implemented now')
    main_arch = cfg.architecture

    assert cfg.use_gpu == True, "GPU must be supported for multi-machine training..."
    devices_num = fluid.core.get_cuda_device_count()

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')

    # build program
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    if FLAGS.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = create(main_arch)
            if FLAGS.fp16:
                assert (getattr(model.backbone, 'norm_type', None)
                        != 'affine_channel'), \
                    '--fp16 currently does not support affine channel, ' \
                    ' please modify backbone settings to use batch norm'

            with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx:
                inputs_def = cfg['TrainReader']['inputs_def']
                feed_vars, train_loader = model.build_inputs(**inputs_def)
                train_fetches = model.train(feed_vars)
                loss = train_fetches['loss']
                if FLAGS.fp16:
                    loss *= ctx.get_loss_scale_var()
                lr = lr_builder()
                optimizer = optim_builder(lr)

                dist_strategy = DistributedStrategy()
                sync_bn = getattr(model.backbone, 'norm_type',
                                  None) == 'sync_bn'
                dist_strategy.sync_batch_norm = sync_bn
                dist_strategy.nccl_comm_num = 1
                exec_strategy = fluid.ExecutionStrategy()
                exec_strategy.num_threads = 3
                exec_strategy.num_iteration_per_drop_scope = 30
                dist_strategy.exec_strategy = exec_strategy
                dist_strategy.fuse_all_reduce_ops = True
                optimizer = fleet.distributed_optimizer(
                    optimizer, strategy=dist_strategy)  # new line 5

                optimizer.minimize(loss)

                if FLAGS.fp16:
                    loss /= ctx.get_loss_scale_var()

            if 'use_ema' in cfg and cfg['use_ema']:
                global_steps = _decay_step_counter()
                ema = ExponentialMovingAverage(cfg['ema_decay'],
                                               thres_steps=global_steps)
                ema.update()

    # parse train fetches
    train_keys, train_values, _ = parse_fetches(train_fetches)
    train_values.append(lr)

    if FLAGS.eval:
        eval_prog = fluid.Program()
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
                model = create(main_arch)
                inputs_def = cfg['EvalReader']['inputs_def']
                feed_vars, eval_loader = model.build_inputs(**inputs_def)
                fetches = model.eval(feed_vars)
        eval_prog = eval_prog.clone(True)

        eval_reader = create_reader(cfg.EvalReader, devices_num=1)
        # When iterable mode, set set_sample_list_generator(eval_reader, place)
        eval_loader.set_sample_list_generator(eval_reader)

        # parse eval fetches
        extra_keys = []
        if cfg.metric == 'COCO':
            extra_keys = ['im_info', 'im_id', 'im_shape']
        if cfg.metric == 'VOC':
            extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
        if cfg.metric == 'WIDERFACE':
            extra_keys = ['im_id', 'im_shape', 'gt_bbox']
        eval_keys, eval_values, eval_cls = parse_fetches(
            fetches, eval_prog, extra_keys)

    exe.run(startup_prog)
    compiled_train_prog = fleet.main_program

    if FLAGS.eval:
        compiled_eval_prog = fluid.CompiledProgram(eval_prog)

    fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel'

    ignore_params = cfg.finetune_exclude_pretrained_params \
                 if 'finetune_exclude_pretrained_params' in cfg else []

    start_iter = 0
    if FLAGS.resume_checkpoint:
        checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint)
        start_iter = checkpoint.global_step()
    elif cfg.pretrain_weights and fuse_bn and not ignore_params:
        checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights)
    elif cfg.pretrain_weights:
        checkpoint.load_params(exe,
                               train_prog,
                               cfg.pretrain_weights,
                               ignore_params=ignore_params)

    train_reader = create_reader(cfg.TrainReader,
                                 (cfg.max_iters - start_iter) * devices_num,
                                 cfg,
                                 devices_num=devices_num)
    # When iterable mode, set set_sample_list_generator(train_reader, place)
    train_loader.set_sample_list_generator(train_reader)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()

    # if map_type not set, use default 11point, only use in VOC eval
    map_type = cfg.map_type if 'map_type' in cfg else '11point'

    train_stats = TrainingStats(cfg.log_iter, train_keys)
    train_loader.start()
    start_time = time.time()
    end_time = time.time()

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)
    time_stat = deque(maxlen=cfg.log_iter)
    best_box_ap_list = [0.0, 0]  #[map, iter]

    # use VisualDL to log data
    if FLAGS.use_vdl:
        assert six.PY3, "VisualDL requires Python >= 3.5"
        from visualdl import LogWriter
        vdl_writer = LogWriter(FLAGS.vdl_log_dir)
        vdl_loss_step = 0
        vdl_mAP_step = 0

    for it in range(start_iter, cfg.max_iters):
        start_time = end_time
        end_time = time.time()
        time_stat.append(end_time - start_time)
        time_cost = np.mean(time_stat)
        eta_sec = (cfg.max_iters - it) * time_cost
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        outs = exe.run(compiled_train_prog, fetch_list=train_values)
        stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])}

        # use vdl-paddle to log loss
        if FLAGS.use_vdl:
            if it % cfg.log_iter == 0:
                for loss_name, loss_value in stats.items():
                    vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step)
                vdl_loss_step += 1

        train_stats.update(stats)
        logs = train_stats.log()
        if it % cfg.log_iter == 0 and trainer_id == 0:
            strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format(
                it, np.mean(outs[-1]), logs, time_cost, eta)
            logger.info(strs)

        # NOTE : profiler tools, used for benchmark
        if FLAGS.is_profiler and it == 5:
            profiler.start_profiler("All")
        elif FLAGS.is_profiler and it == 10:
            profiler.stop_profiler("total", FLAGS.profiler_path)
            return


        if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \
           and trainer_id == 0:
            save_name = str(it) if it != cfg.max_iters - 1 else "model_final"
            if 'use_ema' in cfg and cfg['use_ema']:
                exe.run(ema.apply_program)
            checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name))

            if FLAGS.eval:
                # evaluation
                resolution = None
                if 'Mask' in cfg.architecture:
                    resolution = model.mask_head.resolution
                results = eval_run(exe,
                                   compiled_eval_prog,
                                   eval_loader,
                                   eval_keys,
                                   eval_values,
                                   eval_cls,
                                   cfg,
                                   resolution=resolution)
                box_ap_stats = eval_results(results, cfg.metric,
                                            cfg.num_classes, resolution,
                                            is_bbox_normalized,
                                            FLAGS.output_eval, map_type,
                                            cfg['EvalReader']['dataset'])

                # use vdl_paddle to log mAP
                if FLAGS.use_vdl:
                    vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step)
                    vdl_mAP_step += 1

                if box_ap_stats[0] > best_box_ap_list[0]:
                    best_box_ap_list[0] = box_ap_stats[0]
                    best_box_ap_list[1] = it
                    checkpoint.save(exe, train_prog,
                                    os.path.join(save_dir, "best_model"))
                logger.info("Best test box ap: {}, in iter: {}".format(
                    best_box_ap_list[0], best_box_ap_list[1]))

            if 'use_ema' in cfg and cfg['use_ema']:
                exe.run(ema.restore_program)

    train_loader.reset()
    def do_training(self, fleet, args):
        """
        begin training.
        Args:
            fleet (Collective): Collective inherited base class Fleet
            args (ArgumentParser): run args to config dist fleet.
        Returns:
            tuple: the value is train losses
        """
        args = parse_args()
        logging.info(args)
        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 4))
        place = fluid.CUDAPlace(gpu_id)
        dev_count = 1
        exe = fluid.Executor(place)
        train_program = fluid.Program()
        startup_program = fluid.Program()
        args.num_trainers = fleet.worker_num()
        args.trainer_id = fleet.worker_index()
        args.run_params = json.loads(args.run_params)
        dist_strategy = DistributedStrategy()
        dist_strategy.enable_inplace = args.run_params['enable_inplace']
        dist_strategy.fuse_all_reduce_ops = args.run_params[
            'fuse_all_reduce_ops']
        dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num']
        dist_strategy.use_local_sgd = args.run_params['use_local_sgd']
        dist_strategy.mode = args.run_params["mode"]
        dist_strategy.collective_mode = args.run_params["collective"]

        with fluid.program_guard(train_program, startup_program):
            with fluid.unique_name.guard():
                sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                    ModelHyperParams.src_vocab_size,
                    ModelHyperParams.trg_vocab_size,
                    ModelHyperParams.max_length + 1,
                    ModelHyperParams.n_layer,
                    ModelHyperParams.n_head,
                    ModelHyperParams.d_key,
                    ModelHyperParams.d_value,
                    ModelHyperParams.d_model,
                    ModelHyperParams.d_inner_hid,
                    ModelHyperParams.prepostprocess_dropout,
                    ModelHyperParams.attention_dropout,
                    ModelHyperParams.relu_dropout,
                    ModelHyperParams.preprocess_cmd,
                    ModelHyperParams.postprocess_cmd,
                    ModelHyperParams.weight_sharing,
                    TrainTaskConfig.label_smooth_eps,
                    ModelHyperParams.bos_idx,
                    use_py_reader=args.use_py_reader,
                    is_test=False)
                optimizer = fluid.optimizer.SGD(0.003)
                if args.run_params["fp16"]:
                    optimizer = decorate(optimizer, init_loss_scaling=64.0)
                optimizer = fleet.distributed_optimizer(optimizer,
                                                        strategy=dist_strategy)
                optimizer.minimize(avg_cost, startup_program)
        train_program = fleet.main_program
        exe.run(startup_program)
        train_data = prepare_data_generator(
            args,
            is_test=False,
            count=dev_count,
            pyreader=pyreader,
            py_reader_provider_wrapper=py_reader_provider_wrapper)

        loss_normalizer = -(
            (1. - TrainTaskConfig.label_smooth_eps) * np.log(
                (1. - TrainTaskConfig.label_smooth_eps)) +
            TrainTaskConfig.label_smooth_eps *
            np.log(TrainTaskConfig.label_smooth_eps /
                   (ModelHyperParams.trg_vocab_size - 1) + 1e-20))

        step_idx = 0
        init_flag = True
        result_loss = []
        result_ppl = []
        train_info = []
        for pass_id in six.moves.xrange(args.num_epochs):
            pass_start_time = time.time()
            if args.use_py_reader:
                pyreader.start()
                data_generator = None
            else:
                data_generator = train_data()
            batch_id = 0
            while True:
                try:
                    feed_dict_list = prepare_feed_dict_list(
                        data_generator, init_flag, dev_count)
                    t1 = time.time()
                    outs = exe.run(program=train_program,
                                   fetch_list=[sum_cost.name, token_num.name]
                                   if step_idx % args.fetch_steps == 0 else [],
                                   feed=feed_dict_list)

                    if step_idx % args.fetch_steps == 0:
                        sum_cost_val, token_num_val = np.array(
                            outs[0]), np.array(outs[1])
                        total_sum_cost = sum_cost_val.sum()
                        total_token_num = token_num_val.sum()
                        total_avg_cost = total_sum_cost / total_token_num
                        result_loss.append(total_avg_cost - loss_normalizer)
                        result_ppl.append(
                            np.exp([min(total_avg_cost, 100)]).item(0))
                        train_info.append(result_loss)
                    init_flag = False
                    batch_id += 1
                    step_idx += 1
                    if batch_id >= 5:
                        break
                except (StopIteration, fluid.core.EOFException):
                    if args.use_py_reader:
                        pyreader.reset()
                    break

            train_info = [round(i, 6) for i in train_info[0]]
            return train_info
Exemple #21
0
    def build_program(self,
                      is_train=True,
                      use_parallel_test=False,
                      dist_strategy=None):
        model_name = self.model_name
        assert not (is_train and use_parallel_test), \
            "is_train and use_parallel_test cannot be set simultaneously."

        trainer_id = self.trainer_id
        num_trainers = self.num_trainers

        image_shape = [int(m) for m in self.image_shape]
        # model definition
        model = self.model
        if model is None:
            model = resnet.__dict__[model_name](emb_dim=self.emb_dim)
        main_program = self.train_program if is_train else self.test_program
        startup_program = self.startup_program
        with fluid.program_guard(main_program, startup_program):
            with fluid.unique_name.guard():
                image = fluid.layers.data(name='image',
                                          shape=image_shape,
                                          dtype='float32')
                label = fluid.layers.data(name='label',
                                          shape=[1],
                                          dtype='int64')

                emb, loss, prob = model.get_output(
                    input=image,
                    label=label,
                    num_ranks=num_trainers,
                    rank_id=trainer_id,
                    is_train=is_train,
                    num_classes=self.num_classes,
                    loss_type=self.loss_type,
                    param_attr=self.param_attr,
                    bias_attr=self.bias_attr,
                    margin=self.margin,
                    scale=self.scale)

                acc1 = None
                acc5 = None

                if self.loss_type in ["dist_softmax", "dist_arcface"]:
                    if self.calc_train_acc:
                        shard_prob = loss._get_info("shard_prob")

                        prob_all = fluid.layers.collective._c_allgather(
                            shard_prob,
                            nranks=num_trainers,
                            use_calc_stream=True)
                        prob_list = fluid.layers.split(
                            prob_all, dim=0, num_or_sections=num_trainers)
                        prob = fluid.layers.concat(prob_list, axis=1)
                        label_all = fluid.layers.collective._c_allgather(
                            label, nranks=num_trainers, use_calc_stream=True)
                        acc1 = fluid.layers.accuracy(input=prob,
                                                     label=label_all,
                                                     k=1)
                        acc5 = fluid.layers.accuracy(input=prob,
                                                     label=label_all,
                                                     k=5)
                else:
                    if self.calc_train_acc:
                        acc1 = fluid.layers.accuracy(input=prob,
                                                     label=label,
                                                     k=1)
                        acc5 = fluid.layers.accuracy(input=prob,
                                                     label=label,
                                                     k=5)

                optimizer = None
                if is_train:
                    # initialize optimizer
                    optimizer = self._get_optimizer()
                    if self.num_trainers > 1:
                        dist_optimizer = fleet.distributed_optimizer(
                            optimizer, strategy=dist_strategy)
                        dist_optimizer.minimize(loss)
                    else:  # single card training
                        optimizer.minimize(loss)
                    if "dist" in self.loss_type or self.use_fp16:
                        optimizer = optimizer._optimizer
                elif use_parallel_test:
                    emb = fluid.layers.collective._c_allgather(
                        emb, nranks=num_trainers, use_calc_stream=True)
        return emb, loss, acc1, acc5, optimizer
Exemple #22
0
def optimization(loss,
                 warmup_steps,
                 num_train_steps,
                 learning_rate,
                 train_program,
                 startup_prog,
                 weight_decay,
                 scheduler='linear_warmup_decay',
                 use_fp16=False,
                 use_dynamic_loss_scaling=False,
                 init_loss_scaling=1.0,
                 incr_every_n_steps=1000,
                 decr_every_n_nan_or_inf=2,
                 incr_ratio=2.0,
                 decr_ratio=0.8,
                 dist_strategy=None):
    """ optimization """
    if warmup_steps > 0:
        if scheduler == 'noam_decay':
            scheduled_lr = fluid.layers.learning_rate_scheduler \
                .noam_decay(1 / (warmup_steps * (learning_rate ** 2)),
                            warmup_steps)
        elif scheduler == 'linear_warmup_decay':
            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
                                               num_train_steps)
        else:
            raise ValueError("Unkown learning rate scheduler, should be "
                             "'noam_decay' or 'linear_warmup_decay'")
        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr,
                                         epsilon=1e-06)
        # optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
    else:
        scheduled_lr = fluid.layers.create_global_var(
            name=fluid.unique_name.generate("learning_rate"),
            shape=[1],
            value=learning_rate,
            dtype='float32',
            persistable=True)
        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr,
                                         epsilon=1e-06)
        # optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
        optimizer._learning_rate_map[
            fluid.default_main_program()] = scheduled_lr

    fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
        clip_norm=1.0))

    def exclude_from_weight_decay(name):
        """ exclude_from_weight_decay """
        if name.find("layer_norm") > -1:
            return True
        bias_suffix = ["_bias", "_b", ".b_0"]
        for suffix in bias_suffix:
            if name.endswith(suffix):
                return True
        return False

    param_list = dict()

    loss_scaling = fluid.layers.create_global_var(
        name=fluid.unique_name.generate("loss_scaling"),
        shape=[1],
        value=init_loss_scaling,
        dtype='float32',
        persistable=True)

    if use_fp16:
        loss *= loss_scaling
        param_grads = optimizer.backward(loss)

        master_param_grads = create_master_params_grads(
            param_grads, train_program, startup_prog, loss_scaling)

        for param, _ in master_param_grads:
            param_list[param.name] = param * 1.0
            param_list[param.name].stop_gradient = True

        if use_dynamic_loss_scaling:
            apply_dynamic_loss_scaling(loss_scaling, master_param_grads,
                                       incr_every_n_steps,
                                       decr_every_n_nan_or_inf, incr_ratio,
                                       decr_ratio)

        optimizer.apply_gradients(master_param_grads)

        if weight_decay > 0:
            for param, grad in master_param_grads:
                if exclude_from_weight_decay(param.name.rstrip(".master")):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)

        master_param_to_train_param(master_param_grads, param_grads,
                                    train_program)

    else:
        for param in train_program.global_block().all_parameters():
            param_list[param.name] = param * 1.0
            param_list[param.name].stop_gradient = True

        if dist_strategy is not None:
            optimizer = fleet.distributed_optimizer(optimizer,
                                                    strategy=dist_strategy)

        _, param_grads = optimizer.minimize(loss)

        if weight_decay > 0:
            for param, grad in param_grads:
                if exclude_from_weight_decay(param.name):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)
    result = collections.OrderedDict()
    result['scheduled_lr'] = scheduled_lr
    if use_fp16:
        result['loss_scaling'] = loss_scaling
    return result
Exemple #23
0
def train(args):
    # implement distributed training by fleet
    use_fleet = True
    if use_fleet:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        args.num_trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
        args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        print('-------------', args.num_trainers, args.trainer_id)

    if args.trainer_id == 0:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)

    # parse config
    config = parse_config(args.config)
    train_config = merge_configs(config, 'train', vars(args))
    print_configs(train_config, 'Train')
    train_model = models.get_model(args.model_name, train_config, mode='train')

    # build model
    startup = fluid.Program()
    train_prog = fluid.Program()
    if args.fix_random_seed:
        startup.random_seed = 1000
        train_prog.random_seed = 1000
    with fluid.program_guard(train_prog, startup):
        with fluid.unique_name.guard():
            train_model.build_input(use_dataloader=True)
            train_model.build_model()
            # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label
            train_feeds = train_model.feeds()
            train_fetch_list = train_model.fetches()
            train_loss = train_fetch_list[0]
            optimizer = train_model.optimizer()

            if use_fleet:
                optimizer = fleet.distributed_optimizer(optimizer)

            optimizer.minimize(train_loss)
            train_dataloader = train_model.dataloader()

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup)

    if args.resume:
        # if resume weights is given, load resume weights directly
        assert os.path.exists(args.resume + '.pdparams'), \
                "Given resume weight dir {}.pdparams not exist.".format(args.resume)
        fluid.load(train_prog, model_path=args.resume, executor=exe)
    else:
        # if not in resume mode, load pretrain weights
        if args.pretrain:
            assert os.path.exists(args.pretrain), \
                    "Given pretrain weight dir {} not exist.".format(args.pretrain)
        pretrain = args.pretrain or train_model.get_pretrain_weights()
        if pretrain:
            train_model.load_pretrain_params(exe, pretrain, train_prog, place)

    build_strategy = fluid.BuildStrategy()
    build_strategy.enable_inplace = True
    if args.model_name in ['CTCN']:
        build_strategy.enable_sequential_execution = True

    exec_strategy = fluid.ExecutionStrategy()

    if use_fleet:
        compiled_train_prog = fleet.main_program
    else:
        compiled_train_prog = fluid.compiler.CompiledProgram(
            train_prog).with_data_parallel(loss_name=train_loss.name,
                                           build_strategy=build_strategy,
                                           exec_strategy=exec_strategy)
    # get reader
    bs_denominator = 1
    if args.use_gpu:
        # check number of GPUs
        gpus = os.getenv("CUDA_VISIBLE_DEVICES", "")
        if gpus == "":
            pass
        else:
            gpus = gpus.split(",")
            num_gpus = len(gpus)
            assert num_gpus == train_config.TRAIN.num_gpus, \
                   "num_gpus({}) set by CUDA_VISIBLE_DEVICES " \
                   "shoud be the same as that " \
                   "set in {}({})".format(
                   num_gpus, args.config, train_config.TRAIN.num_gpus)
        bs_denominator = train_config.TRAIN.num_gpus

    train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /
                                        bs_denominator)
    train_reader = get_reader(args.model_name.upper(), 'train', train_config)

    # get metrics
    train_metrics = get_metrics(args.model_name.upper(), 'train', train_config)

    epochs = args.epoch or train_model.epoch_num()
    exe_places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()

    train_dataloader.set_batch_generator(train_reader, places=place)

    train_with_dataloader(exe,
                          train_prog,
                          compiled_train_prog,
                          train_dataloader,
                          train_fetch_list,
                          train_metrics,
                          epochs=epochs,
                          log_interval=args.log_interval,
                          save_dir=args.save_dir,
                          num_trainers=args.num_trainers,
                          trainer_id=args.trainer_id,
                          save_model_name=args.model_name,
                          fix_random_seed=args.fix_random_seed,
                          is_profiler=args.is_profiler,
                          profiler_path=args.profiler_path)
def main(args):
    with open(args.config, 'r') as f:
        config = json.load(f)

    logging.info('Load data ...')
    if len(args.dataset.split(',')) > 1:
        # for large pretraining dataset, ZINC15 and ChEMBL
        # directly load the processed npz files
        train_data_list = []
        for ds in args.dataset.split(','):
            # use processed data.npz
            train_data_list.extend(
                load_data(os.path.join(args.root, ds, 'processed')))
            # dataset = MoleculeDataset(
            #     args.root, ds,
            #     add_symmetry=False,
            #     add_self_loop=False)
            # data_list = dataset.get_data_list()
            # processed_dir = os.path.join(args.root, ds, 'processed')
            # os.makedirs(processed_dir, exist_ok=True)
            # save_data_list_to_npz(
            #     data_list, os.path.join(processed_dir, 'data.npz'))

            # logging.info('Processed {}'.format(ds))
            # train_data_list.extend(data_list)
    else:
        if args.dataset == 'mutag':
            train_data_list, _ = load_mutag_dataset(
                os.path.join(args.root, args.dataset, 'raw'))
        elif args.dataset == 'ptc_mr':
            train_data_list, _ = load_ptc_mr_dataset(
                os.path.join(args.root, args.dataset, 'raw'))
        else:
            raise ValueError('Unsupported dataset')

    if args.is_fleet:
        train_data_list = [
            x for i, x in enumerate(train_data_list)
            if i % fleet.worker_num() == fleet.worker_index()
        ]
    logging.info("Data loaded.")
    logging.info("Train Examples: %s" % len(train_data_list))
    sys.stdout.flush()

    if args.emb_dir is not None:
        os.makedirs(args.emb_dir, exist_ok=True)

    train_prog = F.Program()
    test_prog = F.Program()
    startup_prog = F.Program()
    with F.program_guard(train_prog, startup_prog):
        with F.unique_name.guard():
            agent = create_model(args, config)
            test_prog = train_prog.clone(for_test=True)

            opt = F.optimizer.Adam(learning_rate=args.lr)
            if args.is_fleet:
                dist_strategy = DistributedStrategy()
                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                fleet.init(role)
                opt = fleet.distributed_optimizer(opt, strategy=dist_strategy)
            opt.minimize(agent.loss)

    place = F.CUDAPlace(0) if args.use_cuda else F.CPUPlace()
    exe = F.Executor(place)
    exe.run(startup_prog)

    if (not args.dont_save_emb) and \
       (not args.is_fleet or fleet.worker_index() == 0):
        save_embedding(args, exe, test_prog, agent, train_data_list, -1)

    for epoch_id in range(args.max_epoch):
        train(args, exe, train_prog, agent, train_data_list, epoch_id)
        if not args.is_fleet or fleet.worker_index() == 0:
            F.io.save_params(exe, '%s/epoch%s' % (args.model_dir, epoch_id),
                             train_prog)
            if not args.dont_save_emb:
                save_embedding(args, exe, test_prog, agent, train_data_list,
                               epoch_id)
    def _test_checkpoint(self, fs, dir_path):
        file_name = "persistables"

        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        image = fluid.data(name='img', shape=[None, 28, 28], dtype='float32')
        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
        feeder = fluid.DataFeeder(feed_list=[image, label],
                                  place=fluid.CPUPlace())
        predict = fluid.layers.fc(input=image, size=10, act='softmax')
        loss = fluid.layers.cross_entropy(input=predict, label=label)
        avg_loss = fluid.layers.mean(loss)
        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001)

        dist_optimizer = fleet.distributed_optimizer(optimizer)
        dist_optimizer.minimize(avg_loss)

        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(fluid.default_startup_program())

        status = ExeTrainStatus()
        status.epoch_no = 2
        _, n1 = fleet.save_checkpoint(exe,
                                      dir_path,
                                      trainer_id=0,
                                      train_status=status,
                                      fs=fs)

        status2 = ExeTrainStatus()
        fleet.load_checkpoint(exe,
                              dir_path,
                              trainer_id=0,
                              fs=fs,
                              train_status=status2)
        self.assertEqual(status2, status)

        _, n2 = fleet.save_checkpoint(exe,
                                      dir_path,
                                      trainer_id=0,
                                      train_status=status,
                                      fs=fs,
                                      remain_all_checkpoint=False)
        self.assertEqual(n2, n1 + 1)

        c = CheckpointSaver(fs)
        cp_nos = c.get_checkpoint_no(dir_path)
        assert len(cp_nos) == 1  # cleanup all others

        # unnormal
        # test remain_all_checkpoint
        fleet.save_checkpoint(exe,
                              dir_path,
                              trainer_id=0,
                              train_status=status,
                              fs=fs,
                              remain_all_checkpoint=False)

        # can't save under a file
        fs = LocalFS()
        cache_path = "./.load_cache"
        fs.touch(cache_path)
        try:
            fleet.save_checkpoint(exe,
                                  dir_path,
                                  trainer_id=0,
                                  train_status=status,
                                  fs=fs,
                                  cache_path=cache_path)
            self.assertFalse(True)
        except:
            pass

        # can't load under a file
        try:
            fleet.load_checkpoint(exe,
                                  dir_path,
                                  trainer_id=0,
                                  train_status=status2,
                                  fs=fs,
                                  cache_path=cache_path)
            self.assertFalse(True)
        except:
            pass
        fs.delete(cache_path)
Exemple #26
0
    def _make_program(self, mode):
        prog = self._progs.get(mode, None)
        if prog is not None:
            return

        prog = self._orig_prog.clone()
        # NOTE: When defining learning rate scheduling in static-graph, ops to
        # increase the global step var and calculate learning rate would be
        # prepended into _orig_prog. test program maked by `_orig_prog.clone`
        # also would include these ops. Thus must prune these ops in test
        # program, otherwise the global step would be changed in test.
        if mode != 'train':
            for op in list(prog.global_block().ops):
                prog.global_block()._remove_op(0)
        if mode == 'train' and self.model._optimizer \
                and self.model._optimizer._learning_rate_map:
            # HACK workaround learning rate map issue
            lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
            self.model._optimizer._learning_rate_map[prog] = lr_var

        losses = []
        metrics = []
        with fluid.program_guard(prog, self._startup_prog):
            ins = self.model._inputs
            lbls = self.model._labels if self.model._labels else []
            inputs = [k.forward() for k in to_list(ins)]
            labels = [k.forward() for k in to_list(lbls)]
            self._label_vars[mode] = labels
            outputs = to_list(self.model.forward(*inputs))

            if mode != 'test' and self.model._loss_function:
                losses = self.model._loss_function(outputs, labels)

            if self._nranks > 1 and mode != 'train':
                outputs = [_all_gather(o, self._nranks) for o in outputs]
                if mode != 'test':
                    labels = [_all_gather(l, self._nranks) for l in labels]

            if mode != 'test':
                for metric in self.model._metrics:
                    metrics.append(
                        to_list(metric.add_metric_op(outputs, labels)))

            if mode == 'train' and self.model._optimizer:
                self._loss_endpoint = fluid.layers.sum(losses)
                if self._nranks > 1:
                    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                    fleet.init(role)
                    dist_strategy = DistributedStrategy()
                    dist_strategy.mode = "collective"
                    dist_strategy.collective_mode = "grad_allreduce"
                    self.model._optimizer = fleet.distributed_optimizer(
                        self.model._optimizer, strategy=dist_strategy)

                self.model._optimizer.minimize(self._loss_endpoint)

        if mode != 'train':  # clone again to put it in test mode
            prog = prog.clone(for_test=True)

        self._input_vars[mode] = inputs

        self._progs[mode] = prog
        self._endpoints[mode] = {
            "output": outputs,
            "loss": losses,
            "metric": metrics
        }
Exemple #27
0
def optimization(loss,
                 warmup_steps,
                 num_train_steps,
                 learning_rate,
                 train_program,
                 startup_prog,
                 weight_decay,
	         dist_strategy=None,
                 scheduler='linear_warmup_decay',
                 use_fp16=False,
                 loss_scaling=1.0):
    if use_fp16:
        print("fp16 is not supported for now, please contact the author")
        exit()
    if warmup_steps > 0:
        if scheduler == 'noam_decay':
            scheduled_lr = fluid.layers.learning_rate_scheduler\
             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
                         warmup_steps)
        elif scheduler == 'linear_warmup_decay':
            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
                                               num_train_steps)
        else:
            raise ValueError("Unkown learning rate scheduler, should be "
                             "'noam_decay' or 'linear_warmup_decay'")
        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
    else:
        optimizer = fluid.optimizer.Adam(learning_rate=learning_rate)
        scheduled_lr = learning_rate

    clip_norm_thres = 1.0
    # When using mixed precision training, scale the gradient clip threshold
    # by loss_scaling
    if use_fp16 and loss_scaling > 1.0:
        clip_norm_thres *= loss_scaling
    fluid.clip.set_gradient_clip(
        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))

    def exclude_from_weight_decay(name):
        if name.find("layer_norm") > -1:
            return True
        bias_suffix = ["_bias", "_b", ".b_0"]
        for suffix in bias_suffix:
            if name.endswith(suffix):
                return True
        return False

    param_list = dict()
    """
    if use_fp16:
        param_grads = optimizer.backward(loss)
        master_param_grads = create_master_params_grads(
            param_grads, train_program, startup_prog, loss_scaling)

        for param, _ in master_param_grads:
            param_list[param.name] = param * 1.0
            param_list[param.name].stop_gradient = True

        optimizer.apply_gradients(master_param_grads)

        if weight_decay > 0:
            for param, grad in master_param_grads:
                if exclude_from_weight_decay(param.name.rstrip(".master")):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)

        master_param_to_train_param(master_param_grads, param_grads,
                                    train_program)
    """
    if use_fp16:
        pass

    else:
        for param in train_program.global_block().all_parameters():
            param_list[param.name] = param * 1.0
            param_list[param.name].stop_gradient = True
       
        #optimizer = fluid.contrib.mixed_precision.decorate(optimizer,
        #               init_loss_scaling=1.0,
        #               use_dynamic_loss_scaling=True)
        if dist_strategy is not None:	
	      optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
        _, param_grads = optimizer.minimize(loss)

        if weight_decay > 0:
            for param, grad in param_grads:
                if exclude_from_weight_decay(param.name):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
                        param.name] * weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)

    return scheduled_lr
Exemple #28
0
def compress(args):
    shuffle = True
    if args.ce_test:
        # set seed
        seed = 111
        paddle.seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        args.num_workers = 0
        shuffle = False

    env = os.environ
    num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 1))
    use_data_parallel = num_trainers > 1

    if use_data_parallel:
        # Fleet step 1: initialize the distributed environment
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

    train_reader = None
    test_reader = None
    if args.data == "mnist":
        transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
        train_dataset = paddle.vision.datasets.MNIST(
            mode='train', backend="cv2", transform=transform)
        val_dataset = paddle.vision.datasets.MNIST(
            mode='test', backend="cv2", transform=transform)
        class_dim = 10
        image_shape = "1,28,28"
        args.pretrained_model = False
    elif args.data == "cifar10":
        transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
        train_dataset = paddle.vision.datasets.Cifar10(
            mode="train", backend="cv2", transform=transform)
        val_dataset = paddle.vision.datasets.Cifar10(
            mode="test", backend="cv2", transform=transform)
        class_dim = 10
        image_shape = "3, 32, 32"
        args.pretrained_model = False
    elif args.data == "imagenet":
        import imagenet_reader as reader
        train_dataset = reader.ImageNetDataset(mode='train')
        val_dataset = reader.ImageNetDataset(mode='val')
        class_dim = 1000
        image_shape = "3,224,224"
    else:
        raise ValueError("{} is not supported.".format(args.data))
    image_shape = [int(m) for m in image_shape.split(",")]
    assert args.model in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
    if args.use_gpu:
        places = paddle.static.cuda_places()
    else:
        places = paddle.static.cpu_places()
    place = places[0]
    exe = paddle.static.Executor(place)

    image = paddle.static.data(
        name='image', shape=[None] + image_shape, dtype='float32')
    label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')

    batch_size_per_card = args.batch_size
    batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset,
        batch_size=batch_size_per_card,
        shuffle=shuffle,
        drop_last=True)

    train_loader = paddle.io.DataLoader(
        train_dataset,
        places=place,
        batch_sampler=batch_sampler,
        feed_list=[image, label],
        return_list=False,
        use_shared_memory=True,
        num_workers=args.num_workers)

    valid_loader = paddle.io.DataLoader(
        val_dataset,
        places=place,
        feed_list=[image, label],
        drop_last=False,
        return_list=False,
        use_shared_memory=True,
        batch_size=args.batch_size_for_validation,
        shuffle=False)

    step_per_epoch = int(
        np.ceil(len(train_dataset) * 1. / args.batch_size / num_trainers))

    # model definition
    model = models.__dict__[args.model]()
    out = model.net(input=image, class_dim=class_dim)
    if args.data == 'cifar10':
        label = paddle.reshape(label, [-1, 1])
    cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label)
    avg_cost = paddle.mean(x=cost)
    acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1)
    acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)

    val_program = paddle.static.default_main_program().clone(for_test=True)

    opt, learning_rate = create_optimizer(args, step_per_epoch)

    # Fleet step 2: distributed strategy
    if use_data_parallel:
        dist_strategy = DistributedStrategy()
        dist_strategy.sync_batch_norm = False
        dist_strategy.exec_strategy = paddle.static.ExecutionStrategy()
        dist_strategy.fuse_all_reduce_ops = False

    train_program = paddle.static.default_main_program()

    if args.pruning_strategy == 'gmp':
        # GMP pruner step 0: define configs for GMP, no need to define configs for the base training.
        configs = {
            'stable_iterations': args.stable_epochs * step_per_epoch,
            'pruning_iterations': args.pruning_epochs * step_per_epoch,
            'tunning_iterations': args.tunning_epochs * step_per_epoch,
            'resume_iteration': (args.last_epoch + 1) * step_per_epoch,
            'pruning_steps': args.pruning_steps,
            'initial_ratio': args.initial_ratio,
        }
    elif args.pruning_strategy == 'base':
        configs = None

    # GMP pruner step 1: initialize a pruner object by calling entry function.
    pruner = create_unstructured_pruner(
        train_program, args, place, configs=configs)

    if use_data_parallel:
        # Fleet step 3: decorate the origial optimizer and minimize it
        opt = fleet.distributed_optimizer(opt, strategy=dist_strategy)
    opt.minimize(avg_cost, no_grad_set=pruner.no_grad_set)

    exe.run(paddle.static.default_startup_program())
    if args.last_epoch > -1:
        assert args.checkpoint is not None and os.path.exists(
            args.checkpoint), "Please specify a valid checkpoint path."
        paddle.fluid.io.load_persistables(
            executor=exe, dirname=args.checkpoint, main_program=train_program)

    elif args.pretrained_model:
        assert os.path.exists(
            args.
            pretrained_model), "Pretrained model path {} doesn't exist".format(
                args.pretrained_model)

        def if_exist(var):
            return os.path.exists(os.path.join(args.pretrained_model, var.name))

        _logger.info("Load pretrained model from {}".format(
            args.pretrained_model))
        # NOTE: We are using fluid.io.load_vars() because the pretrained model is from an older version which requires this API. 
        # Please consider using paddle.static.load(program, model_path) when possible
        paddle.fluid.io.load_vars(
            exe, args.pretrained_model, predicate=if_exist)

    def test(epoch, program):
        acc_top1_ns = []
        acc_top5_ns = []

        _logger.info(
            "The current sparsity of the inference model is {}%".format(
                round(100 * UnstructuredPruner.total_sparse(
                    paddle.static.default_main_program()), 2)))
        for batch_id, data in enumerate(valid_loader):
            start_time = time.time()
            acc_top1_n, acc_top5_n = exe.run(
                program, feed=data, fetch_list=[acc_top1.name, acc_top5.name])
            end_time = time.time()
            if batch_id % args.log_period == 0:
                _logger.info(
                    "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}".
                    format(epoch, batch_id,
                           np.mean(acc_top1_n),
                           np.mean(acc_top5_n), end_time - start_time))
            acc_top1_ns.append(np.mean(acc_top1_n))
            acc_top5_ns.append(np.mean(acc_top5_n))

        _logger.info("Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format(
            epoch,
            np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns))))

    def train(epoch, program):
        train_reader_cost = 0.0
        train_run_cost = 0.0
        total_samples = 0
        reader_start = time.time()
        for batch_id, data in enumerate(train_loader):
            train_reader_cost += time.time() - reader_start
            train_start = time.time()
            loss_n, acc_top1_n, acc_top5_n = exe.run(
                program,
                feed=data,
                fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name])
            # GMP pruner step 2: step() to update ratios and other internal states of the pruner.
            pruner.step()
            train_run_cost += time.time() - train_start
            total_samples += args.batch_size
            loss_n = np.mean(loss_n)
            acc_top1_n = np.mean(acc_top1_n)
            acc_top5_n = np.mean(acc_top5_n)
            if batch_id % args.log_period == 0:
                _logger.info(
                    "epoch[{}]-batch[{}] lr: {:.6f} - loss: {}; acc_top1: {}; acc_top5: {}; avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
                    format(epoch, batch_id,
                           learning_rate.get_lr(), loss_n, acc_top1_n,
                           acc_top5_n, train_reader_cost / args.log_period, (
                               train_reader_cost + train_run_cost
                           ) / args.log_period, total_samples / args.log_period,
                           total_samples / (train_reader_cost + train_run_cost
                                            )))
                train_reader_cost = 0.0
                train_run_cost = 0.0
                total_samples = 0
            learning_rate.step()
            reader_start = time.time()

    if use_data_parallel:
        # Fleet step 4: get the compiled program from fleet
        compiled_train_program = fleet.main_program
    else:
        compiled_train_program = paddle.static.CompiledProgram(
            paddle.static.default_main_program())

    for i in range(args.last_epoch + 1, args.num_epochs):
        train(i, compiled_train_program)
        # GMP pruner step 3: update params before summrizing sparsity, saving model or evaluation. 
        pruner.update_params()

        _logger.info("The current sparsity of the pruned model is: {}%".format(
            round(100 * UnstructuredPruner.total_sparse(
                paddle.static.default_main_program()), 2)))

        if (i + 1) % args.test_period == 0:
            test(i, val_program)
        if (i + 1) % args.model_period == 0:
            if use_data_parallel:
                fleet.save_persistables(executor=exe, dirname=args.model_path)
            else:
                paddle.fluid.io.save_persistables(
                    executor=exe, dirname=args.model_path)
    def net(self, args=None):
        """
        resnet struct.
        Args:
            fleet:
            args (ArgumentParser): run args to config dist fleet.
        Returns:
            tuple: the return value contains avg_cost, py_reader
        """
        from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
        from thirdparty.image_classfication.models.resnet import ResNet50
        from thirdparty.image_classfication.train import parser
        from thirdparty.image_classfication.train import optimizer_setting
        parser.add_argument('--update_method',
                            type=str,
                            required=True,
                            choices=['pserver', 'nccl'])
        parser.add_argument('--role',
                            type=str,
                            required=True,
                            choices=['pserver', 'trainer'])
        parser.add_argument('--endpoints',
                            type=str,
                            required=False,
                            default="")
        parser.add_argument('--current_id',
                            type=int,
                            required=False,
                            default=0)
        parser.add_argument('--trainers', type=int, required=False, default=1)
        # parser.add_argument('--sync_mode', action='store_true')
        parser.add_argument('--run_params',
                            type=str,
                            required=False,
                            default='{}')
        args = parser.parse_args()
        args.run_params = json.loads(args.run_params)
        image_shape = [3, 224, 224]
        scale_loss = 1.0
        self.py_reader = fluid.layers.py_reader(capacity=16,
                                                shapes=[[-1] + image_shape,
                                                        [-1, 1]],
                                                lod_levels=[0, 0],
                                                dtypes=["float32", "int64"],
                                                use_double_buffer=True)
        image, label = fluid.layers.read_file(self.py_reader)
        run_model = ResNet50()
        out = run_model.net(image, 4)
        softmax_out = fluid.layers.softmax(out, use_cudnn=False)
        cost, prob = fluid.layers.softmax_with_cross_entropy(
            out, label, return_softmax=True)
        self.avg_cost = fluid.layers.mean(cost)

        params = run_model.params
        params["total_images"] = args.total_images
        params["lr"] = 1e-5
        params["num_epochs"] = args.num_epochs
        params["learning_strategy"]["batch_size"] = args.batch_size
        params["learning_strategy"]["name"] = args.lr_strategy
        params["l2_decay"] = args.l2_decay
        params["momentum_rate"] = args.momentum_rate
        optimizer = optimizer_setting(params)
        global_lr = optimizer._global_learning_rate()
        global_lr.persistable = True

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = 1
        exec_strategy.num_iteration_per_drop_scope = 30
        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.enable_inplace = args.run_params['enable_inplace']
        dist_strategy.fuse_all_reduce_ops = args.run_params[
            'fuse_all_reduce_ops']
        dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num']
        dist_strategy.use_local_sgd = args.run_params['use_local_sgd']
        dist_strategy.mode = args.run_params["mode"]
        dist_strategy.collective_mode = args.run_params["collective"]

        if args.run_params["fp16"]:
            optimizer = fluid.contrib.mixed_precision.decorate(
                optimizer,
                init_loss_scaling=128.0,
                use_dynamic_loss_scaling=True)

        if "use_dgc" in args.run_params and args.run_params["use_dgc"]:
            # use dgc must close fuse
            dist_strategy.fuse_all_reduce_ops = False
            optimizer = fluid.optimizer.DGCMomentumOptimizer(
                learning_rate=0.001, momentum=0.9, rampup_begin_step=0)

        dist_optimizer = fleet.distributed_optimizer(optimizer,
                                                     strategy=dist_strategy)
        _, param_grads = dist_optimizer.minimize(self.avg_cost)

        shuffle_seed = 1
        train_reader = reader.train(settings=args,
                                    data_dir=DATA_DIR,
                                    pass_id_as_seed=shuffle_seed)
        self.py_reader.decorate_paddle_reader(
            paddle.batch(train_reader, batch_size=self.batch_size))

        if scale_loss > 1:
            avg_cost = fluid.layers.mean(x=cost) * scale_loss
        return self.avg_cost, self.py_reader
Exemple #30
0
def train(args):
    """train start"""
    logging.info(args)

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id)
    dev_count = 1

    exe = fluid.Executor(place)

    train_program = fluid.Program()
    startup_program = fluid.Program()

    # For Distributed Training.
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)
    args.num_trainers = fleet.worker_num()
    args.trainer_id = fleet.worker_index()
    dist_strategy = DistributedStrategy()

    with fluid.program_guard(train_program, startup_program):
        with fluid.unique_name.guard():
            sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 1,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                TrainTaskConfig.label_smooth_eps,
                ModelHyperParams.bos_idx,
                use_py_reader=args.use_py_reader,
                is_test=False)

            optimizer = None
            if args.sync:
                lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
                    ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)

                with fluid.default_main_program()._lr_schedule_guard():
                    learning_rate = lr_decay * TrainTaskConfig.learning_rate

                optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
                                                 beta1=TrainTaskConfig.beta1,
                                                 beta2=TrainTaskConfig.beta2,
                                                 epsilon=TrainTaskConfig.eps)
            else:
                optimizer = fluid.optimizer.SGD(0.003)
            if args.use_fp16:
                optimizer = decorate(optimizer,
                                     init_loss_scaling=args.loss_scaling)
            optimizer = fleet.distributed_optimizer(optimizer,
                                                    strategy=dist_strategy)
            optimizer.minimize(avg_cost, startup_program)

    train_program = fleet.main_program
    orig_train_program = fleet._origin_program
    train_loop(args, exe, train_program, orig_train_program, startup_program,
               dev_count, sum_cost, avg_cost, token_num, predict, pyreader)