Ejemplo n.º 1
0
    def test_a_sync_optimizer2(self):
        os.environ["TRAINING_ROLE"] = "TRAINER"
        import paddle.distributed.fleet as fleet

        main_program = paddle.fluid.Program()
        startup_program = paddle.fluid.Program()

        paddle.fluid.framework.switch_main_program(main_program)
        paddle.fluid.framework.switch_startup_program(startup_program)

        fleet.init(role_maker.PaddleCloudRoleMaker())
        input_x = paddle.fluid.layers.data(
            name="x", shape=[32], dtype='float32')
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
        cost = paddle.fluid.layers.cross_entropy(
            input=prediction, label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)
        os.environ["FLAGS_LAUNCH_BARRIER"] = "0"
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.auto = True
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        self.assertTrue(fleet._final_strategy().a_sync)
        a_sync_configs = fleet._final_strategy().a_sync_configs
        self.assertTrue(a_sync_configs['k_steps'] == 800)
    def test_amp_recompute_lars_dgc_not_apply_optimizer(self):
        """ test amp + recompute + lars + dgc,
            amp -/-> dgc, max_path is amp-->recompute-->lars
        """
        train_prog, startup_prog = fluid.Program(), fluid.Program()
        avg_cost, strategy = self.net(train_prog, startup_prog)
        self.set_strategy(strategy, 'dgc')
        self.set_strategy(strategy, 'amp')
        self.set_strategy(strategy, 'recompute')
        self.set_strategy(strategy, 'lars')
        self.optimizer(avg_cost, strategy, train_prog, startup_prog)

        strategy = fleet._final_strategy()

        ops = [op.type for op in avg_cost.block.ops]
        outs = [
            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
        ]
        self.assertIn('cast', ops)
        self.assertIn('check_finite_and_unscale', ops)

        # recompute
        self.assertIn('subprog', ''.join(outs))

        # lars
        self.assertIn('lars_momentum', ops)

        # dgc not apply
        self.assertFalse(strategy.dgc)
    def test_a_sync_optimizer3(self):
        os.environ["TRAINING_ROLE"] = "TRAINER"
        import paddle.distributed.fleet as fleet

        main_program = paddle.fluid.Program()
        startup_program = paddle.fluid.Program()

        paddle.fluid.framework.switch_main_program(main_program)
        paddle.fluid.framework.switch_startup_program(startup_program)

        fleet.init(role_maker.PaddleCloudRoleMaker())
        input_x = paddle.fluid.layers.data(name="x",
                                           shape=[-1, 1],
                                           dtype="int64",
                                           lod_level=1,
                                           append_batch_size=False)
        x_embedding = paddle.fluid.layers.embedding(
            is_distributed=False,
            input=input_x,
            size=[1000000000, 100000],
            param_attr=paddle.fluid.ParamAttr(
                name="embedding",
                initializer=paddle.fluid.initializer.Constant(value=0.01)),
            is_sparse=True)
        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')

        fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh')
        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
        prediction = paddle.fluid.layers.fc(input=[fc_2],
                                            size=2,
                                            act='softmax')
        cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                 label=input_y)
        avg_cost = paddle.fluid.layers.mean(x=cost)

        os.environ["FLAGS_LAUNCH_BARRIER"] = "0"
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.auto = True
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

        self.assertTrue(fleet._final_strategy().a_sync)
        a_sync_configs = fleet._final_strategy().a_sync_configs
        self.assertTrue(a_sync_configs['k_steps'] == 0)
Ejemplo n.º 4
0
    def test_amp_recompute_optimizer(self):
        """ test amp + recompute """
        train_prog, startup_prog = fluid.Program(), fluid.Program()
        avg_cost, strategy = self.net(train_prog, startup_prog)
        self.set_strategy(strategy, 'amp')
        self.set_strategy(strategy, 'recompute')
        self.optimizer(avg_cost, strategy, train_prog, startup_prog)

        strategy = fleet._final_strategy()

        ops = [op.type for op in avg_cost.block.ops]
        outs = [
            op.output('Out')[0] for op in avg_cost.block.ops
            if op.type == 'mul'
        ]
        self.assertIn('cast', ops)
        self.assertIn('check_finite_and_unscale', ops)

        # recompute
        self.assertIn('subprog', ''.join(outs))
Ejemplo n.º 5
0
    def test_single_gpu(self):
        paddle.enable_static()
        fleet.init(is_collective=True)
        main_program = paddle.static.Program()
        startup_program = paddle.static.Program()
        strategy = fleet.DistributedStrategy()
        strategy.gradient_scale_configs = {'scale_strategy': 'sum'}
        with fluid.program_guard(main_program, startup_program):
            with fluid.unique_name.guard():
                input_x = paddle.static.data(name="x",
                                             shape=[None, 32],
                                             dtype='float32')
                input_y = paddle.static.data(name="y",
                                             shape=[None, 1],
                                             dtype='int64')
                cost = self.mlp(input_x=input_x, input_y=input_y)
                output_name = cost.name
                optimizer = fleet.distributed_optimizer(
                    fluid.optimizer.Adam(), strategy)
                optimizer.minimize(cost)

        final_strategy = fleet._final_strategy()
        assert final_strategy.gradient_scale_configs['scale_strategy'] == 'sum'
Ejemplo n.º 6
0
def train(args):
    log.info("pretraining start")
    profile = False

    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))

    # set seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    paddle.seed(args.seed)
    get_rng_state_tracker().add('global_seed', args.seed)
    get_rng_state_tracker().add('local_seed',
                                args.seed + fleet.worker_index() + 2021)

    # define execution strategy
    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = 2
    exec_strategy.num_iteration_per_drop_scope = 1

    # define distribution strategy
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.execution_strategy = exec_strategy
    dist_strategy.nccl_comm_num = 3
    if args.use_recompute:
        log.info("using recompute.")
    dist_strategy.recompute = args.use_recompute
    dist_strategy.sharding = args.use_sharding
    dist_strategy.pipeline = args.num_pp > 1

    # define topology structure for dp/pp/mp
    topo = Topology(rank=fleet.worker_index(),
                    world_size=fleet.worker_num(),
                    dp=args.num_dp,
                    pp=args.num_pp,
                    sharding=args.num_sharding,
                    mp=args.num_mp)

    is_last = False
    if topo.pp.rank == (topo.pp.size - 1):
        is_last = True

    dp_sharding_rank = topo.dp.rank * topo.sharding.size + topo.sharding.rank
    dp_worldsize = topo.dp.size * topo.sharding.size
    bsz_per_dp = args.global_bsz // dp_worldsize

    micro_bsz = args.micro_bsz
    assert args.global_bsz % micro_bsz == 0, f"cannot do gradient accumulate, globa_bsz: {args.bsz} micro_bsz: {micro_bsz}"
    acc_steps = bsz_per_dp // micro_bsz

    # sharding \ model parallel \ pipeline
    assert dist_strategy.sharding == True
    dist_strategy.sharding_configs = {
        "segment_broadcast_MB": 32,
        "sharding_degree": args.num_sharding,
        "mp_degree": args.num_mp,
        "pp_degree": args.num_pp,
        "dp_degree": args.num_dp,
        "optimize_offload": True,
    }
    dist_strategy.pipeline_configs = {
        "schedule_mode": "1F1B",
        "micro_batch_size": micro_bsz,
        "accumulate_steps": acc_steps,
    }
    log.info(
        f"using globa_bsz: {args.global_bsz} micro_bsz: {micro_bsz}, acc_steps: {acc_steps}"
    )

    dist_strategy.amp = args.use_amp
    dist_strategy.amp_configs = {
        "custom_white_list": ['softmax', 'layer_norm', 'gelu'],
        "init_loss_scaling": 32768,
        "decr_every_n_nan_or_inf": 2,
        "incr_every_n_steps": 1000,
        "incr_ratio": 2.0,
        "use_dynamic_loss_scaling": True,
        "decr_ratio": 0.5,
        "use_pure_fp16": False,
        "use_fp16_guard": False,
    }

    dist_strategy.lamb = args.use_lamb
    dist_strategy.lamb_configs = {
        'lamb_weight_decay':
        0.01,
        'exclude_from_weight_decay':
        ['layer_norm_bias', 'layer_norm_scale', '.b_0']
    }

    train_program = fluid.Program()
    startup_program = fluid.Program()
    with fluid.program_guard(train_program, startup_program):
        with fluid.unique_name.guard():
            graph_vars = create_model(args, 'train', micro_bsz,
                                      dp_sharding_rank, dp_worldsize, topo)
            data_loader = graph_vars['data_loader']
            for op in train_program.global_block().ops:
                if op.type == 'fill_constant':
                    op._set_attr(
                        'op_device', "gpu:0"
                    )  # XXX: hack: https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/tensor.py#L1376

            if args.use_recompute:
                dist_strategy.recompute_configs = {
                    "checkpoints": graph_vars['checkpoints'],
                    # "enable_offload": args.use_offload,
                    # "checkpoint_shape": [micro_bsz, args.max_seq_len, 4096],
                }

            log.debug("base lr: {}".format(args.learning_rate))
            scheduled_lr = linear_warmup_decay(
                learning_rate=args.learning_rate,
                warmup_steps=args.warmup_steps,
                num_train_steps=args.num_train_steps)

            clip_norm_thres = 1.0
            if paddlenlp.ops.optimizer._jit_compile():
                optimizer = paddlenlp.ops.optimizer.AdamwOptimizer(
                    learning_rate=scheduled_lr,
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=clip_norm_thres),
                    weight_decay=args.weight_decay,
                    apply_decay_param_fun=apply_weight_decay_fun)
            else:
                optimizer = fluid.optimizer.Adam(
                    learning_rate=scheduled_lr,
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=clip_norm_thres),
                    #multi_precision=True,
                    #weight_decay=args.weight_decay, # merge this pr to use weight_decay: https://github.com/PaddlePaddle/Paddle/pull/29248
                    #exclude_from_weight_decay_fn=exclude_from_weight_decay
                )

            optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
            log.info(f"using dist strategy: {dist_strategy}")

            optimizer.minimize(graph_vars['total_loss'])

            final_strategy = fleet._final_strategy()
            applied_meta_list = fleet._get_applied_meta_list()
            log.info("final strategy: {}".format(final_strategy))
            log.info("applied_meta_list: {}".format(applied_meta_list))

    program_desc_dir = os.path.join(args.output_dir, "program_desc")
    if not os.path.isdir(program_desc_dir):
        os.mkdir(program_desc_dir)

    with open(
            program_desc_dir + "/main_program.txt.%d" %
        (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f:
        f.write(str(train_program))

    with open(
            program_desc_dir + "/startup_program.txt.%d" %
        (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f:
        f.write(str(startup_program))

    exe = fluid.Executor(place)
    exe.run(startup_program)

    optimizer.amp_init(place)

    #save_path = os.path.join(args.output_dir, 'step_0')
    #log.debug("saving models to {}".format(save_path))
    #save_persistables(exe, save_path, train_program)

    if args.init_checkpoint and args.init_checkpoint != "":
        log.info(' ')
        log.info(
            '############################WARNING############################')
        log.info(
            '####### using ini_checkpoint, not init_pretraining_params ####')
        log.info(
            '## meaning hyper param e.g. lr will inherit from checkpoint ##')
        log.info(
            '###############################################################')
        init_checkpoint(exe, args.init_checkpoint, train_program)
        log.info(' ')

    output_dir = args.output_dir
    save_steps = args.save_steps
    total_time = 0
    cost_vals, lm_losses, sop_accs = [], [], []
    global_steps = args.global_steps + 1
    steps = 0
    log_path = 'train_log/node-%d' % fleet.worker_index()
    start_time = time.time()
    with LogWriter(os.path.join(args.output_dir, log_path)) as swriter:
        data_loader.start()
        while True:
            #if steps < global_steps:
            #    steps += 1
            #    continue
            if not is_last:
                fetch_list = []
            else:
                fetch_list = [
                    graph_vars['total_loss'], graph_vars['mean_mask_lm_loss'],
                    scheduled_lr
                ]
                if args.use_sop:
                    fetch_list.extend(
                        [graph_vars['sop_acc'], graph_vars['sop_loss']])
                if args.use_amp:
                    loss_scaling = train_program.global_block(
                    ).vars['loss_scaling_0']
                    fetch_list.append(loss_scaling)

            ret = exe.run(train_program, fetch_list=fetch_list
                          )  # run one mini-batch(=acc_steps micro-batch)
            #use_program_cache=True)

            steps += 1

            if is_last:
                if args.use_sop and args.use_amp:
                    cost_val, lm_loss, lr, sop_acc, sop_loss, loss_scaling_0 = ret
                elif args.use_sop:
                    cost_val, lm_loss, lr, sop_acc, sop_loss = ret
                elif args.use_amp:
                    cost_val, lm_loss, lr, loss_scaling_0 = ret
                else:
                    cost_val, lm_loss, lr = ret
                cost_vals.append(cost_val[0])
                lm_losses.append(lm_loss[0])
                if args.use_sop:
                    sop_accs.append(sop_acc[0])

                if steps > 0 and (steps % args.log_steps) == 0:
                    end_time = time.time()
                    total_time = end_time - start_time
                    cost_val = np.mean(cost_vals)
                    lm_loss = np.mean(lm_losses)
                    swriter.add_scalar('loss/total_loss', cost_val, steps)
                    swriter.add_scalar('loss/mlm_loss', lm_loss, steps)
                    swriter.add_scalar('lr/scheduled_lr', lr[0], steps)

                    if args.use_sop:
                        sop_acc = np.mean(sop_accs)
                        swriter.add_scalar('loss/sop_loss', sop_loss, steps)
                        swriter.add_scalar('train/sop_acc', sop_acc, steps)
                    else:
                        sop_acc = 0.0

                    if args.use_amp:
                        swriter.add_scalar('lr/loss_scaling',
                                           loss_scaling_0[0], steps)
                    else:
                        loss_scaling_0 = [0.0]

                    log.info(
                        "worker_index: %d, step: %d, cost: %f, "
                        "mlm loss: %f, sentence order acc: %f, "
                        "speed: %f steps/s, "
                        "speed: %f samples/s, "
                        "speed: %f tokens/s, "
                        "learning rate: %.3e, loss_scalings: %f" %
                        (fleet.worker_index(), steps, cost_val, lm_loss,
                         sop_acc, args.log_steps / total_time,
                         args.log_steps * args.global_bsz / total_time,
                         args.log_steps * args.global_bsz * args.max_seq_len /
                         total_time, lr[0], loss_scaling_0[0]))

                    cost_vals, lm_losses, sop_accs = [], [], []
                    start_time = time.time()

            # TODO: add evaluation
            if steps > 0 and args.eval_steps > 0 and steps % args.eval_steps == 0:
                pass

            if steps > 0 and args.save_steps > 0 and steps % args.save_steps == 0:
                if args.use_hybrid_dp and fleet.worker_index() > 8:
                    continue
                save_path = os.path.join(output_dir, 'step_' + str(steps))
                log.debug("saving models to {}".format(save_path))
                save_persistables(exe, save_path, train_program)

            if steps == args.num_train_steps:
                if args.use_hybrid_dp and fleet.worker_index() > 8:
                    continue
                save_path = os.path.join(output_dir,
                                         'final_step_' + str(steps))
                save_persistables(exe, save_path, train_program)
                log.debug("saving final models to {}".format(save_path))
                log.debug("end of training, total steps: {}".format(steps))