コード例 #1
0
    def test_with_asp(self):
        fleet.init(is_collective=True)

        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
        paddle.incubate.asp.prune_model(self.layer)

        self.optimizer = fleet.distributed_optimizer(self.optimizer)
        self.layer = fleet.distributed_model(self.layer)

        imgs = paddle.to_tensor(np.random.randn(64, 32),
                                dtype='float32',
                                place=self.place,
                                stop_gradient=False)
        labels = paddle.to_tensor(np.random.randint(10, size=(64, 1)),
                                  dtype='float32',
                                  place=self.place,
                                  stop_gradient=False)

        loss_fn = paddle.nn.MSELoss(reduction='mean')

        output = self.layer(imgs)
        loss = loss_fn(output, labels)
        loss.backward()
        self.optimizer.step()
        self.optimizer.clear_grad()

        for param in self.layer.parameters():
            if ASPHelper._is_supported_layer(
                    paddle.static.default_main_program(), param.name):
                mat = param.numpy()
                self.assertTrue(
                    paddle.fluid.contrib.sparsity.check_sparsity(mat.T,
                                                                 n=2,
                                                                 m=4))
コード例 #2
0
    def build_model_optimizer(self, Optimizer="adam"):
        hcg = fleet.get_hybrid_communicate_group()
        word_size = hcg.get_model_parallel_world_size()
        sharding_id = hcg.get_sharding_parallel_rank()
        dp_id = hcg.get_data_parallel_rank()
        rank_id = dist.get_rank()

        np_fc1 = np.random.random_sample((hidden_size, inner_size))
        np_fc2 = np.random.random_sample((inner_size, hidden_size))

        model_a = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size,
                              np_fc1, np_fc2)
        optimizer_a = self.build_optimizer(model_a,
                                           strategy=self.strategy,
                                           is_sharding=True,
                                           Optimizer=Optimizer)
        model_a = fleet.distributed_model(model_a)
        optimizer_a = fleet.distributed_optimizer(optimizer_a)

        model_b = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size,
                              np_fc1, np_fc2)
        optimizer_b = self.build_optimizer(model_b,
                                           strategy=self.strategy,
                                           is_sharding=False,
                                           Optimizer=Optimizer)

        return model_a, optimizer_a, model_b, optimizer_b
コード例 #3
0
    def test_pp_model(self):
        hcg = fleet.get_hybrid_communicate_group()
        word_size = hcg.get_model_parallel_world_size()
        dp_id = hcg.get_data_parallel_rank()
        pp_id = hcg.get_stage_id()
        rank_id = dist.get_rank()
        topology = hcg.topology()
        set_random_seed(1024, dp_id, rank_id)

        model = ModelPipe(topology)
        scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
                                                       values=[0.001, 0.002],
                                                       verbose=True)
        optimizer = paddle.optimizer.SGD(learning_rate=scheduler,
                                         parameters=model.parameters())

        model = fleet.distributed_model(model)
        optimizer = fleet.distributed_optimizer(optimizer)

        for step_id in range(5):
            x_data = np.random.randint(0,
                                       vocab_size,
                                       size=[batch_size, length])
            x = paddle.to_tensor(x_data)
            x.stop_gradient = True

            e_loss = model.eval_batch([x, x], True)
            loss = model.train_batch([x, x], optimizer, scheduler)

            # TODO(shenliang03) add utest for loss
            if pp_id != 0:
                np.testing.assert_allclose(loss.numpy(), e_loss.numpy())
コード例 #4
0
    def test_pp_model(self):
        hcg = fleet.get_hybrid_communicate_group()
        word_size = hcg.get_model_parallel_world_size()
        dp_id = hcg.get_data_parallel_rank()
        pp_id = hcg.get_stage_id()
        rank_id = dist.get_rank()
        topology = hcg.topology()
        set_random_seed(1024, dp_id, rank_id)

        model = ModelPipe(topology)
        scheduler = paddle.optimizer.lr.PiecewiseDecay(
            boundaries=[2], values=[0.001, 0.002], verbose=True)
        optimizer = paddle.optimizer.SGD(learning_rate=scheduler,
                                         parameters=model.parameters())

        model = fleet.distributed_model(model)
        optimizer = fleet.distributed_optimizer(optimizer)
        output_dir = tempfile.mkdtemp()

        # warmup step
        for step_id in range(2):
            x_data = np.random.randint(0, vocab_size, size=[batch_size, length])
            x = paddle.to_tensor(x_data)
            x.stop_gradient = True
            loss = model.train_batch([x, x], optimizer, scheduler)

        model._layers.save_state_dict(output_dir)
        paddle.save(optimizer.state_dict(),
                    os.path.join(output_dir, "model_state.pdopt"))

        # construct data
        test_steps = 5
        np_data = np.random.randint(
            0, vocab_size, size=[test_steps, batch_size, length])

        origin_loss = []
        for step_id in range(5):
            x_data = np_data[step_id, :]
            x = paddle.to_tensor(x_data)
            x.stop_gradient = True
            loss = model.train_batch([x, x], optimizer, scheduler)
            origin_loss.append(loss.numpy())

        # test step
        model._layers.set_state_dir(output_dir)
        opt_dict = paddle.load(os.path.join(output_dir, "model_state.pdopt"))
        optimizer.set_state_dict(opt_dict)

        for step_id in range(5):
            x_data = np_data[step_id, :]
            x = paddle.to_tensor(x_data)
            x.stop_gradient = True
            loss = model.train_batch([x, x], optimizer, scheduler)
            print("origin loss: ", origin_loss[step_id], "current loss: ",
                  loss.numpy())
            np.testing.assert_allclose(loss.numpy(), origin_loss[step_id])

        # finally, remove the model/optimizer path
        shutil.rmtree(output_dir)
コード例 #5
0
    def test_pp_model(self):
        hcg = fleet.get_hybrid_communicate_group()
        word_size = hcg.get_model_parallel_world_size()
        dp_id = hcg.get_data_parallel_rank()
        pp_id = hcg.get_stage_id()
        rank_id = dist.get_rank()
        set_random_seed(1024, dp_id, rank_id)

        #construct model a
        model_a = AlexNet(10)
        scheduler_a, optimizer_a = self.build_optimizer(model_a)

        param_len = len(model_a.parameters())

        parameters = []
        for param in model_a.parameters():
            parameters.append(param.numpy())

        # construct model b
        model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size)
        scheduler_b, optimizer_b = self.build_optimizer(model_b)
        model_b = fleet.distributed_model(model_b)
        optimizer_b = fleet.distributed_optimizer(optimizer_b)

        for idx, param in enumerate(model_b.parameters()):
            param.set_value(parameters[idx + pp_id * (param_len // 2)])

        # construct reader
        train_reader = paddle.batch(paddle.dataset.mnist.train(),
                                    batch_size=batch_size,
                                    drop_last=True)

        for step_id, data in enumerate(train_reader()):
            x_data = np.array([x[0] for x in data]).astype('float32').reshape(
                batch_size, 1, 28, 28)
            y_data = np.array([x[1] for x in data
                               ]).astype('int64').reshape(batch_size, 1)
            img = paddle.to_tensor(x_data)
            label = paddle.to_tensor(y_data)
            img.stop_gradient = True
            label.stop_gradient = True

            if step_id >= 5:
                return True

            loss_a = model_a(img, label)
            loss_a.backward()
            optimizer_a.step()
            optimizer_a.clear_grad()
            scheduler_a.step()

            loss_b = model_b.train_batch([img, label], optimizer_b,
                                         scheduler_b)

            print("loss: ", loss_a.numpy(), loss_b.numpy())
            np.testing.assert_allclose(loss_a.numpy(),
                                       loss_b.numpy(),
                                       rtol=5e-5)
コード例 #6
0
 def test_dygraph_method(self):
     paddle.disable_static()
     value = np.arange(26).reshape(2, 13).astype("float32")
     a = fluid.dygraph.to_variable(value)
     layer = paddle.nn.Linear(13, 5)
     adam = paddle.optimizer.Adam(learning_rate=0.01,
                                  parameters=layer.parameters())
     # remove init cause this UT cannot launch distributed task
     adam = fleet.distributed_optimizer(adam)
     dp_layer = fleet.distributed_model(layer)
     lr = 0.001
     adam.set_lr(lr)
     cur_lr = adam.get_lr()
     assert (lr == cur_lr)
     state_dict = adam.state_dict()
     adam.set_state_dict(state_dict)
コード例 #7
0
    def test_dygraph_single(self):
        paddle.disable_static()
        fleet.init(is_collective=True)

        layer = LinearNet()
        loss_fn = nn.MSELoss()
        adam = paddle.optimizer.Adam(learning_rate=0.001,
                                     parameters=layer.parameters())

        adam = fleet.distributed_optimizer(adam)
        dp_layer = fleet.distributed_model(layer)
        for step in range(2):
            inputs = paddle.randn([10, 10], 'float32')
            outputs = dp_layer(inputs)
            labels = paddle.randn([10, 1], 'float32')
            loss = loss_fn(outputs, labels)
            loss.backward()
            adam.step()
            adam.clear_grad()
コード例 #8
0
ファイル: eval.py プロジェクト: xueeinstein/PaddleHelix
def main(args):
    """
    main function
    """
    model_config = json.load(open(args.model_config, 'r'))
    paddle.set_device("gpu")
    strategy = fleet.DistributedStrategy()
    fleet.init(is_collective=True, strategy=strategy)

    eval_loader = create_dataloader(data_dir=args.eval_data,
                                    model_config=model_config)

    encoder_model = ProteinEncoderModel(model_config, name='protein')
    model = ProteinModel(encoder_model, model_config)
    model = fleet.distributed_model(model)
    model.load_dict(paddle.load(args.eval_model))

    criterion = ProteinCriterion(model_config)
    metric = get_metric(model_config['task'])
    eval_cur_loss = eval(model, eval_loader, criterion, metric)
コード例 #9
0
    def build_model_optimizer(self):
        hcg = fleet.get_hybrid_communicate_group()
        word_size = hcg.get_model_parallel_world_size()
        mp_id = hcg.get_model_parallel_rank()
        dp_id = hcg.get_data_parallel_rank()
        rank_id = dist.get_rank()
        set_random_seed(1024, dp_id, rank_id)

        np_fc1 = np.random.random_sample((hidden_size, inner_size))
        np_fc2 = np.random.random_sample((inner_size, hidden_size))

        model_a = SimpleMPNet(vocab_size, hidden_size, inner_size, output_size,
                              np_fc1, np_fc2, mp_id)
        optimizer_a = self.build_optimizer(model_a)
        model_a = fleet.distributed_model(model_a)
        optimizer_a = fleet.distributed_optimizer(optimizer_a)

        model_b = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size,
                              np_fc1, np_fc2)
        optimizer_b = self.build_optimizer(model_b)

        return model_a, optimizer_a, model_b, optimizer_b
コード例 #10
0

# 1. enable dynamic mode
paddle.disable_static()

# 2. initialize fleet environment
fleet.init(is_collective=True)

# 3. create layer & optimizer
layer = paddle.nn.Linear(10, 10)
adam = paddle.optimizer.Adam(learning_rate=0.001,
                             parameters=layer.parameters())

# 4. get data_parallel model using fleet
adam = fleet.distributed_optimizer(adam)
dp_layer = fleet.distributed_model(layer)

# 5. run layer
for step in range(1):
    inputs = paddle.randn([10, 10], 'float32')
    outputs = dp_layer(inputs)
    loss = paddle.mean(outputs)

    print("step:{}\tloss:{}".format(step, loss.numpy()))

    loss = dp_layer.scale_loss(loss)
    loss.backward()
    dp_layer.apply_collective_grads()

    adam.step()
    adam.clear_grad()
コード例 #11
0
def main(args):
    paddle.seed(12345)
    # load config
    config = load_yaml(args.config_yaml)
    dy_model_class = load_dy_model_class(args.abs_dir)
    config["config_abs_dir"] = args.abs_dir
    # modify config from command
    if args.opt:
        for parameter in args.opt:
            parameter = parameter.strip()
            key, value = parameter.split("=")
            if type(config.get(key)) is int:
                value = int(value)
            if type(config.get(key)) is bool:
                value = (True if value.lower() == "true" else False)
            config[key] = value

    # tools.vars
    use_gpu = config.get("runner.use_gpu", True)
    use_xpu = config.get("runner.use_xpu", False)
    use_visual = config.get("runner.use_visual", False)
    train_data_dir = config.get("runner.train_data_dir", None)
    epochs = config.get("runner.epochs", None)
    print_interval = config.get("runner.print_interval", None)
    train_batch_size = config.get("runner.train_batch_size", None)
    model_save_path = config.get("runner.model_save_path", "model_output")
    model_init_path = config.get("runner.model_init_path", None)
    use_fleet = config.get("runner.use_fleet", False)

    logger.info("**************common.configs**********")
    logger.info(
        "use_gpu: {}, use_xpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}"
        .format(use_gpu, use_xpu, use_visual, train_batch_size, train_data_dir,
                epochs, print_interval, model_save_path))
    logger.info("**************common.configs**********")

    if use_xpu:
        xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0))
        place = paddle.set_device(xpu_device)
    else:
        place = paddle.set_device('gpu' if use_gpu else 'cpu')

    dy_model = dy_model_class.create_model(config)

    # Create a log_visual object and store the data in the path
    if use_visual:
        from visualdl import LogWriter
        log_visual = LogWriter(args.abs_dir + "/visualDL_log/train")

    if model_init_path is not None:
        load_model(model_init_path, dy_model)

    # to do : add optimizer function
    optimizer = dy_model_class.create_optimizer(dy_model, config)

    # use fleet run collective
    if use_fleet:
        from paddle.distributed import fleet
        strategy = fleet.DistributedStrategy()
        fleet.init(is_collective=True, strategy=strategy)
        optimizer = fleet.distributed_optimizer(optimizer)
        dy_model = fleet.distributed_model(dy_model)

    logger.info("read data")
    train_dataloader = create_data_loader(config=config, place=place)

    last_epoch_id = config.get("last_epoch", -1)
    step_num = 0

    for epoch_id in range(last_epoch_id + 1, epochs):
        # set train mode
        dy_model.train()
        metric_list, metric_list_name = dy_model_class.create_metrics()
        #auc_metric = paddle.metric.Auc("ROC")
        epoch_begin = time.time()
        interval_begin = time.time()
        train_reader_cost = 0.0
        train_run_cost = 0.0
        total_samples = 0
        reader_start = time.time()

        for batch_id, batch in enumerate(train_dataloader()):
            train_reader_cost += time.time() - reader_start
            optimizer.clear_grad()
            train_start = time.time()
            batch_size = len(batch[0])

            loss, metric_list, tensor_print_dict = dy_model_class.train_forward(
                dy_model, metric_list, batch, config)

            loss.backward()
            optimizer.step()
            train_run_cost += time.time() - train_start
            total_samples += batch_size

            if batch_id % print_interval == 0:
                metric_str = ""
                for metric_id in range(len(metric_list_name)):
                    metric_str += (metric_list_name[metric_id] +
                                   ":{:.6f}, ".format(
                                       metric_list[metric_id].accumulate()))
                    if use_visual:
                        log_visual.add_scalar(
                            tag="train/" + metric_list_name[metric_id],
                            step=step_num,
                            value=metric_list[metric_id].accumulate())
                tensor_print_str = ""
                if tensor_print_dict is not None:
                    for var_name, var in tensor_print_dict.items():
                        tensor_print_str += ("{}:".format(var_name) +
                                             str(var.numpy()) + ",")
                        if use_visual:
                            log_visual.add_scalar(tag="train/" + var_name,
                                                  step=step_num,
                                                  value=var.numpy())
                logger.info(
                    "epoch: {}, batch_id: {}, ".format(epoch_id, batch_id) +
                    metric_str + tensor_print_str +
                    " avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} ins/s"
                    .format(
                        train_reader_cost /
                        print_interval, (train_reader_cost + train_run_cost) /
                        print_interval, total_samples /
                        print_interval, total_samples /
                        (train_reader_cost + train_run_cost)))
                train_reader_cost = 0.0
                train_run_cost = 0.0
                total_samples = 0
            reader_start = time.time()
            step_num = step_num + 1

        metric_str = ""
        for metric_id in range(len(metric_list_name)):
            metric_str += (
                metric_list_name[metric_id] +
                ": {:.6f},".format(metric_list[metric_id].accumulate()))

        tensor_print_str = ""
        if tensor_print_dict is not None:
            for var_name, var in tensor_print_dict.items():
                tensor_print_str += ("{}:".format(var_name) +
                                     str(var.numpy()) + ",")

        logger.info("epoch: {} done, ".format(epoch_id) + metric_str +
                    tensor_print_str +
                    " epoch time: {:.2f} s".format(time.time() - epoch_begin))

        if use_fleet:
            trainer_id = paddle.distributed.get_rank()
            if trainer_id == 0:
                save_model(dy_model,
                           optimizer,
                           model_save_path,
                           epoch_id,
                           prefix='rec')
        else:
            save_model(dy_model,
                       optimizer,
                       model_save_path,
                       epoch_id,
                       prefix='rec')
コード例 #12
0
def do_train(args):
    paddle.set_device(args.device)
    nranks = paddle.distributed.get_world_size()
    strategy = fleet.DistributedStrategy()
    strategy.hybrid_configs = {
        "dp_degree": args.dp_degree,
        "mp_degree": args.mp_degree,
        "pp_degree": args.pp_degree,
        "sharding_degree": args.sharding_degree
    }

    accumulate_steps = args.local_batch_size // args.micro_batch_size
    strategy.pipeline_configs = {
        "accumulate_steps": accumulate_steps,
        "micro_batch_size": args.micro_batch_size
    }

    # set control in tensor parallel
    strategy.tensor_parallel_configs = {"tensor_init_seed": args.seed}

    fleet.init(is_collective=True, strategy=strategy)

    # obtain rank message of hybrid parallel
    hcg = fleet.get_hybrid_communicate_group()
    global_rank = hcg.get_global_rank()
    mp_rank = hcg.get_model_parallel_rank()
    pp_rank = hcg.get_stage_id()
    dp_rank = hcg.get_data_parallel_rank()
    sharding_rank = hcg.get_sharding_parallel_rank()

    # sharding stage2/3 not support hybrid parallel
    if args.sharding_stage in [2, 3]:
        assert args.dp_degree == args.mp_degree == args.pp_degree == 1, "sharding stage2/3 will support hybrid parallel later"

    sharding_size = hcg.get_sharding_parallel_world_size()
    data_world_rank = dp_rank * sharding_size + sharding_rank
    data_world_size = args.dp_degree * args.sharding_degree
    local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))

    # seed control in hybrid parallel
    set_hyrbid_parallel_seed(args.seed, data_world_rank, mp_rank, pp_rank)

    default_global_tokens_num = args.global_batch_size * args.max_seq_len

    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    # Define log writer
    log_writer_path = os.path.join(
        args.output_dir, "train_log",
        "{}_globalbsz_{}_pure_fp16_{}_recompute_{}_card_{}".format(
            args.model_name_or_path, args.global_batch_size, args.use_pure_fp16,
            False, global_rank).lower())

    if os.path.exists(log_writer_path):
        import shutil
        shutil.rmtree(log_writer_path)

    log_writer = LogWriter(log_writer_path)

    pretrained_models_list = list(
        model_class.pretrained_init_configuration.keys())

    if args.model_name_or_path in pretrained_models_list:
        model_config = model_class.pretrained_init_configuration[
            args.model_name_or_path]
        model_config["hidden_dropout_prob"] = args.hidden_dropout_prob
        model_config[
            "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob

        model_config['num_partitions'] = args.mp_degree
        model_config['use_recompute'] = args.use_recompute
        if args.pp_degree == 1:
            model = GPTForPretraining(GPTModel(**model_config))
        else:
            model_config['topology'] = hcg.topology()
            model = GPTForPretrainingPipe(**model_config)
    else:
        model = GPTForPretraining.from_pretrained(
            args.model_name_or_path,
            hidden_dropout_prob=args.hidden_dropout_prob,
            attention_probs_dropout_prob=args.attention_probs_dropout_prob)

    # Create the critrion for the gpt model
    criterion = GPTPretrainingCriterion()

    if args.decay_steps is None:
        args.decay_steps = args.max_steps
    warmup_step = args.warmup_rate * args.decay_steps

    lr_scheduler = None

    if args.lr_decay_style == "none":
        lr_scheduler = None
    elif args.lr_decay_style == "cosine":
        lr_scheduler = lr.CosineAnnealingWithWarmupDecay(
            max_lr=args.max_lr,
            min_lr=args.min_lr,
            warmup_step=warmup_step,
            decay_step=args.decay_steps)

    clip = None
    if args.grad_clip > 0:
        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.grad_clip)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]

    if args.sharding_stage == 1 and args.sharding_degree > 1:
        optimizer = DygraphShardingOptimizer(
            hcg=fleet.get_hybrid_communicate_group(),
            user_defined_strategy=strategy,
            params=model.parameters(),
            inner_optimizer_class=paddle.optimizer.AdamW,
            learning_rate=lr_scheduler
            if lr_scheduler is not None else args.max_lr,
            beta1=args.adam_beta1,
            beta2=args.adam_beta2,
            epsilon=args.adam_epsilon,
            weight_decay=args.weight_decay,
            grad_clip=clip,
            apply_decay_param_fun=lambda x: x in decay_params)
    else:
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler
            if lr_scheduler is not None else args.max_lr,
            beta1=args.adam_beta1,
            beta2=args.adam_beta2,
            epsilon=args.adam_epsilon,
            parameters=model.parameters(),
            weight_decay=args.weight_decay,
            grad_clip=clip,
            apply_decay_param_fun=lambda x: x in decay_params,
            # TODO: remove 'multi_precision' in definition of optimizer
            # and add it to 'paddle.amp.decorate'
            multi_precision=args.use_pure_fp16)

    if args.use_pure_fp16:
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
        # level O2 means converting the network to FP16
        if args.sharding_stage not in [2, 3]:
            scaler = fleet.distributed_scaler(scaler)
        model = paddle.amp.decorate(
            models=model, level='O2', save_dtype='float32')

    # wrap sharding stage2/3 and add collective group
    # TODO(Baibaifan): combine ShardingStage1/2/3 and fleet.distributed_model in feature
    if args.sharding_stage in [2, 3]:
        scaler = scaler if args.use_pure_fp16 else None
        model, optimizer, scaler = wrap_sharding_2_3(model, optimizer, scaler,
                                                     args.sharding_offload)

    elif paddle.distributed.get_world_size() > 1:
        model = fleet.distributed_model(model)
        optimizer = fleet.distributed_optimizer(optimizer)

    if args.model_name_or_path not in pretrained_models_list:
        logger.info("Try to load checkpoint from %s " % args.model_name_or_path)
        opt_path = os.path.join(args.model_name_or_path, "model_state.pdopt")
        if os.path.exists(opt_path):
            opt_dict = paddle.load(opt_path)
            optimizer.set_state_dict(opt_dict)
        else:
            logger.warning("No optimizer checkpoint file found in %s." %
                           opt_path)

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        files = get_train_data_file(args)
        files.sort()
        num_files = len(files)
        for f_id in range(num_files):
            data_file = files[f_id]
            train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset(
                args, [data_file],
                local_rank=local_rank,
                data_world_size=data_world_size,
                data_world_rank=data_world_rank,
                eos_id=tokenizer.eos_token_id)
            # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader
            # many times. and start a new random dataloader.
            valid_data_loader = valid_data_loader()
            test_data_loader = test_data_loader()

            # time count
            train_reader_cost = 0.0
            train_run_cost = 0.0
            reader_start = time.time()
            for step, batch in enumerate(train_data_loader()):
                train_reader_cost += time.time() - reader_start
                train_start = time.time()

                global_step += 1
                tokens, loss_mask, position_ids, labels = batch

                loss_mask.stop_gradient = True
                labels.stop_gradient = True
                position_ids.stop_gradient = True

                if args.pp_degree == 1:
                    # In ParallelMode of DataParallel, 'no_sync' can be used for improving
                    # performance of model by gradient accumulation.
                    loss = 0.0
                    for i in range(accumulate_steps):
                        start_index = i * args.micro_batch_size
                        end_index = start_index + args.micro_batch_size
                        with paddle.amp.auto_cast(
                                args.use_pure_fp16,
                                custom_black_list=[
                                    "reduce_sum",
                                    "c_softmax_with_cross_entropy",
                                    "elementwise_div"
                                ],
                                level='O2'):
                            preds = model(
                                tokens[start_index:end_index, :],
                                position_ids[start_index:end_index, :])
                            loss_mbs = criterion(
                                preds, labels[start_index:end_index, :],
                                loss_mask[start_index:end_index, :])
                        loss_mbs = loss_mbs / accumulate_steps
                        if args.use_pure_fp16:
                            scaler.scale(loss_mbs).backward()
                        else:
                            loss_mbs.backward()
                        loss = loss + loss_mbs

                    if args.use_pure_fp16:
                        if args.sharding_stage in [2, 3]:
                            scaler.step(optimizer)
                            scaler.update()
                        else:
                            scaler.minimize(optimizer, loss)
                    else:
                        optimizer.step()

                    if lr_scheduler is not None:
                        lr_scheduler.step()

                    optimizer.clear_grad()

                else:
                    data = [(tokens, position_ids), (labels, loss_mask)]
                    with paddle.amp.auto_cast(
                            args.use_pure_fp16,
                            custom_black_list=[
                                "reduce_sum", "c_softmax_with_cross_entropy",
                                "elementwise_div"
                            ],
                            level='O2'):
                        loss = model.train_batch(
                            data,
                            optimizer=optimizer,
                            lr_scheduler=lr_scheduler,
                            scaler=scaler if args.use_pure_fp16 else None)

                # Sync for profile time, delete it may be a little faster
                paddle.device.cuda.synchronize()
                train_run_cost += time.time() - train_start
                # Profile for model benchmark
                profiler.add_profiler_step(args.profiler_options)

                if global_step % args.logging_freq == 0:
                    avg_loss = loss.numpy()
                    speed = args.logging_freq / (
                        train_reader_cost + train_run_cost)
                    avg_reader_cost = train_reader_cost / args.logging_freq

                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f step/s, ips: %.0f tokens/s, ips_per_card: %.0f tokens/s, learning rate: %.5e"
                        % (global_step, epoch, step, avg_loss, avg_reader_cost,
                           1. / speed, speed, speed * default_global_tokens_num,
                           speed * default_global_tokens_num / nranks,
                           optimizer.get_lr()))
                    log_writer.add_scalar("loss", float(loss), global_step)
                    log_writer.add_scalar("learning_rate",
                                          optimizer.get_lr(), global_step)

                    tic_train = time.time()
                    train_reader_cost = 0.0
                    train_run_cost = 0.0

                if args.check_accuracy:
                    if global_step >= args.max_steps:
                        return
                    else:
                        continue

                if global_step % args.eval_freq == 0:
                    # Since the valid data broardcast to all devices, we do evaluate on all device.
                    run_evaluate(args, valid_data_loader, model, criterion,
                                 args.eval_iters, log_writer, global_step,
                                 epoch, "valid")

                # TODO: 1. merge paramters while saving model. 2. ensure that the model is saved and loaded correctly
                # only dp_rank = 0 save model
                if (global_step % args.save_steps == 0 or
                        global_step >= args.max_steps) and dp_rank == 0:

                    model_to_save = model._layers if paddle.distributed.get_world_size(
                    ) > 1 and args.sharding_stage not in [2, 3] else model
                    output_dir = os.path.join(args.output_dir,
                                              "step_%d" % global_step)
                    os.makedirs(output_dir, exist_ok=True)

                    logger.info("Save model to %s" % output_dir)

                    if args.pp_degree > 1:
                        if mp_rank == 0 and sharding_rank == 0 and pp_rank == 0:
                            tokenizer.save_pretrained(output_dir)
                        model_to_save.save_state_dict(output_dir)
                        paddle.save(
                            optimizer.state_dict(),
                            os.path.join(
                                output_dir,
                                "model_state_mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}.pdopt".
                                format(mp_rank, sharding_rank, pp_rank)))
                    else:
                        if args.sharding_stage == 3:
                            # If parameter need to convert to cpu, please add convert2cpu=True
                            model_to_save.get_all_parameters(convert2cpu=False)
                        if mp_rank == 0 and sharding_rank == 0:
                            tokenizer.save_pretrained(output_dir)
                        model_to_save.save_pretrained(output_dir)
                        paddle.save(
                            optimizer.state_dict(),
                            os.path.join(
                                output_dir,
                                "model_state_mp_{:0>2d}_sharding_{:0>2d}.pdopt".
                                format(mp_rank, sharding_rank)))

                if global_step >= args.max_steps:
                    run_evaluate(args, test_data_loader, model, criterion,
                                 args.test_iters, log_writer, global_step,
                                 epoch, "test")
                    logger.info("The training process is complete.")
                    del train_data_loader
                    return

                reader_start = time.time()

            del train_data_loader
コード例 #13
0
ファイル: trainer.py プロジェクト: lvjian0706/PaddleDetection
    def train(self, validate=False):
        assert self.mode == 'train', "Model not in 'train' mode"
        Init_mark = False

        model = self.model
        if self.cfg.get('fleet', False):
            model = fleet.distributed_model(model)
            self.optimizer = fleet.distributed_optimizer(self.optimizer)
        elif self._nranks > 1:
            find_unused_parameters = self.cfg[
                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
            model = paddle.DataParallel(
                self.model, find_unused_parameters=find_unused_parameters)

        # initial fp16
        if self.cfg.get('fp16', False):
            scaler = amp.GradScaler(enable=self.cfg.use_gpu,
                                    init_loss_scaling=1024)

        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': len(self.loader)
        })

        self.status['batch_time'] = stats.SmoothedValue(self.cfg.log_iter,
                                                        fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(self.cfg.log_iter,
                                                       fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        if self.cfg.get('print_flops', False):
            self._flops(self.loader)
        profiler_options = self.cfg.get('profiler_options', None)

        self._compose_callback.on_train_begin(self.status)

        for epoch_id in range(self.start_epoch, self.cfg.epoch):
            self.status['mode'] = 'train'
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset.set_epoch(epoch_id)
            model.train()
            iter_tic = time.time()
            for step_id, data in enumerate(self.loader):
                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                profiler.add_profiler_step(profiler_options)
                self._compose_callback.on_step_begin(self.status)
                data['epoch_id'] = epoch_id

                if self.cfg.get('fp16', False):
                    with amp.auto_cast(enable=self.cfg.use_gpu):
                        # model forward
                        outputs = model(data)
                        loss = outputs['loss']

                    # model backward
                    scaled_loss = scaler.scale(loss)
                    scaled_loss.backward()
                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
                    scaler.minimize(self.optimizer, scaled_loss)
                else:
                    # model forward
                    outputs = model(data)
                    loss = outputs['loss']
                    # model backward
                    loss.backward()
                    self.optimizer.step()
                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                if self.cfg.get('unstructured_prune'):
                    self.pruner.step()
                self.optimizer.clear_grad()
                self.status['learning_rate'] = curr_lr

                if self._nranks < 2 or self._local_rank == 0:
                    self.status['training_staus'].update(outputs)

                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)
                if self.use_ema:
                    self.ema.update(self.model)
                iter_tic = time.time()

            # apply ema weight on model
            if self.use_ema:
                weight = copy.deepcopy(self.model.state_dict())
                self.model.set_dict(self.ema.apply())
            if self.cfg.get('unstructured_prune'):
                self.pruner.update_params()

            self._compose_callback.on_epoch_end(self.status)

            if validate and (self._nranks < 2 or self._local_rank == 0) \
                    and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \
                             or epoch_id == self.end_epoch - 1):
                if not hasattr(self, '_eval_loader'):
                    # build evaluation dataset and loader
                    self._eval_dataset = self.cfg.EvalDataset
                    self._eval_batch_sampler = \
                        paddle.io.BatchSampler(
                            self._eval_dataset,
                            batch_size=self.cfg.EvalReader['batch_size'])
                    self._eval_loader = create('EvalReader')(
                        self._eval_dataset,
                        self.cfg.worker_num,
                        batch_sampler=self._eval_batch_sampler)
                # if validation in training is enabled, metrics should be re-init
                # Init_mark makes sure this code will only execute once
                if validate and Init_mark == False:
                    Init_mark = True
                    self._init_metrics(validate=validate)
                    self._reset_metrics()
                with paddle.no_grad():
                    self.status['save_best_model'] = True
                    self._eval_with_loader(self._eval_loader)

            # restore origin weight on model
            if self.use_ema:
                self.model.set_dict(weight)

        self._compose_callback.on_train_end(self.status)
コード例 #14
0
        verbose=True)
    clip = paddle.nn.ClipGradByValue(min=-CLIP, max=CLIP)
    strategy = fleet.DistributedStrategy()
    OPTIMIZER_decay = optim.Momentum(parameters=backbone_paras_wo_bn +
                                     head_paras_wo_bn,
                                     learning_rate=scheduler,
                                     weight_decay=WEIGHT_DECAY,
                                     momentum=MOMENTUM)
    OPTIMIZER_decay = fleet.distributed_optimizer(optimizer=OPTIMIZER_decay,
                                                  strategy=strategy)
    OPTIMIZER = optim.Momentum(parameters=backbone_paras_only_bn,
                               learning_rate=scheduler,
                               momentum=MOMENTUM)
    OPTIMIZER = fleet.distributed_optimizer(optimizer=OPTIMIZER,
                                            strategy=strategy)
    BACKBONE = fleet.distributed_model(BACKBONE)
    HEAD = fleet.distributed_model(HEAD)
    logger.info("=" * 60)
    logger.info(OPTIMIZER)
    logger.info("Optimizer Generated")
    logger.info("=" * 60)

    # optionally resume from a checkpoint
    if BACKBONE_RESUME_ROOT and HEAD_RESUME_ROOT:
        logger.info("=" * 60)
        if os.path.isfile(BACKBONE_RESUME_ROOT) and os.path.isfile(
                HEAD_RESUME_ROOT):
            logger.info("Loading Backbone Checkpoint '{}'".format(
                BACKBONE_RESUME_ROOT))
            load_weight(model=BACKBONE, weight_path=BACKBONE_RESUME_ROOT)
            logger.info(
コード例 #15
0
    ema = None
    if cfg.use_ema:
        ema = ExponentialMovingAverage(model, cfg.ema_decay)
        ema.register()

    # 分布式训练与混合精度训练
    # 有疑问请参考文档https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/06_distributed_training/cluster_quick_start_cn.html
    _nranks = dist.get_world_size()
    _local_rank = dist.get_rank()
    use_fleet = cfg.train_cfg.get('fleet', False)
    use_fp16 = cfg.train_cfg.get('fp16', False)
    if use_fleet:
        # 初始化Fleet环境
        fleet.init(is_collective=True)
        # 通过Fleet API获取分布式model,用于支持分布式训练
        model = fleet.distributed_model(model)
        optimizer = fleet.distributed_optimizer(optimizer)
    elif _nranks > 1:
        find_unused_parameters = cfg.train_cfg['find_unused_parameters'] \
            if 'find_unused_parameters' in cfg.train_cfg else False
        model = paddle.DataParallel(
            model, find_unused_parameters=find_unused_parameters)
    if use_fp16:
        # scaler = amp.GradScaler(enable=use_gpu, init_loss_scaling=2.**16,
        #                         incr_every_n_steps=2000, use_dynamic_loss_scaling=True)
        scaler = amp.GradScaler(enable=use_gpu, init_loss_scaling=1024)

    print('\n=============== fleet and fp16 ===============')
    print('use_fleet: %d' % use_fleet)
    print('use_fp16: %d' % use_fp16)
    print('_nranks: %d' % _nranks)
コード例 #16
0
    def test_pp_model(self):
        hcg = fleet.get_hybrid_communicate_group()
        word_size = hcg.get_model_parallel_world_size()
        dp_id = hcg.get_data_parallel_rank()
        pp_id = hcg.get_stage_id()
        rank_id = dist.get_rank()
        set_random_seed(1024, dp_id, rank_id)

        #construct model a
        model_a = SimpleNet()
        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(
            boundaries=[2, 3, 4],
            values=[0.01, 0.02, 0.03, 0.04],
            verbose=True)
        optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a,
                                           parameters=model_a.parameters())

        model_b = SimpleNetPipe(topology=hcg.topology())

        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(
            boundaries=[2, 3, 4],
            values=[0.01, 0.02, 0.03, 0.04],
            verbose=True)
        optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b,
                                           parameters=model_b.parameters())
        model_b = fleet.distributed_model(model_b)
        optimizer_b = fleet.distributed_optimizer(optimizer_b)

        param_len = len(model_a.parameters())

        parameters = []
        for param in model_a.parameters():
            parameters.append(param.numpy())

        model_b_params = model_b.parameters()

        if pp_id == 0:
            model_b_params[0].set_value(parameters[2])
            model_b_params[1].set_value(parameters[0])

        else:
            model_b_params[0].set_value(parameters[2])
            model_b_params[1].set_value(parameters[1])

        for step in range(5):
            x1_data = np.random.randint(0, vocab_size, size=[batch_size, 1])
            x2_data = np.random.randint(0, vocab_size, size=[batch_size, 1])
            y1_data = np.random.randint(0, hidden_size, size=[batch_size, 1])

            x1 = paddle.to_tensor(x1_data)
            x2 = paddle.to_tensor(x2_data)
            y1 = paddle.to_tensor(y1_data)

            x1.stop_gradient = True
            x2.stop_gradient = True
            y1.stop_gradient = True

            loss_a = model_a(x1, x2, y1)
            loss_a.backward()

            optimizer_a.step()
            optimizer_a.clear_grad()
            scheduler_a.step()

            loss_b = model_b.train_batch([(x1, x2), (y1, )], optimizer_b,
                                         scheduler_b)

            print("loss", loss_a.numpy(), loss_b.numpy())
            np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy())
コード例 #17
0
ファイル: run_pretrain.py プロジェクト: tianxin1860/PaddleNLP
def do_train(args):
    paddle.set_device(args.device)

    worker_index = paddle.distributed.get_rank()
    worker_num = paddle.distributed.get_world_size()
    local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))

    if worker_num > 1:
        paddle.distributed.init_parallel_env()

    if args.dp_degree * args.sharding_degree == 1:
        args.dp_degree = worker_num
        args.sharding_degree = 1

    args_post_process(args, worker_num)

    logger.info('{:20}:{}'.format("paddle commit id", paddle.version.commit))
    for arg in vars(args):
        logger.info('{:20}:{}'.format(arg, getattr(args, arg)))

    strategy = fleet.DistributedStrategy()
    strategy.hybrid_configs = {
        "dp_degree": args.dp_degree,
        "mp_degree": 1,
        "pp_degree": 1,
        "sharding_degree": 1
    }

    fleet.init(is_collective=True, strategy=strategy)
    hcg = fleet.get_hybrid_communicate_group()

    # Create the random seed for the worker
    set_seed(args)

    assert args.dp_degree * args.sharding_degree == worker_num, \
        "The product of degree num should be equal to worker_num."

    # Create log write,
    log_writer = None
    if worker_index == 0:
        log_writer = LogWriter(os.path.join(args.output_dir, default_logdir()))

    # Define the input data in the static mode
    base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[
        args.model_type]
    pretrained_models_list = list(
        model_class.pretrained_init_configuration.keys())

    # load config in checkpoint
    global_step = 0
    consumed_samples = 0
    checkpoint_dir = os.path.join(args.output_dir, "model_last")
    if os.path.exists(checkpoint_dir):
        if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")):
            with open(os.path.join(checkpoint_dir, "./config.yml"), "r") as f:
                step_config = yaml.load(f, Loader=yaml.FullLoader)
                assert step_config[
                    "global_batch_size"] == args.global_batch_size, "Please ensure checkpoint global batch size is the same. Folder: {}".format(
                        checkpoint_dir)
                consumed_samples = step_config["consumed_samples"]
                global_step = step_config["global_step"]

    if args.model_name_or_path in pretrained_models_list:
        model_config = model_class.pretrained_init_configuration[
            args.model_name_or_path]
        model_config["hidden_dropout_prob"] = args.hidden_dropout_prob
        model_config[
            "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob
        model = model_class(base_class(**model_config))
    else:
        model = model_class.from_pretrained(
            args.model_name_or_path,
            hidden_dropout_prob=args.hidden_dropout_prob,
            attention_probs_dropout_prob=args.attention_probs_dropout_prob)

    criterion = criterion_class()

    if worker_index == 0:
        # log the model config and args
        model_config_json = json.dumps(model.get_model_config(),
                                       ensure_ascii=False,
                                       indent=2)
        log_writer.add_text("model_config", model_config_json)
        args_dict = {"paddle commit id": str(paddle.version.commit)}
        for arg in vars(args):
            args_dict[arg] = str(getattr(args, arg))
        log_writer.add_text("args", json.dumps(args_dict, indent=2))

    # Create the learning_rate sheduler and optimizer
    if args.decay_steps is None:
        args.decay_steps = args.max_steps
    assert args.warmup_rate <= 1.0 and args.warmup_rate >= 0.0, "warmup_rate should be in [0, 1]"
    args.warmup_steps = args.warmup_rate * args.max_steps

    lr_scheduler = LinearAnnealingWithWarmupDecay(
        args.max_lr,
        args.min_lr,
        warmup_step=args.warmup_steps,
        decay_step=args.decay_steps,
        last_epoch=global_step)

    clip = None
    if args.grad_clip > 0:
        clip = paddle.fluid.clip.GradientClipByGlobalNorm(
            clip_norm=args.grad_clip)

    decay_param = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    logger.info("Using paddle.optimizer.AdamW.")
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler
        if lr_scheduler is not None else args.max_lr,
        beta1=args.adam_beta1,
        beta2=args.adam_beta2,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        grad_clip=clip,
        apply_decay_param_fun=lambda x: x in decay_param,
        multi_precision=args.use_amp)

    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
        scaler = fleet.distributed_scaler(scaler)
        model = paddle.amp.decorate(models=model,
                                    level='O2',
                                    save_dtype='float32')

    if paddle.distributed.get_world_size() > 1:
        model = fleet.distributed_model(model)
        optimizer = fleet.distributed_optimizer(optimizer)

    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    data_file = get_train_data_file(args)

    train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset(
        args,
        data_file,
        tokenizer,
        data_world_size=worker_num,
        data_world_rank=worker_index,
        max_seq_len=args.max_seq_len,
        current_step=global_step)

    # load checkpoint vars
    if os.path.exists(checkpoint_dir):
        if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")):
            logger.info("Try to load checkpoint from %s " % checkpoint_dir)
            opt_path = os.path.join(checkpoint_dir, "model_state.pdopt")
            params_path = os.path.join(checkpoint_dir, "model_state.pdparams")

            if os.path.exists(opt_path):
                opt_dict = paddle.load(opt_path)
                optimizer.set_state_dict(opt_dict)
                model_dict = paddle.load(params_path)
                model.set_state_dict(model_dict)
            else:
                logger.warning("No optimizer checkpoint file found in %s." %
                               opt_path)
            logger.info(
                "Checkpoint loaded from global step: {}".format(global_step))

    loss_global = {
        "loss": paddle.to_tensor(0.0),
        "lm_loss": paddle.to_tensor(0.0),
        "sop_loss": paddle.to_tensor(0.0),
    }
    tic_train = time.time()
    while True:
        # If not call valid_data_loader, the enumerate will call valid_data_loader
        # many times. and start a new random dataloader.
        valid_data_loader = valid_data_loader()
        test_data_loader = test_data_loader()

        # time count
        train_reader_cost = 0.0
        train_run_cost = 0.0
        reader_start = time.time()

        for step, batch in enumerate(train_data_loader()):
            train_reader_cost += time.time() - reader_start
            train_start = time.time()

            # 0. input_ids,
            # 1. segment_ids,
            # 2. input_mask,
            # 3. masked_lm_positions,
            # 4. masked_lm_labels,
            # 5. next_sentence_labels

            input_ids, segment_ids, input_mask, masked_lm_positions, \
            masked_lm_labels, next_sentence_labels = batch

            with paddle.amp.auto_cast(args.use_amp,
                                      custom_black_list=[
                                          "reduce_sum",
                                          "c_softmax_with_cross_entropy",
                                          "elementwise_div"
                                      ],
                                      level='O2'):

                # Create the model for the ernie pretrain
                prediction_scores, seq_relationship_score = model(
                    input_ids=input_ids,
                    token_type_ids=segment_ids,
                    position_ids=None,
                    attention_mask=input_mask,
                    masked_positions=masked_lm_positions)

                lm_loss, sop_loss = criterion(prediction_scores,
                                              seq_relationship_score,
                                              masked_lm_labels,
                                              next_sentence_labels)
                loss = lm_loss + sop_loss

            if args.use_amp:
                scaler.scale(loss).backward()
                scaler.minimize(optimizer, loss)
            else:
                loss.backward()
                optimizer.step()

            optimizer.clear_grad()
            train_run_cost += time.time() - train_start

            # Skip for accumulate_steps in global step
            if (step + 1) % args.accumulate_steps != 0:
                continue

            global_step += 1

            loss_global["loss"] += loss.detach()
            loss_global["lm_loss"] += lm_loss.detach()
            loss_global["sop_loss"] += sop_loss.detach()

            if global_step % args.logging_freq == 0:
                log_info_dict = dict()
                log_info_dict["global_step"] = global_step
                for k, v in loss_global.items():
                    log_info_dict[k] = all_gather(v) / args.logging_freq
                    v.subtract_(v)
                if worker_index == 0:
                    speed = args.logging_freq / (time.time() - tic_train)
                    log_info_dict["learning_rate"] = lr_scheduler.get_lr()
                    log_info_dict["steps_per_second"] = speed
                    log_info_dict[
                        "samples_per_second"] = speed * args.global_batch_size

                    for k, v in log_info_dict.items():
                        log_writer.add_scalar("train/%s" % k, v, global_step)

                    common_loginfo = "global step %d, loss: %.9f, lm_loss: %.6f, sop_loss: %.6f, speed: %.2f steps/s, ips: %.2f seqs/s, learning rate: %.5e" % (
                        global_step, log_info_dict["loss"],
                        log_info_dict["lm_loss"], log_info_dict["sop_loss"],
                        speed, log_info_dict["samples_per_second"],
                        log_info_dict["learning_rate"])

                    addition_info = ""
                    if args.use_amp:
                        amp_info = {
                            "loss_scaling": scaler._scale.item(),
                            "incr_count": scaler._incr_count,
                            "decr_count": scaler._decr_count
                        }
                        addition_info = ", ".join("%s: %d" % (k, v)
                                                  for k, v in amp_info.items())
                        addition_info = " " + addition_info
                        for k, v in amp_info.items():
                            log_writer.add_scalar("amp/%s" % k, v, global_step)

                    logger.info(common_loginfo + addition_info)

                tic_train = time.time()

            if lr_scheduler is not None:
                lr_scheduler.step()

            if global_step % args.eval_freq == 0:
                # TODO, check the input data of validation

                run_evaluate(valid_data_loader,
                             model,
                             criterion,
                             args.eval_iters,
                             log_writer,
                             global_step,
                             args,
                             task_name="valid")
                tic_train = time.time()

            def save_ckpt(output_dir, model, tokenizer, args, global_step):
                step_config = {
                    "model_name": args.model_name_or_path,
                    "global_step": global_step,
                    "global_batch_size": args.global_batch_size,
                    "consumed_samples": global_step * args.global_batch_size,
                }

                logger.debug("saving models to {}".format(output_dir))
                model_to_save = model._layers if isinstance(
                    model, paddle.DataParallel) else model

                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                paddle.save(optimizer.state_dict(),
                            os.path.join(output_dir, "model_state.pdopt"))

                with open(os.path.join(output_dir, "config.yml"), "w") as f:
                    yaml.dump(step_config,
                              f,
                              encoding='utf-8',
                              allow_unicode=True)

            if global_step % args.save_steps == 0 or global_step >= args.max_steps:
                output_dir = os.path.join(args.output_dir,
                                          "model_%d" % global_step)
                if worker_index == 0:
                    save_ckpt(output_dir, model, tokenizer, args, global_step)

                if worker_num > 1:
                    paddle.distributed.barrier()
                tic_train = time.time()

            if global_step % args.checkpoint_steps == 0:
                output_dir = os.path.join(args.output_dir, "model_last")
                if worker_index == 0:
                    if not os.path.exists(output_dir):
                        os.mkdir(output_dir)
                    output_dir_bak = os.path.join(args.output_dir,
                                                  "model_last_bak")
                    if os.path.exists(output_dir):
                        if os.path.exists(output_dir_bak):
                            shutil.rmtree(output_dir_bak)
                        shutil.move(output_dir, output_dir_bak)
                        os.mkdir(output_dir)
                    save_ckpt(output_dir, model, tokenizer, args, global_step)

                if worker_num > 1:
                    paddle.distributed.barrier()

            if global_step >= args.max_steps:
                run_evaluate(test_data_loader,
                             model,
                             criterion,
                             args.test_iters,
                             log_writer,
                             global_step,
                             args,
                             task_name="test")
                del train_data_loader
                return
コード例 #18
0
ファイル: trainer.py プロジェクト: zuisom/PaddleDetection
    def train(self, validate=False):
        assert self.mode == 'train', "Model not in 'train' mode"

        # if no given weights loaded, load backbone pretrain weights as default
        if not self._weights_loaded:
            self.load_weights(self.cfg.pretrain_weights)

        model = self.model
        if self.cfg.fleet:
            model = fleet.distributed_model(model)
            self.optimizer = fleet.distributed_optimizer(
                self.optimizer).user_defined_optimizer
        elif self._nranks > 1:
            model = paddle.DataParallel(self.model)

        # initial fp16
        if self.cfg.fp16:
            scaler = amp.GradScaler(enable=self.cfg.use_gpu,
                                    init_loss_scaling=1024)

        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': len(self.loader)
        })

        self.status['batch_time'] = stats.SmoothedValue(self.cfg.log_iter,
                                                        fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(self.cfg.log_iter,
                                                       fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        for epoch_id in range(self.start_epoch, self.cfg.epoch):
            self.status['mode'] = 'train'
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset.set_epoch(epoch_id)
            model.train()
            iter_tic = time.time()
            for step_id, data in enumerate(self.loader):
                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                self._compose_callback.on_step_begin(self.status)

                if self.cfg.fp16:
                    with amp.auto_cast(enable=self.cfg.use_gpu):
                        # model forward
                        outputs = model(data)
                        loss = outputs['loss']

                    # model backward
                    scaled_loss = scaler.scale(loss)
                    scaled_loss.backward()
                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
                    scaler.minimize(self.optimizer, scaled_loss)
                else:
                    # model forward
                    outputs = model(data)
                    loss = outputs['loss']
                    # model backward
                    loss.backward()
                    self.optimizer.step()

                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                self.optimizer.clear_grad()
                self.status['learning_rate'] = curr_lr

                if self._nranks < 2 or self._local_rank == 0:
                    self.status['training_staus'].update(outputs)

                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)
                iter_tic = time.time()

            self._compose_callback.on_epoch_end(self.status)

            if validate and (self._nranks < 2 or self._local_rank == 0) \
                    and (epoch_id % self.cfg.snapshot_epoch == 0 \
                             or epoch_id == self.end_epoch - 1):
                if not hasattr(self, '_eval_loader'):
                    # build evaluation dataset and loader
                    self._eval_dataset = self.cfg.EvalDataset
                    self._eval_batch_sampler = \
                        paddle.io.BatchSampler(
                            self._eval_dataset,
                            batch_size=self.cfg.EvalReader['batch_size'])
                    self._eval_loader = create('EvalReader')(
                        self._eval_dataset,
                        self.cfg.worker_num,
                        batch_sampler=self._eval_batch_sampler)
                with paddle.no_grad():
                    self._eval_with_loader(self._eval_loader)
コード例 #19
0
def train_mlp(model,
              sharding_stage,
              batch_size=100,
              use_pure_fp16=False,
              accumulate_grad=False,
              opt_group=False,
              save_model=False):
    if sharding_stage == "dp":
        hcg = fleet.get_hybrid_communicate_group()
        group = hcg.get_check_parallel_group()
    else:
        group = paddle.distributed.new_group([0, 1])
    if opt_group:
        optimizer = optimizer_setting(model=model,
                                      use_pure_fp16=use_pure_fp16,
                                      opt_group=opt_group)
    else:
        optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)

    if sharding_stage == 2:
        optimizer = ShardingOptimizerStage2(params=model.parameters(),
                                            optim=optimizer,
                                            group=group)

        model = ShardingStage2(model,
                               optimizer,
                               group=group,
                               buffer_max_size=2**21)
    else:
        optimizer = fleet.distributed_optimizer(optimizer)
        model = fleet.distributed_model(model)

    train_reader = paddle.batch(reader_decorator(),
                                batch_size=batch_size,
                                drop_last=True)

    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
                                                       use_double_buffer=True,
                                                       iterable=True,
                                                       return_list=True,
                                                       use_multiprocess=True)
    train_loader.set_sample_list_generator(train_reader)

    if sharding_stage == 2:
        model.to(device="gpu")

    for eop in range(epoch):
        model.train()

        for batch_id, data in enumerate(train_loader()):
            img, label = data
            label.stop_gradient = True
            img.stop_gradient = True

            out = model(img)
            loss = paddle.nn.functional.cross_entropy(input=out, label=label)

            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
            if batch_size == 20:
                avg_loss = avg_loss / 5
            avg_loss.backward()

            if not accumulate_grad:
                optimizer.step()
                optimizer.clear_grad()

        if accumulate_grad:
            optimizer.step()
            optimizer.clear_grad()

    if save_model:
        return model, optimizer
    return model.parameters()
コード例 #20
0

if __name__ == '__main__':

    print(config.config)
    train_dataset = Loader(path=config.dataset)
    name_datasets = config.dataset.split('/')[-1]
    Recmodel = NGCF(config.config, train_dataset)
    if config.config['multigpu']:
        print('using fleet multigpu training', Recmodel)
        dist.init_parallel_env()
        Recmodel = paddle.DataParallel(Recmodel)
    if config.config['multicpu']:
        fleet.init(is_collective=True)
        optimizer = fleet.distributed_optimizer(optimizer)
        Recmodel = fleet.distributed_model(Recmodel)
        print('using fleet multicpu training', Recmodel)
    Neg_k = 1
    bpr = BPRLoss(Recmodel, config.config)
    f = open(f'logger/train_logger_{name_datasets}.txt', 'w')
    f_test = open(f'logger/test_logger_{name_datasets}.txt', 'w')

    for epoch in range(config.TRAIN_epochs):
        if epoch % 10 == 0:
            cprint("[TEST]")
            preds = predict(train_dataset,
                            Recmodel,
                            epoch,
                            multigpu=config.config['multigpu'],
                            multicpu=config.config['multicpu'])
            result = Test(train_dataset,
コード例 #21
0
    def test_pp_model(self):
        hcg = fleet.get_hybrid_communicate_group()
        word_size = hcg.get_model_parallel_world_size()
        dp_id = hcg.get_data_parallel_rank()
        pp_id = hcg.get_stage_id()
        rank_id = dist.get_rank()
        set_random_seed(1024, dp_id, rank_id)

        grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0)

        #construct model a
        model_a = AlexNet(10)
        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
                                                         values=[0.001, 0.002],
                                                         verbose=True)
        optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a,
                                           grad_clip=grad_clip,
                                           parameters=model_a.parameters())

        scaler_a = paddle.amp.GradScaler(init_loss_scaling=2**5)

        param_len = len(model_a.parameters())
        parameters = []
        for param in model_a.parameters():
            parameters.append(param.numpy())

        # construct model b
        model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size)
        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2],
                                                         values=[0.001, 0.002],
                                                         verbose=True)
        optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b,
                                           grad_clip=grad_clip,
                                           parameters=model_b.parameters())
        model_b = fleet.distributed_model(model_b)
        optimizer_b = fleet.distributed_optimizer(optimizer_b)
        scaler_b = paddle.amp.GradScaler(init_loss_scaling=2**5)
        scaler_b = fleet.distributed_scaler(scaler_b)

        for idx, param in enumerate(model_b.parameters()):
            param.set_value(parameters[idx + pp_id * (param_len // 2)])

        # construct reader
        train_reader = paddle.batch(paddle.dataset.mnist.train(),
                                    batch_size=batch_size,
                                    drop_last=True)

        for step_id, data in enumerate(train_reader()):
            x_data = np.array([x[0] for x in data]).astype('float32').reshape(
                batch_size, 1, 28, 28)
            y_data = np.array([x[1] for x in data
                               ]).astype('int64').reshape(batch_size, 1)
            img = paddle.to_tensor(x_data)
            label = paddle.to_tensor(y_data)
            img.stop_gradient = True
            label.stop_gradient = True

            if step_id >= 5:
                return True

            with paddle.amp.auto_cast():
                loss_a = model_a(img, label)
                scaler_a.scale(loss_a).backward()
                scaler_a.minimize(optimizer_a, loss_a)
                optimizer_a.clear_grad()
                scheduler_a.step()

            with paddle.amp.auto_cast():
                loss_b = model_b.train_batch([img, label],
                                             optimizer_b,
                                             scheduler_b,
                                             scaler=scaler_b)

            print("loss: ", loss_a.numpy(), loss_b.numpy())
            np.testing.assert_allclose(loss_a.numpy(),
                                       loss_b.numpy(),
                                       rtol=5e-5)
コード例 #22
0
def distill_train(distill_model,
                  train_dataset,
                  val_dataset=None,
                  optimizer=None,
                  save_dir='output',
                  iters=10000,
                  batch_size=2,
                  resume_model=None,
                  save_interval=1000,
                  log_iters=10,
                  num_workers=0,
                  use_vdl=False,
                  losses=None,
                  distill_losses=None,
                  keep_checkpoint_max=5,
                  test_config=None,
                  fp16=False):
    """
    Launch training.

    Args:
        distill_model (nn.Layer): A distill model.
        train_dataset (paddle.io.Dataset): Used to read and process training datasets.
        val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
        optimizer (paddle.optimizer.Optimizer): The optimizer.
        save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
        iters (int, optional): How may iters to train the model. Defualt: 10000.
        batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2.
        resume_model (str, optional): The path of resume model.
        save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000.
        log_iters (int, optional): Display logging information at every log_iters. Default: 10.
        num_workers (int, optional): Num workers for data loader. Default: 0.
        use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False.
        losses (dict): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']).
            The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient.
        distill_losses (dict): A dict including 'types' and 'coef'. The format of distill_losses is the same as losses.
        keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5.
        test_config(dict, optional): Evaluation config.
        fp16 (bool, optional): Whether to use amp. Not support for now.
    """
    if fp16:
        raise RuntimeError("Distillation doesn't support amp training.")

    nranks = paddle.distributed.ParallelEnv().nranks
    local_rank = paddle.distributed.ParallelEnv().local_rank
    student_model = distill_model._student_models

    start_iter = 0
    if resume_model is not None:
        start_iter = resume(student_model, optimizer, resume_model)

    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
        os.makedirs(save_dir)

    if nranks > 1:
        strategy = fleet.DistributedStrategy()
        strategy.find_unused_parameters = True
        fleet.init(is_collective=True, strategy=strategy)

        optimizer = fleet.distributed_optimizer(
            optimizer)  # The return is Fleet object
        ddp_distill_model = fleet.distributed_model(distill_model)

    batch_sampler = paddle.io.DistributedBatchSampler(train_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=True,
                                                      drop_last=True)

    loader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=batch_sampler,
        num_workers=num_workers,
        return_list=True,
        worker_init_fn=worker_init_fn,
    )

    if fp16:
        logger.info('use amp to train')
        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

    if use_vdl:
        from visualdl import LogWriter
        log_writer = LogWriter(save_dir)

    avg_loss = 0.0
    avg_out_loss = 0.0
    avg_out_distill_loss = 0.0
    avg_feature_distill_loss = 0.0
    avg_out_loss_list = []
    iters_per_epoch = len(batch_sampler)
    best_mean_iou = -1.0
    best_model_iter = -1
    reader_cost_averager = TimeAverager()
    batch_cost_averager = TimeAverager()
    save_models = deque()
    batch_start = time.time()

    iter = start_iter
    while iter < iters:
        for data in loader:
            iter += 1
            if iter > iters:
                break
            reader_cost_averager.record(time.time() - batch_start)
            images = data[0]
            labels = data[1].astype('int64')
            edges = None
            if len(data) == 3:
                edges = data[2].astype('int64')
            if hasattr(distill_model,
                       'data_format') and distill_model.data_format == 'NHWC':
                images = images.transpose((0, 2, 3, 1))

            if fp16:
                with paddle.amp.auto_cast(
                        enable=True,
                        custom_white_list={
                            "elementwise_add", "batch_norm", "sync_batch_norm"
                        },
                        custom_black_list={'bilinear_interp_v2'}):
                    if nranks > 1:
                        logits_list = ddp_distill_model(images)
                    else:
                        logits_list = distill_model(images)
                    loss_list = loss_computation(logits_list=logits_list,
                                                 labels=labels,
                                                 losses=losses,
                                                 edges=edges)
                    loss = sum(loss_list)

                scaled = scaler.scale(loss)  # scale the loss
                scaled.backward()  # do backward
                if isinstance(optimizer, fleet.Fleet):
                    scaler.minimize(optimizer.user_defined_optimizer, scaled)
                else:
                    scaler.minimize(optimizer, scaled)  # update parameters
            else:
                if nranks > 1:
                    s_logits_list, t_logits_list, feature_distill_loss = ddp_distill_model(
                        images)
                else:
                    s_logits_list, t_logits_list, feature_distill_loss = distill_model(
                        images)

                out_loss_list = loss_computation(logits_list=s_logits_list,
                                                 labels=labels,
                                                 losses=losses,
                                                 edges=edges)
                out_loss = sum(out_loss_list)

                out_distill_loss_list = distill_loss_computation(
                    student_logits_list=s_logits_list,
                    teacher_logits_list=t_logits_list,
                    labels=labels,
                    losses=distill_losses,
                    edges=edges)
                out_distill_loss = sum(out_distill_loss_list)

                loss = out_loss + out_distill_loss + feature_distill_loss
                loss.backward()
                optimizer.step()

            lr = optimizer.get_lr()

            # update lr
            if isinstance(optimizer, fleet.Fleet):
                lr_sche = optimizer.user_defined_optimizer._learning_rate
            else:
                lr_sche = optimizer._learning_rate
            if isinstance(lr_sche, paddle.optimizer.lr.LRScheduler):
                lr_sche.step()

            distill_model.clear_gradients()
            avg_loss += loss.numpy()[0]
            avg_out_loss += out_loss.numpy()[0]
            avg_out_distill_loss += out_distill_loss.numpy()[0]
            avg_feature_distill_loss += feature_distill_loss.numpy()[0]
            if not avg_out_loss_list:
                avg_out_loss_list = [l.numpy() for l in out_loss_list]
            else:
                for i in range(len(out_loss_list)):
                    avg_out_loss_list[i] += out_loss_list[i].numpy()
            batch_cost_averager.record(time.time() - batch_start,
                                       num_samples=batch_size)

            if (iter) % log_iters == 0 and local_rank == 0:
                avg_loss /= log_iters
                avg_out_loss /= log_iters
                avg_out_distill_loss /= log_iters
                avg_feature_distill_loss /= log_iters
                avg_out_loss_list = [
                    l[0] / log_iters for l in avg_out_loss_list
                ]
                remain_iters = iters - iter
                avg_train_batch_cost = batch_cost_averager.get_average()
                avg_train_reader_cost = reader_cost_averager.get_average()
                eta = calculate_eta(remain_iters, avg_train_batch_cost)
                logger.info(
                    "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f},  out_loss: {:.4f}, out_distill_loss: {:.4f}, feature_distill_loss: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}"
                    .format((iter - 1) // iters_per_epoch + 1, iter, iters,
                            avg_loss, avg_out_loss, avg_out_distill_loss,
                            avg_feature_distill_loss, lr, avg_train_batch_cost,
                            avg_train_reader_cost,
                            batch_cost_averager.get_ips_average(), eta))
                if use_vdl:
                    log_writer.add_scalar('Train/loss', avg_loss, iter)
                    # Record all losses if there are more than 2 losses.
                    if len(avg_out_loss_list) > 1:
                        avg_loss_dict = {}
                        for i, value in enumerate(avg_out_loss_list):
                            avg_loss_dict['loss_' + str(i)] = value
                        for key, value in avg_loss_dict.items():
                            log_tag = 'Train/' + key
                            log_writer.add_scalar(log_tag, value, iter)

                    log_writer.add_scalar('Train/lr', lr, iter)
                    log_writer.add_scalar('Train/batch_cost',
                                          avg_train_batch_cost, iter)
                    log_writer.add_scalar('Train/reader_cost',
                                          avg_train_reader_cost, iter)
                avg_loss = 0.0
                avg_out_loss = 0.0
                avg_out_distill_loss = 0.0
                avg_feature_distill_loss = 0.0
                avg_out_loss_list = []
                reader_cost_averager.reset()
                batch_cost_averager.reset()

            if (iter % save_interval == 0 or iter == iters) and (val_dataset
                                                                 is not None):
                num_workers = 1 if num_workers > 0 else 0

                if test_config is None:
                    test_config = {}

                mean_iou, acc, _, _, _ = evaluate(student_model,
                                                  val_dataset,
                                                  num_workers=num_workers,
                                                  **test_config)

                student_model.train()

            if (iter % save_interval == 0
                    or iter == iters) and local_rank == 0:
                current_save_dir = os.path.join(save_dir,
                                                "iter_{}".format(iter))
                if not os.path.isdir(current_save_dir):
                    os.makedirs(current_save_dir)
                paddle.save(student_model.state_dict(),
                            os.path.join(current_save_dir, 'model.pdparams'))
                paddle.save(optimizer.state_dict(),
                            os.path.join(current_save_dir, 'model.pdopt'))
                save_models.append(current_save_dir)
                if len(save_models) > keep_checkpoint_max > 0:
                    model_to_remove = save_models.popleft()
                    shutil.rmtree(model_to_remove)

                if val_dataset is not None:
                    if mean_iou > best_mean_iou:
                        best_mean_iou = mean_iou
                        best_model_iter = iter
                        best_model_dir = os.path.join(save_dir, "best_model")
                        paddle.save(
                            student_model.state_dict(),
                            os.path.join(best_model_dir, 'model.pdparams'))
                    logger.info(
                        '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.'
                        .format(best_mean_iou, best_model_iter))

                    if use_vdl:
                        log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter)
                        log_writer.add_scalar('Evaluate/Acc', acc, iter)
            batch_start = time.time()

    # Calculate flops.
    if local_rank == 0:

        def count_syncbn(m, x, y):
            x = x[0]
            nelements = x.numel()
            m.total_ops += int(2 * nelements)

        _, c, h, w = images.shape
        flops = paddle.flops(
            student_model, [1, c, h, w],
            custom_ops={paddle.nn.SyncBatchNorm: count_syncbn})

    # Sleep for half a second to let dataloader release resources.
    time.sleep(0.5)
    if use_vdl:
        log_writer.close()
コード例 #23
0
ファイル: train.py プロジェクト: xueeinstein/PaddleHelix
def main(args):
    """
    main function
    """

    model_config = json.load(open(args.model_config, 'r'))
    if args.use_cuda:
        paddle.set_device("gpu")
    else:
        paddle.set_device("cpu")

    if args.is_distributed:
        strategy = fleet.DistributedStrategy()
        fleet.init(is_collective=args.use_cuda, strategy=strategy)

    train_loader = create_dataloader(
        data_dir=args.train_data,
        model_config=model_config)

    valid_loader = create_dataloader(
        data_dir=args.valid_data,
        model_config=model_config)

    encoder_model = ProteinEncoderModel(model_config, name='protein')
    model = ProteinModel(encoder_model, model_config)
    if args.is_distributed:
        model = fleet.distributed_model(model)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]

    grad_clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
    optimizer = paddle.optimizer.AdamW(
        learning_rate=1e-4,
        epsilon=1e-06,
        weight_decay=0.01,
        parameters=model.parameters(),
        apply_decay_param_fun=lambda x: x in decay_params)

    if args.is_distributed:
        optimizer = fleet.distributed_optimizer(optimizer)
    criterion = ProteinCriterion(model_config)
    metric = get_metric(model_config['task'])

    if args.init_model:
        print("load init_model")
        # for hot_start
        if args.hot_start == 'hot_start':
            model.load_dict(paddle.load(args.init_model))
        # for pre_train
        else:
            encoder_model.load_dict(paddle.load(args.init_model))

    train_sum_loss = 0
    valid_min_loss = 10000
    steps_per_epoch = 20
    cur_step = 0
    while True:
        model.train()
        for (text, pos, label) in train_loader:
            # print("text: ", text)
            cur_step += 1
            pred = model(text, pos)
            label = label.reshape([-1, 1])
            pred = pred.reshape([-1, pred.shape[-1]])
            loss = criterion.cal_loss(pred, label)

            print("loss: ", loss)
            train_sum_loss += loss.numpy()
            loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()

            pred = pred.numpy()
            label = label.numpy()
            loss = loss.numpy()
            metric.update(pred, label, loss)
            if cur_step % 10 == 0:
                print('step %d, avg loss %.5f' % (cur_step, train_sum_loss / 10))
                metric.show()
                train_sum_loss = 0
                metric.clear()

            # save best_model
            if cur_step % steps_per_epoch == 0:
                print("eval begin_time: ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
                valid_cur_loss = eval(model, valid_loader, criterion, metric)
                print("valid_cur_loss: ", valid_cur_loss)
                print("eval end_time: ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
                if valid_cur_loss < valid_min_loss:
                    print("%s Save best model step_%d." % \
                            (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), cur_step))
                    paddle.save(encoder_model.state_dict(), 'models/epoch_best_encoder.pdparams')
                    paddle.save(model.state_dict(), 'models/epoch_best.pdparams')
                    valid_min_loss = valid_cur_loss

                    os.system("cp -rf models/epoch_best.pdparams models/step_%d.pdparams" % (cur_step))
                    os.system("cp -rf models/epoch_best_encoder.pdparams models/step_%d_encoder.pdparams" % (cur_step))
                model.train()
コード例 #24
0
def do_train(args):
    paddle.set_device(args.device)
    strategy = fleet.DistributedStrategy()
    strategy.hybrid_configs = {
        "dp_degree": args.dp_degree,
        "mp_degree": args.mp_degree,
        "pp_degree": args.pp_degree
    }

    strategy.pipeline_configs = {
        "accumulate_steps": args.local_batch_size // args.micro_batch_size,
        "micro_batch_size": args.micro_batch_size
    }

    fleet.init(is_collective=True, strategy=strategy)

    # obtain rank message of hybrid parallel
    hcg = fleet.get_hybrid_communicate_group()
    global_rank = hcg.get_global_rank()
    mp_rank = hcg.get_model_parallel_rank()
    pp_rank = hcg.get_stage_id()
    dp_rank = hcg.get_data_parallel_rank()
    local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))

    # seed control in hybrid parallel
    set_hyrbid_parallel_seed(args.seed, dp_rank, mp_rank, pp_rank)

    default_global_tokens_num = args.global_batch_size * args.max_seq_len

    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

    # Define log writer
    log_writer_path = os.path.join(
        args.output_dir, "train_log",
        "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format(
            args.model_name_or_path, args.global_batch_size, args.use_amp,
            False, global_rank).lower())

    if os.path.exists(log_writer_path):
        import shutil
        shutil.rmtree(log_writer_path)

    log_writer = LogWriter(log_writer_path)

    pretrained_models_list = list(
        model_class.pretrained_init_configuration.keys())

    if args.model_name_or_path in pretrained_models_list:
        model_config = model_class.pretrained_init_configuration[
            args.model_name_or_path]
        model_config["hidden_dropout_prob"] = args.hidden_dropout_prob
        model_config[
            "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob

        model_config['num_partitions'] = args.mp_degree
        if args.pp_degree == 1:
            model = GPTForPretraining(GPTModel(**model_config))
        else:
            model_config['topology'] = hcg.topology()
            model_config["recompute_interval"] = 1 if args.use_recompute else 0
            model = GPTForPretrainingPipe(**model_config)
    else:
        model = GPTForPretraining.from_pretrained(
            args.model_name_or_path,
            hidden_dropout_prob=args.hidden_dropout_prob,
            attention_probs_dropout_prob=args.attention_probs_dropout_prob)

    # Create the critrion for the gpt model
    criterion = GPTPretrainingCriterion()

    if args.decay_steps is None:
        args.decay_steps = args.max_steps
    warmup_step = args.warmup_rate * args.decay_steps

    lr_scheduler = None

    if args.lr_decay_style == "none":
        lr_scheduler = None
    elif args.lr_decay_style == "cosine":
        lr_scheduler = lr.CosineAnnealingWithWarmupDecay(
            max_lr=args.max_lr,
            min_lr=args.min_lr,
            warmup_step=warmup_step,
            decay_step=args.decay_steps)

    clip = None
    if args.grad_clip > 0:
        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.grad_clip)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler
        if lr_scheduler is not None else args.max_lr,
        beta1=args.adam_beta1,
        beta2=args.adam_beta2,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        grad_clip=clip,
        apply_decay_param_fun=lambda x: x in decay_params)

    if paddle.distributed.get_world_size() > 1:
        model = fleet.distributed_model(model)
        optimizer = fleet.distributed_optimizer(optimizer)

    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
        scaler = fleet.distributed_scaler(scaler)

    if args.model_name_or_path not in pretrained_models_list:
        logger.info("Try to load checkpoint from %s " %
                    args.model_name_or_path)
        opt_path = os.path.join(args.model_name_or_path, "model_state.pdopt")
        if os.path.exists(opt_path):
            opt_dict = paddle.load(opt_path)
            optimizer.set_state_dict(opt_dict)
        else:
            logger.warning("No optimizer checkpoint file found in %s." %
                           opt_path)

    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        files = [
            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
            if (os.path.isfile(os.path.join(args.input_dir, f))
                and "npz_" not in str(f))
        ]
        files.sort()
        num_files = len(files)
        for f_id in range(num_files):
            data_file = files[f_id]
            train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset(
                args,
                data_file,
                local_rank=local_rank,
                data_world_size=args.dp_degree,
                data_world_rank=dp_rank,
                eos_id=tokenizer.eos_token_id)
            # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader
            # many times. and start a new random dataloader.
            valid_data_loader = valid_data_loader()
            test_data_loader = test_data_loader()

            for step, batch in enumerate(train_data_loader()):
                global_step += 1
                tokens, loss_mask, labels = batch

                loss_mask.stop_gradient = True
                labels.stop_gradient = True

                if args.pp_degree == 1:
                    with paddle.amp.auto_cast(
                            args.use_amp,
                            custom_white_list=[
                                "layer_norm", "softmax", "gelu"
                            ],
                            custom_black_list=[
                                "reduce_sum", "c_softmax_with_cross_entropy",
                                "c_embedding"
                            ]):
                        preds = model(tokens)
                        loss = criterion(preds, labels, loss_mask)

                    if args.use_amp:
                        scaler.scale(loss).backward()
                        scaler.minimize(optimizer, loss)
                    else:
                        loss.backward()
                        optimizer.step()

                    if lr_scheduler is not None:
                        lr_scheduler.step()
                    optimizer.clear_grad()

                else:
                    data = [tokens, (labels, loss_mask)]
                    with paddle.amp.auto_cast(
                            args.use_amp,
                            custom_white_list=[
                                "layer_norm", "softmax", "gelu"
                            ],
                            custom_black_list=[
                                "reduce_sum", "c_softmax_with_cross_entropy",
                                "c_embedding"
                            ]):
                        loss = model.train_batch(
                            data,
                            optimizer=optimizer,
                            lr_scheduler=lr_scheduler,
                            scaler=scaler if args.use_amp else None)

                if global_step % args.logging_freq == 0:
                    avg_loss = loss.numpy()
                    speed = args.logging_freq / (time.time() - tic_train)
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %.9f, speed: %.2f step/s, ips: %.0f tokens/s, learning rate: %.5e"
                        % (global_step, epoch, step, avg_loss, speed, speed *
                           default_global_tokens_num, optimizer.get_lr()))
                    log_writer.add_scalar("loss", float(loss), global_step)
                    log_writer.add_scalar("learning_rate", optimizer.get_lr(),
                                          global_step)

                    tic_train = time.time()

                if args.check_accuracy:
                    if global_step >= args.max_steps:
                        return
                    else:
                        continue

                if global_step % args.eval_freq == 0:
                    # Since the valid data broardcast to all devices, we do evaluate on all device.
                    run_evaluate(args, valid_data_loader, model, criterion,
                                 args.eval_iters, log_writer, global_step,
                                 epoch, "valid")

                # only dp_rank = 0 save model
                if (global_step % args.save_steps == 0
                        or global_step >= args.max_steps) and dp_rank == 0:

                    model_to_save = model._layers if paddle.distributed.get_world_size(
                    ) > 1 else model
                    output_dir = os.path.join(args.output_dir,
                                              "step_%d" % global_step)
                    os.makedirs(output_dir, exist_ok=True)

                    logger.info("Save model to %s" % output_dir)

                    if args.pp_degree > 1:
                        model_to_save.save_state_dict(output_dir)
                        if mp_rank * pp_rank == 1:
                            tokenizer.save_pretrained(output_dir)
                        paddle.save(
                            optimizer.state_dict(),
                            os.path.join(
                                output_dir,
                                "model_state_mp_{:0>2d}_pp_{:0>2d}.pdopt".
                                format(mp_rank, pp_rank)))
                    else:
                        path = os.path.join(output_dir,
                                            'model_{:0>2d}'.format(mp_rank))
                        os.makedirs(path, exist_ok=True)
                        model_to_save.save_pretrained(path)

                        paddle.save(optimizer.state_dict(),
                                    os.path.join(path, "model_state.pdopt"))
                        tokenizer.save_pretrained(path)

                if global_step >= args.max_steps:
                    run_evaluate(args, test_data_loader, model, criterion,
                                 args.test_iters, log_writer, global_step,
                                 epoch, "test")
                    logger.info("The training process is complete.")
                    del train_data_loader
                    return

            del train_data_loader