def test_single_gpu(self):
        paddle.enable_static()
        fleet.init(is_collective=True)
        sharding_program = paddle.static.Program()
        sharding_startup_program = paddle.static.Program()
        strategy = fleet.DistributedStrategy()
        strategy.without_graph_optimization = True
        with fluid.program_guard(sharding_program, sharding_startup_program):
            with fluid.unique_name.guard():
                input_x = paddle.static.data(name="x",
                                             shape=[None, 32],
                                             dtype='float32')
                input_y = paddle.static.data(name="y",
                                             shape=[None, 1],
                                             dtype='int64')
                cost = self.mlp(input_x=input_x, input_y=input_y)
                output_name = cost.name
                optimizer = fleet.distributed_optimizer(
                    fluid.optimizer.Adam(), strategy)
                optimizer.minimize(cost)

        trainer_id = fleet.worker_index()
        exe = paddle.static.Executor(paddle.CUDAPlace(trainer_id))
        rank = fleet.worker_index()
        exe.run(sharding_startup_program)
        exe.run(program=sharding_program, feed=self.gen_data())
    def run_worker(self):
        logger.info("Run Worker Begin")
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        with open("./{}_worker_main_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_main_program()))
        with open("./{}_worker_startup_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_startup_program()))

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        save_model_path = self.config.get("runner.model_save_path")
        if save_model_path and (not os.path.exists(save_model_path)):
            os.makedirs(save_model_path)

        reader_type = self.config.get("runner.reader_type", None)
        epochs = int(self.config.get("runner.epochs"))
        sync_mode = self.config.get("runner.sync_mode")

        gpus_env = os.getenv("FLAGS_selected_gpus")
        self.PSGPU = paddle.fluid.core.PSGPU()
        gpuslot = [int(i) for i in range(1, self.model.sparse_inputs_slots)]
        print("gpuslot: {}".format(gpuslot))
        self.PSGPU.set_slot_vector(gpuslot)
        self.PSGPU.init_gpu_ps([int(s) for s in gpus_env.split(",")])
        opt_info = paddle.fluid.default_main_program()._fleet_opt
        opt_info['stat_var_names'] = []
        for epoch in range(epochs):
            epoch_start_time = time.time()

            if sync_mode == "heter":
                self.heter_train_loop(epoch)
            elif sync_mode == "gpubox":
                self.dataset_train_loop(epoch)
            elif reader_type == "QueueDataset":
                self.dataset_train_loop(epoch)
            elif reader_type == "DataLoader":
                self.dataloader_train_loop(epoch)
            elif reader_type == None or reader_type == "RecDataset":
                self.recdataset_train_loop(epoch)

            epoch_time = time.time() - epoch_start_time
            epoch_speed = self.example_nums / epoch_time
            logger.info(
                "Epoch: {}, using time {} second, ips {} {}/sec.".format(
                    epoch, epoch_time, epoch_speed, self.count_method))
            self.train_result_dict["speed"].append(epoch_speed)

            model_dir = "{}/{}".format(save_model_path, epoch)
            if fleet.is_first_worker(
            ) and save_model_path and is_distributed_env():
                fleet.save_inference_model(
                    self.exe, model_dir,
                    [feed.name for feed in self.input_data],
                    self.inference_target_var)
Exemple #3
0
    def run_online_worker(self):
        logger.info("Run Online Worker Begin")
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        with open("./{}_worker_main_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_main_program()))
        with open("./{}_worker_startup_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_startup_program()))

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        save_model_path = self.config.get("runner.model_save_path")
        if save_model_path and (not os.path.exists(save_model_path)):
            os.makedirs(save_model_path)

        days = os.popen("echo -n " + self.config.get("runner.days")).read().split(" ")
        pass_per_day = int(self.config.get("runner.pass_per_day"))

        for day_index in range(len(days)):
            day = days[day_index]
            for pass_index in range(1, pass_per_day + 1):
                logger.info("Day: {} Pass: {} Begin.".format(day, pass_index))
                
                prepare_data_start_time = time.time()
                dataset = self.wait_and_prepare_dataset(day, pass_index)
                prepare_data_end_time = time.time()
                logger.info(
                    "Prepare Dataset Done, using time {} second.".format(prepare_data_end_time - prepare_data_start_time))
                
                train_start_time = time.time()
                self.dataset_train_loop(dataset, day, pass_index)
                train_end_time = time.time()
                logger.info(
                    "Train Dataset Done, using time {} second.".format(train_end_time - train_start_time))
            
                model_dir = "{}/{}/{}".format(save_model_path, day, pass_index)

                if fleet.is_first_worker() and save_model_path and is_distributed_env():
                    fleet.save_inference_model(
                        self.exe, model_dir,
                        [feed.name for feed in self.input_data],
                        self.inference_target_var,
                        mode=2)

            if fleet.is_first_worker() and save_model_path and is_distributed_env():
                fleet.save_inference_model(
                    self.exe, model_dir,
                    [feed.name for feed in self.input_data],
                    self.inference_target_var,
                    mode=0)
Exemple #4
0
    def run_worker(self):
        logger.info("Run Worker Begin")
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        with open(
                "./{}_worker_main_program.prototxt".format(
                    fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_main_program()))
        with open(
                "./{}_worker_startup_program.prototxt".format(
                    fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_startup_program()))

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        save_model_path = self.config.get("runner.model_save_path")
        if save_model_path and not os.path.exists(save_model_path):
            os.makedirs(save_model_path)

        reader_type = self.config.get("runner.reader_type", None)
        epochs = int(self.config.get("runner.epochs"))
        sync_mode = self.config.get("runner.sync_mode")

        for epoch in range(epochs):
            epoch_start_time = time.time()

            if sync_mode == "heter":
                self.heter_train_loop(epoch)
            elif reader_type == "QueueDataset":
                self.dataset_train_loop(epoch)
            elif reader_type == "DataLoader":
                self.dataloader_train_loop(epoch)
            elif reader_type == None or reader_type == "RecDataset":
                self.recdataset_train_loop(epoch)

            epoch_time = time.time() - epoch_start_time
            epoch_speed = self.example_nums / epoch_time
            logger.info(
                "Epoch: {}, using time {} second, ips {} {}/sec.".format(
                    epoch, epoch_time, epoch_speed, self.count_method))
            self.train_result_dict["speed"].append(epoch_speed)

            model_dir = "{}/{}".format(save_model_path, epoch)
            if fleet.is_first_worker(
            ) and save_model_path and is_distributed_env():
                fleet.save_inference_model(
                    self.exe, model_dir,
                    [feed.name
                     for feed in self.input_data], self.inference_target_var)
Exemple #5
0
    def test_single_run_ps_minimize(self):
        paddle.enable_static()
        input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
        input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')

        fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
        cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
        avg_cost = paddle.mean(x=cost)

        fleet.init()
        strategy = paddle.distributed.fleet.DistributedStrategy()
        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)
        if fleet.is_server():
            fleet.init_server()
            fleet.run_server()
        elif fleet.is_worker():
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            exe.run(paddle.static.default_startup_program())
            step = 10
            for i in range(step):
                cost_val = exe.run(program=fluid.default_main_program(),
                                   feed=self.gen_data(),
                                   fetch_list=[avg_cost.name])
                print("worker_index: %d, step%d cost = %f" %
                      (fleet.worker_index(), i, cost_val[0]))
    def minimize(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None,
                 auto_dp=False,
                 rank_table_file=None,
                 precision_mode="must_keep_origin_dtype"):
        minimized = None
        if self.inner_opt:
            minimized = self.inner_opt.minimize(
                loss, startup_program=startup_program)

        self.ascend_instance = core.AscendInstance()

        from paddle.distributed import fleet
        if auto_dp and fleet.world_size() > 1:
            from paddle.fluid.transpiler import ascend_transpiler
            t = ascend_transpiler.AscendTranspiler(startup_program,
                                                   loss.block.program)
            t.transpile()
            #print(loss.block.program)

        # Config about Graph Engine can be found in https://support.huaweicloud.com/
        config = {
            "ge.exec.deviceId": str(fleet.local_device_ids()),
            "ge.graphRunMode": "1",
            "ge.exec.precision_mode": precision_mode,
        }
        # if multi trainers
        if rank_table_file and fleet.world_size() > 1:
            config["ge.exec.rankTableFile"] = rank_table_file
            config["ge.exec.rankId"] = str(fleet.worker_index())
            config["ge.exec.isUseHcom"] = "1"
            config["ge.exec.deployMode"] = "0"
        print("ge_initialize config:", config)
        core.ge_initialize(config)

        # Init Session
        self.ascend_instance.init_global_resources()

        main_block = loss.block
        self.parser = AscendIRParser(
            auto_dp=auto_dp, world_rank_size=fleet.world_size())

        input_varlist = self._get_input_varlist(main_block.program)

        startup_graph, main_graph = self.parser.parse_program(
            startup_program, main_block.program, input_varlist, self.fetch_list)

        for cfg in self.parser.groups_to_create:
            print("create group (%s), nranks: %d, rank_ids: %s" %
                  (cfg.name, cfg.nranks, cfg.rank_ids))
            hccl.create_group(cfg.name, cfg.nranks, cfg.rank_ids)

        self.ascend_instance.add_ascend_subgraph(0, startup_graph)
        self.ascend_instance.add_ascend_subgraph(1, main_graph)

        return minimized
Exemple #7
0
def get_file_list(data_path, config):
    assert os.path.exists(data_path)
    file_list = [data_path + "/%s" % x for x in os.listdir(data_path)]
    if config.get("runner.split_file_list"):
        logger.info("Split file list for worker {}".format(fleet.worker_index(
        )))
        file_list = fleet.util.get_file_shard(file_list)
    logger.info("File list: {}".format(file_list))
    return file_list
Exemple #8
0
    def init_fleet_with_gloo(self, use_gloo=False):
        if use_gloo:
            os.environ["PADDLE_WITH_GLOO"] = "1"
            fleet.init(self.role_maker)
        else:
            fleet.init()

        if fleet.is_server():
            print("server: {} started".format(fleet.server_index()))
        else:
            print("worker: {} started".format(fleet.worker_index()))
Exemple #9
0
    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
        # Input data
        seq_len = 2
        data_in = fluid.data(name='data_in',
                             shape=[batch_size, seq_len, hidden],
                             dtype=DTYPE)

        if dist_strategy:
            data_loader = fluid.io.DataLoader.from_generator(
                feed_list=[data_in],
                capacity=64,
                use_double_buffer=False,
                iterable=False)

        if dist_strategy:
            fleet.init(is_collective=True)
            strategy = fleet.DistributedStrategy()
            strategy.tensor_parallel = True
            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}

        rank = fleet.worker_index() if dist_strategy else None
        avg_cost = create_model(data_in, rank)
        opt = fluid.optimizer.SGD(0.1)

        if dist_strategy:
            dist_opt = fleet.distributed_optimizer(optimizer=opt,
                                                   strategy=strategy)
            dist_opt.minimize(avg_cost)
        else:
            opt.minimize(avg_cost)

        def gen_data():
            np.random.seed(2021)
            while True:
                data = [np.random.random([seq_len, hidden]).astype(DTYPE)]
                yield data

        train_reader = paddle.batch(gen_data, batch_size=batch_size)

        if dist_strategy:
            return None, avg_cost, train_reader, None, None, None, data_loader
        else:
            return None, avg_cost, train_reader, None, None, None
Exemple #10
0
def get_samples_mapping(indexed_dataset, data_prefix, num_epochs,
                        max_num_samples, max_seq_length, short_seq_prob, seed,
                        name, binary_head, share_folder):
    """Get a list that maps a sample index to a starting sentence index, end sentence index, and length"""

    if not num_epochs:
        if not max_num_samples:
            raise ValueError("Need to specify either max_num_samples "
                             "or num_epochs")
        num_epochs = np.iinfo(np.int32).max - 1
    if not max_num_samples:
        max_num_samples = np.iinfo(np.int64).max - 1

    # Filename of the index mapping
    indexmap_filename = data_prefix
    indexmap_filename += '_{}_indexmap'.format(name)
    if num_epochs != (np.iinfo(np.int32).max - 1):
        indexmap_filename += '_{}ep'.format(num_epochs)
    if max_num_samples != (np.iinfo(np.int64).max - 1):
        indexmap_filename += '_{}mns'.format(max_num_samples)
    indexmap_filename += '_{}msl'.format(max_seq_length)
    indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
    indexmap_filename += '_{}s'.format(seed)
    indexmap_filename += '.npy'

    local_rank = 0 if fleet.local_rank() is None else int(fleet.local_rank())
    if share_folder:
        local_rank = fleet.worker_index()
    # Build the indexed mapping if not exist.

    if local_rank == 0 and \
       not os.path.isfile(indexmap_filename):
        print(' > WARNING: could not find index map file {}, building '
              'the indices on rank 0 ...'.format(indexmap_filename))

        # Make sure the types match the helpers input types.
        assert indexed_dataset.doc_idx.dtype == np.int64
        print(indexed_dataset.sizes.dtype)
        assert indexed_dataset.sizes.dtype == np.int32

        # Build samples mapping
        verbose = local_rank == 0
        start_time = time.time()
        print_rank_0(
            ' > building sapmles index mapping for {} ...'.format(name))
        # First compile and then import.
        if local_rank == 0:
            compile_helper()
        import data_tools.helpers as helpers
        samples_mapping = helpers.build_mapping(indexed_dataset.doc_idx,
                                                indexed_dataset.sizes,
                                                num_epochs, max_num_samples,
                                                max_seq_length, short_seq_prob,
                                                seed, verbose,
                                                2 if binary_head else 1)
        print_rank_0(' > done building sapmles index maping')
        np.save(indexmap_filename, samples_mapping, allow_pickle=True)
        print_rank_0(
            ' > saved the index mapping in {}'.format(indexmap_filename))
        # Make sure all the ranks have built the mapping
        print_rank_0(' > elasped time to build and save samples mapping '
                     '(seconds): {:4f}'.format(time.time() - start_time))

    else:
        while True:
            if (not os.path.isfile(indexmap_filename)):
                time.sleep(3)
            else:
                try:
                    np.load(indexmap_filename,
                            allow_pickle=True,
                            mmap_mode='r')
                    break
                except Exception as e:
                    print(
                        "%s file is still writing or damaged, please wait a moment."
                        % indexmap_filename)
                    time.sleep(3)

    # This should be a barrier but nccl barrier assumes
    # device_index=rank which is not the case for model
    # parallel case
    if paddle.distributed.get_world_size() > 1:
        if paddle.in_dynamic_mode():
            paddle.distributed.barrier()

    # Load indexed dataset.
    print_rank_0(
        ' > loading indexed mapping from {}'.format(indexmap_filename))
    start_time = time.time()
    samples_mapping = np.load(indexmap_filename,
                              allow_pickle=True,
                              mmap_mode='r')
    print_rank_0(
        '    loaded indexed file in {:3.3f} seconds'.format(time.time() -
                                                            start_time))
    print_rank_0('    total number of samples: {}'.format(
        samples_mapping.shape[0]))

    return samples_mapping
def do_train(args):
    # Initialize the paddle and paddle fleet execute environment
    paddle.enable_static()
    fleet.init(is_collective=True)

    # Create the random seed for the worker
    random.seed(args.seed)
    np.random.seed(args.seed)
    paddle.seed(args.seed)
    get_rng_state_tracker().add('global_seed', args.seed)
    get_rng_state_tracker().add('local_seed',
                                args.seed + fleet.worker_index() + 2021)

    assert args.device in [
        "cpu", "gpu", "xpu"
    ], "Invalid device! Available device should be cpu, gpu, or xpu."
    place = paddle.set_device(args.device)

    worker_num = fleet.worker_num()
    worker_index = fleet.worker_index()

    topo = Topology(device_rank=worker_index,
                    world_size=worker_num,
                    dp_degree=args.dp_degree,
                    pp_degree=args.pp_degree,
                    sharding_degree=args.sharding_degree,
                    mp_degree=args.mp_degree)

    logger.info("The topo of hybrid parallelism:\n{}".format(topo))

    dist_strategy = dist_optimizer(args, topo)

    # Create log write, train results show on last card of pipeline.
    if topo.is_last:
        log_writer_path = os.path.join(
            args.output_dir, "train_log",
            "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format(
                args.model_name_or_path, args.global_batch_size, args.use_amp,
                args.use_recompute, worker_index).lower())
        if os.path.exists(log_writer_path):
            import shutil
            shutil.rmtree(log_writer_path)
        log_writer = LogWriter(log_writer_path)

    # Define the input data in the static mode

    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    pretrained_models_list = list(
        model_class.pretrained_init_configuration.keys())

    data_file = get_train_data_file(args)
    main_program = paddle.static.default_main_program()
    startup_program = paddle.static.default_startup_program()
    with paddle.static.program_guard(main_program, startup_program):
        with paddle.utils.unique_name.guard():
            with paddle.static.device_guard('gpu:0'):
                data_holders = create_data_holder(args)
                [tokens, loss_mask, attention_mask, position_ids,
                 labels] = data_holders

                tokenizer = tokenizer_class.from_pretrained(
                    args.model_name_or_path)
                eos_id = tokenizer.eos_token_id

                train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset(
                    args,
                    data_file,
                    data_world_size=topo.data_info.size,
                    data_world_rank=topo.data_info.rank,
                    eos_id=eos_id,
                    max_seq_len=args.max_seq_len,
                    places=paddle.static.cuda_places(),
                    data_holders=data_holders,
                    pipeline_mode=False,
                )

                if args.model_name_or_path in pretrained_models_list:
                    model_config = model_class.pretrained_init_configuration[
                        args.model_name_or_path]

                    model_config[
                        "hidden_dropout_prob"] = args.hidden_dropout_prob
                    model_config[
                        "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob
                    model_config["topo"] = topo

                    model = guard(f'gpu:{args.pp_degree -1}')(
                        GPTForPretraining)(
                            guard(f'gpu:0')(GPTModel)(**model_config))
                else:
                    model, _ = GPTForPretraining.from_pretrained(
                        args.model_name_or_path,
                        hidden_dropout_prob=args.hidden_dropout_prob,
                        attention_probs_dropout_prob=args.
                        attention_probs_dropout_prob,
                        topo=topo)

                # Create the model for the gpt pretrain
                preds = model(tokens, position_ids, attention_mask)

                criterion = guard(f'gpu:{args.pp_degree -1}')(
                    GPTPretrainingCriterion)(topo)
                loss = criterion(preds, labels, loss_mask)

            # Create the learning_rate sheduler and optimizer
            if args.decay_steps is None:
                args.decay_steps = args.max_steps
            warmup_step = args.warmup_rate * args.decay_steps

            # TODO @ZHUI Use paddle network to support lr scheduler
            lr_scheduler = lr.CosineAnnealingWithWarmupDecay(
                max_lr=args.max_lr,
                min_lr=args.min_lr,
                warmup_step=warmup_step,
                decay_step=args.decay_steps)

            clip = None
            if args.grad_clip > 0:
                clip = paddle.fluid.clip.GradientClipByGlobalNorm(
                    clip_norm=args.grad_clip)

            decay_param = [
                p.name for n, p in model.named_parameters()
                if not any(nd in n for nd in ["bias", "norm"])
            ]

            optimizer = paddle.optimizer.AdamW(
                learning_rate=lr_scheduler,
                beta1=args.adam_beta1,
                beta2=args.adam_beta2,
                epsilon=args.adam_epsilon,
                grad_clip=clip,
                weight_decay=args.weight_decay,
                apply_decay_param_fun=lambda x: x in decay_param)
            # alias
            optimizer.apply_optimize = optimizer._apply_optimize

            if args.use_recompute:
                dist_strategy.recompute = True
                dist_strategy.recompute_configs = {
                    "checkpoints": model.gpt.checkpoints
                }

            # Use the fleet api to compile the distributed optimizer
            optimizer = fleet.distributed_optimizer(optimizer,
                                                    strategy=dist_strategy)

            optimizer.minimize(loss)
            logger.info(f'final strategy: {fleet._final_strategy()}')
            logger.info("The training meta optimizer is/are %s" %
                        fleet._get_applied_meta_list())

    program_desc_dir = os.path.join(args.output_dir, "program_desc")
    if not os.path.isdir(program_desc_dir):
        os.mkdir(program_desc_dir)

    with open(program_desc_dir + "/main_program.txt.%d" % worker_index,
              'w') as f:
        f.write(str(main_program))

    with open(program_desc_dir + "/startup_program.txt.%d" % worker_index,
              'w') as f:
        f.write(str(startup_program))

    # Define the Executor for running the static model
    exe = paddle.static.Executor(place)
    exe.run(startup_program)
    test_program = main_program.clone(for_test=True)

    if args.model_name_or_path not in pretrained_models_list:
        logger.info("Try to load checkpoint from %s " %
                    args.model_name_or_path)
        dygrah_path = os.path.join(args.model_name_or_path,
                                   "model_state.pdparams")
        static_path = os.path.join(args.model_name_or_path, "static_vars")

        flag_loaded = False
        if os.path.exists(static_path):
            if args.mp_degree > 1:
                logger.warning("MP should init with dygraph params")
            else:
                logger.info("Loading parameters from %s" % static_path)
                paddle.static.load(main_program, static_path, exe)
                flag_loaded = True

        if not flag_loaded and os.path.exists(dygrah_path):
            if args.sharding_degree > 1:
                logger.warning("Sharding should init with static vars")
            else:
                logger.info("Loading parameters from %s" % dygrah_path)
                init_static_with_params(
                    model, paddle.load(dygrah_path, return_numpy=True), topo,
                    main_program)
                flag_loaded = True

        if not flag_loaded:
            logger.error("No checkpoint load.")

    global_step = 0
    tic_train = time.time()
    epoch = 0
    learning_rate = main_program.global_block().vars["learning_rate_0"]
    while True:
        fetchs = []
        if topo.is_last:
            fetchs = [loss, learning_rate]

        # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader
        # many times. and start a new random dataloader.
        valid_data_loader = valid_data_loader()
        test_data_loader = test_data_loader()

        for step, batch in enumerate(train_data_loader()):
            global_step += 1
            ret = exe.run(main_program,
                          feed=batch,
                          fetch_list=fetchs,
                          use_program_cache=True)
            # In the new 2.0 api, must call this function to change the learning_rate
            lr_scheduler.step()

            if global_step % args.logging_freq == 0:
                if topo.is_last:
                    loss_return, lr_return = ret
                    speed = args.logging_freq / (time.time() - tic_train)
                    logger.info(
                        "global step %d, epoch: %d, batch: %d, loss: %.9f, speed: %.2f steps/s, ips: %.0f tokens/s, learning rate: %.5e"
                        % (global_step, epoch, step, loss_return[0], speed,
                           speed * args.global_batch_size * args.max_seq_len,
                           lr_return[0]))
                    log_writer.add_scalar("loss", loss_return[0], global_step)
                    log_writer.add_scalar("learning_rate", lr_return[0],
                                          global_step)
                tic_train = time.time()

            if args.check_accuracy:
                if global_step >= args.max_steps:
                    return
                else:
                    continue

            if global_step % args.eval_freq == 0:
                # TODO, check the input data of validation
                eval_fetch = []
                if topo.is_last:
                    eval_fetch = [loss]

                run_evaluate(valid_data_loader, exe, test_program,
                             args.eval_iters, log_writer, global_step, args,
                             epoch, topo.is_last, eval_fetch, "valid")
                tic_train = time.time()

            if global_step % args.save_steps == 0 or global_step >= args.max_steps:
                output_dir = os.path.join(args.output_dir,
                                          "model_%d" % global_step)
                logger.debug("saving models to {}".format(output_dir))
                save_persistables(exe, os.path.join(output_dir, "static_vars"),
                                  main_program)
                if global_step == args.save_steps:
                    model.init_config["init_args"][0].init_config.pop(
                        "topo", None)
                model.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                tic_train = time.time()

            if global_step >= args.max_steps:
                eval_fetch = []
                if topo.is_last:
                    eval_fetch = [loss]

                run_evaluate(test_data_loader, exe, test_program,
                             args.test_iters, log_writer, global_step, args,
                             epoch, topo.is_last, eval_fetch, "test")
                del train_data_loader
                return
        epoch += 1
Exemple #12
0
    def __init__(self, args, place):
        self.args = args

        self.place = place

        self.init_checkpoint = args.init_checkpoint
        self.init_pretraining_params = args.init_pretraining_params

        # optimizer related
        self.optimizer = args.optimizer
        self.learning_rate = args.learning_rate
        self.beta1 = args.beta1
        self.beta2 = args.beta2
        self.warmup_steps = args.warmup_steps
        self.lr_scheduler = args.lr_scheduler
        self.max_training_steps = args.max_training_steps
        self.min_learning_rate = args.min_learning_rate
        self.weight_decay = args.weight_decay
        self.max_grad_norm = args.max_grad_norm

        # training related
        self.is_distributed = args.get("is_distributed", False)
        self.use_recompute = args.use_recompute
        self.checkpointing_every_n_layers = args.checkpointing_every_n_layers
        self.use_amp = args.use_amp
        self.amp_loss_scaling = args.amp_loss_scaling
        self.use_sharding = args.use_sharding
        self.dp_degree = args.dp_degree
        self.sharding_degree = args.sharding_degree
        self.mp_degree = args.mp_degree
        self.pp_degree = args.pp_degree

        # setup topology
        if self.is_distributed:
            fleet.init(is_collective=True)
            if self.use_sharding:
                self.topo = Topology(device_rank=fleet.worker_index(),
                                     world_size=fleet.worker_num(),
                                     dp_degree=self.dp_degree,
                                     pp_degree=self.pp_degree,
                                     sharding_degree=self.sharding_degree,
                                     mp_degree=self.mp_degree)
            else:
                self.topo = Topology(device_rank=fleet.worker_index(),
                                     world_size=fleet.worker_num(),
                                     dp_degree=fleet.worker_num())
        else:
            self.topo = Topology(device_rank=0, world_size=1)
            if self.use_recompute:
                print(
                    "[WARN] Cannot support recomputation in non-distributed mode."
                )
            if self.use_amp:
                print("[WARN] Cannot support AMP in non-distributed mode.")

        self.exe = fluid.Executor(place)
        # model mode
        self.run_infer = args.get("run_infer", False)
        self.batch_size = args.get("batch_size", 1)

        self._build_programs()
        return
def do_train(args):
    # Initialize the paddle and paddle fleet execute enviroment
    paddle.enable_static()
    place = paddle.set_device(args.select_device)
    fleet.init(is_collective=True)
    # paddle.distributed.init_parallel_env()

    worker_num = fleet.worker_num()
    worker_index = fleet.worker_index()

    # Create the random seed for the worker
    set_seed(args.seed)
    # worker_init = WorkerInitObj(args.seed + worker_index)
    worker_init = WorkerInitObj(args.seed)
    tracker = get_rng_state_tracker()
    tracker.add('global_seed', args.seed)
    tracker.add('local_seed', args.seed + worker_index + 2021)

    # Define the input data in the static mode
    main_program = paddle.static.default_main_program()
    startup_program = paddle.static.default_startup_program()
    data_holders = create_data_holder(args)

    [
        input_ids, segment_ids, input_mask, masked_lm_positions,
        masked_lm_labels, next_sentence_labels, masked_lm_scale
    ] = data_holders

    # Define the model structure in static mode
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    config = model_class.pretrained_init_configuration[args.model_name_or_path]
    if config["vocab_size"] % 8 != 0:
        config["vocab_size"] += 8 - (config["vocab_size"] % 8)
    config['num_partitions'] = args.num_partitions
    model = BertForPretraining(BertModel(**config), args.num_partitions)
    criterion = BertPretrainingCriterion(model.bert.config["vocab_size"])
    prediction_scores, seq_relationship_score = model(
        input_ids=input_ids,
        token_type_ids=segment_ids,
        attention_mask=input_mask,
        masked_positions=masked_lm_positions)
    loss = criterion(prediction_scores, seq_relationship_score,
                     masked_lm_labels, next_sentence_labels, masked_lm_scale)

    # Define the dynamic learing_reate scheduler and optimizer
    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, num_warmup_steps=args.warmup_steps,
        num_training_steps=args.max_steps if args.max_steps > 0 else
        (len(train_data_loader) * args.num_train_epochs): float(
            current_step) / float(max(1, num_warmup_steps))
        if current_step < num_warmup_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - num_warmup_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])
    # if worker_num == 1 and args.use_amp:
    #     amp_list = paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
    #         custom_white_list=['softmax', 'layer_norm', 'gelu'])
    #     optimizer = paddle.fluid.contrib.mixed_precision.decorate(
    #         optimizer,
    #         amp_list,
    #         init_loss_scaling=args.scale_loss,
    #         use_dynamic_loss_scaling=True)

    if fleet.worker_num() > 1:
        # Use the fleet api to compile the distributed optimizer
        optimizer = dist_optimizer(args, optimizer)
    optimizer.minimize(loss)

    # Define the Executor for running the static model
    exe = paddle.static.Executor(place)
    exe.run(startup_program)
    # state_dict = model.state_dict()

    # Use the state dict to update the parameter
    # reset_state_dict = reset_program_state_dict(model, state_dict)
    # paddle.static.set_program_state(main_program, reset_state_dict)

    # if worker_num == 1:
    #     # Construct the compiled program
    #     main_program = build_compiled_program(main_program, loss)
    main_program._graph = None

    if fleet.worker_index() == 0:
        with open('startup_%d' % fleet.worker_num(), 'w') as f:
            f.writelines(str(startup_program))
        with open('main_%d' % fleet.worker_num(), 'w') as f:
            f.writelines(str(main_program))
    pool = ThreadPoolExecutor(1)
    global_step = 0
    tic_train = time.time()
    epoch = 0
    while True:
        files = [
            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
            if os.path.isfile(os.path.join(args.input_dir, f))
            and "training" in f
        ]
        files.sort()
        num_files = len(files)
        random.Random(args.seed + epoch).shuffle(files)
        f_start_id = 0

        # Select one file for each worker and create the DataLoader for the file
        data_file = select_dataset_file_for_each_worker(
            files, f_start_id, 1, 0)
        #files, f_start_id, worker_num, worker_index)
        train_data_loader, _ = create_pretraining_dataset(
            data_file, args.max_predictions_per_seq, args, data_holders,
            worker_init, paddle.static.cuda_places())

        for f_id in range(f_start_id + 1, len(files)):
            data_file = select_dataset_file_for_each_worker(files, f_id, 1, 0)
            # files, f_id, worker_num, worker_index)
            dataset_future = pool.submit(create_pretraining_dataset, data_file,
                                         args.max_predictions_per_seq, args,
                                         data_holders, worker_init,
                                         paddle.static.cuda_places())

            for step, batch in enumerate(train_data_loader):
                global_step += 1
                if step == 10 and worker_index == 0:
                    profiler.start_profiler("All")
                if step == 20 and worker_index == 0:
                    profiler.stop_profiler("total", "/tmp/profile")

                loss_return = exe.run(main_program,
                                      feed=batch,
                                      fetch_list=[loss])
                # In the new 2.0 api, must call this function to change the learning_rate
                lr_scheduler.step()
                if global_step % args.logging_steps == 0:
                    time_cost = time.time() - tic_train
                    print(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, ips: %.2f sequences/s"
                        % (global_step, epoch, step, loss_return[0],
                           args.logging_steps / time_cost,
                           args.logging_steps * args.batch_size / time_cost))
                    tic_train = time.time()
                if global_step % args.save_steps == 0:
                    if worker_index == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # TODO(fangzeyang): Udpate the save_params to paddle.static
                        paddle.fluid.io.save_params(exe, output_dir)
                        tokenizer.save_pretrained(output_dir)
                if global_step >= args.max_steps:
                    del train_data_loader
                    return
            del train_data_loader
            train_data_loader, data_file = dataset_future.result(timeout=None)
        epoch += 1
def do_train(args):
    # Initialize the paddle and paddle fleet execute environment
    paddle.enable_static()
    fleet.init(is_collective=True)

    # Create the random seed for the worker
    random.seed(args.seed)
    np.random.seed(args.seed)
    paddle.seed(args.seed)
    get_rng_state_tracker().add('global_seed', args.seed)
    get_rng_state_tracker().add('local_seed',
                                args.seed + fleet.worker_index() + 2021)

    assert args.device in [
        "cpu", "gpu", "xpu"
    ], "Invalid device! Available device should be cpu, gpu, or xpu."
    place = paddle.set_device(args.device)

    worker_num = fleet.worker_num()
    worker_index = fleet.worker_index()
    assert args.dp_degree * args.sharding_degree * args.mp_degree * args.pp_degree == worker_num, \
        "The product of degree num should be equal to worker_num."

    topo = Topology(device_rank=worker_index,
                    world_size=worker_num,
                    dp_degree=args.dp_degree,
                    pp_degree=args.pp_degree,
                    sharding_degree=args.sharding_degree,
                    mp_degree=args.mp_degree)

    logger.info("The topo of hybrid parallelism:\n{}".format(topo))

    dist_strategy = dist_optimizer(args, topo)

    # Create log write, train results show on last card of pipeline.
    if topo.is_last:
        log_writer_path = os.path.join(
            args.output_dir, "train_log",
            "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format(
                args.model_name_or_path, args.global_batch_size, args.use_amp,
                args.use_recompute, worker_index).lower())
        # if os.path.exists(log_writer_path):
        #     shutil.rmtree(log_writer_path)
        log_writer = LogWriter(log_writer_path)

    # Define the input data in the static mode
    base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[
        args.model_type]
    pretrained_models_list = list(
        model_class.pretrained_init_configuration.keys())

    # load config in checkpoint
    global_step = 0
    consumed_samples = 0
    checkpoint_dir = os.path.join(args.output_dir, "model_last")
    if os.path.exists(checkpoint_dir):
        if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")):
            with open(os.path.join(checkpoint_dir, "./config.yml"), "r") as f:
                step_config = yaml.load(f, Loader=yaml.FullLoader)
                assert step_config[
                    "global_batch_size"] == args.global_batch_size, "Please ensure checkpoint global batch size is the same. Folder: {}".format(
                        checkpoint_dir)
                consumed_samples = step_config["consumed_samples"]
                global_step = step_config["global_step"]

    data_file = get_train_data_file(args)
    main_program = paddle.static.default_main_program()
    startup_program = paddle.static.default_startup_program()
    with paddle.static.program_guard(main_program, startup_program):
        data_holders = create_data_holder(args)
        # 0. input_ids,
        # 1. segment_ids,
        # 2. input_mask,
        # 3. masked_lm_positions,
        # 4. masked_lm_labels,
        # 5. next_sentence_labels

        [
            input_ids, segment_ids, input_mask, masked_lm_positions,
            masked_lm_labels, next_sentence_labels
        ] = data_holders

        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)

        train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset(
            args,
            data_file,
            tokenizer,
            data_world_size=topo.data_info.size,
            data_world_rank=topo.data_info.rank,
            max_seq_len=args.max_seq_len,
            places=paddle.static.cuda_places(),
            data_holders=data_holders,
            current_step=global_step)
        fleet.init(is_collective=True)

        if args.model_name_or_path in pretrained_models_list:
            model_config = model_class.pretrained_init_configuration[
                args.model_name_or_path]
            if model_config["vocab_size"] % 8 != 0:
                model_config["vocab_size"] += 8 - (model_config["vocab_size"] %
                                                   8)
            model_config["hidden_dropout_prob"] = args.hidden_dropout_prob
            model_config[
                "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob
            model = model_class(base_class(**model_config))
        else:
            model, _ = model_class.from_pretrained(
                args.model_name_or_path,
                hidden_dropout_prob=args.hidden_dropout_prob,
                attention_probs_dropout_prob=args.attention_probs_dropout_prob,
            )

        # Create the model for the gpt pretrain
        prediction_scores, seq_relationship_score = model(
            input_ids=input_ids,
            token_type_ids=segment_ids,
            position_ids=None,
            attention_mask=input_mask,
            masked_positions=masked_lm_positions)

        criterion = criterion_class(with_nsp_loss=args.binary_head)
        if args.binary_head:
            lm_loss, sop_loss = criterion(prediction_scores,
                                          seq_relationship_score,
                                          masked_lm_labels,
                                          next_sentence_labels)
            loss = lm_loss + sop_loss
        else:
            loss = criterion(prediction_scores, seq_relationship_score,
                             masked_lm_labels)

        # Create the learning_rate sheduler and optimizer
        if args.decay_steps is None:
            args.decay_steps = args.max_steps

        # lr_scheduler = CosineAnnealingWithWarmupDecay(
        #     max_lr=args.max_lr,
        #     min_lr=args.min_lr,
        #     warmup_step=args.warmup_rate * args.max_steps,
        #     decay_step=args.decay_steps, last_epoch=global_step)

        lr_scheduler = LinearDecayWithWarmup(args.max_lr,
                                             args.max_steps,
                                             args.warmup_rate,
                                             last_epoch=global_step)

        clip = None
        if args.grad_clip > 0:
            clip = paddle.fluid.clip.GradientClipByGlobalNorm(
                clip_norm=args.grad_clip)

        decay_param = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        logger.info("Using paddle.optimizer.AdamW.")
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            beta1=args.adam_beta1,
            beta2=args.adam_beta2,
            epsilon=args.adam_epsilon,
            grad_clip=clip,
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_param)
        # alias
        optimizer.apply_optimize = optimizer._apply_optimize

        # if args.use_recompute:
        #     dist_strategy.recompute = True
        #     dist_strategy.recompute_configs = {
        #         "checkpoints": model.bert.checkpoints
        #     }

        # Use the fleet api to compile the distributed optimizer
        optimizer = fleet.distributed_optimizer(optimizer,
                                                strategy=dist_strategy)

        optimizer.minimize(loss)
        logger.info(f'final strategy: {fleet._final_strategy()}')
        logger.info("The training meta optimizer is/are %s" %
                    fleet._get_applied_meta_list())

    program_desc_dir = os.path.join(args.output_dir, "program_desc")
    if not os.path.isdir(program_desc_dir):
        os.mkdir(program_desc_dir)

    with open(program_desc_dir + "/main_program.txt.%d" % worker_index,
              'w') as f:
        f.write(str(main_program))

    with open(program_desc_dir + "/startup_program.txt.%d" % worker_index,
              'w') as f:
        f.write(str(startup_program))

    # Define the Executor for running the static model
    exe = paddle.static.Executor(place)
    exe.run(startup_program)

    test_program = main_program.clone(for_test=True)

    if args.model_name_or_path not in pretrained_models_list:
        logger.info("Try to load checkpoint from %s " %
                    args.model_name_or_path)
        dygrah_path = os.path.join(args.model_name_or_path,
                                   "model_state.pdparams")
        static_path = os.path.join(args.model_name_or_path, "static_vars")

        flag_loaded = False
        if os.path.exists(static_path):
            if args.mp_degree > 1:
                logger.warning("MP should init with dygraph params")
            else:
                logger.info("Loading parameters from %s" % static_path)
                paddle.static.load(main_program, static_path, exe)
                flag_loaded = True

        if not flag_loaded and os.path.exists(dygrah_path):
            if args.sharding_degree > 1:
                logger.warning("Sharding should init with static vars")
            else:
                logger.info("Loading parameters from %s" % dygrah_path)
                init_static_with_params(
                    model, paddle.load(dygrah_path, return_numpy=True), topo,
                    main_program)
                flag_loaded = True

        if not flag_loaded:
            logger.error("No checkpoint load.")

    # load checkpoint vars
    if os.path.exists(checkpoint_dir):
        if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")):
            paddle.static.load(main_program,
                               os.path.join(checkpoint_dir, "static_vars"),
                               exe)

    fetch_loss_vars = collections.OrderedDict()
    fetch_other_vars = collections.OrderedDict()
    fetch_loss_vars["loss"] = loss
    if args.binary_head:
        fetch_loss_vars["lm_loss"] = lm_loss
        fetch_loss_vars["sop_loss"] = sop_loss

    fetch_other_vars["learning_rate"] = main_program.global_block(
    ).vars["learning_rate_0"]

    additional_vars = collections.OrderedDict()
    if args.use_amp:
        for key in ["loss_scaling", "num_good_steps", "num_bad_steps"]:
            additional_vars[key] = main_program.global_block().vars[key + "_0"]

    tic_train = time.time()
    while True:
        fetchs = []
        fetchs_keys = []
        if topo.is_last:
            fetchs = list(fetch_loss_vars.values()) + list(
                fetch_other_vars.values()) + list(additional_vars.values())
            fetchs_keys = list(fetch_loss_vars.keys()) + list(
                fetch_other_vars.keys()) + list(additional_vars.keys())

        # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader
        # many times. and start a new random dataloader.
        valid_data_loader = valid_data_loader()
        test_data_loader = test_data_loader()

        for step, batch in enumerate(train_data_loader()):
            ret = exe.run(main_program,
                          feed=batch,
                          fetch_list=fetchs,
                          use_program_cache=True)
            # Skip for accumulate_steps in global step
            if (step + 1) % args.accumulate_steps != 0:
                continue
            global_step += 1
            # In the new 2.0 api, must call this function to change the learning_rate
            lr_scheduler.step()

            if global_step % args.logging_freq == 0:
                if topo.is_last:
                    res = collections.defaultdict(float)
                    for k, v in zip(fetchs_keys, ret):
                        res[k] = v[0]

                    speed = args.logging_freq / (time.time() - tic_train)

                    loss_info = "loss: %.6f, lm_loss: %.6f, sop_loss: %.6f"

                    loss_info = ", ".join([
                        "{}: {:.6f}".format(k, res[k])
                        for k in fetch_loss_vars.keys()
                    ])

                    common_loginfo = "global step %d, %s, speed: %.2f steps/s, ips: %.2f seqs/s, learning rate: %.5e" % (
                        global_step, loss_info, speed,
                        speed * args.global_batch_size, res["learning_rate"])
                    additional_loginfo = ", ".join([
                        "{}: {}".format(k, res[k])
                        for k in additional_vars.keys()
                    ])
                    if additional_loginfo:
                        common_loginfo += ", " + additional_loginfo
                    logger.info(common_loginfo)
                    for k, v in res.items():
                        log_writer.add_scalar(k, v, global_step)

                tic_train = time.time()

            #if args.check_accuracy:
            #    if global_step >= args.max_steps:
            #        return
            #    else:
            #        continue

            if global_step % args.eval_freq == 0:
                # TODO, check the input data of validation
                eval_fetch = collections.OrderedDict()
                if topo.is_last:
                    eval_fetch["loss"] = loss
                    if args.binary_head:
                        eval_fetch["lm_loss"] = lm_loss
                        eval_fetch["sop_loss"] = sop_loss

                run_evaluate(valid_data_loader, exe, test_program,
                             args.eval_iters, log_writer, global_step, args,
                             topo.is_last, eval_fetch, "valid")
                tic_train = time.time()

            if global_step % args.save_steps == 0 or global_step >= args.max_steps:
                output_dir = os.path.join(args.output_dir,
                                          "model_%d" % global_step)
                logger.debug("saving models to {}".format(output_dir))
                save_persistables(exe, os.path.join(output_dir, "static_vars"),
                                  main_program)
                if global_step == args.save_steps:
                    model.init_config["init_args"][0].init_config.pop(
                        "topo", None)
                model.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                tic_train = time.time()

            if global_step % args.checkpoint_steps == 0:
                output_dir = os.path.join(args.output_dir, "model_last")
                if worker_index == 0:
                    if not os.path.exists(output_dir):
                        os.mkdir(output_dir)
                    output_dir_bak = os.path.join(args.output_dir,
                                                  "model_last_bak")
                    if os.path.exists(output_dir):
                        if os.path.exists(output_dir_bak):
                            shutil.rmtree(output_dir_bak)
                        shutil.move(output_dir, output_dir_bak)
                        os.mkdir(output_dir)

                    step_config = {
                        "model_name": args.model_name_or_path,
                        "global_step": global_step,
                        "global_batch_size": args.global_batch_size,
                        "consumed_samples":
                        global_step * args.global_batch_size,
                    }

                    with open(os.path.join(output_dir, "config.yml"),
                              "w") as f:
                        yaml.dump(step_config,
                                  f,
                                  encoding='utf-8',
                                  allow_unicode=True)

                fleet.barrier_worker()

                logger.debug("saving models to {}".format(output_dir))
                if args.sharding_degree <= 1:
                    # Save on the first worker by default.
                    if worker_index == 0:
                        paddle.static.save(
                            main_program,
                            os.path.join(output_dir, "static_vars"))
                else:
                    # Use save_persistables in sharding, but more slower
                    save_persistables(exe,
                                      os.path.join(output_dir, "static_vars"),
                                      main_program)

            if global_step >= args.max_steps:
                eval_fetch = collections.OrderedDict()
                if topo.is_last:
                    eval_fetch["loss"] = loss
                    if args.binary_head:
                        eval_fetch["lm_loss"] = lm_loss
                        eval_fetch["sop_loss"] = sop_loss

                run_evaluate(test_data_loader, exe, test_program,
                             args.test_iters, log_writer, global_step, args,
                             topo.is_last, eval_fetch, "test")
                del train_data_loader
                return
Exemple #15
0
def main(args):

    np.random.seed(9001)

    run_id = args.run_id
    if not os.path.isdir(run_id):
        os.system('mkdir -p {}'.format(run_id))

    profile = False
    batch_size = 512 * 50
    lr = 1e-4

    fleet.init(is_collective=True)
    # load Bert_large / Bert_base model
    model = X.applications.BertLarge(lang="en")

    model.main_prog.random_seed = 9001
    model.startup_prog.random_seed = 9001

    local_path = "./data"
    data_loader = model.get_val_dataloader(data_dir='{}'.format(local_path),
                                           max_seq_len=512,
                                           batch_size=batch_size,
                                           in_tokens=True,
                                           shuffle=False)

    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))
    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = 2
    exec_strategy.num_iteration_per_drop_scope = 1

    build_strategy = fluid.BuildStrategy()
    build_strategy.enable_inplace = True
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.execution_strategy = exec_strategy
    dist_strategy.build_strategy = build_strategy
    dist_strategy.nccl_comm_num = 1
    dist_strategy.amp = args.use_amp

    # recompute
    checkpoints = [
        'elementwise_add_{}.tmp_0'.format(i * 2) for i in range(1, 24)
    ]
    dist_strategy.recompute = args.use_recompute
    if args.use_recompute:
        dist_strategy.recompute_configs = {"checkpoints": checkpoints}

    scheduled_lr = X.utils.linear_warmup_decay(lr,
                                               warmup_steps=4000,
                                               num_train_steps=1000000)
    optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
    optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)

    clip_norm_thres = 1.0
    fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
        clip_norm=clip_norm_thres))

    ops, param_grads = optimizer.minimize(model.loss)

    filename = "./" + args.run_id + "/main_program.txt"
    with open(filename + str(int(os.environ.get('FLAGS_selected_gpus', 0))),
              'w') as f:
        f.write(str(fluid.default_main_program()))
    filename = "./" + args.run_id + "/start_program.txt"
    with open(filename + str(int(os.environ.get('FLAGS_selected_gpus', 0))),
              'w') as f:
        f.write(str(fluid.default_startup_program()))

    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    model.main_prog.random_seed = 9001
    model.startup_prog.random_seed = 9001
    np.random.seed(9001)

    fetch_list = [model.loss.name] + list(model.target.values()) + \
                    [scheduled_lr.name, "loss_scaling_0"]

    start_time = -1
    speeds = []
    profile = False
    costs = []
    accs = []

    print("============start training============")
    for i, data in enumerate(data_loader()):

        # profile
        if profile and i == 2050:
            print("begin profiler")
            profiler.start_profiler("All")
        elif profile and i == 2065:
            print("end profiler")
            filename = "./run_id/profile_" + str(fleet.worker_index())
            profiler.stop_profiler("total", filename)
            print("end profiler break!")
            print("avg speed = {} step / s".format(np.mean(speeds)))
            sys.exit("profile finish !")

        cost_val, next_sent_acc, lm_loss, np_lr, loss_scaling_0 = exe.run(
            fluid.default_main_program(),
            feed=data,
            fetch_list=fetch_list,
            use_program_cache=True)

        costs.append(cost_val[0])
        accs.append(next_sent_acc[0])

        # count speed
        if (i + 1) % 10 == 0:

            duration = time.time() - start_time
            speed = 10 / duration
            print("step {}, loss {}, acc {}, np_lr {}". \
            format(i, np.mean(costs), np.mean(accs), np_lr[0]))
            start_time = time.time()
            costs = []
            accs = []
Exemple #16
0
def infer_dst(args):
    """Inference main function."""
    if args.is_distributed:
        fleet.init(is_collective=True)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
        phase = "distributed_test"
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
        phase = "test"
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    # task.debug()

    schema = get_schema(args.dataset)
    empty_ds_seq = "<ds/> " + " ".join(flatten_ds({}, schema)) + " </ds>"

    # record original order and init status
    output_order = []
    # {"dial_id": {"prev_ds": "", "turns": [{"utts": utts, "turn_idx": turn_idx}], "cur_idx": 0}}
    dial_status = defaultdict(dict)
    with open(args.infer_file, "r") as fin:
        next(fin)
        for line in fin:
            dial_id, turn_idx, utts = line.strip().split("\t")
            output_order.append(f"{dial_id}-{turn_idx}")
            if dial_id not in dial_status:
                dial_status[dial_id]["prev_ds"] = empty_ds_seq
                dial_status[dial_id]["turns"] = []
                dial_status[dial_id]["cur_idx"] = 0
            dial_status[dial_id]["turns"].append({
                "utts": utts,
                "turn_idx": turn_idx
            })
    dial_ids = list(dial_status.keys())

    # batch inference
    outputs = {}
    timer = Timer()
    while len(dial_ids) > 0:
        timer.start()
        cur_dial_ids = dial_ids[:args.dial_batch_size]
        logger.info(f"Sampled dialogue ids: {cur_dial_ids}")

        # 1st: basic generation
        basic_inputs = {}
        for cur_dial_id in cur_dial_ids:
            cur_idx = dial_status[cur_dial_id]["cur_idx"]
            cur_dial_turn = dial_status[cur_dial_id]["turns"][cur_idx]
            cur_utts = cur_dial_turn["utts"]
            prev_ds = dial_status[cur_dial_id]["prev_ds"]
            src = f"<gen/> {cur_utts} [SEP] {prev_ds} </gen>\x010"
            basic_inputs[f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"] = src
        basic_outputs = generate(basic_inputs, model, task)

        # 2nd: amending generation
        amending_inputs = {}
        for cur_dial_id in cur_dial_ids:
            cur_idx = dial_status[cur_dial_id]["cur_idx"]
            cur_dial_turn = dial_status[cur_dial_id]["turns"][cur_idx]
            cur_utts = cur_dial_turn["utts"]
            basic_ds = basic_outputs[
                f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"]
            src = f"<amend/> {cur_utts} [SEP] {basic_ds} </amend>\x010"
            amending_inputs[f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"] = src
        amending_outputs = generate(amending_inputs, model, task)

        outputs.update(amending_outputs)
        time_cost_infer = timer.pass_time
        logger.info(f"Time cost: {time_cost_infer}")

        # debug info
        for dial_turn_tag in basic_inputs:
            logger.debug(f"[basic input]: {basic_inputs[dial_turn_tag]}")
            logger.debug(f"[basic output]: {basic_outputs[dial_turn_tag]}")
            logger.debug(f"[amending input]: {amending_inputs[dial_turn_tag]}")
            logger.debug(
                f"[amending output]: {amending_outputs[dial_turn_tag]}")

        # update dial_status
        for dial_turn_tag in amending_outputs:
            dial_id, _ = dial_turn_tag.split("-")
            dial_status[dial_id]["cur_idx"] += 1
            if dial_status[dial_id]["cur_idx"] >= len(
                    dial_status[dial_id]["turns"]):
                dial_ids.remove(dial_id)
            else:
                dial_status[dial_id]["prev_ds"] = outputs[dial_turn_tag]
        timer.reset()

    # reorder and output
    if gpu_id == 0:
        pred_seqs = []
        pred_labels = []
        for dial_turn_tag in output_order:
            pred_seqs.append(outputs[dial_turn_tag])
            pred_label = parse_ds(outputs[dial_turn_tag], schema)
            pred_labels.append(pred_label)

        out_seq_file = os.path.join(args.save_path, "inference_output.txt")
        out_label_file = os.path.join(args.save_path, "inference_labels.json")
        with open(out_seq_file, "w") as fout_seq, open(out_label_file,
                                                       "w") as fout_label:
            fout_seq.write("\n".join(pred_seqs))
            json.dump(pred_labels, fout_label, indent=2)
        logger.info(f"Save inference sequences to `{out_seq_file}`")
        logger.info(f"Save inference labels to `{out_label_file}`")
Exemple #17
0
input_y = paddle.static.data(name="y", shape=[None, 1], dtype='int64')

cost = mlp(input_x, input_y)
optimizer = paddle.optimizer.SGD(learning_rate=0.01)

role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)

strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True

optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(cost)

if fleet.is_server():
    fleet.init_server()
    fleet.run_server()

elif fleet.is_worker():
    place = paddle.CPUPlace()
    exe = paddle.static.Executor(place)
    exe.run(paddle.static.default_startup_program())

    step = 1001
    for i in range(step):
        cost_val = exe.run(program=paddle.static.default_main_program(),
                           feed=gen_data(),
                           fetch_list=[cost.name])
        print("worker_index: %d, step%d cost = %f" %
              (fleet.worker_index(), i, cost_val[0]))
Exemple #18
0
def do_train(args):
    # Initialize the paddle and paddle fleet execute enviroment
    paddle.enable_static()
    place = paddle.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))
    fleet.init(is_collective=True)

    # Create the random seed for the worker
    set_seed(args.seed)
    worker_init = WorkerInitObj(args.seed + fleet.worker_index())

    # Define the input data in the static mode
    data_holders = create_data_holder(args)

    [
        input_ids, segment_ids, input_mask, masked_lm_positions,
        masked_lm_labels, next_sentence_labels, masked_lm_scale
    ] = data_holders

    # Define the model structure in static mode
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    model = BertForPretraining(
        BertModel(**model_class.pretrained_init_configuration[
            args.model_name_or_path]))
    criterion = BertPretrainingCriterion(model.bert.config["vocab_size"])
    prediction_scores, seq_relationship_score = model(
        input_ids=input_ids,
        token_type_ids=segment_ids,
        attention_mask=input_mask,
        masked_positions=masked_lm_positions)
    loss = criterion(prediction_scores, seq_relationship_score,
                     masked_lm_labels, next_sentence_labels, masked_lm_scale)

    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs
    # Define the dynamic learing_reate scheduler and optimizer
    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_steps)

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    # Use the fleet api to compile the distributed optimizer
    strategy = fleet.DistributedStrategy()
    optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
    optimizer.minimize(loss)

    # Define the Executor for running the static model
    exe = paddle.static.Executor(place)
    exe.run(paddle.static.default_startup_program())
    state_dict = model.state_dict()

    # Use the state dict to update the parameter
    reset_state_dict = reset_program_state_dict(model, state_dict)
    paddle.static.set_program_state(paddle.static.default_main_program(),
                                    reset_state_dict)

    pool = ThreadPoolExecutor(1)
    global_step = 0
    tic_train = time.time()
    worker_num = fleet.worker_num()
    worker_index = fleet.worker_index()
    epoch = 0
    while True:
        files = [
            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
            if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in
            f
        ]
        files.sort()
        num_files = len(files)
        random.Random(args.seed + epoch).shuffle(files)
        f_start_id = 0

        # Select one file for each worker and create the DataLoader for the file
        data_file = select_dataset_file_for_each_worker(
            files, f_start_id, worker_num, worker_index)
        train_data_loader, _ = create_pretraining_dataset(
            data_file, args.max_predictions_per_seq, args, data_holders,
            worker_init, paddle.static.cuda_places())

        for f_id in range(f_start_id + 1, len(files)):
            data_file = select_dataset_file_for_each_worker(
                files, f_id, worker_num, worker_index)
            dataset_future = pool.submit(create_pretraining_dataset, data_file,
                                         args.max_predictions_per_seq, args,
                                         data_holders, worker_init,
                                         paddle.static.cuda_places())

            for step, batch in enumerate(train_data_loader):
                global_step += 1
                loss_return = exe.run(paddle.static.default_main_program(),\
                    feed=batch,
                    fetch_list=[loss])
                # In the new 2.0 api, must call this function to change the learning_rate
                lr_scheduler.step()
                if global_step % args.logging_steps == 0:
                    time_cost = time.time() - tic_train
                    print(
                        "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, ips :%.2f sequences/s"
                        % (global_step, epoch, step, loss_return[0],
                           args.logging_steps / time_cost,
                           args.logging_steps * args.batch_size / time_cost))
                    tic_train = time.time()
                if global_step % args.save_steps == 0:
                    if worker_index == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # TODO(fangzeyang): Udpate the save_params to paddle.static
                        paddle.fluid.io.save_params(exe, output_dir)
                        tokenizer.save_pretrained(output_dir)
                if global_step >= args.max_steps:
                    del train_data_loader
                    return
            del train_data_loader
            train_data_loader, data_file = dataset_future.result(timeout=None)
        epoch += 1
Exemple #19
0
def test_worker_index():
    """test_worker_index"""
    assert fleet.worker_index() == 0
    print("{} ... ok".format(sys._getframe().f_code.co_name))
 def test_worker_index(self):
     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
     fleet.init(role)
     print(fleet.worker_index())
Exemple #21
0
def write_xbox_donefile(output_path,
                        day,
                        pass_id,
                        xbox_base_key,
                        client,
                        hadoop_fs_name="",
                        monitor_data="",
                        donefile_name=None):
    """
    write xbox donefile when save xbox model

    Args:
        output_path(str): output path
        day(str|int): training day
        pass_id(str|int): training pass id
        xbox_base_key(str|int): xbox base key
        client(HDFSClient): hadoop client
        donefile_name(str): donefile name, default is "xbox_patch_done.txt"
    """
    day = str(day)
    pass_id = str(pass_id)
    xbox_base_key = int(xbox_base_key)
    mode = None

    if pass_id != "-1":
        mode = "patch"
        suffix_name = "/%s/delta-%s/" % (day, pass_id)
        model_path = output_path.rstrip("/") + suffix_name
        if donefile_name is None:
            donefile_name = "xbox_patch_done.txt"
    else:
        mode = "base"
        suffix_name = "/%s/base/" % day
        model_path = output_path.rstrip("/") + suffix_name
        if donefile_name is None:
            donefile_name = "xbox_base_done.txt"

    if fleet.worker_index() == 0:
        donefile_path = output_path + "/" + donefile_name
        xbox_str = _get_xbox_str(
            model_path=model_path,
            xbox_base_key=xbox_base_key,
            hadoop_fs_name=hadoop_fs_name,
            monitor_data=monitor_data,
            mode=mode)
        if not is_local(donefile_path):
            if client.is_file(donefile_path):
                pre_content = client.cat(donefile_path)
                last_line = pre_content.split("\n")[-1]
                if last_line == '':
                    last_line = pre_content.split("\n")[-2]
                last_dict = json.loads(last_line)
                last_day = last_dict["input"].split("/")[-3]
                last_pass = last_dict["input"].split("/")[-2].split("-")[-1]
                exist = False
                if int(day) < int(last_day) or \
                        int(day) == int(last_day) and \
                        int(pass_id) <= int(last_pass):
                    exist = True
                if not exist:
                    with open(donefile_name, "w") as f:
                        f.write(pre_content + "\n")
                        f.write(xbox_str + "\n")
                    client.delete(donefile_path)
                    client.upload(
                        donefile_name,
                        output_path,
                        multi_processes=1,
                        overwrite=False)
                    logger.info("write %s/%s %s success" % \
                                (day, pass_id, donefile_name))
                else:
                    logger.info("do not write %s because %s/%s already "
                                "exists" % (donefile_name, day, pass_id))
            else:
                with open(donefile_name, "w") as f:
                    f.write(xbox_str + "\n")
                client.upload(
                    donefile_name,
                    output_path,
                    multi_processes=1,
                    overwrite=False)
                logger.info("write %s/%s %s success" % \
                            (day, pass_id, donefile_name))
        else:
            file = Path(donefile_path)
            if not file.is_file():
                with open(donefile_path, "w") as f:
                    f.write(xbox_str + "\n")
                return
            with open(donefile_path, encoding='utf-8') as f:
                pre_content = f.read().strip("\n")
            exist = False
            last_line = pre_content.split("\n")[-1]
            last_dict = json.loads(last_line, strict=False)
            last_day = last_dict["input"].split("/")[-3]
            last_pass = last_dict["input"].split("/")[-2].split("-")[-1]
            if int(day) < int(last_day) or \
                    int(day) == int(last_day) and \
                    int(pass_id) <= int(last_pass):
                exist = True
            if not exist:
                with open(donefile_path, "w") as f:
                    f.write(pre_content + "\n")
                    f.write(xbox_str + "\n")
Exemple #22
0
def write_model_donefile(output_path,
                         day,
                         pass_id,
                         xbox_base_key,
                         client,
                         donefile_name="donefile.txt"):
    """
    write donefile when save model

    Args:
        output_path(str): output path
        day(str|int): training day
        pass_id(str|int): training pass id
        xbox_base_key(str|int): xbox base key
        client(HDFSClient): hadoop client
        donefile_name(str): donefile name, default is "donefile.txt"r
    """
    day = str(day)
    pass_id = str(pass_id)
    xbox_base_key = int(xbox_base_key)

    if pass_id != "-1":
        suffix_name = "/%s/%s/" % (day, pass_id)
        model_path = output_path.rstrip("/") + suffix_name
    else:
        suffix_name = "/%s/0/" % day
        model_path = output_path.rstrip("/") + suffix_name

    if fleet.worker_index() == 0:
        donefile_path = output_path + "/" + donefile_name
        content = "%s\t%lu\t%s\t%s\t%d" % (day, xbox_base_key, \
                                            model_path, pass_id, 0)
        if not is_local(model_path):
            if client.is_file(donefile_path):
                pre_content = client.cat(donefile_path)
                pre_content_list = pre_content.split("\n")
                day_list = [i.split("\t")[0] for i in pre_content_list]
                pass_list = [i.split("\t")[3] for i in pre_content_list]
                exist = False
                for i in range(len(day_list)):
                    if int(day) == int(day_list[i]) and \
                            int(pass_id) == int(pass_list[i]):
                        exist = True
                        break
                if not exist:
                    with open(donefile_name, "w") as f:
                        f.write(pre_content + "\n")
                        f.write(content + "\n")
                    client.delete(donefile_path)
                    client.upload(donefile_name, output_path)
                    logger.info("write %s/%s %s succeed" % \
                                (day, pass_id, donefile_name))
                else:
                    logger.info("not write %s because %s/%s already "
                                "exists" % (donefile_name, day, pass_id))
            else:
                with open(donefile_name, "w") as f:
                    f.write(content + "\n")
                client.upload(donefile_name, output_path)
                logger.info("write %s/%s %s succeed" % \
                            (day, pass_id, donefile_name))
        else:
            file = Path(donefile_path)
            logger.info("model done file path = {}, content = {}".format(
                donefile_path, content))
            if not file.is_file():
                logger.info(" {} doesn't exist ".format(donefile_path))
                with open(donefile_path, "w") as f:
                    f.write(content + "\n")
                return
            with open(donefile_path, encoding='utf-8') as f:
                pre_content = f.read().strip("\n")
            logger.info("pre_content = {}".format(pre_content))
            pre_content_list = pre_content.split("\n")
            day_list = [i.split("\t")[0] for i in pre_content_list]
            pass_list = [i.split("\t")[3] for i in pre_content_list]
            exist = False
            for i in range(len(day_list)):
                if int(day) == int(day_list[i]) and \
                        int(pass_id) == int(pass_list[i]):
                    exist = True
                    break
            if not exist:
                with open(donefile_path, "w") as f:
                    f.write(pre_content + "\n")
                    logger.info("write donefile {}".format(pre_content))
                    f.write(content + "\n")
                    logger.info("write donefile {}".format(content))
                logger.info("write %s/%s %s succeed" % \
                            (day, pass_id, donefile_name))
            else:
                logger.info("not write %s because %s/%s already "
                            "exists" % (donefile_name, day, pass_id))
Exemple #23
0
def train(args):
    log.info("pretraining start")
    profile = False

    place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))

    # set seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    paddle.seed(args.seed)
    get_rng_state_tracker().add('global_seed', args.seed)
    get_rng_state_tracker().add('local_seed',
                                args.seed + fleet.worker_index() + 2021)

    # define execution strategy
    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = 2
    exec_strategy.num_iteration_per_drop_scope = 1

    # define distribution strategy
    dist_strategy = fleet.DistributedStrategy()
    dist_strategy.execution_strategy = exec_strategy
    dist_strategy.nccl_comm_num = 3
    if args.use_recompute:
        log.info("using recompute.")
    dist_strategy.recompute = args.use_recompute
    dist_strategy.sharding = args.use_sharding
    dist_strategy.pipeline = args.num_pp > 1

    # define topology structure for dp/pp/mp
    topo = Topology(rank=fleet.worker_index(),
                    world_size=fleet.worker_num(),
                    dp=args.num_dp,
                    pp=args.num_pp,
                    sharding=args.num_sharding,
                    mp=args.num_mp)

    is_last = False
    if topo.pp.rank == (topo.pp.size - 1):
        is_last = True

    dp_sharding_rank = topo.dp.rank * topo.sharding.size + topo.sharding.rank
    dp_worldsize = topo.dp.size * topo.sharding.size
    bsz_per_dp = args.global_bsz // dp_worldsize

    micro_bsz = args.micro_bsz
    assert args.global_bsz % micro_bsz == 0, f"cannot do gradient accumulate, globa_bsz: {args.bsz} micro_bsz: {micro_bsz}"
    acc_steps = bsz_per_dp // micro_bsz

    # sharding \ model parallel \ pipeline
    assert dist_strategy.sharding == True
    dist_strategy.sharding_configs = {
        "segment_broadcast_MB": 32,
        "sharding_degree": args.num_sharding,
        "mp_degree": args.num_mp,
        "pp_degree": args.num_pp,
        "dp_degree": args.num_dp,
        "optimize_offload": True,
    }
    dist_strategy.pipeline_configs = {
        "schedule_mode": "1F1B",
        "micro_batch_size": micro_bsz,
        "accumulate_steps": acc_steps,
    }
    log.info(
        f"using globa_bsz: {args.global_bsz} micro_bsz: {micro_bsz}, acc_steps: {acc_steps}"
    )

    dist_strategy.amp = args.use_amp
    dist_strategy.amp_configs = {
        "custom_white_list": ['softmax', 'layer_norm', 'gelu'],
        "init_loss_scaling": 32768,
        "decr_every_n_nan_or_inf": 2,
        "incr_every_n_steps": 1000,
        "incr_ratio": 2.0,
        "use_dynamic_loss_scaling": True,
        "decr_ratio": 0.5,
        "use_pure_fp16": False,
        "use_fp16_guard": False,
    }

    dist_strategy.lamb = args.use_lamb
    dist_strategy.lamb_configs = {
        'lamb_weight_decay':
        0.01,
        'exclude_from_weight_decay':
        ['layer_norm_bias', 'layer_norm_scale', '.b_0']
    }

    train_program = fluid.Program()
    startup_program = fluid.Program()
    with fluid.program_guard(train_program, startup_program):
        with fluid.unique_name.guard():
            graph_vars = create_model(args, 'train', micro_bsz,
                                      dp_sharding_rank, dp_worldsize, topo)
            data_loader = graph_vars['data_loader']
            for op in train_program.global_block().ops:
                if op.type == 'fill_constant':
                    op._set_attr(
                        'op_device', "gpu:0"
                    )  # XXX: hack: https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/tensor.py#L1376

            if args.use_recompute:
                dist_strategy.recompute_configs = {
                    "checkpoints": graph_vars['checkpoints'],
                    # "enable_offload": args.use_offload,
                    # "checkpoint_shape": [micro_bsz, args.max_seq_len, 4096],
                }

            log.debug("base lr: {}".format(args.learning_rate))
            scheduled_lr = linear_warmup_decay(
                learning_rate=args.learning_rate,
                warmup_steps=args.warmup_steps,
                num_train_steps=args.num_train_steps)

            clip_norm_thres = 1.0
            if paddlenlp.ops.optimizer._jit_compile():
                optimizer = paddlenlp.ops.optimizer.AdamwOptimizer(
                    learning_rate=scheduled_lr,
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=clip_norm_thres),
                    weight_decay=args.weight_decay,
                    apply_decay_param_fun=apply_weight_decay_fun)
            else:
                optimizer = fluid.optimizer.Adam(
                    learning_rate=scheduled_lr,
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=clip_norm_thres),
                    #multi_precision=True,
                    #weight_decay=args.weight_decay, # merge this pr to use weight_decay: https://github.com/PaddlePaddle/Paddle/pull/29248
                    #exclude_from_weight_decay_fn=exclude_from_weight_decay
                )

            optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
            log.info(f"using dist strategy: {dist_strategy}")

            optimizer.minimize(graph_vars['total_loss'])

            final_strategy = fleet._final_strategy()
            applied_meta_list = fleet._get_applied_meta_list()
            log.info("final strategy: {}".format(final_strategy))
            log.info("applied_meta_list: {}".format(applied_meta_list))

    program_desc_dir = os.path.join(args.output_dir, "program_desc")
    if not os.path.isdir(program_desc_dir):
        os.mkdir(program_desc_dir)

    with open(
            program_desc_dir + "/main_program.txt.%d" %
        (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f:
        f.write(str(train_program))

    with open(
            program_desc_dir + "/startup_program.txt.%d" %
        (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f:
        f.write(str(startup_program))

    exe = fluid.Executor(place)
    exe.run(startup_program)

    optimizer.amp_init(place)

    #save_path = os.path.join(args.output_dir, 'step_0')
    #log.debug("saving models to {}".format(save_path))
    #save_persistables(exe, save_path, train_program)

    if args.init_checkpoint and args.init_checkpoint != "":
        log.info(' ')
        log.info(
            '############################WARNING############################')
        log.info(
            '####### using ini_checkpoint, not init_pretraining_params ####')
        log.info(
            '## meaning hyper param e.g. lr will inherit from checkpoint ##')
        log.info(
            '###############################################################')
        init_checkpoint(exe, args.init_checkpoint, train_program)
        log.info(' ')

    output_dir = args.output_dir
    save_steps = args.save_steps
    total_time = 0
    cost_vals, lm_losses, sop_accs = [], [], []
    global_steps = args.global_steps + 1
    steps = 0
    log_path = 'train_log/node-%d' % fleet.worker_index()
    start_time = time.time()
    with LogWriter(os.path.join(args.output_dir, log_path)) as swriter:
        data_loader.start()
        while True:
            #if steps < global_steps:
            #    steps += 1
            #    continue
            if not is_last:
                fetch_list = []
            else:
                fetch_list = [
                    graph_vars['total_loss'], graph_vars['mean_mask_lm_loss'],
                    scheduled_lr
                ]
                if args.use_sop:
                    fetch_list.extend(
                        [graph_vars['sop_acc'], graph_vars['sop_loss']])
                if args.use_amp:
                    loss_scaling = train_program.global_block(
                    ).vars['loss_scaling_0']
                    fetch_list.append(loss_scaling)

            ret = exe.run(train_program, fetch_list=fetch_list
                          )  # run one mini-batch(=acc_steps micro-batch)
            #use_program_cache=True)

            steps += 1

            if is_last:
                if args.use_sop and args.use_amp:
                    cost_val, lm_loss, lr, sop_acc, sop_loss, loss_scaling_0 = ret
                elif args.use_sop:
                    cost_val, lm_loss, lr, sop_acc, sop_loss = ret
                elif args.use_amp:
                    cost_val, lm_loss, lr, loss_scaling_0 = ret
                else:
                    cost_val, lm_loss, lr = ret
                cost_vals.append(cost_val[0])
                lm_losses.append(lm_loss[0])
                if args.use_sop:
                    sop_accs.append(sop_acc[0])

                if steps > 0 and (steps % args.log_steps) == 0:
                    end_time = time.time()
                    total_time = end_time - start_time
                    cost_val = np.mean(cost_vals)
                    lm_loss = np.mean(lm_losses)
                    swriter.add_scalar('loss/total_loss', cost_val, steps)
                    swriter.add_scalar('loss/mlm_loss', lm_loss, steps)
                    swriter.add_scalar('lr/scheduled_lr', lr[0], steps)

                    if args.use_sop:
                        sop_acc = np.mean(sop_accs)
                        swriter.add_scalar('loss/sop_loss', sop_loss, steps)
                        swriter.add_scalar('train/sop_acc', sop_acc, steps)
                    else:
                        sop_acc = 0.0

                    if args.use_amp:
                        swriter.add_scalar('lr/loss_scaling',
                                           loss_scaling_0[0], steps)
                    else:
                        loss_scaling_0 = [0.0]

                    log.info(
                        "worker_index: %d, step: %d, cost: %f, "
                        "mlm loss: %f, sentence order acc: %f, "
                        "speed: %f steps/s, "
                        "speed: %f samples/s, "
                        "speed: %f tokens/s, "
                        "learning rate: %.3e, loss_scalings: %f" %
                        (fleet.worker_index(), steps, cost_val, lm_loss,
                         sop_acc, args.log_steps / total_time,
                         args.log_steps * args.global_bsz / total_time,
                         args.log_steps * args.global_bsz * args.max_seq_len /
                         total_time, lr[0], loss_scaling_0[0]))

                    cost_vals, lm_losses, sop_accs = [], [], []
                    start_time = time.time()

            # TODO: add evaluation
            if steps > 0 and args.eval_steps > 0 and steps % args.eval_steps == 0:
                pass

            if steps > 0 and args.save_steps > 0 and steps % args.save_steps == 0:
                if args.use_hybrid_dp and fleet.worker_index() > 8:
                    continue
                save_path = os.path.join(output_dir, 'step_' + str(steps))
                log.debug("saving models to {}".format(save_path))
                save_persistables(exe, save_path, train_program)

            if steps == args.num_train_steps:
                if args.use_hybrid_dp and fleet.worker_index() > 8:
                    continue
                save_path = os.path.join(output_dir,
                                         'final_step_' + str(steps))
                save_persistables(exe, save_path, train_program)
                log.debug("saving final models to {}".format(save_path))
                log.debug("end of training, total steps: {}".format(steps))
Exemple #24
0
def do_train(args):
    # Initialize the paddle and paddle fleet execute enviroment
    paddle.enable_static()
    place = paddle.set_device(args.device)
    fleet.init(is_collective=True)

    worker_num = fleet.worker_num()
    worker_index = fleet.worker_index()

    # Create the random seed for the worker
    set_seed(args.seed)
    worker_init = WorkerInitObj(args.seed + worker_index)

    # Define the input data in the static mode
    main_program = paddle.static.default_main_program()
    startup_program = paddle.static.default_startup_program()

    data_holders = create_data_holder(args)

    [
        input_ids, segment_ids, input_mask, masked_lm_positions,
        masked_lm_labels, next_sentence_labels, masked_lm_scale
    ] = data_holders

    # Define the model structure in static mode
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    config = model_class.pretrained_init_configuration[args.model_name_or_path]
    if config["vocab_size"] % 8 != 0:
        config["vocab_size"] += 8 - (config["vocab_size"] % 8)
    model = BertForPretraining(BertModel(**config))
    criterion = BertPretrainingCriterion(model.bert.config["vocab_size"])
    prediction_scores, seq_relationship_score = model(
        input_ids=input_ids,
        token_type_ids=segment_ids,
        attention_mask=input_mask,
        masked_positions=masked_lm_positions)
    loss = criterion(prediction_scores, seq_relationship_score,
                     masked_lm_labels, next_sentence_labels, masked_lm_scale)

    # Define the dynamic learing_reate scheduler and optimizer
    num_training_steps = args.max_steps if args.max_steps > 0 else len(
        train_data_loader) * args.num_train_epochs

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_steps)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params,
        multi_precision=args.use_pure_fp16)

    # Use the fleet api to compile the distributed optimizer
    optimizer = dist_optimizer(args, optimizer)
    optimizer.minimize(loss)

    # Define the Executor for running the static model
    exe = paddle.static.Executor(place)
    exe.run(startup_program)
    state_dict = model.state_dict()

    # Use the state dict to update the parameter
    reset_state_dict = reset_program_state_dict(model, state_dict)
    paddle.static.set_program_state(main_program, reset_state_dict)
    if args.use_amp:
        optimizer.amp_init(place)

    pool = ThreadPoolExecutor(1)
    global_step = 0
    tic_train = time.time()
    epoch = 0
    while True:
        files = [
            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
            if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in
            f
        ]
        files.sort()
        num_files = len(files)
        random.Random(args.seed + epoch).shuffle(files)
        f_start_id = 0

        # Select one file for each worker and create the DataLoader for the file
        data_file = select_dataset_file_for_each_worker(
            files, f_start_id, worker_num, worker_index)
        train_data_loader, _ = create_pretraining_dataset(
            data_file, args.max_predictions_per_seq, args, data_holders,
            worker_init, paddle.static.cuda_places())

        for f_id in range(f_start_id + 1, len(files)):
            data_file = select_dataset_file_for_each_worker(
                files, f_id, worker_num, worker_index)
            dataset_future = pool.submit(create_pretraining_dataset, data_file,
                                         args.max_predictions_per_seq, args,
                                         data_holders, worker_init,
                                         paddle.static.cuda_places())

            train_cost_avg = TimeCostAverage()
            reader_cost_avg = TimeCostAverage()
            total_samples = 0
            batch_start = time.time()
            for step, batch in enumerate(train_data_loader):
                train_reader_cost = time.time() - batch_start
                reader_cost_avg.record(train_reader_cost)
                global_step += 1
                train_start = time.time()
                loss_return = exe.run(main_program,
                                      feed=batch,
                                      fetch_list=[loss])
                total_samples += args.batch_size
                # In the new 2.0 api, must call this function to change the learning_rate
                lr_scheduler.step()
                train_run_cost = time.time() - batch_start
                train_cost_avg.record(train_run_cost)

                # Profile for model benchmark
                if args.profiler_options is not None:
                    profiler.add_profiler_step(args.profiler_options)

                if global_step % args.logging_steps == 0:
                    print(
                        "tobal step: %d, epoch: %d, batch: %d, loss: %f, "
                        "avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec"
                        % (global_step, epoch, step, loss_return[0],
                           reader_cost_avg.get_average(),
                           train_cost_avg.get_average(), total_samples /
                           args.logging_steps, args.batch_size / (
                               reader_cost_avg.get_average() +
                               train_cost_avg.get_average())))
                    total_samples = 0
                    train_cost_avg.reset()
                    reader_cost_avg.reset()
                if global_step % args.save_steps == 0:
                    if worker_index == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        model.save_model_config(output_dir)
                        paddle.static.save(main_program,
                                           os.path.join(output_dir,
                                                        "model_state"))
                        tokenizer.save_pretrained(output_dir)
                if global_step >= args.max_steps:
                    reader_start = time.time()
                    del train_data_loader
                    return
                batch_start = time.time()
            del train_data_loader
            train_data_loader, data_file = dataset_future.result(timeout=None)
        epoch += 1
def do_generation(args):
    # Initialize the paddle and paddle fleet execute environment
    paddle.enable_static()

    assert args.dp_degree == 1, "Data parallel is not supported in inference"
    assert args.sharding_degree == 1, "Sharding parallel is temporarily not supported in inference"
    assert args.pp_degree == 1, "Pipeline parallel will be supported later"

    if args.mp_degree == 1:
        args.mp_degree = paddle.distributed.get_world_size()
    else:
        assert args.mp_degree == paddle.distributed.get_world_size(), \
            "If mp_degree is specified, the size must be the same as world_size"

    strategy = fleet.DistributedStrategy()
    strategy.tensor_parallel = True
    strategy.tensor_parallel_configs = {
        "tensor_parallel_degree": args.mp_degree
    }

    fleet.init(is_collective=True, strategy=strategy)

    # temp use dynamic init, use HybridParallelInferenceHelper in future?
    paddle.distributed.init_parallel_env()

    # Create the random seed for the worker
    random.seed(args.seed)
    np.random.seed(args.seed)
    paddle.seed(args.seed)
    get_rng_state_tracker().add('global_seed', args.seed)
    get_rng_state_tracker().add('local_seed',
                                args.seed + fleet.worker_index() + 2021)

    if args.use_amp and args.amp_level == "O2":
        assert (args.mp_degree == 1 and args.pp_degree == 1
                ), "When amp level is O2, mp_degree and pp_degree should be 1."
        assert (args.use_sharding == False
                ), "When amp level is O2, use_sharding should be False."

    assert args.device in [
        "cpu", "gpu", "xpu"
    ], "Invalid device! Available device should be cpu, gpu, or xpu."
    place = paddle.set_device(args.device)

    worker_num = fleet.worker_num()
    worker_index = fleet.worker_index()
    local_rank = 0 if fleet.local_rank() is None else int(fleet.local_rank())

    topo = Topology(
        device_rank=worker_index,
        world_size=worker_num,
        dp_degree=args.dp_degree,
        pp_degree=args.pp_degree,
        sharding_degree=args.sharding_degree,
        mp_degree=args.mp_degree)

    logger.info("The topo of hybrid parallelism:\n{}".format(topo))

    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    pretrained_models_list = list(
        model_class.pretrained_init_configuration.keys())

    data_file = get_data_file(args)
    main_program = paddle.static.default_main_program()
    startup_program = paddle.static.default_startup_program()
    with paddle.static.program_guard(main_program, startup_program):
        with paddle.utils.unique_name.guard():
            with paddle.static.device_guard('gpu:0'):
                feeds = create_data_holder(args)
                tokenizer = tokenizer_class.from_pretrained(
                    args.model_name_or_path)
                eos_id = tokenizer.eos_token_id

                _, _, test_data_loader = create_pretrained_dataset(
                    args,
                    data_file,
                    local_rank=local_rank,
                    data_world_size=topo.data_info.size,
                    data_world_rank=topo.data_info.rank,
                    eos_id=eos_id,
                    max_seq_len=args.max_seq_len,
                    places=paddle.static.cuda_places(),
                    data_holders=feeds,
                    pipeline_mode=False)

                if args.model_name_or_path in pretrained_models_list:
                    model_config = model_class.pretrained_init_configuration[
                        args.model_name_or_path]
                    model_config[
                        "hidden_dropout_prob"] = args.hidden_dropout_prob
                    model_config[
                        "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob
                    model_config["topo"] = topo
                    model_config["fuse"] = args.fuse
                    model = GPTForGeneration(
                        GPTModel(**model_config),
                        max_length=args.max_dec_len,
                        decoding_strategy=args.decoding_strategy,
                        temperature=args.temperature,
                        top_k=args.topk,
                        top_p=args.topp,
                        eos_id=eos_id,
                        fuse=args.fuse)
                else:
                    logger.error("No checkpoint load.")
                model.eval()
                ins = {v.name: v for v in feeds}
                preds = model(ins)

    # Define the Executor for running the static model
    exe = paddle.static.Executor(place)
    exe.run(startup_program)
    main_program = main_program.clone(for_test=True)

    model_urls = model.pretrained_resource_files_map['model_state']
    model_path = args.model_name_or_path
    if model_path in pretrained_models_list and model_path in model_urls:
        flag_loaded = False
        from paddle.utils.download import get_weights_path_from_url
        dygraph_path = get_weights_path_from_url(model_urls[model_path])
        if os.path.exists(dygraph_path):
            if args.sharding_degree > 1:
                logger.warning("Sharding should init with static vars")
            else:
                logger.info("Loading parameters from %s" % dygraph_path)
                init_static_with_params(
                    model,
                    paddle.load(
                        dygraph_path, return_numpy=True),
                    topo,
                    main_program)
                flag_loaded = True
        if not flag_loaded:
            logger.error("No checkpoint load.")

    global_step = 0
    epoch = 0
    fetchs = [preds]

    ### check resutls
    text = [
        "Question: Where is the capital of China? Answer:",
        "Question:Who is the CEO of Apple? Answer:"
    ]
    inputs = tokenizer(
        text,
        padding=True,
        return_attention_mask=True,
        return_position_ids=True)
    ids = np.array(inputs["input_ids"]).reshape(len(text), -1).astype('int64')
    position_ids = np.array(inputs["position_ids"]).reshape(len(text),
                                                            -1).astype('int64')
    attention_mask = np.array(inputs["attention_mask"]).reshape(
        len(text), -1).astype('float32')

    t_ids = paddle.fluid.core.Tensor()
    t_ids.set(ids, place)
    t_mask = paddle.fluid.core.Tensor()
    t_mask.set(attention_mask, place)
    t_pos = paddle.fluid.core.Tensor()
    t_pos.set(position_ids, place)
    feed_data = {'src_ids': t_ids, 'pos_ids': t_pos, 'input_mask': t_mask}
    ret = exe.run(main_program, feed=feed_data, fetch_list=fetchs)
    ret = np.array(ret[0])
    for i in range(ret.shape[0]):
        o = [int(x) for x in ret[i]]
        ret_str = tokenizer.convert_ids_to_string(o)
        ret_str = text[i] + ret_str
        logger.info(ret_str)
    ##################

    for step, batch in enumerate(test_data_loader()):
        ret = exe.run(main_program, feed=batch, fetch_list=fetchs)
        if step == 5:
            break

    if args.save_inference_model_then_exist:
        save_inference_model_dir = 'inference_model_pp{pp_degree}mp{mp_degree}'.format(
            pp_degree=args.pp_degree, mp_degree=args.mp_degree)
        inference_save_path = os.path.join(save_inference_model_dir,
                                           'rank_' + str(fleet.worker_index()),
                                           'step_' + str(0))
        print("saving inference models to {}".format(inference_save_path))
        feed_names = [v.name for v in feeds]
        fetchs_names = [v.name for v in fetchs]
        print('feeds: ', feed_names, 'fetches: ', fetchs_names)
        paddle.static.save_inference_model(
            inference_save_path, feeds, fetchs, exe, program=main_program)
    def run_online_worker(self):
        logger.info("Run Online Worker Begin")
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        with open("./{}_worker_main_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_main_program()))
        with open("./{}_worker_startup_program.prototxt".format(
                fleet.worker_index()), 'w+') as f:
            f.write(str(paddle.static.default_startup_program()))

        self.exe.run(paddle.static.default_startup_program())
        fleet.init_worker()

        self.online_intervals = get_online_pass_interval(
            self.split_interval, self.split_per_pass, False)
        if is_local(self.save_model_path) and self.save_model_path and (
                not os.path.exists(self.save_model_path)):
            os.makedirs(self.save_model_path)

        last_day, last_pass, last_path, xbox_base_key = get_last_save_model(
            self.save_model_path, self.hadoop_client)
        logger.info(
            "get_last_save_model last_day = {}, last_pass = {}, last_path = {}, xbox_base_key = {}".
            format(last_day, last_pass, last_path, xbox_base_key))
        if last_day != -1 and fleet.is_first_worker():
            load_model(last_path, 0, self.hadoop_client)
        fleet.barrier_worker()

        day = self.start_day
        infer_first = True
        while int(day) <= int(self.end_day):
            logger.info("training a new day {}, end_day = {}".format(
                day, self.end_day))
            if last_day != -1 and int(day) < last_day:
                day = get_next_day(day)
                continue
            # base_model_saved = False
            for pass_id in range(1, 1 + len(self.online_intervals)):
                print(last_day, day, last_pass, pass_id)
                if (last_day != -1 and int(day) == last_day) and (
                        last_pass != -1 and int(pass_id) <= last_pass):
                    continue
                if self.save_first_base and fleet.is_first_worker():
                    self.save_first_base = False
                    last_base_day, last_base_path, tmp_xbox_base_key = \
                        get_last_save_xbox_base(self.save_model_path, self.hadoop_client)
                    logger.info(
                        "get_last_save_xbox_base, last_base_day = {}, last_base_path = {}, tmp_xbox_base_key = {}".
                        format(last_base_day, last_base_path,
                               tmp_xbox_base_key))
                    if int(day) > last_base_day:
                        xbox_base_key = int(time.time())
                        save_xbox_model(self.save_model_path, day, -1,
                                        self.exe, self.inference_feed_vars,
                                        self.inference_target_var,
                                        self.hadoop_client)
                        write_xbox_donefile(
                            output_path=self.save_model_path,
                            day=day,
                            pass_id=-1,
                            xbox_base_key=xbox_base_key,
                            client=self.hadoop_client)
                    elif int(day) == last_base_day:
                        xbox_base_key = tmp_xbox_base_key
                fleet.barrier_worker()

                logger.info("training a new day = {} new pass = {}".format(
                    day, pass_id))
                logger.info("Day:{}, Pass: {}, Prepare Dataset Begin.".format(
                    day, pass_id))
                begin_train = time.time()
                begin = time.time()
                dataset = self.wait_and_prepare_dataset(day, pass_id)
                end = time.time()
                read_data_cost = (end - begin) / 60.0
                logger.info("Prepare Dataset Done, using time {} mins.".format(
                    read_data_cost))

                infer_cost = 0
                infer_metric_cost = 0
                if infer_first:
                    infer_first = False
                else:
                    logger.info("Day:{}, Pass: {}, Infering Dataset Begin.".
                                format(day, pass_id))
                    begin = time.time()
                    self.dataset_infer_loop(dataset, day, pass_id)
                    end = time.time()
                    infer_cost = (end - begin) / 60.0
                    logger.info("Infering Dataset Done, using time {} mins.".
                                format(infer_cost))
                    begin = time.time()
                    metric_str = get_global_metrics_str(fluid.global_scope(),
                                                        self.metric_list, "")
                    logger.info("Day:{}, Pass: {}, Infer Global Metric: {}".
                                format(day, pass_id, metric_str))
                    clear_metrics(fluid.global_scope(), self.metric_list,
                                  self.metric_types)
                    end = time.time()
                    infer_metric_cost = (end - begin) / 60.0

                logger.info("Day:{}, Pass: {}, Training Dataset Begin.".format(
                    day, pass_id))
                begin = time.time()
                self.dataset_train_loop(dataset, day, pass_id,
                                        self.need_train_dump)
                end = time.time()
                avg_cost = get_avg_cost_mins(end - begin)
                get_max_cost_mins(end - begin)
                get_min_cost_mins(end - begin)
                train_cost = avg_cost
                logger.info("Training Dataset Done, using time {} mins.".
                            format(train_cost))

                begin = time.time()
                dataset.release_memory()
                end = time.time()
                release_cost = (end - begin) / 60.0

                begin = time.time()
                metric_str = get_global_metrics_str(fluid.global_scope(),
                                                    self.metric_list, "")
                logger.info("Day:{}, Pass: {}, Train Global Metric: {}".format(
                    day, pass_id, metric_str))
                clear_metrics(fluid.global_scope(), self.metric_list,
                              self.metric_types)
                end = time.time()
                metric_cost = (end - begin) / 60
                end_train = time.time()
                total_cost = (end_train - begin_train) / 60
                other_cost = total_cost - read_data_cost - train_cost - release_cost - metric_cost - infer_cost - infer_metric_cost
                log_str = "finished train epoch %d time cost:%s min job time cost" \
                            ":[read_data:%s min][train: %s min][metric: %s min][release: %s min]" \
                            "[infer:%s min][infer_metric: %s min][other:%s min]" \
                              % (pass_id, total_cost, read_data_cost, train_cost, metric_cost, release_cost, infer_cost, infer_metric_cost, other_cost)
                logger.info(log_str)

                if self.need_infer_dump:
                    prepare_data_start_time = time.time()
                    dump_dataset = self.wait_and_prepare_infer_dataset(day,
                                                                       pass_id)
                    prepare_data_end_time = time.time()
                    logger.info(
                        "Prepare Infer Dump Dataset Done, using time {} second.".
                        format(prepare_data_end_time -
                               prepare_data_start_time))

                    dump_start_time = time.time()
                    self.dataset_infer_loop(dump_dataset, day, pass_id, True)
                    dump_end_time = time.time()
                    logger.info(
                        "Infer Dump Dataset Done, using time {} second.".
                        format(dump_end_time - dump_start_time))

                    dump_dataset.release_memory()

                if fleet.is_first_worker():
                    if pass_id % self.checkpoint_per_pass == 0:
                        save_model(self.exe, self.save_model_path, day,
                                   pass_id)
                        write_model_donefile(
                            output_path=self.save_model_path,
                            day=day,
                            pass_id=pass_id,
                            xbox_base_key=xbox_base_key,
                            client=self.hadoop_client)
                    if pass_id % self.save_delta_frequency == 0:
                        last_xbox_day, last_xbox_pass, last_xbox_path, _ = get_last_save_xbox(
                            self.save_model_path, self.hadoop_client)
                        if int(day) < last_xbox_day or int(
                                day) == last_xbox_day and int(
                                    pass_id) <= last_xbox_pass:
                            log_str = "delta model exists"
                            logger.info(log_str)
                        else:
                            save_xbox_model(self.save_model_path, day, pass_id,
                                            self.exe, self.inference_feed_vars,
                                            self.inference_target_var,
                                            self.hadoop_client)  # 1 delta
                            write_xbox_donefile(
                                output_path=self.save_model_path,
                                day=day,
                                pass_id=pass_id,
                                xbox_base_key=xbox_base_key,
                                client=self.hadoop_client,
                                hadoop_fs_name=self.hadoop_fs_name,
                                monitor_data=metric_str)
                fleet.barrier_worker()

            logger.info("shrink table")
            begin = time.time()
            fleet.shrink()
            end = time.time()
            logger.info("shrink table done, cost %s min" % (
                (end - begin) / 60.0))

            if fleet.is_first_worker():
                last_base_day, last_base_path, last_base_key = get_last_save_xbox_base(
                    self.save_model_path, self.hadoop_client)
                logger.info(
                    "one epoch finishes, get_last_save_xbox, last_base_day = {}, last_base_path = {}, last_base_key = {}".
                    format(last_base_day, last_base_path, last_base_key))
                next_day = get_next_day(day)
                if int(next_day) <= last_base_day:
                    logger.info("batch model/base xbox model exists")
                else:
                    xbox_base_key = int(time.time())
                    save_xbox_model(self.save_model_path, next_day, -1,
                                    self.exe, self.inference_feed_vars,
                                    self.inference_target_var,
                                    self.hadoop_client)
                    write_xbox_donefile(
                        output_path=self.save_model_path,
                        day=next_day,
                        pass_id=-1,
                        xbox_base_key=xbox_base_key,
                        client=self.hadoop_client,
                        hadoop_fs_name=self.hadoop_fs_name,
                        monitor_data=metric_str)
                    save_batch_model(self.exe, self.save_model_path, next_day)
                    write_model_donefile(
                        output_path=self.save_model_path,
                        day=next_day,
                        pass_id=-1,
                        xbox_base_key=xbox_base_key,
                        client=self.hadoop_client)
            fleet.barrier_worker()
            day = get_next_day(day)
Exemple #27
0
def do_train(args):
    # Initialize the paddle and paddle fleet execute enviroment
    paddle.enable_static()
    place = paddle.set_device(args.select_device)
    fleet.init(is_collective=True)

    worker_num = fleet.worker_num()
    worker_index = fleet.worker_index()

    # Create the random seed for the worker
    set_seed(args.seed)
    worker_init = WorkerInitObj(args.seed + worker_index)

    # Define the input data in the static mode
    main_program = paddle.static.default_main_program()
    startup_program = paddle.static.default_startup_program()
    data_holders = create_data_holder(args)

    [
        input_ids, segment_ids, input_mask, masked_lm_positions,
        masked_lm_labels, next_sentence_labels, masked_lm_scale
    ] = data_holders

    # Define the model structure in static mode
    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    config = model_class.pretrained_init_configuration[args.model_name_or_path]
    if config["vocab_size"] % 8 != 0:
        config["vocab_size"] += 8 - (config["vocab_size"] % 8)
    model = BertForPretraining(BertModel(**config))
    criterion = BertPretrainingCriterion(model.bert.config["vocab_size"])
    prediction_scores, seq_relationship_score = model(
        input_ids=input_ids,
        token_type_ids=segment_ids,
        attention_mask=input_mask,
        masked_positions=masked_lm_positions)
    loss = criterion(prediction_scores, seq_relationship_score,
                     masked_lm_labels, next_sentence_labels, masked_lm_scale)

    # Define the dynamic learing_reate scheduler and optimizer
    lr_scheduler = paddle.optimizer.lr.LambdaDecay(
        args.learning_rate,
        lambda current_step, num_warmup_steps=args.warmup_steps,
        num_training_steps=args.max_steps if args.max_steps > 0 else
        (len(train_data_loader) * args.num_train_epochs): float(
            current_step) / float(max(1, num_warmup_steps))
        if current_step < num_warmup_steps else max(
            0.0,
            float(num_training_steps - current_step) / float(
                max(1, num_training_steps - num_warmup_steps))))

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ],
        multi_precision=args.use_pure_fp16)
    if worker_num == 1 and args.use_amp:
        custom_black_list = (['lookup_table', 'lookup_table_v2']
                             if args.use_pure_fp16 else None)
        amp_list = paddle.static.amp.AutoMixedPrecisionLists(
            custom_white_list=['softmax', 'layer_norm', 'gelu'],
            custom_black_list=custom_black_list)
        optimizer = paddle.static.amp.decorate(
            optimizer,
            amp_list,
            init_loss_scaling=args.scale_loss,
            use_dynamic_loss_scaling=True,
            use_pure_fp16=args.use_pure_fp16)

    if worker_num > 1:
        # Use the fleet api to compile the distributed optimizer
        optimizer = dist_optimizer(args, optimizer)
    optimizer.minimize(loss)

    # Define the Executor for running the static model
    exe = paddle.static.Executor(place)
    exe.run(startup_program)
    state_dict = model.state_dict()

    # Use the state dict to update the parameter
    reset_state_dict = reset_program_state_dict(model, state_dict)
    paddle.static.set_program_state(main_program, reset_state_dict)
    if args.use_amp:
        optimizer.amp_init(place)

    if worker_num == 1:
        # Construct the compiled program
        main_program = build_compiled_program(main_program, loss)

    pool = ThreadPoolExecutor(1)
    global_step = 0
    tic_train = time.time()
    epoch = 0
    while True:
        files = [
            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
            if os.path.isfile(os.path.join(args.input_dir, f))
            and "training" in f
        ]
        files.sort()
        num_files = len(files)
        random.Random(args.seed + epoch).shuffle(files)
        f_start_id = 0

        # Select one file for each worker and create the DataLoader for the file
        data_file = select_dataset_file_for_each_worker(
            files, f_start_id, worker_num, worker_index)
        train_data_loader, _ = create_pretraining_dataset(
            data_file, args.max_predictions_per_seq, args, data_holders,
            worker_init, paddle.static.cuda_places())

        for f_id in range(f_start_id + 1, len(files)):
            data_file = select_dataset_file_for_each_worker(
                files, f_id, worker_num, worker_index)
            dataset_future = pool.submit(create_pretraining_dataset, data_file,
                                         args.max_predictions_per_seq, args,
                                         data_holders, worker_init,
                                         paddle.static.cuda_places())

            train_reader_cost = 0.0
            train_run_cost = 0.0
            total_samples = 0
            reader_start = time.time()
            for step, batch in enumerate(train_data_loader):
                train_reader_cost += time.time() - reader_start
                global_step += 1
                train_start = time.time()
                loss_return = exe.run(main_program,
                                      feed=batch,
                                      fetch_list=[loss])
                train_run_cost += time.time() - train_start
                total_samples += args.batch_size
                # In the new 2.0 api, must call this function to change the learning_rate
                lr_scheduler.step()
                if global_step % args.logging_steps == 0:
                    print(
                        "tobal step: %d, epoch: %d, batch: %d, loss: %f, "
                        "avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec"
                        % (global_step, epoch, step, loss_return[0],
                           train_reader_cost / args.logging_steps,
                           (train_reader_cost + train_run_cost) /
                           args.logging_steps,
                           total_samples / args.logging_steps, total_samples /
                           (train_reader_cost + train_run_cost)))
                    train_reader_cost = 0.0
                    train_run_cost = 0.0
                    total_samples = 0
                if global_step % args.save_steps == 0:
                    if worker_index == 0:
                        output_dir = os.path.join(args.output_dir,
                                                  "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # TODO(fangzeyang): Udpate the save_params to paddle.static
                        paddle.fluid.io.save_params(exe, output_dir)
                        tokenizer.save_pretrained(output_dir)
                if global_step >= args.max_steps:
                    reader_start = time.time()
                    del train_data_loader
                    return
                reader_start = time.time()
            del train_data_loader
            train_data_loader, data_file = dataset_future.result(timeout=None)
        epoch += 1
Exemple #28
0
def infer(args):
    """Inference main function."""
    if args.is_distributed:
        fleet.init(is_collective=True)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
        phase = "distributed_test"
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
        phase = "test"
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    infer_generator = task.get_data_loader(model,
                                           input_file=args.infer_file,
                                           num_part=trainers_num,
                                           part_id=trainer_id,
                                           phase=phase,
                                           is_infer=True)

    # run inference
    timer = Timer()
    timer.start()
    infer_out = {}
    step = 0
    for step, data in enumerate(infer_generator(), 1):
        predictions = task.infer_step(model, data)
        for pred in predictions:
            infer_out[pred["data_id"]] = pred
        if step % args.log_steps == 0:
            time_cost = timer.pass_time
            print(f"\tstep: {step}, time: {time_cost:.3f}, "
                  f"queue size: {infer_generator.queue.size()}, "
                  f"speed: {step / time_cost:.3f} steps/s")

    time_cost = timer.pass_time
    print(f"[infer] steps: {step} time cost: {time_cost}, "
          f"speed: {step / time_cost} steps/s")

    if args.is_distributed:
        # merge inference outputs in distributed mode.
        part_file = os.path.join(args.save_path,
                                 f"inference_output.part_{gpu_id}")
        with open(part_file, "w") as fp:
            json.dump(infer_out, fp, ensure_ascii=False, indent=2)
        part_finish_file = os.path.join(
            args.save_path, f"inference_output.part_{gpu_id}.finish")
        with open(part_finish_file, "w"):
            pass

    # Only run on master GPU in each node
    if gpu_id != 0:
        return

    if args.is_distributed:
        part_files = f"inference_output.part_*.finish"
        while True:
            ret = subprocess.getoutput(
                f"find {args.save_path} -maxdepth 1 -name {part_files}")
            num_completed = len(ret.split("\n"))
            if num_completed != dev_count:
                time.sleep(1)
                continue
            infer_out = {}
            for dev_id in range(dev_count):
                part_file = os.path.join(args.save_path,
                                         f"inference_output.part_{dev_id}")
                with open(part_file, "r") as fp:
                    part_infer_out = json.load(fp)
                    for data_id in part_infer_out:
                        infer_out[data_id] = part_infer_out[data_id]
            break
        subprocess.getoutput(
            "rm " + os.path.join(args.save_path, f"inference_output.part*"))

    # save inference outputs
    inference_output = os.path.join(args.save_path, args.save_name)
    save_array = []
    for i in range(len(infer_out)):
        save_array.append(infer_out[str(i)]["emb"])
    np_array = np.array(save_array)
    np.save(inference_output, np_array)

    return