Esempio n. 1
0
    def net(self, args=None):
        """
        BERT net struct.
        Args:
            fleet:
            args (ArgumentParser): run args to config dist fleet.
        Returns:
            tuple: the return value contains avg_cost, py_reader
        """
        args = p_args()
        bert_config = BertConfig(DATA_DIR +
                                 "uncased_L-24_H-1024_A-16/bert_config.json")
        bert_config.print_config()
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        exe = fluid.Executor(place)
        # init program
        train_program = fluid.Program()
        startup_prog = fluid.Program()

        if args.random_seed != 0:
            print("set program random seed as: ", args.random_seed)
            startup_prog.random_seed = args.random_seed
            train_program.random_seed = args.random_seed

        task_name = args.task_name.lower()
        processors = {
            'xnli': reader.XnliProcessor,
            'cola': reader.ColaProcessor,
            'mrpc': reader.MrpcProcessor,
            'mnli': reader.MnliProcessor,
        }
        processor = processors[task_name](data_dir=args.data_dir,
                                          vocab_path=args.vocab_path,
                                          max_seq_len=args.max_seq_len,
                                          do_lower_case=args.do_lower_case,
                                          in_tokens=args.in_tokens,
                                          random_seed=args.random_seed)
        num_labels = len(processor.get_labels())

        dev_count = 1
        self.train_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='train',
            epoch=args.epoch,
            dev_count=dev_count,
            dev_idx=0,
            shuffle=args.shuffle,
            shuffle_seed=args.shuffle_seed)

        num_train_examples = processor.get_num_examples(phase='train')

        max_train_steps = 5
        self.warmup_steps = 0.5

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        dist_strategy = DistributedStrategy()
        args.run_params = json.loads(args.run_params)
        dist_strategy.enable_inplace = args.run_params['enable_inplace']
        dist_strategy.fuse_all_reduce_ops = args.run_params[
            'fuse_all_reduce_ops']
        dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num']
        dist_strategy.use_local_sgd = args.run_params['use_local_sgd']
        dist_strategy.mode = args.run_params["mode"]
        dist_strategy.collective_mode = args.run_params["collective"]
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.use_hierarchical_allreduce = False

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model(
                    args, bert_config=bert_config, num_labels=num_labels)
                scheduled_lr = optimization(loss=self.loss,
                                            warmup_steps=self.warmup_steps,
                                            num_train_steps=max_train_steps,
                                            learning_rate=args.learning_rate,
                                            train_program=train_program,
                                            startup_prog=startup_prog,
                                            weight_decay=args.weight_decay,
                                            scheduler=args.lr_scheduler,
                                            use_fp16=False,
                                            loss_scaling=args.loss_scaling,
                                            dist_strategy=dist_strategy)
        exe.run(startup_prog)
        with open("__model__", "wb") as f:
            f.write(fleet._origin_program.desc.serialize_to_string())

        with open("debug_program", "w") as f:
            f.write(str(fleet._origin_program))
        return self.loss
Esempio n. 2
0
    def run_gpu_fleet_api_trainer(self, args):
        assert args.update_method == "nccl2"

        self.lr = args.lr

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = 1

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.fuse_memory_size = 1  # MB
        dist_strategy.fuse_laryer_size = 1
        if args.use_local_sgd:
            dist_strategy.use_local_sgd = True
        if args.ut4grad_allreduce:
            dist_strategy._ut4grad_allreduce = True
        if args.sync_batch_norm:
            dist_strategy.sync_batch_norm = True

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        print_to_err("gpu_fleet", "fleet.node_num:")
        # "fleet.node_id:", fleet.node_id(),
        # "fleet.trainer_num:", fleet.worker_num())

        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)

        trainer_prog = fleet._origin_program
        dist_prog = fleet.main_program

        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
        place = fluid.CUDAPlace(device_id)

        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        eprint(type(self).__name__, "run worker startup program done.")

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        eprint("feed_var_list:", feed_var_list)

        # tmp add this code to pass python35 gcc8 CI
        # Fixme(gongweibao, wangxi), need fix fleet api program order
        if feed_var_list[0].name == 'label':
            feed_var_list = feed_var_list[::-1]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = train_reader()

        def get_data():
            origin_batch = next(reader_generator)
            if args.update_method != "local" and args.use_reader_alloc:
                new_batch = []
                for offset, item in enumerate(origin_batch):
                    if offset % 2 == args.trainer_id:
                        new_batch.append(item)
                return new_batch
            else:
                return origin_batch

        print_to_err(type(self).__name__, "begin to train on trainer")
        out_losses = []
        for i in six.moves.xrange(RUN_STEP):
            loss, = exe.run(dist_prog,
                            fetch_list=[avg_cost.name],
                            feed=feeder.feed(get_data()))
            out_losses.append(loss[0])
            print_to_err(type(self).__name__, "run step %d finished" % i)
        print_to_err(type(self).__name__, "trainer run finished")

        if six.PY2:
            print(pickle.dumps(out_losses))
        else:
            sys.stdout.buffer.write(pickle.dumps(out_losses))

        if args.save_model:
            model_save_dir = "/tmp"
            if fleet.worker_index() == 0:
                model_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_persistables")
                model_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_persistables")
                infer_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_infer")
                infer_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_infer")
            else:
                model_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_persistables_2")
                model_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_persistables_2")
                infer_save_dir_fluid = os.path.join(model_save_dir,
                                                    "fluid_infer_2")
                infer_save_dir_fleet = os.path.join(model_save_dir,
                                                    "fleet_infer_2")
            fluid.io.save_persistables(exe, model_save_dir_fluid,
                                       fleet._origin_program)
            fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet)
            feeded_var_names = [var.name for var in feed_var_list]
            fluid.io.save_inference_model(infer_save_dir_fluid,
                                          feeded_var_names, [avg_cost], exe,
                                          fleet._origin_program)
            fleet.save_inference_model(exe, infer_save_dir_fleet,
                                       feeded_var_names, [avg_cost])
    def do_training(self, fleet, args):
        """
        begin training.
        Args:
            fleet (Collective): Collective inherited base class Fleet
            args (ArgumentParser): run args to config dist fleet.
        Returns:
            tuple: the value is train losses
        """
        args = parse_args()
        logging.info(args)
        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 4))
        place = fluid.CUDAPlace(gpu_id)
        dev_count = 1
        exe = fluid.Executor(place)
        train_program = fluid.Program()
        startup_program = fluid.Program()
        args.num_trainers = fleet.worker_num()
        args.trainer_id = fleet.worker_index()
        args.run_params = json.loads(args.run_params)
        dist_strategy = DistributedStrategy()
        dist_strategy.enable_inplace = args.run_params['enable_inplace']
        dist_strategy.fuse_all_reduce_ops = args.run_params[
            'fuse_all_reduce_ops']
        dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num']
        dist_strategy.use_local_sgd = args.run_params['use_local_sgd']
        dist_strategy.mode = args.run_params["mode"]
        dist_strategy.collective_mode = args.run_params["collective"]

        with fluid.program_guard(train_program, startup_program):
            with fluid.unique_name.guard():
                sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                    ModelHyperParams.src_vocab_size,
                    ModelHyperParams.trg_vocab_size,
                    ModelHyperParams.max_length + 1,
                    ModelHyperParams.n_layer,
                    ModelHyperParams.n_head,
                    ModelHyperParams.d_key,
                    ModelHyperParams.d_value,
                    ModelHyperParams.d_model,
                    ModelHyperParams.d_inner_hid,
                    ModelHyperParams.prepostprocess_dropout,
                    ModelHyperParams.attention_dropout,
                    ModelHyperParams.relu_dropout,
                    ModelHyperParams.preprocess_cmd,
                    ModelHyperParams.postprocess_cmd,
                    ModelHyperParams.weight_sharing,
                    TrainTaskConfig.label_smooth_eps,
                    ModelHyperParams.bos_idx,
                    use_py_reader=args.use_py_reader,
                    is_test=False)
                optimizer = fluid.optimizer.SGD(0.003)
                if args.run_params["fp16"]:
                    optimizer = decorate(optimizer, init_loss_scaling=64.0)
                optimizer = fleet.distributed_optimizer(optimizer,
                                                        strategy=dist_strategy)
                optimizer.minimize(avg_cost, startup_program)
        train_program = fleet.main_program
        exe.run(startup_program)
        train_data = prepare_data_generator(
            args,
            is_test=False,
            count=dev_count,
            pyreader=pyreader,
            py_reader_provider_wrapper=py_reader_provider_wrapper)

        loss_normalizer = -(
            (1. - TrainTaskConfig.label_smooth_eps) * np.log(
                (1. - TrainTaskConfig.label_smooth_eps)) +
            TrainTaskConfig.label_smooth_eps *
            np.log(TrainTaskConfig.label_smooth_eps /
                   (ModelHyperParams.trg_vocab_size - 1) + 1e-20))

        step_idx = 0
        init_flag = True
        result_loss = []
        result_ppl = []
        train_info = []
        for pass_id in six.moves.xrange(args.num_epochs):
            pass_start_time = time.time()
            if args.use_py_reader:
                pyreader.start()
                data_generator = None
            else:
                data_generator = train_data()
            batch_id = 0
            while True:
                try:
                    feed_dict_list = prepare_feed_dict_list(
                        data_generator, init_flag, dev_count)
                    t1 = time.time()
                    outs = exe.run(program=train_program,
                                   fetch_list=[sum_cost.name, token_num.name]
                                   if step_idx % args.fetch_steps == 0 else [],
                                   feed=feed_dict_list)

                    if step_idx % args.fetch_steps == 0:
                        sum_cost_val, token_num_val = np.array(
                            outs[0]), np.array(outs[1])
                        total_sum_cost = sum_cost_val.sum()
                        total_token_num = token_num_val.sum()
                        total_avg_cost = total_sum_cost / total_token_num
                        result_loss.append(total_avg_cost - loss_normalizer)
                        result_ppl.append(
                            np.exp([min(total_avg_cost, 100)]).item(0))
                        train_info.append(result_loss)
                    init_flag = False
                    batch_id += 1
                    step_idx += 1
                    if batch_id >= 5:
                        break
                except (StopIteration, fluid.core.EOFException):
                    if args.use_py_reader:
                        pyreader.reset()
                    break

            train_info = [round(i, 6) for i in train_info[0]]
            return train_info
Esempio n. 4
0
    def run_gpu_fleet_api_trainer(self, args):
        assert args.update_method == "nccl2"

        self.lr = args.lr

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = 1

        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.fuse_memory_size = 1  # MB
        dist_strategy.fuse_laryer_size = 1
        if args.use_local_sgd:
            dist_strategy.use_local_sgd = True
        if args.ut4grad_allreduce:
            dist_strategy._ut4grad_allreduce = True

        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)
        print_to_err("gpu_fleet", "fleet.node_num:")
        # "fleet.node_id:", fleet.node_id(),
        # "fleet.trainer_num:", fleet.worker_num())

        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)

        trainer_prog = fleet._origin_program
        dist_prog = fleet.main_program

        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
        place = fluid.CUDAPlace(device_id)

        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        eprint(type(self).__name__, "run worker startup program done.")

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = train_reader()

        def get_data():
            origin_batch = next(reader_generator)
            if args.update_method != "local" and args.use_reader_alloc:
                new_batch = []
                for offset, item in enumerate(origin_batch):
                    if offset % 2 == args.trainer_id:
                        new_batch.append(item)
                return new_batch
            else:
                return origin_batch

        print_to_err(type(self).__name__, "begin to train on trainer")
        out_losses = []
        for i in six.moves.xrange(RUN_STEP):
            loss, = exe.run(dist_prog,
                            fetch_list=[avg_cost.name],
                            feed=feeder.feed(get_data()))
            out_losses.append(loss[0])
            print_to_err(type(self).__name__, "run step %d finished" % i)
        print_to_err(type(self).__name__, "trainer run finished")

        if six.PY2:
            print(pickle.dumps(out_losses))
        else:
            sys.stdout.buffer.write(pickle.dumps(out_losses))
Esempio n. 5
0
    def net(self, args=None):
        """
        resnet struct.
        Args:
            fleet:
            args (ArgumentParser): run args to config dist fleet.
        Returns:
            tuple: the return value contains avg_cost, py_reader
        """
        from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
        from thirdparty.image_classfication.models.resnet import ResNet50
        from thirdparty.image_classfication.train import parser
        from thirdparty.image_classfication.train import optimizer_setting
        parser.add_argument('--update_method',
                            type=str,
                            required=True,
                            choices=['pserver', 'nccl'])
        parser.add_argument('--role',
                            type=str,
                            required=True,
                            choices=['pserver', 'trainer'])
        parser.add_argument('--endpoints',
                            type=str,
                            required=False,
                            default="")
        parser.add_argument('--current_id',
                            type=int,
                            required=False,
                            default=0)
        parser.add_argument('--trainers', type=int, required=False, default=1)
        # parser.add_argument('--sync_mode', action='store_true')
        parser.add_argument('--run_params',
                            type=str,
                            required=False,
                            default='{}')
        args = parser.parse_args()
        args.run_params = json.loads(args.run_params)
        image_shape = [3, 224, 224]
        scale_loss = 1.0
        self.py_reader = fluid.layers.py_reader(capacity=16,
                                                shapes=[[-1] + image_shape,
                                                        [-1, 1]],
                                                lod_levels=[0, 0],
                                                dtypes=["float32", "int64"],
                                                use_double_buffer=True)
        image, label = fluid.layers.read_file(self.py_reader)
        run_model = ResNet50()
        out = run_model.net(image, 4)
        softmax_out = fluid.layers.softmax(out, use_cudnn=False)
        cost, prob = fluid.layers.softmax_with_cross_entropy(
            out, label, return_softmax=True)
        self.avg_cost = fluid.layers.mean(cost)

        params = run_model.params
        params["total_images"] = args.total_images
        params["lr"] = 1e-5
        params["num_epochs"] = args.num_epochs
        params["learning_strategy"]["batch_size"] = args.batch_size
        params["learning_strategy"]["name"] = args.lr_strategy
        params["l2_decay"] = args.l2_decay
        params["momentum_rate"] = args.momentum_rate
        optimizer = optimizer_setting(params)
        global_lr = optimizer._global_learning_rate()
        global_lr.persistable = True

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = 1
        exec_strategy.num_iteration_per_drop_scope = 30
        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.enable_inplace = args.run_params['enable_inplace']
        dist_strategy.fuse_all_reduce_ops = args.run_params[
            'fuse_all_reduce_ops']
        dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num']
        dist_strategy.use_local_sgd = args.run_params['use_local_sgd']
        dist_strategy.mode = args.run_params["mode"]
        dist_strategy.collective_mode = args.run_params["collective"]

        if args.run_params["fp16"]:
            optimizer = fluid.contrib.mixed_precision.decorate(
                optimizer,
                init_loss_scaling=128.0,
                use_dynamic_loss_scaling=True)

        if "use_dgc" in args.run_params and args.run_params["use_dgc"]:
            # use dgc must close fuse
            dist_strategy.fuse_all_reduce_ops = False
            optimizer = fluid.optimizer.DGCMomentumOptimizer(
                learning_rate=0.001, momentum=0.9, rampup_begin_step=0)

        dist_optimizer = fleet.distributed_optimizer(optimizer,
                                                     strategy=dist_strategy)
        _, param_grads = dist_optimizer.minimize(self.avg_cost)

        shuffle_seed = 1
        train_reader = reader.train(settings=args,
                                    data_dir=DATA_DIR,
                                    pass_id_as_seed=shuffle_seed)
        self.py_reader.decorate_paddle_reader(
            paddle.batch(train_reader, batch_size=self.batch_size))

        if scale_loss > 1:
            avg_cost = fluid.layers.mean(x=cost) * scale_loss
        return self.avg_cost, self.py_reader