def dist_transpile(trainer_id):
    if trainer_id < 0:
        return None, None

    # the port of all pservers, needed by both trainer and pserver
    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
    # comma separated ips of all pservers, needed by trainer and
    # pserver
    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
    eplist = []
    for ip in pserver_ips.split(","):
        eplist.append(':'.join([ip, port]))
    pserver_endpoints = ",".join(eplist)
    # total number of workers/trainers in the job, needed by
    # trainer and pserver
    trainers = int(os.getenv("PADDLE_TRAINERS"))
    # the IP of the local machine, needed by pserver only
    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
    # the role, should be either PSERVER or TRAINER
    training_role = os.getenv("PADDLE_TRAINING_ROLE")

    t = distribute_transpiler.DistributeTranspiler()
    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(current_endpoint)
        pserver_startup_program = t.get_startup_program(
            current_endpoint, pserver_program)
        return pserver_program, pserver_startup_program
    elif training_role == "TRAINER":
        train_program = t.get_trainer_program()
        return train_program, fluid.default_startup_program()
    else:
        raise ValueError(
            'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )
Exemple #2
0
    def _transpile(self, startup_program, main_program):
        """
        Transpile the programs to distributed programs. And add the variables.
        """
        worker_endpoints = fleet.worker_endpoints()
        trainer_id = fleet.worker_index()
        current_endpoint = fleet.worker_endpoints()[trainer_id]
        worker_endpoints_env = ','.join(worker_endpoints)
        trainers_num = fleet.worker_num()

        if self.print_config:
            print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
                  trainer_id:{}".format(worker_endpoints, trainers_num,
                                        current_endpoint, trainer_id))

        # call transpiler
        config = dist_transpiler.DistributeTranspilerConfig()
        config.mode = self._strategy.mode
        config.collective_mode = self._strategy.collective_mode

        config.nccl_comm_num = self._strategy.nccl_comm_num
        config.use_hierarchical_allreduce = self._strategy.use_hierarchical_allreduce
        config.hierarchical_allreduce_inter_nranks = self._strategy.hierarchical_allreduce_inter_nranks

        t = dist_transpiler.DistributeTranspiler(config=config)
        t.transpile(trainer_id=trainer_id,
                    trainers=worker_endpoints_env,
                    startup_program=startup_program,
                    program=main_program,
                    current_endpoint=current_endpoint)
Exemple #3
0
def dist_transpile(trainer_id, args, train_prog, startup_prog):
    if trainer_id < 0:
        return None, None

    # the port of all pservers, needed by both trainer and pserver
    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
    # comma separated ips of all pservers, needed by trainer and
    # pserver
    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
    eplist = []
    for ip in pserver_ips.split(","):
        eplist.append(':'.join([ip, port]))
    pserver_endpoints = ",".join(eplist)
    # total number of workers/trainers in the job, needed by
    # trainer and pserver
    trainers = int(os.getenv("PADDLE_TRAINERS"))
    # the IP of the local machine, needed by pserver only
    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
    # the role, should be either PSERVER or TRAINER
    training_role = os.getenv("PADDLE_TRAINING_ROLE")

    config = fluid.DistributeTranspilerConfig()
    config.slice_var_up = not args.no_split_var
    config.min_block_size = 1048576
    t = distribute_transpiler.DistributeTranspiler(config=config)

    t.transpile(
        trainer_id,
        # NOTE: *MUST* use train_prog, for we are using with guard to
        # generate different program for train and test.
        program=train_prog,
        pservers=pserver_endpoints,
        trainers=trainers,
        sync_mode=not args.async_mode,
        startup_program=startup_prog)
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(current_endpoint)
        pserver_startup_program = t.get_startup_program(
            current_endpoint, pserver_program, startup_program=startup_prog)
        return pserver_program, pserver_startup_program
    elif training_role == "TRAINER":
        train_program = t.get_trainer_program()
        return train_program, startup_prog
    else:
        raise ValueError(
            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )
Exemple #4
0
    def minimize(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None):
        """
        minimize a program through loss
        Args:
            loss (Variable|Variable List): loss variable or loss variable list to run optimization.
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            parameter_list (list): list of Variables to update.
            no_grad_set (set|None): set of Variables should be ignored.
        Returns:
            tuple: (optimize_ops, params_grads) which are, list of operators appended;
            and list of (param, grad) Variables pair for optimization.
        Note that in parameter server mode, a worker will not get anything about optimize_os
        Because optmizer algorithms run on pserver side. We will make this usable in pserver
        process, but currently the optimization part is written into Fleet(). A user does not
        need to care about how to startup a pserver node.
        """
        optimize_ops, param_grads = self._optimizer.minimize(
            loss, startup_program, parameter_list, no_grad_set)

        worker_endpoints = fleet.worker_endpoints()
        trainer_id = fleet.worker_index()
        current_endpoint = fleet.worker_endpoints()[trainer_id]

        startup_program = startup_program if startup_program else \
            fluid.framework.default_startup_program

        # call transpiler
        config = dist_transpiler.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = dist_transpiler.DistributeTranspiler(config=config)
        t.transpile(
            trainer_id,
            trainers=','.join(worker_endpoints),
            startup_program=startup_program,
            current_endpoint=current_endpoint)

        return optimize_ops, param_grads
Exemple #5
0
    def test(self):
        self._check()

        trainer_id = self.trainer_id
        num_trainers = self.num_trainers

        # if the test program is not built, which means that is the first time
        # to call the test method, we will first build the test program and
        # add ops to broadcast bn-related parameters from trainer 0 to other
        # trainers for distributed tests.
        if not self.test_initialized:
            emb, loss, _, _, _ = self.build_program(False,
                                                    self.num_trainers > 1)
            emb_name = emb.name
            assert self._get_info(emb_name) is None
            self._set_info('emb_name', emb.name)

            if num_trainers > 1 and self.has_run_train:
                self._append_broadcast_ops(self.test_program)

            if num_trainers > 1 and not self.has_run_train:
                worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
                current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")

                config = dist_transpiler.DistributeTranspilerConfig()
                config.mode = "collective"
                config.collective_mode = "grad_allreduce"
                t = dist_transpiler.DistributeTranspiler(config=config)
                t.transpile(trainer_id=trainer_id,
                            trainers=worker_endpoints,
                            startup_program=self.startup_program,
                            program=self.test_program,
                            current_endpoint=current_endpoint)
        else:
            emb_name = self._get_info('emb_name')

        gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
        place = fluid.CUDAPlace(gpu_id)
        exe = fluid.Executor(place)
        if not self.has_run_train:
            exe.run(self.startup_program)

        if not self.test_reader:
            test_reader = reader.test
        else:
            test_reader = self.test_reader
        if not self.test_initialized:
            test_list, test_name_list = test_reader(self.dataset_dir,
                                                    self.val_targets)
            assert self._get_info('test_list') is None
            assert self._get_info('test_name_list') is None
            self._set_info('test_list', test_list)
            self._set_info('test_name_list', test_name_list)
        else:
            test_list = self._get_info('test_list')
            test_name_list = self._get_info('test_name_list')

        test_program = self.test_program

        if not self.has_run_train:
            assert self.checkpoint_dir, "No checkpoint found for test."
            self.load_checkpoint(executor=exe,
                                 main_program=test_program,
                                 load_for_train=False)

        feeder = fluid.DataFeeder(place=place,
                                  feed_list=['image', 'label'],
                                  program=test_program)
        fetch_list = [emb_name]

        self.test_initialized = True

        test_start = time.time()
        self._run_test(exe, test_list, test_name_list, feeder, fetch_list)
        test_end = time.time()
        logger.info("test time: {:.4f}".format(test_end - test_start))
Exemple #6
0
def train(args, data_reader=ctc_reader):
    """OCR CTC training"""
    num_classes = None
    train_images = None
    train_list = None
    test_images = None
    test_list = None
    num_classes = data_reader.num_classes(
    ) if num_classes is None else num_classes
    data_shape = data_reader.data_shape()
    # define network
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(name='label',
                              shape=[1],
                              dtype='int32',
                              lod_level=1)
    sum_cost, error_evaluator, inference_program, model_average = ctc_train_net(
        images, label, args, num_classes)

    # data reader
    train_reader = data_reader.train(args.batch_size,
                                     train_images_dir=train_images,
                                     train_list_file=train_list)
    test_reader = data_reader.test(args.batch_size,
                                   test_images_dir=test_images,
                                   test_list_file=test_list)

    # prepare environment
    place = fluid.CPUPlace()
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    # load init model
    if args.init_model is not None:
        model_dir = args.init_model
        model_file_name = None
        if not os.path.isdir(args.init_model):
            model_dir = os.path.dirname(args.init_model)
            model_file_name = os.path.basename(args.init_model)
        fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
        print "Init model from: %s." % args.init_model

    fetch_vars = [sum_cost]
    fetch_vars.extend([e for e in error_evaluator])

    def test_parallel(exe, pass_id, batch_id):
        distance_evaluator = fluid.metrics.EditDistance(None)
        test_fetch = [v.name for v in error_evaluator]

        distance_evaluator.reset()
        for idx, data in enumerate(test_reader()):
            test_ret = exe.run(test_fetch, feed=get_feeder_data(data, place))
            distance_evaluator.update(distances=test_ret[0],
                                      seq_num=np.mean(test_ret[1]))
        return distance_evaluator.eval()

    def test(exe, pass_id):
        distance_evaluator = fluid.metrics.EditDistance(None)
        test_fetch = [v.name for v in error_evaluator]

        distance_evaluator.reset()
        for idx, data in enumerate(test_reader()):
            test_ret = exe.run(inference_program,
                               feed=get_feeder_data(data, place),
                               fetch_list=test_fetch)
            distance_evaluator.update(distances=test_ret[0],
                                      seq_num=np.mean(test_ret[1]))
        return distance_evaluator.eval()

    def train_parallel(train_exe):
        var_names = [var.name for var in fetch_vars]
        #test_exe = fluid.ParallelExecutor(
        #    use_cuda=True, main_program=inference_program, share_vars_from=train_exe)
        place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
        test_exe = fluid.Executor(place)

        for pass_id in range(args.pass_num):
            batch_id = 1
            total_loss = 0.0
            total_seq_error = 0.0
            # train a pass
            num_samples, start_time = 0, time.time()
            for idx, data in enumerate(train_reader()):
                batch_start_time = time.time()
                results = train_exe.run(var_names,
                                        feed=get_feeder_data(data, place))
                results = [np.array(result).sum() for result in results]
                total_loss += results[0]
                total_seq_error += results[1]
                # training log
                if batch_id % args.log_period == 0:
                    print(
                        "Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq err: %s; Speed: %.5f samples/sec"
                        % (pass_id, batch_id, total_loss /
                           (batch_id * args.batch_size), total_seq_error /
                           (batch_id * args.batch_size), len(data) /
                           (time.time() - batch_start_time)))
                batch_id += 1
                num_samples += len(data)

            print_train_time(start_time, time.time(), num_samples)
            # run test
            if model_average:
                with model_average.apply(test_exe):
                    #test_ret = test_parallel(test_exe, pass_id, batch_id)
                    test_ret = test(test_exe, pass_id)
            else:
                #test_ret = test_parallel(test_exe, pass_id, batch_id)
                test_ret = test(test_exe, pass_id)
            print("Pass[%d]; Test avg seq error: %s\n" %
                  (pass_id, test_ret[1]))

    if args.local:
        place = core.CPUPlace() if args.use_gpu else core.CUDAPlace(0)
        startup_exe = fluid.Executor(place)
        startup_exe.run(fluid.default_startup_program())
        exec_strategy = ExecutionStrategy()
        exec_strategy.use_cuda = args.use_gpu
        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_gpu,
            main_program=fluid.default_main_program(),
            loss_name=sum_cost.name,
            exec_strategy=exec_strategy)
        train_parallel(train_exe)
    else:
        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")
        trainers = int(os.getenv("PADDLE_TRAINERS"))
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)
        # the IP of the local machine, needed by pserver only
        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
        # the role, should be either PSERVER or TRAINER
        training_role = os.getenv("PADDLE_TRAINING_ROLE")
        t = distribute_transpiler.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
        if training_role == "PSERVER":
            pserver_program = t.get_pserver_program(current_endpoint)
            pserver_startup_program = t.get_startup_program(
                current_endpoint, pserver_program)
            exe = fluid.Executor(core.CPUPlace())
            exe.run(pserver_startup_program)
            exe.run(pserver_program)
        elif training_role == "TRAINER":
            exe.run(fluid.default_startup_program())
            trainer_program = t.get_trainer_program()
            exec_strategy = ExecutionStrategy()
            exec_strategy.use_cuda = args.use_gpu
            exec_strategy.num_threads = 1
            train_exe = fluid.ParallelExecutor(use_cuda=args.use_gpu,
                                               main_program=trainer_program,
                                               loss_name=sum_cost.name,
                                               exec_strategy=exec_strategy)
            train_parallel(train_exe)
        else:
            raise ValueError(
                "env PADDLE_TRAINING_ROLE should be in [PSERVER, TRIANER]")