def init_infer_program(self):
        # define inferer
        self.infer_program = fluid.Program()
        startup_prog = fluid.Program()

        # prepare the network
        with fluid.program_guard(self.infer_program, startup_prog):
            with fluid.unique_name.guard():
                self.infer_feeder, self.infer_log_probs, _ = self.create_network(is_infer=True)

        self.infer_program = self.infer_program.clone(for_test=True)
        self.infer_exe = fluid.Executor(self._place)
        self.infer_exe.run(startup_prog)

        # init param from pretrained_model
        if not self._init_from_pretrained_model:
            exit("预训练模型文件不存在!")
        self.init_from_pretrained_model(self.infer_exe, self.infer_program)

        # 支持多卡推理
        build_strategy = compiler.BuildStrategy()
        exec_strategy = fluid.ExecutionStrategy()
        self.infer_compiled_prog = compiler.CompiledProgram(self.infer_program).with_data_parallel(
            build_strategy=build_strategy,
            exec_strategy=exec_strategy)
    def train(self,
              train_batch_reader,
              dev_batch_reader,
              learning_rate,
              gradient_clipping,
              num_epoch,
              batch_size,
              num_samples,
              test_off=False):
        """Train the model.

        :param train_batch_reader: Train data reader.
        :type train_batch_reader: callable
        :param dev_batch_reader: Validation data reader.
        :type dev_batch_reader: callable
        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :param learning_rate: Learning rate for ADAM optimizer.
        :type learning_rate: float
        :param gradient_clipping: Gradient clipping threshold.
        :type gradient_clipping: float
        :param num_epoch: Number of training epochs.
        :type num_epoch: int
        :param batch_size: Number of batch size.
        :type batch_size: int
        :param num_samples: The num of train samples.
        :type num_samples: int
        :param num_iterations_print: Number of training iterations for printing
                                     a training loss.
        :type num_iteratons_print: int
        :param only_train_batch:Every epoch only train only_train_batch batch. Avoid insufficient video memory
        :type only_train_batch:int
        :param test_off: Turn off testing.
        :type test_off: bool
        """
        # prepare model output directory
        if not os.path.exists(self._output_model_dir):
            mkpath(self._output_model_dir)

        if isinstance(self._place, fluid.CUDAPlace):
            dev_count = fluid.core.get_cuda_device_count()
            learning_rate = learning_rate * dev_count
        else:
            dev_count = int(os.environ.get('CPU_NUM', 1))

        # prepare the network
        train_program = fluid.Program()
        startup_prog = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_reader, _, ctc_loss = self.create_network()
                # 学习率
                learning_rate = fluid.layers.exponential_decay(
                        learning_rate=learning_rate,
                        decay_steps=num_samples / batch_size / dev_count,
                        decay_rate=0.83,
                        staircase=True)
                # 准备优化器
                optimizer = fluid.optimizer.AdamOptimizer(
                    learning_rate=learning_rate,
                    regularization=fluid.regularizer.L2Decay(0.0001),
                    grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=gradient_clipping))
                optimizer.minimize(loss=ctc_loss)

        exe = fluid.Executor(self._place)
        exe.run(startup_prog)

        # init from some pretrain models, to better solve the current task
        pre_epoch = 0
        if self._init_from_pretrained_model:
            pre_epoch = self.init_from_pretrained_model(exe, train_program)

        build_strategy = compiler.BuildStrategy()
        exec_strategy = fluid.ExecutionStrategy()

        # pass the build_strategy to with_data_parallel API
        train_compiled_prog = compiler.CompiledProgram(train_program).with_data_parallel(loss_name=ctc_loss.name,
                                                                                         build_strategy=build_strategy,
                                                                                         exec_strategy=exec_strategy)

        train_reader.set_batch_generator(train_batch_reader)

        train_step = 0
        test_step = 0
        num_batch = -1
        # run train
        for epoch_id in range(num_epoch):
            train_reader.start()
            epoch_loss = []
            time_begin = time.time()
            batch_id = 0
            while True:
                try:
                    fetch_list = [ctc_loss.name, learning_rate.name]
                    if batch_id % 100 == 0:
                        fetch = exe.run(program=train_compiled_prog,
                                        fetch_list=fetch_list,
                                        return_numpy=False)
                        each_loss = fetch[0]
                        each_learning_rate = np.array(fetch[1])[0]
                        epoch_loss.extend(np.array(each_loss[0]) / batch_size)

                        print("Train [%s] epoch: [%d/%d], batch: [%d/%d], learning rate: %f, train loss: %f\n" %
                              (datetime.now(), epoch_id, num_epoch, batch_id, num_batch, each_learning_rate,
                               np.mean(each_loss[0]) / batch_size))
                        # 记录训练损失值
                        self.writer.add_scalar('Train loss', np.mean(each_loss[0]) / batch_size, train_step)
                        self.writer.add_scalar('Learning rate', each_learning_rate, train_step)
                        train_step += 1
                    else:
                        _ = exe.run(program=train_compiled_prog,
                                    fetch_list=[],
                                    return_numpy=False)
                    # 每2000个batch保存一次模型
                    if batch_id % 2000 == 0 and batch_id != 0:
                        self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch))
                    batch_id = batch_id + 1
                except fluid.core.EOFException:
                    train_reader.reset()
                    break
            num_batch = batch_id
            # 每一个epoch保存一次模型
            self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch))
            used_time = time.time() - time_begin
            if test_off:
                print('======================last Train=====================')
                print("Train time: %f sec, epoch: %d, train loss: %f\n" %
                      (used_time, epoch_id, np.mean(np.array(epoch_loss))))
                print('======================last Train=====================')
            else:
                print('\n======================Begin test=====================')
                # 设置临时模型的路径
                self._init_from_pretrained_model = self.save_model_path
                # 执行测试
                test_result = self.test(test_reader=dev_batch_reader)
                print("Train time: %f sec, epoch: %d, train loss: %f, test %s: %f"
                      % (used_time, epoch_id + pre_epoch, np.mean(np.array(epoch_loss)), self.error_rate_type, test_result))
                print('======================Stop Train=====================\n')
                # 记录测试结果
                self.writer.add_scalar('Test %s' % self.error_rate_type, test_result, test_step)
                test_step += 1

        self.save_param(exe, train_program, "step_final")

        print("\n------------Training finished!!!-------------")
Example #3
0
    def train(self,
              train_batch_reader,
              dev_batch_reader,
              feeding_dict,
              learning_rate,
              gradient_clipping,
              num_epoch,
              batch_size,
              num_samples,
              save_epoch=100,
              num_iterations_print=100,
              test_off=False):
        """Train the model.

        :param train_batch_reader: Train data reader.
        :type train_batch_reader: callable
        :param dev_batch_reader: Validation data reader.
        :type dev_batch_reader: callable
        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :param learning_rate: Learning rate for ADAM optimizer.
        :type learning_rate: float
        :param gradient_clipping: Gradient clipping threshold.
        :type gradient_clipping: float
        :param num_epoch: Number of training epochs.
        :type num_epoch: int
        :param batch_size: Number of batch size.
        :type batch_size: int
        :param num_samples: The num of train samples.
        :type num_samples: int
        :param save_epoch: Number of training iterations for save checkpoint and params.
        :type save_epoch: int
        :param num_iterations_print: Number of training iterations for printing
                                     a training loss.
        :type num_iteratons_print: int
        :param test_off: Turn off testing.
        :type test_off: bool
        """
        # prepare model output directory
        if not os.path.exists(self._output_model_dir):
            mkpath(self._output_model_dir)

        # adapt the feeding dict according to the network
        adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict)

        if isinstance(self._place, fluid.CUDAPlace):
            dev_count = fluid.core.get_cuda_device_count()
        else:
            dev_count = int(os.environ.get('CPU_NUM', 1))

        # prepare the network
        train_program = fluid.Program()
        startup_prog = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_reader, log_probs, ctc_loss = self.create_network()
                # prepare optimizer
                optimizer = fluid.optimizer.AdamOptimizer(
                    learning_rate=fluid.layers.exponential_decay(
                        learning_rate=learning_rate,
                        decay_steps=num_samples / batch_size / dev_count,
                        decay_rate=0.83,
                        staircase=True))
                fluid.clip.set_gradient_clip(
                    clip=fluid.clip.GradientClipByGlobalNorm(
                        clip_norm=gradient_clipping))
                optimizer.minimize(loss=ctc_loss)

        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_reader, _, ctc_loss = self.create_network()

        test_prog = test_prog.clone(for_test=True)

        exe = fluid.Executor(self._place)
        exe.run(startup_prog)

        # init from some pretrain models, to better solve the current task
        pre_epoch = 0
        if self._init_from_pretrained_model:
            pre_epoch = self.init_from_pretrained_model(exe, train_program)

        build_strategy = compiler.BuildStrategy()
        exec_strategy = fluid.ExecutionStrategy()

        # pass the build_strategy to with_data_parallel API
        compiled_prog = compiler.CompiledProgram(
            train_program).with_data_parallel(loss_name=ctc_loss.name,
                                              build_strategy=build_strategy,
                                              exec_strategy=exec_strategy)

        train_reader.set_batch_generator(train_batch_reader)
        test_reader.set_batch_generator(dev_batch_reader)

        # run train
        for epoch_id in range(num_epoch):
            train_reader.start()
            epoch_loss = []
            time_begin = time.time()
            batch_id = 0
            step = 0
            while True:
                try:
                    fetch_list = [ctc_loss.name]

                    if batch_id % num_iterations_print == 0:
                        fetch = exe.run(program=compiled_prog,
                                        fetch_list=fetch_list,
                                        return_numpy=False)
                        each_loss = fetch[0]
                        epoch_loss.extend(np.array(each_loss[0]) / batch_size)

                        print("epoch: %d, batch: %d, train loss: %f\n" %
                              (epoch_id, batch_id,
                               np.mean(each_loss[0]) / batch_size))

                    else:
                        each_loss = exe.run(program=compiled_prog,
                                            fetch_list=[],
                                            return_numpy=False)

                    batch_id = batch_id + 1
                except fluid.core.EOFException:
                    train_reader.reset()
                    break
            time_end = time.time()
            used_time = time_end - time_begin
            if test_off:
                print("\n--------Time: %f sec, epoch: %d, train loss: %f\n" %
                      (used_time, epoch_id, np.mean(np.array(epoch_loss))))
            else:
                print('\n----------Begin test...')
                test_loss = self.test(exe,
                                      dev_batch_reader=dev_batch_reader,
                                      test_program=test_prog,
                                      test_reader=test_reader,
                                      fetch_list=[ctc_loss])
                print(
                    "--------Time: %f sec, epoch: %d, train loss: %f, test loss: %f"
                    % (used_time, epoch_id + pre_epoch,
                       np.mean(np.array(epoch_loss)), test_loss / batch_size))
            if (epoch_id + 1) % save_epoch == 0:
                self.save_param(exe, train_program,
                                "epoch_" + str(epoch_id + pre_epoch))

        self.save_param(exe, train_program, "step_final")

        print("\n------------Training finished!!!-------------")