Beispiel #1
0
    def test_norm(self):
        self.init_test_case()
        inputs = np.random.random((2, 3, 5, 5)).astype(np.float32)
        shape = inputs.shape
        n, c, h, w = shape[0], shape[1], shape[2], shape[3]
        scale_shape = [c]
        mean_shape = [n * c]
        scale = np.ones(scale_shape).astype(np.float32)
        bias = np.zeros(scale_shape).astype(np.float32)
        mean, variance = _cal_mean_variance(inputs, self.epsilon, mean_shape)
        out_np, _, _ = _reference_instance_norm_naive(inputs, scale, bias,
                                                      self.epsilon, mean,
                                                      variance)

        for place in self.places:
            with fluid.dygraph.guard(place):
                instance_norm = fluid.dygraph.InstanceNorm(3,
                                                           param_attr=True,
                                                           bias_attr=True)
                outputs = instance_norm(to_variable(inputs))
                self.assertTrue(np.allclose(outputs.numpy(), out_np,
                                            atol=1e-6))
def validation():
    # 开启动态图工作环境
    with dygraph.guard():
        # 声明定义好的线性回归模型
        model = Regressor("Regressor")
        # 开启模型训练模式
        model.eval()
        # 参数为保存模型参数的文件地址
        model_dict, _ = fluid.load_dygraph('LR_model')
        model.load_dict(model_dict)
        model.eval()

        # 参数为数据集的文件地址
        test_data, label = load_one_example('./work/housing.data')
        # 将数据转为动态图的variable格式
        test_data = dygraph.to_variable(test_data)
        results = model(test_data)

        # 对结果做反归一化处理
        results = results * (max_values[-1] - min_values[-1]) + avg_values[-1]
        print("Inference result is {}, the corresponding label is {}".format(
            results.numpy(), label))
Beispiel #3
0
def get_part_mask(densepose_map):
    """
    Obtain mask of different body parts of humans. This is done by looking
    at the body part map from DensePose.

    Args:
        densepose_map (NxCxHxW tensor): DensePose map.
    Returns:
        mask (NxKxHxW tensor): Body part mask, where K is the number of parts.
    """
    # Group of body parts. Each group contains IDs of body labels in DensePose.
    # The 9 groups here are: background, torso, hands, feet, upper legs, lower legs,
    # upper arms, lower arms, head.
    part_groups = [[0], [1, 2], [3, 4], [5, 6], [7, 9, 8, 10],
                   [11, 13, 12, 14], [15, 17, 16, 18], [19, 21, 20, 22],
                   [23, 24]]
    n_parts = len(part_groups)

    densepose_map = densepose_map.numpy()
    need_reshape = len(densepose_map.shape) == 4
    if need_reshape:
        bo, t, h, w = densepose_map.shape
        densepose_map = np.reshape(densepose_map, (-1, h, w))
    b, h, w = densepose_map.shape
    part_map = (densepose_map / 2 + 0.5) * 24
    assert np.all(part_map >= 0) and np.all(part_map < 25)

    mask = np.zeros((b, n_parts, h, w)).astype("bool")
    for i in range(n_parts):
        for j in part_groups[i]:
            # Account for numerical errors.
            mask[:, i] = np.logical_or(
                mask[:, i],
                np.logical_and((part_map > j - 0.1), (part_map < j + 0.1)))
    if need_reshape:
        mask = np.reshape(mask, (bo, t, -1, h, w))
    mask = dg.to_variable(mask.astype("float32"))
    return mask
Beispiel #4
0
    def func_test_buffer_not_persistable_assign(self):
        with fluid.dygraph.guard():
            net = fluid.Layer()
            var1 = to_variable(np.zeros([1]))
            net.register_buffer("buffer_name", var1, persistable=False)

            # Assigning Nones will remove the buffer, but allow to re-assign
            # to remark it as buffer.
            net.buffer_name = None
            self.assertEqual(len(net.buffers()), 0)
            self.assertEqual(len(net.state_dict()), 0)

            net.buffer_name = var1
            self.assertEqual(len(net.buffers()), 1)
            self.assertEqual(len(net.state_dict()), 0)

            # Re-assign a ParamBase will remove the buffer.
            if in_dygraph_mode():
                net.buffer_name = EagerParamBase([2, 2], 'float32')
            else:
                net.buffer_name = ParamBase([2, 2], 'float32')
            self.assertEqual(len(net.buffers()), 0)
            self.assertEqual(len(net.state_dict()), 1)
Beispiel #5
0
def infer(model, infer_data, max_seq_len=300, is_tensor=True, logits_softmax=True):
    """ 用dygraph模型预测
    [IN]  model: dygraph模型结构
          infer_data: list[(input1[, input2, ...])], 待预测数据
          max_seq_len: int, 最大长度
          is_tensor: boolean, true则infer_data已经是paddle可处理的tensor
          logits_softmax: boolean, true则预测结果为softmax后的logits
    [OUT] pred: list[float], 预测结果
    """
    # 在这个with域内ernie不会进行梯度计算;
    with D.base._switch_tracer_mode_guard_(is_train=False):
        # 控制模型进入eval模式,这将会关闭所有的dropout;
        model.eval()
        # 如果infer_data没有转tensor 则转为paddle接收的tensor
        if not is_tensor:
            infer_data = D.to_variable(np.array(infer_data))

        logits = model(infer_data, logits_softmax=logits_softmax)
        # TODO: 返回rate值
        pred = L.argmax(logits, -1).numpy()
        # 进入train模式
        model.train()
    return pred
Beispiel #6
0
    def paddle_nn_layer(self):
        x_var = dg.to_variable(self.input)

        if self.output_padding != 0:
            output_size = None
        else:
            output_size = self.output_size

        conv = nn.Conv2DTranspose(self.num_channels,
                                  self.num_filters,
                                  self.filter_size,
                                  padding=self.padding,
                                  output_padding=self.output_padding,
                                  stride=self.stride,
                                  dilation=self.dilation,
                                  groups=self.groups,
                                  data_format=self.data_format)
        conv.weight.set_value(self.weight)
        if not self.no_bias:
            conv.bias.set_value(self.bias)
        y_var = conv(x_var, output_size)
        y_np = y_var.numpy()
        return y_np
Beispiel #7
0
    def func_test_register_buffer_with_error(self):
        with fluid.dygraph.guard():
            net = fluid.Layer()
            var = to_variable(np.zeros([1]))

            with self.assertRaisesRegexp(TypeError,
                                         "name of buffer should be a string"):
                net.register_buffer(12, var)

            with self.assertRaisesRegexp(TypeError,
                                         "buffer should be a Paddle.Tensor"):
                if in_dygraph_mode():
                    net.register_buffer("buffer_name",
                                        EagerParamBase([2, 2], 'float32'))
                else:
                    net.register_buffer("buffer_name",
                                        ParamBase([2, 2], 'float32'))

            with self.assertRaisesRegexp(KeyError,
                                         "name of buffer can not contain"):
                net.register_buffer("buffer.name", var)

            with self.assertRaisesRegexp(KeyError,
                                         "name of buffer can not be empty"):
                net.register_buffer("", var)

            net.attr_name = 10
            with self.assertRaisesRegexp(KeyError, "already exists"):
                net.register_buffer("attr_name", var)

            del net.attr_name
            if in_dygraph_mode():
                net.attr_name = EagerParamBase([2, 2], 'float32')
            else:
                net.attr_name = ParamBase([2, 2], 'float32')
            with self.assertRaisesRegexp(KeyError, "already exists"):
                net.register_buffer("attr_name", var)
Beispiel #8
0
    def forward(self, inputs):
        # Use `to_variable` to create a copy of global h_0 created not in `DynamicGRU`,
        # to avoid modify it because `h_0` is both used in other `DynamicGRU`.
        hidden = to_variable(self.h_0)
        hidden.stop_gradient = True

        res = []
        for i in range(inputs.shape[1]):
            if self.is_reverse:
                j = fluid.layers.shape(inputs)[1] - 1 - i
            else:
                # TODO(Aurelius84): In while block, if the var created in parent block
                # participates in the calculation of gradient, the result of gradient
                # is incorrect because each step scope always returns the same value
                # generated by last step. Here we add 0 to create `j` in while block to
                # avoid this bug, and working on fixing it in next PR.
                j = i + 0
            # FIXME(Aurelius84): see above explanation.
            hidden = fluid.layers.scale(hidden, 1)

            # See above explanation.
            # input_ = inputs[:, i:i+1, :]  # original code
            input_ = fluid.layers.slice(inputs,
                                        axes=[1],
                                        starts=[j],
                                        ends=[j + 1])
            input_ = fluid.layers.reshape(input_, [-1, input_.shape[2]],
                                          inplace=False)
            hidden, reset, gate = self.gru_unit(input_, hidden)
            hidden_ = fluid.layers.reshape(hidden, [-1, 1, hidden.shape[1]],
                                           inplace=False)
            res.append(hidden_)

        if self.is_reverse:
            res = res[::-1]
        res = fluid.layers.concat(res, axis=1)
        return res
Beispiel #9
0
    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, 
                pos=None, query_pos=None):
        output = tgt

        intermediate = []

        assert tgt_mask is None, "Not implement compute tgt_mask's attn_mask."

        if memory_mask is not None:
            bs, tgt_length = tgt.shape[:2]
            memory_length = memory.shape[1]
            attn_mask = L.zeros([bs, tgt_length, memory_length], dtype="float32")
            memory_mask = L.expand(L.unsqueeze(memory_mask, [1]), (1, tgt_length, 1)) # [bs, tgt_length, memory_length]
            attn_mask = attn_mask.numpy()
            memory_mask = memory_mask.numpy()
            attn_mask[memory_mask] = -1e8
            attn_mask = dg.to_variable(attn_mask)
            attn_mask = L.expand(L.unsqueeze(attn_mask, [1]), (1, self.nhead, 1, 1)) # [bs, nhead, tgt_length, memory_length]
            memory_mask = attn_mask

        for layer in self.layers:
            output = layer(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
                           pos=pos, query_pos=query_pos)
            
            if self.return_intermediate:
                intermediate.append(self.norm(output))
        
        if self.norm is not None:
            output = self.norm(output)
            if self.return_intermediate:
                intermediate.pop()
                intermediate.append(output)
        
        if self.return_intermediate:
            return L.stack(intermediate)
        
        return L.unsqueeze(output, [0])
Beispiel #10
0
def evaluate(env):
    """Evaluate"""
    args = env.args
    puncts = dygraph.to_variable(env.puncts, zero_copy=False)

    logging.info("Load the dataset")
    evaluates = Corpus.load(args.test_data_path, env.fields)
    dataset = TextDataset(evaluates, env.fields, args.buckets)
    # set the data loader
    dataset.loader = batchify(dataset, args.batch_size)

    logging.info(f"{len(dataset)} sentences, "
                 f"{len(dataset.loader)} batches, "
                 f"{len(dataset.buckets)} buckets")
    logging.info("Load the model")
    model = load(args.model_path)

    logging.info("Evaluate the dataset")
    start = datetime.datetime.now()
    loss, metric = epoch_evaluate(args, model, dataset.loader, puncts)
    total_time = datetime.datetime.now() - start
    logging.info(f"Loss: {loss:.4f} {metric}")
    logging.info(f"{total_time}s elapsed, "
                 f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
Beispiel #11
0
 def __init__(self, dict_dim, batch_size, seq_len):
     super(GRU, self).__init__()
     self.dict_dim = dict_dim
     self.emb_dim = 128
     self.hid_dim = 128
     self.fc_hid_dim = 96
     self.class_dim = 2
     self.batch_size = batch_size
     self.seq_len = seq_len
     self.embedding = Embedding(
         size=[self.dict_dim + 1, self.emb_dim],
         dtype='float32',
         param_attr=fluid.ParamAttr(learning_rate=30),
         is_sparse=False)
     h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
     h_0 = to_variable(h_0)
     self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
     self._fc2 = Linear(input_dim=self.hid_dim,
                        output_dim=self.fc_hid_dim,
                        act="tanh")
     self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
                                  output_dim=self.class_dim,
                                  act="softmax")
     self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0)
def compute_position_embedding(radians, speaker_position_rate):
    """Compute sin/cos interleaved matrix from the radians.
    
    Arg:
        radians (Variable): shape(n_vocab, embed_dim), dtype float32, the radians matrix.
        speaker_position_rate (Variable): shape(B, ), speaker positioning rate.
    
    Returns:
        Variable: shape(B, n_vocab, embed_dim), the sin, cos interleaved matrix.
    """
    _, embed_dim = radians.shape
    batch_size = speaker_position_rate.shape[0]
    speaker_position_rate = F.unsqueeze(speaker_position_rate, [1, 2])
    scaled_radians = speaker_position_rate * radians

    odd_mask = (np.arange(embed_dim) % 2).astype(np.float32)
    odd_mask = dg.to_variable(odd_mask)

    out = odd_mask * F.cos(scaled_radians) \
        + (1 - odd_mask) * F.sin(scaled_radians)
    out = F.concat(
        [F.zeros((batch_size, 1, embed_dim), radians.dtype), out[:, 1:, :]],
        axis=1)
    return out
Beispiel #13
0
    def test_with_different_input(self):
        with fluid.dygraph.guard(fluid.CPUPlace()):
            x_data = np.ones([16, 10]).astype('float32')
            y_data = np.ones([10]).astype('float32') * 2
            z_data = np.ones([10]).astype('float32') * 2.2

            foo = declarative(foo_func)

            # [16, 10] + [10] (varbase)
            out_1 = foo(to_variable(x_data), to_variable(y_data))
            self.assertTrue(np.allclose(x_data + y_data, out_1.numpy()))
            self.assertTrue(len(foo.program_cache) == 1)
            self.assertTrue(len(foo.program_cache.concrete_programs()) == 1)
            first_program = foo.program_cache.last()

            # [16, 10] + [10] (numpy)
            out_2 = foo(to_variable(x_data), y_data)
            self.assertTrue(np.allclose(x_data + y_data, out_2.numpy()))
            self.assertTrue(len(foo.program_cache) == 1)

            # [16, 10] + [10] (numpy)
            out_3 = foo(to_variable(x_data), z_data)
            self.assertTrue(np.allclose(x_data + z_data, out_3.numpy()))
            # hit cache program
            self.assertTrue(len(foo.program_cache) == 1)

            # [16, 10] + [10] (numpy) with other different arguments (c=3)
            out_4 = foo(to_variable(x_data), z_data, 3)
            self.assertTrue(np.allclose(x_data + z_data, out_4.numpy()))
            # create a new program
            self.assertTrue(len(foo.program_cache) == 2)

            # test for recent program
            foo(to_variable(x_data), y_data)
            recent_program = foo.program_cache.last()
            self.assertTrue(first_program == recent_program)
 def run_main(self, np_arr, place):
     with guard(place):
         var = to_variable(np_arr)
         self.assertTrue(np.array_equal(np_arr, var.numpy()))
Beispiel #15
0
def train(to_static):
    program_translator = ProgramTranslator()
    program_translator.enable(to_static)

    random.seed(0)
    np.random.seed(0)

    place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()
    with fluid.dygraph.guard(place):
        fluid.default_startup_program().random_seed = 1000
        fluid.default_main_program().random_seed = 1000
        model = YOLOv3(3, is_train=True)

        boundaries = cfg.lr_steps
        gamma = cfg.lr_gamma
        step_num = len(cfg.lr_steps)
        learning_rate = cfg.learning_rate
        values = [learning_rate * (gamma**i) for i in range(step_num + 1)]

        lr = fluid.dygraph.PiecewiseDecay(boundaries=boundaries,
                                          values=values,
                                          begin=0)

        lr = fluid.layers.linear_lr_warmup(
            learning_rate=lr,
            warmup_steps=cfg.warm_up_iter,
            start_lr=0.0,
            end_lr=cfg.learning_rate,
        )

        optimizer = fluid.optimizer.Momentum(
            learning_rate=lr,
            regularization=fluid.regularizer.L2Decay(cfg.weight_decay),
            momentum=cfg.momentum,
            parameter_list=model.parameters())

        start_time = time.time()
        snapshot_loss = 0
        snapshot_time = 0
        total_sample = 0

        input_size = cfg.input_size
        shuffle = True
        shuffle_seed = None
        total_iter = cfg.max_iter
        mixup_iter = total_iter - cfg.no_mixup_iter

        train_reader = FakeDataReader().reader()

        smoothed_loss = SmoothedValue()
        ret = []
        for iter_id, data in enumerate(train_reader()):
            prev_start_time = start_time
            start_time = time.time()
            img = np.array([x[0] for x in data]).astype('float32')
            img = to_variable(img)

            gt_box = np.array([x[1] for x in data]).astype('float32')
            gt_box = to_variable(gt_box)

            gt_label = np.array([x[2] for x in data]).astype('int32')
            gt_label = to_variable(gt_label)

            gt_score = np.array([x[3] for x in data]).astype('float32')
            gt_score = to_variable(gt_score)

            loss = model(img, gt_box, gt_label, gt_score, None, None)
            smoothed_loss.add_value(np.mean(loss.numpy()))
            snapshot_loss += loss.numpy()
            snapshot_time += start_time - prev_start_time
            total_sample += 1

            print("Iter {:d}, loss {:.6f}, time {:.5f}".format(
                iter_id, smoothed_loss.get_mean_value(),
                start_time - prev_start_time))
            ret.append(smoothed_loss.get_mean_value())

            loss.backward()

            optimizer.minimize(loss)
            model.clear_gradients()

        return np.array(ret)
Beispiel #16
0
def train(args, fake_data_reader, to_static):
    program_translator = ProgramTranslator()
    program_translator.enable(to_static)

    config = parse_config(args.config)
    train_config = merge_configs(config, 'train', vars(args))
    valid_config = merge_configs(config, 'valid', vars(args))
    print_configs(train_config, 'Train')

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()

    random.seed(0)
    np.random.seed(0)
    with fluid.dygraph.guard(place):
        paddle.seed(1000)
        paddle.framework.random._manual_program_seed(1000)

        video_model = TSM_ResNet("TSM", train_config, 'Train')

        optimizer = create_optimizer(train_config.TRAIN,
                                     video_model.parameters())

        train_reader = fake_data_reader.create_reader()

        ret = []
        for epoch in range(train_config.TRAIN.epoch):
            video_model.train()
            total_loss = 0.0
            total_acc1 = 0.0
            total_acc5 = 0.0
            total_sample = 0
            for batch_id, data in enumerate(train_reader()):
                x_data = np.array([item[0] for item in data])
                y_data = np.array([item[1] for item in data]).reshape([-1, 1])

                imgs = to_variable(x_data)
                labels = to_variable(y_data)
                labels.stop_gradient = True
                outputs = video_model(imgs)
                loss = fluid.layers.cross_entropy(input=outputs,
                                                  label=labels,
                                                  ignore_index=-1)
                avg_loss = fluid.layers.mean(loss)
                acc_top1 = fluid.layers.accuracy(input=outputs,
                                                 label=labels,
                                                 k=1)
                acc_top5 = fluid.layers.accuracy(input=outputs,
                                                 label=labels,
                                                 k=5)

                avg_loss.backward()
                optimizer.minimize(avg_loss)
                video_model.clear_gradients()

                total_loss += avg_loss.numpy()[0]
                total_acc1 += acc_top1.numpy()[0]
                total_acc5 += acc_top5.numpy()[0]
                total_sample += 1

                print('TRAIN Epoch {}, iter {}, loss = {}, acc1 {}, acc5 {}'.
                      format(epoch, batch_id,
                             avg_loss.numpy()[0],
                             acc_top1.numpy()[0],
                             acc_top5.numpy()[0]))
                ret.extend([
                    avg_loss.numpy()[0],
                    acc_top1.numpy()[0],
                    acc_top5.numpy()[0]
                ])

            print(
                'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}'
                .format(epoch, total_loss / total_sample,
                        total_acc1 / total_sample, total_acc5 / total_sample))
        return ret
    # 定义外层循环
    for epoch_id in range(EPOCH_NUM):
        # 在每轮迭代开始之前,将训练数据的顺序随机的打乱
        np.random.shuffle(training_data)
        # 将训练数据进行拆分,每个batch包含10条数据
        mini_batches = [
            training_data[k:k + BATCH_SIZE]
            for k in range(0, len(training_data), BATCH_SIZE)
        ]
        # 定义内层循环
        for iter_id, mini_batch in enumerate(mini_batches):
            x = np.array(mini_batch[:, :-1]).astype('float32')  # 获得当前批次训练数据
            y = np.array(mini_batch[:,
                                    -1:]).astype('float32')  # 获得当前批次训练标签(真实房价)
            # 将numpy数据转为飞桨动态图variable形式
            house_features = dygraph.to_variable(x)
            prices = dygraph.to_variable(y)

            # 前向计算
            predicts = model(house_features)

            # 计算损失
            loss = fluid.layers.square_error_cost(predicts, label=prices)
            avg_loss = fluid.layers.mean(loss)
            if iter_id % 20 == 0:
                print("epoch: {}, iter: {}, loss is: {}".format(
                    epoch_id, iter_id, avg_loss.numpy()))

            # 反向传播
            avg_loss.backward()
            # 最小化loss,更新参数
Beispiel #18
0
def alignments(args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    with dg.guard(place):
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

        # get text data
        root = Path(args.data)
        csv_path = root.joinpath("metadata.csv")
        table = pd.read_csv(csv_path,
                            sep="|",
                            header=None,
                            quoting=csv.QUOTE_NONE,
                            names=["fname", "raw_text", "normalized_text"])
        ljspeech_processor = audio.AudioProcessor(
            sample_rate=cfg['audio']['sr'],
            num_mels=cfg['audio']['num_mels'],
            min_level_db=cfg['audio']['min_level_db'],
            ref_level_db=cfg['audio']['ref_level_db'],
            n_fft=cfg['audio']['n_fft'],
            win_length=cfg['audio']['win_length'],
            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
            symmetric_norm=False,
            max_norm=1.,
            mel_fmin=0,
            mel_fmax=None,
            clip_norm=True,
            griffin_lim_iters=60,
            do_trim_silence=False,
            sound_norm=False)

        pbar = tqdm(range(len(table)))
        alignments = OrderedDict()
        for i in pbar:
            fname, raw_text, normalized_text = table.iloc[i]
            # init input
            text = np.asarray(text_to_sequence(normalized_text))
            text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
            pos_text = np.arange(1, text.shape[1] + 1)
            pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
            wav = ljspeech_processor.load_wav(
                os.path.join(args.data, 'wavs', fname + ".wav"))
            mel_input = ljspeech_processor.melspectrogram(wav).astype(
                np.float32)
            mel_input = np.transpose(mel_input, axes=(1, 0))
            mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0])
            mel_lens = mel_input.shape[1]

            dec_slf_mask = get_triu_tensor(mel_input,
                                           mel_input).astype(np.float32)
            dec_slf_mask = np.expand_dims(dec_slf_mask, axis=0)
            dec_slf_mask = fluid.layers.cast(dg.to_variable(dec_slf_mask != 0),
                                             np.float32) * (-2**32 + 1)
            pos_mel = np.arange(1, mel_input.shape[1] + 1)
            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                text, mel_input, pos_text, pos_mel, dec_slf_mask)
            mel_input = fluid.layers.concat(
                [mel_input, postnet_pred[:, -1:, :]], axis=1)

            alignment, _ = get_alignment(attn_probs, mel_lens,
                                         network_cfg['decoder_num_head'])
            alignments[fname] = alignment
        with open(args.output + '.txt', "wb") as f:
            pickle.dump(alignments, f)
Beispiel #19
0
    def generate(self, texts, use_gpu=False, beam_width=5):
        """
        Get the continuation of the input poetry.

        Args:
             texts(list): the front part of a poetry.
             use_gpu(bool): whether use gpu to predict or not
             beam_width(int): the beam search width.

        Returns:
             results(list): the poetry continuations.
        """
        if texts and isinstance(texts, list) and all(texts) and all(
            [isinstance(text, str) for text in texts]):
            predicted_data = texts
        else:
            raise ValueError(
                "The input texts should be a list with nonempty string elements."
            )
        for i, text in enumerate(texts):
            if len(text) > self.line:
                logger.warning(
                    'The input text: %s, contains more than %i characters, which will be cut off'
                    % (text, self.line))
                texts[i] = text[:self.line]

            for char in text:
                if not '\u4e00' <= char <= '\u9fff':
                    logger.warning(
                        'The input text: %s, contains non-Chinese characters, which may result in magic output'
                        % text)
                    break

        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
            use_gpu = False
            logger.warning(
                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
            )
        if use_gpu:
            place = fluid.CUDAPlace(0)
        else:
            place = fluid.CPUPlace()

        with fluid.dygraph.guard(place):
            self.model.eval()
            results = []
            for text in predicted_data:
                sample_results = []
                ids, sids = self.tokenizer.encode(text)
                src_ids = D.to_variable(np.expand_dims(ids, 0))
                src_sids = D.to_variable(np.expand_dims(sids, 0))
                output_ids = beam_search_infilling(
                    self.model,
                    src_ids,
                    src_sids,
                    eos_id=self.tokenizer.sep_id,
                    sos_id=self.tokenizer.cls_id,
                    attn_id=self.tokenizer.vocab['[MASK]'],
                    max_decode_len=80,
                    max_encode_len=20,
                    beam_width=beam_width,
                    tgt_type_id=1)
                output_str = self.rev_lookup(output_ids[0].numpy())

                for ostr in output_str.tolist():
                    if '[SEP]' in ostr:
                        ostr = ostr[:ostr.index('[SEP]')]
                    sample_results.append("".join(ostr))
                results.append(sample_results)
        return results
Beispiel #20
0
 random.shuffle(train_features)
 train_batch_data = batchify(train_features, args.bsz,
                             args.max_seqlen)
 if args.debug:
     print(len(train_batch_data))
     print(train_batch_data[0])
     token_ids, seg_ids, labels = train_batch_data[0]
     for r1, r2, r3 in zip(token_ids, seg_ids, labels):
         print(r1)
         print(r2)
         print(r3)
         print(convert_ids_to_tokens(tokenizer.vocab, r1))
 for step, d in enumerate(tqdm(train_batch_data, desc='training')):
     ids, sids, labels = d
     # print(ids.shape, sids.shape, labels.shape)
     ids, sids, labels = FD.to_variable(ids), FD.to_variable(
         sids), FD.to_variable(labels)
     loss, logits = model(ids, sids, labels=labels)
     if args.ohem_ratio > 0:
         labels = L.reshape(labels, [-1, 1])
         loss = L.softmax_with_cross_entropy(logits, labels)
         N = int(args.bsz * args.ohem_ratio)
         top_loss = L.argsort(loss, axis=0)[0][-N:]
         if args.debug:
             print(loss)
             print(top_loss)
             print(N)
         loss = L.reduce_sum(top_loss) / N
     loss.backward()
     global_step += 1
     if step % 1000 == 0 and step > 0:
Beispiel #21
0
    def scale_loss(self, loss):
        """
        Scale the loss. In data parallel mode, the loss should be scale with
        the number of trainers. If not in data parallel mode, return the loss
        directly.

        Args:
            loss(Variable): The loss of the current Model.

        Returns:
            Variable: the scaled loss.

        Examples:
            .. code-block:: python

                import paddle
                import paddle.nn as nn
                import paddle.optimizer as opt
                import paddle.distributed as dist

                class LinearNet(nn.Layer):
                    def __init__(self):
                        super(LinearNet, self).__init__()
                        self._linear1 = nn.Linear(10, 10)
                        self._linear2 = nn.Linear(10, 1)
                        
                    def forward(self, x):
                        return self._linear2(self._linear1(x))

                def train():
                    # 1. enable dynamic mode
                    paddle.disable_static()
                    
                    # 2. initialize parallel environment
                    dist.init_parallel_env()

                    # 3. create data parallel layer & optimizer
                    layer = LinearNet()
                    dp_layer = paddle.DataParallel(layer)

                    loss_fn = nn.MSELoss()
                    adam = opt.Adam(
                        learning_rate=0.001, parameters=dp_layer.parameters())

                    # 4. run layer
                    inputs = paddle.randn([10, 10], 'float32')
                    outputs = dp_layer(inputs)
                    labels = paddle.randn([10, 1], 'float32')
                    loss = loss_fn(outputs, labels)
                    
                    loss = dp_layer.scale_loss(loss)
                    loss.backward()
                    dp_layer.apply_collective_grads()

                    adam.step()
                    adam.clear_grad()

                if __name__ == '__main__':
                    # 1. start by ``paddle.distributed.spawn`` (default)
                    dist.spawn(train, nprocs=2)
                    # 2. start by ``paddle.distributed.launch``
                    # train()
        """
        if not self._is_data_parallel_mode():
            return loss

        loss_scale = to_variable(
            np.array([self._strategy.nranks]).astype("float32"))
        loss_scale.stop_gradient = True
        loss = loss / loss_scale
        return loss
Beispiel #22
0
def train(env):
    """Train"""
    args = env.args

    logging.info("loading data.")
    train = Corpus.load(args.train_data_path, env.fields)
    dev = Corpus.load(args.valid_data_path, env.fields)
    test = Corpus.load(args.test_data_path, env.fields)
    logging.info("init dataset.")
    train = TextDataset(train, env.fields, args.buckets)
    dev = TextDataset(dev, env.fields, args.buckets)
    test = TextDataset(test, env.fields, args.buckets)
    logging.info("set the data loaders.")
    train.loader = batchify(train, args.batch_size, args.use_data_parallel,
                            True)
    dev.loader = batchify(dev, args.batch_size)
    test.loader = batchify(test, args.batch_size)

    logging.info(f"{'train:':6} {len(train):5} sentences, "
                 f"{len(train.loader):3} batches, "
                 f"{len(train.buckets)} buckets")
    logging.info(f"{'dev:':6} {len(dev):5} sentences, "
                 f"{len(dev.loader):3} batches, "
                 f"{len(train.buckets)} buckets")
    logging.info(f"{'test:':6} {len(test):5} sentences, "
                 f"{len(test.loader):3} batches, "
                 f"{len(train.buckets)} buckets")

    logging.info("Create the model")
    model = Model(args, env.WORD.embed)

    # init parallel strategy
    if args.use_data_parallel:
        strategy = dygraph.parallel.prepare_context()
        model = dygraph.parallel.DataParallel(model, strategy)

    if args.use_cuda:
        grad_clip = fluid.clip.GradientClipByNorm(clip_norm=args.clip)
    else:
        grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.clip)
    decay = dygraph.ExponentialDecay(learning_rate=args.lr,
                                     decay_steps=args.decay_steps,
                                     decay_rate=args.decay)
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=decay,
        beta1=args.mu,
        beta2=args.nu,
        epsilon=args.epsilon,
        parameter_list=model.parameters(),
        grad_clip=grad_clip)

    total_time = datetime.timedelta()
    best_e, best_metric = 1, Metric()

    puncts = dygraph.to_variable(env.puncts, zero_copy=False)
    logging.info("start training.")
    for epoch in range(1, args.epochs + 1):
        start = datetime.datetime.now()
        # train one epoch and update the parameter
        logging.info(f"Epoch {epoch} / {args.epochs}:")
        epoch_train(args, model, optimizer, train.loader, epoch)
        if args.local_rank == 0:
            loss, dev_metric = epoch_evaluate(args, model, dev.loader, puncts)
            logging.info(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}")
            loss, test_metric = epoch_evaluate(args, model, test.loader,
                                               puncts)
            logging.info(f"{'test:':6} Loss: {loss:.4f} {test_metric}")

            t = datetime.datetime.now() - start
            # save the model if it is the best so far
            if dev_metric > best_metric and epoch > args.patience // 10:
                best_e, best_metric = epoch, dev_metric
                save(args.model_path, args, model, optimizer)
                logging.info(f"{t}s elapsed (saved)\n")
            else:
                logging.info(f"{t}s elapsed\n")
            total_time += t
            if epoch - best_e >= args.patience:
                break
    if args.local_rank == 0:
        model = load(args.model_path, model)
        loss, metric = epoch_evaluate(args, model, test.loader, puncts)
        logging.info(
            f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
        logging.info(
            f"the score of test at epoch {best_e} is {metric.score:.2%}")
        logging.info(f"average time of each epoch is {total_time / epoch}s")
        logging.info(f"{total_time}s elapsed")
Beispiel #23
0
    def __init__(self, args, vocab_size, num_labels, length=None):
        super(lex_net, self).__init__()
        """
        define the lexical analysis network structure
        word: stores the input of the model
        for_infer: a boolean value, indicating if the model to be created is for training or predicting.

        return:
            for infer: return the prediction
            otherwise: return the prediction
        """
        self.word_emb_dim = args.word_emb_dim
        self.vocab_size = vocab_size
        self.num_labels = num_labels
        self.grnn_hidden_dim = args.grnn_hidden_dim
        self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(
            args) else 1.0
        self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(
            args) else 1.0
        self.bigru_num = args.bigru_num
        self.init_bound = 0.1
        #self.IS_SPARSE = True

        self.word_embedding = Embedding(
            size=[self.vocab_size, self.word_emb_dim],
            dtype='float32',
            #is_sparse=self.IS_SPARSE,
            param_attr=fluid.ParamAttr(learning_rate=self.emb_lr,
                                       name="word_emb",
                                       initializer=fluid.initializer.Uniform(
                                           low=-self.init_bound,
                                           high=self.init_bound)))

        h_0 = np.zeros((args.batch_size, self.grnn_hidden_dim),
                       dtype="float32")
        h_0 = to_variable(h_0)
        self.bigru_units = []
        for i in range(self.bigru_num):
            if i == 0:
                self.bigru_units.append(
                    self.add_sublayer(
                        "bigru_units%d" % i,
                        BiGRU(self.grnn_hidden_dim,
                              self.grnn_hidden_dim,
                              self.init_bound,
                              h_0=h_0)))
            else:
                self.bigru_units.append(
                    self.add_sublayer(
                        "bigru_units%d" % i,
                        BiGRU(self.grnn_hidden_dim * 2,
                              self.grnn_hidden_dim,
                              self.init_bound,
                              h_0=h_0)))

        self.fc = Linear(input_dim=self.grnn_hidden_dim * 2,
                         output_dim=self.num_labels,
                         param_attr=fluid.ParamAttr(
                             initializer=fluid.initializer.Uniform(
                                 low=-self.init_bound, high=self.init_bound),
                             regularizer=fluid.regularizer.L2DecayRegularizer(
                                 regularization_coeff=1e-4)))  #,
        #num_flatten_dims=2)

        self.linear_chain_crf = Linear_chain_crf(param_attr=fluid.ParamAttr(
            name='linear_chain_crfw', learning_rate=self.crf_lr),
                                                 size=self.num_labels)

        self.crf_decoding = Crf_decoding(param_attr=fluid.ParamAttr(
            name='crfw', learning_rate=self.crf_lr),
                                         size=self.num_labels)
Beispiel #24
0

@np.vectorize
def rev_lookup(i):
    return rev_dict[i]


ernie = ErnieCloze.from_pretrained(model_dir)
ernie.eval()

ids, _ = tokenizer.encode(
    '戊[MASK]变法,又称百日维新,是 [MASK] [MASK] [MASK] 、梁启超等维新派人士通过光绪帝进行 的一场资产阶级改良。')
mask_id = tokenizer.mask_id
print(ids)
ids = np.expand_dims(ids, 0)
ids = D.to_variable(ids)
logits = ernie(ids).numpy()
output_ids = np.argmax(logits, -1)
seg_txt = rev_lookup(output_ids)

print(''.join(seg_txt))


def predict_mask(sentence_with_mask):
    """
    predict multi masks, support top5, multi mask
    :param sentence_with_mask:
    :return:
    """
    ids, id_types = tokenizer.encode(sentence_with_mask)
    mask_id = tokenizer.mask_id
Beispiel #25
0
def train_bmn(args, place, to_static):
    program_translator.enable(to_static)
    loss_data = []

    with fluid.dygraph.guard(place):
        paddle.manual_seed(SEED)
        paddle.framework.random._manual_program_seed(SEED)
        global local_random
        local_random = np.random.RandomState(SEED)

        bmn = BMN(args)
        adam = optimizer(args, parameter_list=bmn.parameters())

        train_reader = fake_data_reader(args, 'train')

        for epoch in range(args.epoch):
            for batch_id, data in enumerate(train_reader()):
                video_feat = np.array([item[0]
                                       for item in data]).astype(DATATYPE)
                gt_iou_map = np.array([item[1]
                                       for item in data]).astype(DATATYPE)
                gt_start = np.array([item[2]
                                     for item in data]).astype(DATATYPE)
                gt_end = np.array([item[3] for item in data]).astype(DATATYPE)

                x_data = to_variable(video_feat)
                gt_iou_map = to_variable(gt_iou_map)
                gt_start = to_variable(gt_start)
                gt_end = to_variable(gt_end)
                gt_iou_map.stop_gradient = True
                gt_start.stop_gradient = True
                gt_end.stop_gradient = True

                pred_bm, pred_start, pred_end = bmn(x_data)

                loss, tem_loss, pem_reg_loss, pem_cls_loss = bmn_loss_func(
                    pred_bm, pred_start, pred_end, gt_iou_map, gt_start,
                    gt_end, args)
                avg_loss = fluid.layers.mean(loss)

                avg_loss.backward()
                adam.minimize(avg_loss)
                bmn.clear_gradients()
                # log loss data to verify correctness
                loss_data += [
                    avg_loss.numpy()[0],
                    tem_loss.numpy()[0],
                    pem_reg_loss.numpy()[0],
                    pem_cls_loss.numpy()[0]
                ]

                if args.log_interval > 0 and (batch_id % args.log_interval
                                              == 0):
                    print('[TRAIN] Epoch {}, iter {} '.format(epoch, batch_id)
                                + '\tLoss = {}, \ttem_loss = {}, \tpem_reg_loss = {}, \tpem_cls_loss = {}'.format(
                        '%f' % avg_loss.numpy()[0], '%f' % tem_loss.numpy()[0], \
                        '%f' % pem_reg_loss.numpy()[0], '%f' % pem_cls_loss.numpy()[0]))

                # validation
                if batch_id % args.valid_interval == 0 and batch_id > 0:
                    bmn.eval()
                    val_loss_data = val_bmn(bmn, args)
                    bmn.train()
                    loss_data += val_loss_data

                if batch_id == args.train_batch_num:
                    if to_static:
                        fluid.dygraph.jit.save(bmn, args.infer_dir)
                    else:
                        fluid.dygraph.save_dygraph(bmn.state_dict(),
                                                   args.dy_param_path)
                    break
        return np.array(loss_data)
Beispiel #26
0
            if args.init_checkpoint is not None:
                print('loading checkpoint from %s' % args.init_checkpoint)
                sd, _ = FD.load_dygraph(args.init_checkpoint)
                model.set_dict(sd)

        test_batch_data = batchify(test_features, args.bsz, args.max_seqlen)
        if args.debug:
            print(len(test_batch_data))
            print(test_batch_data[0])
            token_ids, seg_ids, labels = test_batch_data[0]
            for r1, r2 in zip(token_ids[:5], seg_ids[:5]):
                print(r1)
                print(r2)
                print(convert_ids_to_tokens(tokenizer.vocab, r1))        
        y_pred = []
        with FD.base._switch_tracer_mode_guard_(is_train=False):
            model.eval()
            for step, d in enumerate(tqdm(test_batch_data, desc='predicting')):
                ids, sids, _ = d
                ids, sids = FD.to_variable(ids), FD.to_variable(sids)
                _, logits = model(ids, sids)
                #print('\n'.join(map(str, logits.numpy().tolist())))
                y_pred += L.softmax(logits, -1).numpy().tolist()
                if args.debug and len(y_pred) > 5:
                    break

    print(len(y_pred), y_pred[:5])
    print(test_segs[:5])

    with open(args.save_path, 'wb') as f:
        pickle.dump({'segs': test_segs, 'y_pred': y_pred}, f)
 def run_main(self, np_arr, place):
     with guard(place):
         embedding = Embedding(size=[10, 10])
         var = to_variable(np_arr)
         self.assertTrue(np.array_equal(np_arr, var.numpy()))
Beispiel #28
0
    def forward(self, outputs, targets):
        """
        Performs the matching

        Params:
            outputs: This is a dict contains at least these entries:
                "pred_logits": Tensor of dim[batch_size, num_queries, num_classes] with the classification logits
                "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicated box coordinates
            
            targets: This is a list of targets (len(targets) == batch_size), where each target is a dict containing:
                "labels": Tensor of dim[num_target_boxes] (where num_target_boxes is the number of ground-truth)
                          objects in the target) containing the class labels
                "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordiantes
        
        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        with dg.no_grad():
            bs, num_queries, num_classes = outputs["pred_logits"].shape

            # We flatten to compute the cost matrices in a batch
            out_prob = L.reshape(
                outputs["pred_logits"],
                [-1, num_classes])  # [batch_size * num_queries, num_classes]
            out_prob = L.softmax(
                out_prob, axis=-1)  # [batch_size * num_queries, num_classes]
            out_bbox = L.reshape(outputs["pred_boxes"],
                                 [-1, 4])  # [batch_size * num_queries, 4]

            # Alse concat the target labels and boxes
            tgt_ids = L.concat([v["labels"] for v in targets]).astype(
                "int64")  # [batch_size * num_target_boxes_i]
            tgt_bbox = L.concat([v["boxes"] for v in targets]).astype(
                "float32")  # [batch_size * num_target_boxes_i]

            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
            # but approximate it in 1 - proba[target class].
            # The 1 is a constant that donesn't change the matching, it can be ommitted.
            cost_class = -out_prob.numpy()[:, tgt_ids.numpy(
            )]  # [batch_size * num_queries, num_all_target_boxes]
            cost_class = dg.to_variable(cost_class)

            # Compute the L1 cost between boxes
            num_all_target_boxes = tgt_bbox.shape[0]
            expanded_out_bbox = L.expand(
                L.unsqueeze(out_bbox, [1]),
                [1, num_all_target_boxes, 1
                 ])  # [batch_size * num_queries, num_all_target_boxes, 4]
            expanded_tgt_bbox = L.expand(
                L.unsqueeze(tgt_bbox, [0]),
                [bs * num_queries, 1, 1
                 ])  # [batch_size * num_queries, num_all_target_boxes, 4]
            cost_bbox = F.loss.l1_loss(
                expanded_out_bbox, expanded_tgt_bbox, reduction='none'
            )  # [batch_size * num_queries, num_all_target_boxes, 4]
            cost_bbox = L.reduce_mean(
                cost_bbox,
                -1)  # [batch_size * num_queries, num_all_target_boxes]

            # Compute the giou cost between boxes
            cost_giou = -generalied_box_iou(box_cxcywh_to_xyxy(out_bbox),
                                            box_cxcywh_to_xyxy(tgt_bbox))

            # Final cost matrix
            C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
            C = L.reshape(
                C, [bs, num_queries, -1
                    ])  # [batch_size, num_queries, num_all_target_boxes]

            sizes = [len(v["boxes"]) for v in targets]

            indices = [
                linear_sum_assignment(c[i].numpy())
                for i, c in enumerate(L.split(C, sizes, dim=-1))
            ]

            return [(dg.to_variable(i.astype("int64")),
                     dg.to_variable(j.astype("int64"))) for i, j in indices]
Beispiel #29
0
 def setUp(self):
     paddle.disable_static()
     program_trans.enable(True)
     self.x = to_variable(np.ones([4, 10]).astype('float32'))
Beispiel #30
0
    def train(self, train_data_list, eval_data_list,
            model_save_path=None, best_model_save_path=None,
            epochs=5, batch_size=32, learning_rate=5e-5, max_seq_len=300,
            max_ensure=False, print_step=50, load_best_model=True,
            **kwargs):
        """ 训练dygraph模型
        [IN]  model: dygraph模型结构
              optimizer: 优化器
              train_data_list: list[(input1[, input2, ...], label)], 训练数据
              eval_data_list: list[(input1[, input2, ...], label)], 评估数据
              label_encoder: LabelEncoder, 类别转化工具
              model_save_path: string, 模型存储路径
              best_model_save_path: string, 最优模型存储路径
              epochs:  int, 训练轮数
              batch_size: int, 批大小
              max_seq_len: int, 最大长度
              max_ensure: boolean, true则始终补齐到max_seq_len
              best_acc: float, 最优acc初始值
              print_step: int, 每个print_step打印训练情况
              logits_softmax: boolean, true则验证时输出softmax后的logits
              eval_method: str, eval模型效果
              with_label: boolean, true则数据中有label
        [OUT] best_acc: float, 训练得到的最优acc
        """
        logging.info("train model start")
        train_start_time = time.time()
        # 加载最优模型
        if load_best_model:
            self.load_model(best_model_save_path)
        # 进入train模式
        self.model.train()
        # 初始化优化器
        self.init_optimizer(learning_rate)

        def train_data_reader():
            return  gen_batch_data(train_data_list, batch_size, max_seq_len, max_ensure)

        cur_train_step = 0
        for cur_epoch in range(epochs):
            # 每个epoch都shuffle数据以获得最佳训练效果;
            np.random.shuffle(train_data_list)
            train_data_batch = F.contrib.reader.distributed_batch_reader(train_data_reader)() \
                    if self.parallelized else train_data_reader()
            for cur_train_batch in train_data_batch:
                cur_train_step += 1
                cur_train_batch = [D.to_variable(x) for x in cur_train_batch]
                loss = self.get_loss(*cur_train_batch, **kwargs)
                if self.parallelized:
                    # 若多卡 则将各训练的loss归一化
                    loss = self.model.scale_loss(loss)
                # 反向传播
                loss.backward()
                if self.parallelized:
                    # 若多卡 则各训练的梯度收集
                    # 注意梯度更新时需要时LoDTensor,即为dense矩阵
                    # 例如:embedding层的is_sparse参数需要为False,
                    #      否则更新时将是稀疏更新, 多卡训练时会出错
                    self.model.apply_collective_grads()
                self.optimizer.minimize(loss)
                # 清空梯度
                self.model.clear_gradients()
                if cur_train_step % print_step == 0:
                    speed = cur_train_step / (time.time() - train_start_time)
                    logging.info('train epoch %d, step %d: loss %.5f, speed %.2f step/s' % \
                            (cur_epoch, cur_train_step, loss.numpy(), speed))

            if model_save_path is not None:
                # 每轮保存模型
                logging.info("save model at epoch {}".format(cur_epoch))
                self.save_model(model_save_path + "_epoch{}".format(cur_epoch))

            # 计算验证集准确率
            cur_eval_res = self.evaluate(eval_data_list, batch_size=batch_size, max_seq_len=max_seq_len, **kwargs)
            is_best = self.check_if_best(cur_eval_res)
            if best_model_save_path is not None and is_best:
                # 如果是当前最优效果模型 则保存为best模型
                logging.info("cur best score, save model at epoch {} as best model".format(cur_epoch))
                self.save_model(best_model_save_path)
        logging.info("train model cost time %.4fs" % (time.time() - train_start_time))
        return self.get_best_score()