Exemple #1
0
def evaluate(model: tf.keras.Model,
             data_path: str,
             batch_size: int,
             buffer_size: int,
             dict_path: str = "",
             length_path: str = "",
             max_data_size: int = 0):
    """
    评估模块
    :param model: 模型
    :param data_path: 文本数据路径
    :param buffer_size: Dataset加载缓存大小
    :param batch_size: Dataset加载批大小
    :param dict_path: 字典路径,若使用phoneme则不用传
    :param max_data_size: 最大训练数据量
    :param length_path: 训练样本长度保存路径
    :return: 返回历史指标数据
    """
    valid_dataset, _, valid_steps_per_epoch, _ = \
        load_data(train_data_path=data_path, batch_size=batch_size, buffer_size=buffer_size,
                  valid_data_split=0.0, valid_data_path="", train_length_path=length_path,
                  valid_length_path="", max_train_data_size=max_data_size, max_valid_data_size=0)

    tokenizer = load_tokenizer(dict_path=dict_path)
    enc_hidden = model.initialize_hidden_state()
    dec_input = tf.cast(tf.expand_dims([tokenizer.word_index.get('<start>')] *
                                       batch_size, 1),
                        dtype=tf.int64)

    _, _, _ = _valid_step(model=model,
                          dataset=valid_dataset,
                          steps_per_epoch=valid_steps_per_epoch,
                          tokenizer=tokenizer,
                          enc_hidden=enc_hidden,
                          dec_input=dec_input)
Exemple #2
0
def evaluate(encoder: tf.keras.Model,
             decoder: tf.keras.Model,
             data_path: str,
             batch_size: int,
             buffer_size: int,
             length_path: str = "",
             max_data_size: int = 0):
    """
    评估模块
    :param encoder: 模型的encoder
    :param decoder: 模型的decoder
    :param data_path: 文本数据路径
    :param buffer_size: Dataset加载缓存大小
    :param batch_size: Dataset加载批大小
    :param max_data_size: 最大训练数据量
    :param length_path: 训练样本长度保存路径
    :return: 无返回值
    """
    valid_dataset, _, valid_steps_per_epoch, _ = \
        load_data(train_data_path=data_path, batch_size=batch_size, buffer_size=buffer_size,
                  valid_data_split=0.0, valid_data_path="", train_length_path=length_path,
                  valid_length_path="", max_train_data_size=max_data_size, max_valid_data_size=0)

    _valid_step(encoder=encoder,
                decoder=decoder,
                dataset=valid_dataset,
                steps_per_epoch=valid_steps_per_epoch)
Exemple #3
0
def evaluate(model: tf.keras.Model,
             data_path: str,
             batch_size: int,
             buffer_size: int,
             dict_path: str = "",
             length_path: str = "",
             max_data_size: int = 0):
    """
    评估模块
    :param model: 模型
    :param data_path: 文本数据路径
    :param buffer_size: Dataset加载缓存大小
    :param batch_size: Dataset加载批大小
    :param dict_path: 字典路径,若使用phoneme则不用传
    :param max_data_size: 最大训练数据量
    :param length_path: 训练样本长度保存路径
    :return: 返回历史指标数据
    """
    valid_dataset, _, valid_steps_per_epoch, _ = \
        load_data(train_data_path=data_path, batch_size=batch_size, buffer_size=buffer_size,
                  valid_data_split=0.0, valid_data_path="", train_length_path=length_path,
                  valid_length_path="", max_train_data_size=max_data_size, max_valid_data_size=0)

    tokenizer = load_tokenizer(dict_path=dict_path)

    _, _, _ = _valid_step(model=model,
                          dataset=valid_dataset,
                          steps_per_epoch=valid_steps_per_epoch,
                          tokenizer=tokenizer)
Exemple #4
0
def train(encoder: tf.keras.Model,
          decoder: tf.keras.Model,
          optimizer: tf.keras.optimizers.Adam,
          epochs: int,
          checkpoint: tf.train.CheckpointManager,
          train_data_path: str,
          max_len: int,
          vocab_size: int,
          batch_size: int,
          buffer_size: int,
          checkpoint_save_freq: int,
          dict_path: str = "",
          valid_data_split: float = 0.0,
          valid_data_path: str = "",
          max_train_data_size: int = 0,
          max_valid_data_size: int = 0):
    """
    训练模块
    :param encoder: 模型的encoder
    :param decoder: 模型的decoder
    :param optimizer: 优化器
    :param checkpoint: 检查点管理器
    :param epochs: 训练周期
    :param train_data_path: 文本数据路径
    :param max_len: 文本序列最大长度
    :param vocab_size: 词汇大小
    :param buffer_size: Dataset加载缓存大小
    :param batch_size: Dataset加载批大小
    :param dict_path: 字典路径,若使用phoneme则不用传
    :param valid_data_split: 用于从训练数据中划分验证数据
    :param valid_data_path: 验证数据文本路径
    :param max_train_data_size: 最大训练数据量
    :param max_valid_data_size: 最大验证数据量
    :param checkpoint_save_freq: 检查点保存频率
    """
    _, train_dataset, valid_dataset, steps_per_epoch, valid_steps_per_epoch = \
        load_data(train_data_path=train_data_path, max_len=max_len, vocab_size=vocab_size,
                  batch_size=batch_size, buffer_size=buffer_size, dict_path=dict_path,
                  valid_data_split=valid_data_split, valid_data_path=valid_data_path,
                  max_train_data_size=max_train_data_size, max_valid_data_size=max_valid_data_size)

    if steps_per_epoch == 0:
        print("训练数据量过小,小于batch_size,请添加数据后重试")
        exit(0)

    for epoch in range(epochs):
        print('Epoch {}/{}'.format(epoch + 1, epochs))
        start_time = time.time()
        total_loss = 0

        for (batch,
             (audio_feature,
              sentence)) in enumerate(train_dataset.take(steps_per_epoch)):
            batch_start = time.time()
            sentence_input = sentence[:, :-1]
            sentence_real = sentence[:, 1:]

            batch_loss, sentence_predictions = _train_step(
                encoder, decoder, optimizer, sentence_input, sentence_real,
                audio_feature)
            total_loss += batch_loss

            print('\r{}/{} [Batch {} Loss {:.4f} {:.1f}s]'.format(
                (batch + 1), steps_per_epoch, batch + 1, batch_loss.numpy(),
                (time.time() - batch_start)),
                  end="")

        print(' - {:.0f}s/step - loss: {:.4f}'.format(
            (time.time() - start_time) / steps_per_epoch,
            total_loss / steps_per_epoch))

        if (epoch + 1) % checkpoint_save_freq == 0:
            checkpoint.save()

            if valid_steps_per_epoch == 0:
                print("验证数据量过小,小于batch_size,请添加数据后重试")
                exit(0)

            _valid_step(encoder=encoder,
                        decoder=decoder,
                        dataset=valid_dataset,
                        steps_per_epoch=valid_steps_per_epoch)
Exemple #5
0
        model = las.LAS(test_vocab_tar_size, cnn1_filters, cnn1_kernel_size,
                        cnn2_filters, cnn2_kernel_size, max_pool_strides,
                        max_pool_size, d, w, emb_dim, dec_units, batch_size)
    # 检查点
    checkpoint_dir = config.checkpoint_dir
    checkpoint_prefix = os.path.join(checkpoint_dir, config.checkpoint_prefix)
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

    # 恢复检查点目录 (checkpoint_dir) 中最新的检查点
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

    results = []
    labels_list = []

    # 加载测试集数据生成器
    test_data = load_dataset.load_data(dataset_name, data_path, num_examples)
    batchs = len(test_data[0]) // batch_size
    print("构建数据生成器......")
    test_data_generator = test_generator(
        test_data, batchs, batch_size, audio_feature_type,
        dataset_information["max_input_length"])

    word_index = dataset_information["word_index"]
    index_word = dataset_information["index_word"]
    max_label_length = dataset_information["max_label_length"]
    beam_search_container = beamsearch.BeamSearchDecoder(
        beam_size=config.beam_size, min_score=0)

    for batch, (inp, _, targ) in zip(range(1, batchs + 1),
                                     test_data_generator):
        hidden = model.initialize_hidden_state()
Exemple #6
0
from hlp.stt.ds2.util import get_config
from hlp.stt.utils.load_dataset import load_data
from hlp.stt.utils.audio_process import max_audio_length
from hlp.stt.utils.text_process import split_sentences, get_max_label_length, tokenize_and_encode


if __name__ == "__main__":
    configs = get_config()

    dataset_name = configs["preprocess"]["dataset_name"]
    data_path = configs["train"]["data_path"]
    num_examples = configs["train"]["num_examples"]

    # 获取语料里所有语音路径list和文本list
    print("读取数据集的语音文件和转写...")
    audio_data_path_list, text_list = load_data(dataset_name, data_path, num_examples)

    print("对文本进行切分...")
    mode = configs["preprocess"]["text_process_mode"]
    splitted_text_list = split_sentences(text_list, mode)

    print("对文本进行编码...")
    text_int_sequences, tokenizer = tokenize_and_encode(splitted_text_list)

    print("统计最长语音和转写长度...")
    audio_feature_type = configs["other"]["audio_feature_type"]
    max_input_length = max_audio_length(audio_data_path_list, audio_feature_type)
    max_label_length = get_max_label_length(text_int_sequences)

    print("保存数据集信息...")
    ds_info_path = configs["preprocess"]["dataset_info_path"]
Exemple #7
0
def train(epochs: int,
          train_data_path: str,
          batch_size: int,
          buffer_size: int,
          checkpoint_save_freq: int,
          checkpoint: tf.train.CheckpointManager,
          model: tf.keras.Model,
          optimizer: tf.keras.optimizers.Adam,
          dict_path: str = "",
          valid_data_split: float = 0.0,
          valid_data_path: str = "",
          train_length_path: str = "",
          valid_length_path: str = "",
          max_train_data_size: int = 0,
          max_valid_data_size: int = 0,
          history_img_path: str = ""):
    """
    训练模块
    :param epochs: 训练周期
    :param train_data_path: 文本数据路径
    :param dict_path: 字典路径,若使用phoneme则不用传
    :param buffer_size: Dataset加载缓存大小
    :param batch_size: Dataset加载批大小
    :param checkpoint: 检查点管理器
    :param model: 模型
    :param optimizer: 优化器
    :param valid_data_split: 用于从训练数据中划分验证数据
    :param valid_data_path: 验证数据文本路径
    :param max_train_data_size: 最大训练数据量
    :param train_length_path: 训练样本长度保存路径
    :param valid_length_path: 验证样本长度保存路径
    :param max_valid_data_size: 最大验证数据量
    :param checkpoint_save_freq: 检查点保存频率
    :param history_img_path: 历史指标数据图表保存路径
    :return:
    """
    train_dataset, valid_dataset, steps_per_epoch, valid_steps_per_epoch = \
        load_data(train_data_path=train_data_path, batch_size=batch_size, buffer_size=buffer_size,
                  valid_data_split=valid_data_split, valid_data_path=valid_data_path,
                  train_length_path=train_length_path, valid_length_path=valid_length_path,
                  max_train_data_size=max_train_data_size, max_valid_data_size=max_valid_data_size)

    tokenizer = load_tokenizer(dict_path=dict_path)
    history = {"loss": [], "wers": [], "norm_lers": []}

    if steps_per_epoch == 0:
        print("训练数据量过小,小于batch_size,请添加数据后重试")
        exit(0)

    for epoch in range(epochs):
        total_loss = 0
        start_time = time.time()
        enc_hidden = model.initialize_hidden_state()
        dec_input = tf.cast(tf.expand_dims(
            [tokenizer.word_index.get('<start>')] * batch_size, 1),
                            dtype=tf.int64)

        print("Epoch {}/{}".format(epoch + 1, epochs))
        for (batch,
             (audio_feature, sentence,
              length)) in enumerate(train_dataset.take(steps_per_epoch)):
            batch_start = time.time()

            batch_loss = _train_step(model, optimizer, audio_feature, sentence,
                                     enc_hidden, dec_input)
            total_loss += batch_loss

            print('\r{}/{} [Batch {} Loss {:.4f} {:.1f}s]'.format(
                (batch + 1), steps_per_epoch, batch + 1, batch_loss.numpy(),
                (time.time() - batch_start)),
                  end="")

        print(' - {:.0f}s/step - loss: {:.4f}'.format(
            (time.time() - start_time) / steps_per_epoch,
            total_loss / steps_per_epoch))

        if (epoch + 1) % checkpoint_save_freq == 0:
            checkpoint.save()

            if valid_steps_per_epoch == 0:
                print("验证数据量过小,小于batch_size,请添加数据后重试")
                exit(0)

            valid_loss, valid_wer, valid_ler = _valid_step(
                model=model,
                dataset=valid_dataset,
                enc_hidden=enc_hidden,
                dec_input=dec_input,
                steps_per_epoch=valid_steps_per_epoch,
                tokenizer=tokenizer)
            history["wers"].append(valid_wer)
            history["norm_lers"].append(valid_ler)

    plot_history(history=history,
                 valid_epoch_freq=checkpoint_save_freq,
                 history_img_path=history_img_path)
    return history
Exemple #8
0
def train(model: tf.keras.Model,
          optimizer: tf.keras.optimizers.Adam,
          epochs: int,
          checkpoint: tf.train.CheckpointManager,
          train_data_path: str,
          batch_size: int,
          buffer_size: int,
          checkpoint_save_freq: int,
          dict_path: str = "",
          valid_data_split: float = 0.0,
          valid_data_path: str = "",
          train_length_path: str = "",
          valid_length_path: str = "",
          stop_early_limits: int = 0,
          max_train_data_size: int = 0,
          max_valid_data_size: int = 0,
          history_img_path: str = ""):
    """
    训练模块
    :param model: 模型
    :param optimizer: 优化器
    :param checkpoint: 检查点管理器
    :param epochs: 训练周期
    :param train_data_path: 文本数据路径
    :param buffer_size: Dataset加载缓存大小
    :param batch_size: Dataset加载批大小
    :param dict_path: 字典路径,若使用phoneme则不用传
    :param valid_data_split: 用于从训练数据中划分验证数据
    :param valid_data_path: 验证数据文本路径
    :param max_train_data_size: 最大训练数据量
    :param train_length_path: 训练样本长度保存路径
    :param valid_length_path: 验证样本长度保存路径
    :param stop_early_limits: 不增长停止个数
    :param max_valid_data_size: 最大验证数据量
    :param checkpoint_save_freq: 检查点保存频率
    :param history_img_path: 历史指标数据图表保存路径
    :return: 返回历史指标数据
    """
    train_dataset, valid_dataset, steps_per_epoch, valid_steps_per_epoch = \
        load_data(train_data_path=train_data_path, batch_size=batch_size, buffer_size=buffer_size,
                  valid_data_split=valid_data_split, valid_data_path=valid_data_path,
                  train_length_path=train_length_path, valid_length_path=valid_length_path,
                  max_train_data_size=max_train_data_size, max_valid_data_size=max_valid_data_size)

    tokenizer = load_tokenizer(dict_path=dict_path)

    history = {"loss": [], "wers": [], "norm_lers": []}

    if steps_per_epoch == 0:
        print("训练数据量过小,小于batch_size,请添加数据后重试")
        exit(0)

    for epoch in range(epochs):
        print('Epoch {}/{}'.format(epoch + 1, epochs))
        start_time = time.time()
        total_loss = 0

        for (batch,
             (audio_feature, sentence,
              length)) in enumerate(train_dataset.take(steps_per_epoch)):
            batch_start = time.time()

            batch_loss = _train_step(model, optimizer, sentence, length,
                                     audio_feature)
            total_loss += batch_loss

            print('\r{}/{} [Batch {} Loss {:.4f} {:.1f}s]'.format(
                (batch + 1), steps_per_epoch, batch + 1, batch_loss.numpy(),
                (time.time() - batch_start)),
                  end="")

        print(' - {:.0f}s/step - loss: {:.4f}'.format(
            (time.time() - start_time) / steps_per_epoch,
            total_loss / steps_per_epoch))
        history["loss"].append(total_loss / steps_per_epoch)

        if (epoch + 1) % checkpoint_save_freq == 0:
            checkpoint.save()

            if valid_steps_per_epoch == 0:
                print("验证数据量过小,小于batch_size,请添加数据后重试")
                exit(0)

            valid_loss, valid_wer, valid_ler = _valid_step(
                model=model,
                dataset=valid_dataset,
                steps_per_epoch=valid_steps_per_epoch,
                tokenizer=tokenizer)
            history["wers"].append(valid_wer)
            history["norm_lers"].append(valid_ler)

            if stop_early_limits != 0 and len(
                    history["wers"]) >= stop_early_limits:
                if can_stop(history["wers"][-stop_early_limits:]) \
                        or can_stop(history["norm_lers"][-stop_early_limits:]):
                    print("指标反弹,停止训练!")
                    break
    plot_history(history=history,
                 valid_epoch_freq=checkpoint_save_freq,
                 history_img_path=history_img_path)
    return history
Exemple #9
0
def train(epochs: int,
          train_data_path: str,
          max_len: int,
          vocab_size: int,
          batch_size: int,
          buffer_size: int,
          checkpoint_save_freq: int,
          checkpoint: tf.train.CheckpointManager,
          model: tf.keras.Model,
          optimizer: tf.keras.optimizers.Adam,
          dict_path: str = "",
          valid_data_split: float = 0.0,
          valid_data_path: str = "",
          max_train_data_size: int = 0,
          max_valid_data_size: int = 0):
    """
    训练模块
    :param epochs: 训练周期
    :param train_data_path: 文本数据路径
    :param max_len: 文本序列最大长度
    :param vocab_size: 词汇大小
    :param dict_path: 字典路径,若使用phoneme则不用传
    :param buffer_size: Dataset加载缓存大小
    :param batch_size: Dataset加载批大小
    :param checkpoint: 检查点管理器
    :param model: 模型
    :param optimizer: 优化器
    :param valid_data_split: 用于从训练数据中划分验证数据
    :param valid_data_path: 验证数据文本路径
    :param max_train_data_size: 最大训练数据量
    :param max_valid_data_size: 最大验证数据量
    :param checkpoint_save_freq: 检查点保存频率
    :return:
    """
    tokenizer, train_dataset, valid_dataset, steps_per_epoch, valid_steps_per_epoch = \
        load_data(train_data_path=train_data_path, max_len=max_len, vocab_size=vocab_size,
                  batch_size=batch_size, buffer_size=buffer_size, dict_path=dict_path,
                  valid_data_split=valid_data_split, valid_data_path=valid_data_path,
                  max_train_data_size=max_train_data_size, max_valid_data_size=max_valid_data_size)

    for epoch in range(epochs):
        start = time.time()
        enc_hidden = model.initialize_hidden_state()
        total_loss = 0
        batch_start = time.time()

        print("Epoch {}/{}".format(epoch + 1, epochs))
        for (batch,
             (audio_feature,
              sentence)) in enumerate(train_dataset.take(steps_per_epoch)):
            batch_loss = _train_step(audio_feature, sentence, enc_hidden,
                                     tokenizer, model, optimizer, batch_size)

            total_loss += batch_loss

            print('Epoch {} Batch {} Loss {:.4f} - {:.4f} sec'.format(
                epoch + 1, batch, batch_loss.numpy(),
                time.time() - batch_start))
            batch_start = time.time()

        print('Epoch {} Loss {:.4f} - {:.4f} sec'.format(
            epoch + 1, total_loss / steps_per_epoch,
            time.time() - start))

        if (epoch + 1) % checkpoint_save_freq == 0:
            checkpoint.save()