Exemple #1
0
    def train(datagene: TextDataGenerator, load_model=None):

        txtfs = datagene.load_from_path()

        max_label_len = 200

        pinyin_map = PinyinMapper(sil_mode=0)
        chs_map = ChsMapper()

        tloader = TextLoader2(
            txtfs,
            padding_length=max_label_len,
            pinyin_map=pinyin_map,
            chs_map=chs_map,
            grain=TextLoader2.grain_alpha,
            cut_sub=175,
        )

        model_helper = SOMMalpha()
        model_helper.compile(feature_shape=(max_label_len, ),
                             ms_pinyin_size=pinyin_map.max_index,
                             ms_output_size=chs_map.categores)

        if load_model is not None:
            model_helper.load(load_model)

        model_helper.fit(tloader, -1)
Exemple #2
0
def predict_dchmm(path="./model/DCBNN1D_cur_best.h5"):
    dcnn = DCHMM(acmodel_input_shape=(1600, 200),
                 acmodel_output_shape=(200, ),
                 lgmodel_input_shape=None,
                 py_map=PinyinMapper(sil_mode=-1),
                 chs_map=ChsMapper())

    dcnn.compile(path)

    while True:
        pyline, chline, prob = dcnn.record_from_cmd(3)
        print(pyline, chline, prob)
Exemple #3
0
def predict_dcsom(ac_path="./model/DCBNN1D_cur_best.h5",
                  lg_path="./model/language/SOMMalpha_step_18000.h5"):
    dcs = DCSOM(acmodel_input_shape=(1600, 200),
                acmodel_output_shape=(200, ),
                lgmodel_input_shape=(200, ),
                py_map=PinyinMapper(sil_mode=-1),
                chs_map=ChsMapper(),
                divide_feature=8)

    dcs.compile(ac_path, lg_path)
    while True:
        try:
            print(dcs.record_from_cmd(5))
        except:
            print("[info*]未识别到语音")
Exemple #4
0
    def real_predict(path="./model/DCBNN1D_cur_best.h5"):
        '''
        :param path:DCBNN1D的预训练权重文件路径
        :return:
        '''
        dcnn = DCHMM(
            acmodel_input_shape=(1600, 200),
            acmodel_output_shape=(200,),
            lgmodel_input_shape=None,
            py_map=PinyinMapper(sil_mode=-1),
            chs_map=ChsMapper())

        dcnn.compile(path)

        while True:
            pyline, chline, prob = dcnn.record_from_cmd(3)
            print(pyline, chline, prob)
Exemple #5
0
    def real_predict(path):
        max_label_len = 200
        pinyin_map = PinyinMapper(sil_mode=0)
        chs_map = ChsMapper()

        model_helper = SOMMalpha()
        model_helper.compile(feature_shape=(max_label_len, ),
                             ms_pinyin_size=pinyin_map.max_index,
                             ms_output_size=chs_map.categores)

        model_helper.load(path)

        while True:
            string = input("请输入拼音:")
            xs = [pinyin_map.alist2vector(string)]
            print(xs)
            batch = pad_sequences(xs,
                                  maxlen=max_label_len,
                                  padding="post",
                                  truncating="post"), None
            result = model_helper.predict(batch)[0]
            print(result.replace("_", ""))
def train_dcnn1d(datagene: list, load_model=None):

    dataset = VoiceDatasetList()
    _, y_set = dataset.merge_load(datagene, choose_x=False, choose_y=True)

    max_label_len = 64
    pinyin_map = PinyinMapper(sil_mode=0)
    chs_map = ChsMapper()
    tloader = TextLoader(y_set,
                         padding_length=max_label_len,
                         pinyin_map=pinyin_map,
                         cut_sub=16,
                         chs_map=chs_map)

    model_helper = DCNN1D()
    model_helper.compile(feature_shape=(max_label_len, tloader.max_py_size),
                         ms_input_size=pinyin_map.max_index,
                         ms_output_size=chs_map.categores)

    if load_model is not None:
        model_helper.load(load_model)

    model_helper.fit(tloader, -1)
Exemple #7
0
        alpha_batch = ["".join(sample) for sample in raw_pylist_batch]

        alpha_vector_batch = self.py_map.batch_alist2vector(alpha_batch)
        alpha_vector_batch = TextLoader2.corpus2feature(alpha_vector_batch,self.lgmodel_input_shape[0])

        ch_list_batch,prob_batch = self.lg_model.predict([alpha_vector_batch,None],True)

        pyline = np.concatenate(pylist_batch).tolist()
        chline = ",".join(ch_list_batch).replace("_","")

        print(pyline,chline)
        return pyline,chline,[ctc_prob[0]]


if __name__ == "__main__":
    dcs = DCSOM(acmodel_input_shape=(1600,200),
                acmodel_output_shape=(200,),
                lgmodel_input_shape=(200,),
                py_map=PinyinMapper(sil_mode=-1),
                chs_map=ChsMapper(),
                divide_feature=8)

    # dcs.compile("../model/DCBNN1D_step_326000.h5",
    #             "../model/language/SOMMalpha_step_18000.h5")
    dcs.compile("../model/DCBNN1D_cur_best.h5",
                "../model/language/SOMMalpha_step_18000.h5")
    while True:
        try:
            print(dcs.record_from_cmd(5))
        except:
            print("[info*]未识别到语音")
Exemple #8
0
 def __init__(self, path, strip_tone=False):
     assert os.path.exists(path), "path not exists!"
     self.path = path
     self.strip_tone = strip_tone
     self.pymap = PinyinMapper()
     self.chs_map = ChsMapper()
Exemple #9
0
 def __init__(self):
     super().__init__()
     self.chs_map = ChsMapper()
Exemple #10
0
class LanguageModel(BaseModel):
    '''继承自BaseModel,用于训练声学模型,主要区别在于save时候目录存放位置和fit、predict、test方法不同'''
    def __init__(self):
        super().__init__()
        self.chs_map = ChsMapper()

    def save(self, dir_path=None, fn=None, epoch=None, step=None):
        if dir_path is None:
            dir_path = config.language_model_dir
        super().save(dir_path, fn, epoch, step)

    def fit(self,
            txt_loader: [TextLoader, TextLoader2],
            epoch=100,
            save_step=500):
        '''
        传入数据生成器,执行训练
        :param txt_loader:参考VoiceLoader的使用方法
        :param epoch: 训练多少次epoch,由代码手动控制而不是设置在kears的训练中,如果为-1,则为一直训练
        :param save_step: 每一个epoch训练多少步,这里每一个epoch训练完成后就会保存一次模型
        :param use_ctc: 该模型测试时是否使用ctc解码对输出进行处理,注意是测试的时候,该参数不影响训练过程
        :return:
        '''
        # viter = voice_loader.create_feature_iter(shuffle_set=False)

        i = self.pre_epoch
        if i == 0:
            self.save(epoch=0, step=0)
        self.test(txt_loader.choice())

        logg_plot = Lossplot(self.__class__.__name__,
                             save_dir=config.language_loss_dir)
        loss_report = LossReportor()
        time_clock = TimeClock()

        while i < epoch or epoch == -1:
            i += 1
            print(f"train epoch {i}/{epoch}.")
            self.train_model.fit_generator(
                txt_loader,
                save_step,
                callbacks=[logg_plot, time_clock, loss_report])

            self.test(txt_loader.choice())
            self.save(epoch=i, step=i * save_step)
            # self.save_loss_plot(None) # TODO 实现损失曲线的绘制,每个模型一个,覆盖原图片,默认保存在 ./loss_plot 下

    def test(self, batch):
        # [xs, ys, label_len], placeholder = batch
        xs, ys = batch
        result = self.base_model.predict(xs)

        result = K.argmax(result)
        result = K.eval(result)

        ys = K.argmax(ys)
        ys = K.eval(ys)

        result = self.chs_map.batch_vector2chsent(result)
        ys = self.chs_map.batch_vector2chsent(ys)
        # for pre_line,true_line in zip(result,ys):
        #     print("————————————————————————")
        #     print(pre_line)
        #     print(true_line)
        #     count, count_norm = self.evaluate.compare_sent(pre_line, true_line)

        print("===================")
        all_count = 0
        all_norm = 0
        i = 0
        for pred, true in zip(result, ys):
            pred = [i for i in pred if i != "_"]
            true = [i for i in true if i != "_"]
            count, count_norm = self.evaluate.compare_sent(pred, true)

            all_count += count
            all_norm += count_norm
            i += 1
            print("".join(pred).replace("_", ""))
            print("".join(true).replace("_", ""))
            print("-------------------")
            print(
                f"[test*] compare result:{count} differences. After norm:{count_norm}. "
            )
            print("===================")
        print(f"[test*] all differences:{all_count}.Whole norm:{all_norm/i}")

        # self.chs_map.vector2chsent()

    def predict(self, batch, return_prob=False):
        xs, _ = batch
        prob = self.base_model.predict(xs)

        result = K.argmax(prob)
        result = K.eval(result)
        if return_prob:
            return self.chs_map.batch_vector2chsent(result), prob
        return self.chs_map.batch_vector2chsent(result)

    def blur_predict(self, batch):
        xs, ys = batch
        prob = self.base_model.predict(xs)

        result = K.argmax(prob)

        result = K.eval(result)

        return self.chs_map.batch_vector2chsent(result), prob

    # def _judge_blur_range(self,batch,thresh = 0.01):
    #     for sample in batch:

    def hignway_netblock(self, x, h_dim):
        H = Dense(h_dim, activation="relu")(x)
        T = Dense(h_dim,
                  activation="sigmoid",
                  kernel_initializer=Constant(value=-1))(x)

        C = Lambda(lambda x: 1 - x)(T)

        A = Multiply()([H, T])
        B = Multiply()([x, C])
        outputs = Add()([A, B])

        return outputs

    def parent(self, ipt, h_dim, drop_out_rate=0.5):
        emb = Dense(h_dim)(ipt)
        emb = Dropout(rate=drop_out_rate)(emb)
        emb = Dense(h_dim // 2)(emb)
        emb = Dropout(rate=drop_out_rate)(emb)
        return emb
Exemple #11
0
 def _check(self):
     '''清洗完后,在根目录下生成一个文件,表示无需再清洗了'''
     symbol = os.path.join(self.path,"symbol")
     self.check = os.path.exists(symbol)
     self.pymap = PinyinMapper()
     self.chsmap = ChsMapper()