コード例 #1
0
ファイル: train_char.py プロジェクト: widdiot/E2E-ASR
def eval(epoch):

    pbar = tqdm(total=len(devset))
    losses = []
    is_new_epoch = 0
    step = 0
    while True:
        batch, is_new_epoch = devset.next()
        if is_new_epoch:
            break
        xs, ys, xlens = batch['xs'], batch['ys'], batch['xlens']
        xs = [stack_frame(x, args.n_stack, args.n_skip) for x in xs]
        xs = [np2tensor(x).float() for x in xs]
        xlen = torch.IntTensor([len(x) for x in xs])
        xs = pad_list(xs, 0.0).cuda()
        _ys = [np2tensor(np.fromiter(y, dtype=np.int64), -1) for y in ys]
        ys_out_pad = pad_list(_ys, 0).long().cuda()
        ylen = np2tensor(np.fromiter([y.size(0) for y in _ys], dtype=np.int32))
        model.eval()
        loss = model(xs, ys_out_pad, xlen, ylen)
        loss = float(loss.data) * len(xlen)
        losses.append(loss)
        step += 1  # //TODO vishay un-hardcode the batch size

        pbar.update(len(batch['xs']))
    pbar.close()

    # Reset data counters
    devset.reset()

    return sum(losses) / len(devset)  #, wer, cer
コード例 #2
0
ファイル: speech2text.py プロジェクト: thanhkm/neural_sp
    def encode(self, xs, task='all', flip=False, use_cache=False, streaming=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all/ys*/ys_sub1*/ys_sub2*
            flip (bool): if True, flip acoustic features in the time-dimension
            use_cache (bool): use the cached forward encoder state in the previous chunk as the initial state
            streaming (bool): streaming encoding
        Returns:
            eout_dict (dict):

        """
        if self.input_type == 'speech':
            # Frame stacking
            if self.n_stacks > 1:
                xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs]

            # Splicing
            if self.n_splices > 1:
                xs = [splice(x, self.n_splices, self.n_stacks) for x in xs]
            xlens = torch.IntTensor([len(x) for x in xs])

            # Flip acoustic features in the reverse order
            if flip:
                xs = [torch.from_numpy(np.flip(x, axis=0).copy()).float().cuda(self.device_id) for x in xs]
            else:
                xs = [np2tensor(x, self.device_id).float() for x in xs]
            xs = pad_list(xs, 0.)

            # SpecAugment
            if self.use_specaug and self.training:
                xs = self.specaug(xs)

            # Gaussian noise injection
            if self.gaussian_noise:
                xs = add_gaussian_noise(xs)

            # Sequence summary network
            if self.ssn is not None:
                xs += self.ssn(xs, xlens)

        elif self.input_type == 'text':
            xlens = torch.IntTensor([len(x) for x in xs])
            xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device_id) for x in xs]
            xs = pad_list(xs, self.pad)
            xs = self.dropout_emb(self.embed(xs))
            # TODO(hirofumi): fix for Transformer

        # encoder
        eout_dict = self.enc(xs, xlens, task.split('.')[0], use_cache, streaming)

        if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv', 'transformer', 'conv_transformer']:
            for sub in ['sub1', 'sub2']:
                eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone()
                eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:]

        return eout_dict
コード例 #3
0
    def encode(self, xs, task='all', streaming=False, lookback=False, lookahead=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all/ys*/ys_sub1*/ys_sub2*
            streaming (bool): streaming encoding
            lookback (bool): truncate leftmost frames for lookback in CNN context
            lookahead (bool): truncate rightmost frames for lookahead in CNN context
        Returns:
            eout_dict (dict):

        """
        if self.input_type == 'speech':
            # Frame stacking
            if self.n_stacks > 1:
                xs = [stack_frame(x, self.n_stacks, self.n_skips) for x in xs]

            # Splicing
            if self.n_splices > 1:
                xs = [splice(x, self.n_splices, self.n_stacks) for x in xs]

            xlens = torch.IntTensor([len(x) for x in xs])
            xs = pad_list([np2tensor(x, self.device).float() for x in xs], 0.)

            # SpecAugment
            if self.specaug is not None and self.training:
                xs = self.specaug(xs)

            # Weight noise injection
            if self.weight_noise_std > 0 and self.training:
                self.add_weight_noise(std=self.weight_noise_std)

            # Input Gaussian noise injection
            if self.input_noise_std > 0 and self.training:
                xs = add_input_noise(xs, std=self.input_noise_std)

            # Sequence summary network
            if self.ssn is not None:
                xs = self.ssn(xs, xlens)

        elif self.input_type == 'text':
            xlens = torch.IntTensor([len(x) for x in xs])
            xs = [np2tensor(np.fromiter(x, dtype=np.int64), self.device) for x in xs]
            xs = pad_list(xs, self.pad)
            xs = self.dropout_emb(self.embed(xs))
            # TODO(hirofumi): fix for Transformer

        # encoder
        eout_dict = self.enc(xs, xlens, task.split('.')[0], streaming, lookback, lookahead)

        if self.main_weight < 1 and self.enc_type in ['conv', 'tds', 'gated_conv']:
            for sub in ['sub1', 'sub2']:
                eout_dict['ys_' + sub]['xs'] = eout_dict['ys']['xs'].clone()
                eout_dict['ys_' + sub]['xlens'] = eout_dict['ys']['xlens'][:]

        return eout_dict
コード例 #4
0
    def collate_fn(self, batch):

        xs = []
        xlens = []
        ys = []
        ys_hist = []
        ys_sub1 = []
        ys_sub2 = []
        utt_ids = []
        speakers = []
        sessions = []
        text = []
        for item in batch:
            xs.append(item['xs'][0])
            xlens.append(item['xlens'][0])
            ys.append(item['ys'][0])
            ys_hist.append(item['ys_hist'][0])
            ys_sub1.append(item['ys_sub1'])
            ys_sub2.append(item['ys_sub2'])
            utt_ids.append(item['utt_ids'][0])
            speakers.append(item['speakers'][0])
            sessions.append(item['sessions'][0])
            text.append(item['text'])

        if self.num_stacks > 1:
            xs = [stack_frame(x, self.num_stacks, self.num_skips) for x in xs]

        # Splicing
        if self.num_splices > 1:
            xs = [splice(x, self.num_splices, self.num_stacks) for x in xs]

        data = {
            'xs': xs,
            'xlens': xlens,
            'ys': ys,
            'ys_hist': ys_hist,
            'ys_sub1': ys_sub1,
            'ys_sub2': ys_sub2,
            'utt_ids': utt_ids,
            'speakers': speakers,
            'sessions': sessions,
            'text': text
        }

        return data
コード例 #5
0
    def encode(self, xs, task='all', flip=False):
        """Encode acoustic or text features.

        Args:
            xs (list): A list of length `[B]`, which contains Tensor of size `[T, input_dim]`
            task (str): all or ys* or ys_sub1* or ys_sub2*
            flip (bool): if True, flip acoustic features in the time-dimension
        Returns:
            enc_outs (dict):

        """
        if 'lmobj' in task:
            eouts = {
                'ys': {
                    'xs': None,
                    'xlens': None
                },
                'ys_sub1': {
                    'xs': None,
                    'xlens': None
                },
                'ys_sub2': {
                    'xs': None,
                    'xlens': None
                }
            }
            return eouts
        else:
            if self.input_type == 'speech':
                # Frame stacking
                if self.n_stacks > 1:
                    xs = [
                        stack_frame(x, self.n_stacks, self.n_skips) for x in xs
                    ]

                # Splicing
                if self.n_splices > 1:
                    xs = [splice(x, self.n_splices, self.n_stacks) for x in xs]
                xlens = torch.IntTensor([len(x) for x in xs])

                # Flip acoustic features in the reverse order
                if flip:
                    xs = [
                        torch.from_numpy(np.flip(
                            x, axis=0).copy()).float().cuda(self.device_id)
                        for x in xs
                    ]
                else:
                    xs = [np2tensor(x, self.device_id).float() for x in xs]
                xs = pad_list(xs, 0.0)

                # SpecAugment
                if self.is_specaug and self.training:
                    xs = self.specaug(xs)

                # Gaussian noise injection
                if self.gaussian_noise:
                    xs = add_gaussian_noise(xs)

                # Sequence summary network
                if self.ssn is not None:
                    xs += self.ssn(xs, xlens)

            elif self.input_type == 'text':
                xlens = torch.IntTensor([len(x) for x in xs])
                xs = [
                    np2tensor(np.fromiter(x, dtype=np.int64), self.device_id)
                    for x in xs
                ]
                xs = pad_list(xs, self.pad)
                xs = self.embed(xs)

            # encoder
            enc_outs = self.enc(xs, xlens, task.split('.')[0])

            if self.main_weight < 1 and self.enc_type in [
                    'conv', 'tds', 'gated_conv', 'transformer',
                    'conv_transformer'
            ]:
                for sub in ['sub1', 'sub2']:
                    enc_outs['ys_' + sub]['xs'] = enc_outs['ys']['xs'].clone()
                    enc_outs['ys_' + sub]['xlens'] = enc_outs['ys']['xlens'][:]

            return enc_outs
コード例 #6
0
ファイル: train_char.py プロジェクト: widdiot/E2E-ASR
def train():
    def adjust_learning_rate(optimizer, lr):
        """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
        # lr = args.lr * (0.1 ** (epoch // 30))
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

    def add_noise(x):
        dim = x.shape[-1]
        noise = torch.normal(torch.zeros(dim), 0.075)
        if x.is_cuda: noise = noise.cuda()
        x.data += noise

    prev_loss = 2000
    best_model = None
    lr = args.lr
    for epoch in range(1, args.epochs):
        totloss = 0
        losses = []
        start_time = time.time()
        # for i, (xs, ys, xlen, ylen) in enumerate(trainset):
        step = 0
        is_new_epoch = 0
        tbar = tqdm(total=len(trainset))
        while True:
            # Compute loss in the training set
            batch, is_new_epoch = trainset.next()
            if is_new_epoch:
                break
            xs, ys, xlens = batch['xs'], batch['ys'], batch['xlens']
            xs = [stack_frame(x, args.n_stack, args.n_skip) for x in xs]
            xs = [np2tensor(x).float() for x in xs]
            xlen = torch.IntTensor([len(x) for x in xs])
            xs = pad_list(xs, 0.0).cuda()
            _ys = [np2tensor(np.fromiter(y, dtype=np.int64), -1) for y in ys]
            ys_out_pad = pad_list(_ys, 0).long().cuda()
            ylen = np2tensor(
                np.fromiter([y.size(0) for y in _ys], dtype=np.int32))
            #accum_n_tokens += sum([len(y) for y in batch_train['ys']])
            if args.cuda: xs = xs.cuda()
            if args.noise: add_noise(xs)
            # Change mini-batch depending on task
            model.train()
            optimizer.zero_grad()

            loss = model(xs, ys_out_pad, xlen, ylen)
            loss.backward()
            # loss.detach()  # Truncate the graph
            loss = float(loss.data) * len(xlen)
            totloss += loss
            losses.append(loss)
            if args.gradclip:
                grad_norm = nn.utils.clip_grad_norm(model.parameters(), 200)
            optimizer.step()
            step += 1  # //TODO vishay un-hardcode the batch size
            # print(step, '/68k')
            if step % args.log_interval == 0 and step > 0:
                loss = totloss / args.batch_size / args.log_interval
                logging.info('[Epoch %d Batch %d] train_loss %.2f' %
                             (epoch + args.resume_epoch, step, loss))
                totloss = 0
            tbar.update(len(batch['xs']))
        tbar.close()
        trainset.reset()
        losses = sum(losses) / len(trainset)
        #val_l, wer, cer = eval(epoch)
        val_l = eval(epoch)
        # logging.info('[Epoch %d] time cost %.2fs, train loss %.2f; cv loss %.2f; wer %.2f ; cer %.2f ; lr %.3e' % (
        #     epoch, time.time() - start_time, losses, val_l, wer, cer, lr
        # ))
        logging.info(
            '[Epoch %d] time cost %.2fs, train loss %.2f; cv loss %.2f; lr %.3e'
            % (epoch + args.resume_epoch, time.time() - start_time, losses,
               val_l, lr))
        if val_l < prev_loss:
            prev_loss = val_l
            best_model = '{}/params_epoch{:02d}_tr{:.2f}_cv{:.2f}'.format(
                args.out, epoch + args.resume_epoch, losses, val_l)
            torch.save(model.state_dict(), best_model)
        else:
            torch.save(
                model.state_dict(),
                '{}/params_epoch{:02d}_tr{:.2f}_cv{:.2f}_rejected'.format(
                    args.out, epoch + args.resume_epoch, losses, val_l))
            model.load_state_dict(torch.load(best_model))
            if args.cuda: model.cuda()
            if args.schedule:
                lr /= 2
                adjust_learning_rate(optimizer, lr)
コード例 #7
0
ファイル: eval_rnnt_wp_cpu.py プロジェクト: widdiot/E2E-ASR
def eval(epoch):
    recog_dir = args.out
    ref_trn_save_path = recog_dir + '/ref_epoch_' + str(epoch) + '.trn'
    hyp_trn_save_path = recog_dir + '/hyp_epoch_' + str(epoch) + '.trn'
    wer, cer = 0, 0
    n_sub_w, n_ins_w, n_del_w = 0, 0, 0
    n_sub_c, n_ins_c, n_del_c = 0, 0, 0
    n_word, n_char = 0, 0
    pbar = tqdm(total=len(evalset))
    f_hyp = open(hyp_trn_save_path, 'w')
    f_ref = open(ref_trn_save_path, 'w')
    losses = []
    is_new_epoch = 0
    #    for xs, ys, xlen, ylen in devset:
    step = 0
    while True:
        batch, is_new_epoch = evalset.next()
        #        if is_new_epoch:
        #            break
        xs, ys, xlens = batch['xs'], batch['ys'], batch['xlens']
        xs = [stack_frame(x, args.n_stack, args.n_skip) for x in xs]
        xs = [np2tensor(x).float() for x in xs]
        xlen = torch.IntTensor([len(x) for x in xs])
        xs = pad_list(xs, 0.0)
        _ys = [np2tensor(np.fromiter(y, dtype=np.int64), -1) for y in ys]
        ys_out_pad = pad_list(_ys, 0).long()
        ylen = np2tensor(np.fromiter([y.size(0) for y in _ys], dtype=np.int32))
        # xs = Variable(torch.FloatTens is:open or(xs), volatile=True).cuda()
        # ys = Variable(torch.LongTensor(ys), volatile=True).cuda()
        # xlen = Variable(torch.IntTensor(xlen)); ylen = Variable(torch.IntTensor(ylen))
        model.eval()
        #logging.info('================== Evaluation Mode =================')
        loss = model(xs, ys_out_pad, xlen, ylen)
        loss = float(loss.data) * len(xlen)
        losses.append(loss)
        step += 1  # //TODO vishay un-hardcode the batch size
        best_hyps_id, _ = model.greedy_decode(xs)
        # print(batch['text'],len(batch['xs']))
        for b in range(len(batch['xs'])):
            ref = batch['text'][b]
            hyp = evalset.idx2token[0](best_hyps_id[b])
            # hyp = removeDuplicates(hyp)
            # Write to trn
            utt_id = str(batch['utt_ids'][b])
            speaker = str(batch['speakers'][b]).replace('-', '_')
            if hyp is None:
                hyp = "none"
            f_ref.write(ref + ' (' + speaker + '-' + utt_id + ')\n')
            f_hyp.write(hyp + ' (' + speaker + '-' + utt_id + ')\n')
            #            logging.info('utt-id: %s' % utt_id)
            #            logging.info('Ref: %s' % ref)
            #            logging.info('Hyp: %s' % hyp)
            #            logging.info('-' * 150)

            # if 'char' in devset.unit:  # //TODO this is only for char unit
            # Compute WER
            wer_b, sub_b, ins_b, del_b = compute_wer(ref=ref.split(' '),
                                                     hyp=hyp.split(' '),
                                                     normalize=False)
            wer += wer_b
            n_sub_w += sub_b
            n_ins_w += ins_b
            n_del_w += del_b
            n_word += len(ref.split(' '))

            # Compute CER
            cer_b, sub_b, ins_b, del_b = compute_wer(ref=list(ref),
                                                     hyp=list(hyp),
                                                     normalize=False)
            cer += cer_b
            n_sub_c += sub_b
            n_ins_c += ins_b
            n_del_c += del_b
            n_char += len(ref)

        pbar.update(len(batch['xs']))
        if is_new_epoch:
            break

    pbar.close()

    # Reset data counters
    evalset.reset()

    wer /= n_word
    n_sub_w /= n_word
    n_ins_w /= n_word
    n_del_w /= n_word

    cer /= n_char
    n_sub_c /= n_char
    n_ins_c /= n_char
    n_del_c /= n_char

    logging.info('WER (%s): %.2f %%' % (evalset.set, wer))
    logging.info('SUB: %.2f / INS: %.2f / DEL: %.2f' %
                 (n_sub_w, n_ins_w, n_del_w))
    logging.info('CER (%s): %.2f %%' % (evalset.set, cer))
    logging.info('SUB: %.2f / INS: %.2f / DEL: %.2f' %
                 (n_sub_c, n_ins_c, n_del_c))

    # print(step, '/12k  dev')
    return sum(losses) / len(evalset), wer, cer