Beispiel #1
0
def eval(model, dataloader):
    model.eval()
    decoder = GreedyDecoder(dataloader.dataset.labels_str)
    ctcloss = CTCLoss(size_average=True)
    cer = 0
    epoch_loss = 0
    print("decoding")
    with torch.no_grad():
        for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)):
            x = x.to(device)
            outs, out_lens = model(x, x_lens)
            loss = ctcloss(
                outs.transpose(0, 1).transpose(0, 2), y, out_lens, y_lens)
            epoch_loss += loss.item()
            outs = F.softmax(outs, 1)
            outs = outs.transpose(1, 2)
            ys = []
            offset = 0
            for y_len in y_lens:
                ys.append(y[offset:offset + y_len])
                offset += y_len
            out_strings, out_offsets = decoder.decode(outs, out_lens)
            y_strings = decoder.convert_to_strings(ys)
            for pred, truth in zip(out_strings, y_strings):
                trans, ref = pred[0], truth[0]
                #if len(ref) == 0 : print("ref:", ref, y_strings)
                cer += decoder.cer(trans, ref) / float(len(ref))
        cer /= len(dataloader.dataset)
        epoch_loss /= i + 1
    model.train()
    return cer, epoch_loss
def eval(model, dataloader):
    model.eval()
    decoder = GreedyDecoder(dataloader.dataset.labels_str)
    cer = 0
    print("decoding")
    with torch.no_grad():
        for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)):
            # x = x.to("cuda")
            x = x.to(device)
            outs, out_lens = model(x, x_lens)
            outs = F.softmax(outs, 1)
            outs = outs.transpose(1, 2)
            ys = []
            offset = 0
            for y_len in y_lens:
                ys.append(y[offset : offset + y_len])
                offset += y_len
            out_strings, out_offsets = decoder.decode(outs, out_lens)
            y_strings = decoder.convert_to_strings(ys)
            for pred, truth in zip(out_strings, y_strings):
                trans, ref = pred[0], truth[0]
                cer += decoder.cer(trans, ref) / float(len(ref))
        cer /= len(dataloader.dataset)
    model.train()
    return cer
def eval(model,
         dataloader,
         device,
         save_output=None,
         lm_alpha=None,
         lm_beta=None):
    model.eval()
    ae_decoder = GreedyDecoder(dataloader.dataset.labels_str)

    global decoder
    if lm_alpha is not None or lm_beta is not None:
        decoder._decoder.reset_params(lm_alpha, lm_beta)

    # from warpctc_pytorch import CTCLoss

    # ctcloss = CTCLoss(size_average=True)
    cer = 0
    epoch_loss = 0
    output_data = []
    print("decoding")
    with torch.no_grad():
        for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)):
            x = x.to(device)
            outs, out_lens = model(x, x_lens)
            # loss = ctcloss(outs.transpose(0, 1).transpose(0, 2), y, out_lens, y_lens)
            # epoch_loss += loss.item()
            outs = F.softmax(outs, 1)
            outs = outs.transpose(1, 2)
            ys = []
            offset = 0
            for y_len in y_lens:
                ys.append(y[offset:offset + y_len])
                offset += y_len
            out_strings, out_offsets = decoder.decode(outs, out_lens)
            y_strings = ae_decoder.convert_to_strings(ys)

            if save_output is not None:
                # add output to data array, and continue
                output_data.append(
                    (outs.cpu().numpy(), out_lens.numpy(), y_strings))

            for pred, truth in zip(out_strings, y_strings):
                trans, ref = pred[0], truth[0]
                cer += decoder.cer(trans, ref) / float(len(ref))
        cer /= len(dataloader.dataset)
        epoch_loss /= i + 1
    print("cer:{}, epoch_loss:{}".format(cer, epoch_loss))

    if save_output is not None:
        np.save(save_output, output_data)

    return cer, epoch_loss
Beispiel #4
0
    def __init__(self, model_path):
        """

        :param model_path:
        """
        assert os.path.exists(model_path), "Cannot find model here {}".format(
            model_path)
        self.deep_speech_model = DeepSpeech.load_model(model_path)
        self.deep_speech_model.eval()
        labels = DeepSpeech.get_labels(self.deep_speech_model)
        self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model)
        self.decoder = GreedyDecoder(labels)
        self.parser = SpectrogramParser(self.audio_conf, normalize=True)
    def __init__(self, config, data_loader=None):
        self.config = config
        self.data_loader = data_loader  # needed for VAE

        self.lr = config.lr
        self.beta1 = config.beta1
        self.beta2 = config.beta2
        self.optimizer = config.optimizer
        self.batch_size = config.batch_size

        self.diffLoss = L1Loss_mask()  # custom module

        self.valmin_iter = 0
        self.model_dir = 'logs/' + str(config.expnum)
        self.savename_G = ''
        self.savename_D = ''
        self.savename_ASR = ''

        self.kt = 0  # used for Proportional Control Theory in BEGAN, initialized as 0
        self.lb = self.config.lambda_k
        self.gamma = self.config.gamma
        self.conv_measure = 0  # convergence measure

        self.ctc_tr = AverageMeter()
        self.ctc_tr_local = AverageMeter()
        self.ctc_val = AverageMeter()
        self.adv_ny_tr = AverageMeter()
        self.adv_ny_val = AverageMeter()
        self.wer_tr = AverageMeter()
        self.wer_val = AverageMeter()
        self.cer_tr = AverageMeter()
        self.cer_val = AverageMeter()

        self.CTCLoss = CTCLoss()
        self.decoder = GreedyDecoder(data_loader.labels)

        self.build_model()
        self.G.loss_stop = 100000
        #self.get_weight_statistic()

        if self.config.gpu >= 0:
            self.G.cuda()
            self.D.cuda()
            self.diffLoss.cuda()
            self.ASR.cuda()

        if len(self.config.load_path) > 0:
            self.load_model()

        if config.mode == 'train':
            self.logFile = open(self.model_dir + '/log.txt', 'w')
Beispiel #6
0
def main():
    import argparse
    global model, spect_parser, decoder, args
    parser = argparse.ArgumentParser(description='DeepSpeech transcription server')
    parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server')
    parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server')
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    if args.cuda:
        model.cuda()
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))

    spect_parser = SpectrogramParser(audio_conf, normalize=True)
    logging.info('Server initialised')
    app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
Beispiel #7
0
class SpeechRecognizer(object):
    def __init__(self, config_path='config.ini'):
        if config_path is None:
            raise Exception('Path to config file is None')
        self.config = configparser.ConfigParser()
        self.config.read(config_path, encoding='UTF-8')
        self.labels = self.config['Wav2Letter']['labels'][1:-1]
        self.sample_rate = int(self.config['Wav2Letter']['sample_rate'])
        self.window_size = float(self.config['Wav2Letter']['window_size'])
        self.window_stride = float(self.config['Wav2Letter']['window_stride'])
        self.greedy = int(self.config['Wav2Letter']['greedy'])
        self.cpu = int(self.config['Wav2Letter']['cpu'])

        if self.cpu:
            from PuzzleLib import Config
            Config.backend = Config.Backend.cpu

        from PuzzleLib.Models.Nets.WaveToLetter import loadW2L
        from PuzzleLib.Modules import MoveAxis

        nfft = int(self.sample_rate * self.window_size)
        self.w2l = loadW2L(modelpath=self.config['Wav2Letter']['model_path'], inmaps=(1 + nfft // 2),
                           nlabels=len(self.labels))
        self.w2l.append(MoveAxis(src=2, dst=0))

        if not self.cpu:
            self.w2l.calcMode(np.float16)

        self.w2l.evalMode()

        if not self.greedy:
            from decoder import TrieDecoder
            lexicon = self.config['Wav2Letter']['lexicon']
            tokens = self.config['Wav2Letter']['tokens']
            lm_path = self.config['Wav2Letter']['lm_path']
            beam_threshold = float(self.config['Wav2Letter']['beam_threshold'])
            self.decoder = TrieDecoder(lexicon, tokens, lm_path, beam_threshold)
        else:
            self.decoder = GreedyDecoder(self.labels)

    def recognize(self, audio_path):
        preprocessed_audio = preprocess(audio_path, self.sample_rate, self.window_size, self.window_stride)
        if self.cpu:
            from PuzzleLib.CPU.CPUArray import CPUArray
            inputs = CPUArray.toDevice(np.array([preprocessed_audio]).astype(np.float32))
        else:
            from PuzzleLib.Backend import gpuarray
            inputs = gpuarray.to_gpu(np.array([preprocessed_audio]).astype(np.float16))

        output = self.w2l(inputs).get()
        output = np.vstack(output).astype(np.float32)
        result = self.decoder.decode(output)

        if not self.cpu:
            from PuzzleLib.Backend.gpuarray import memoryPool
            memoryPool.freeHeld()

        del inputs, output

        return result
    def __init__(self, lr1, num_iterations1, lr2, num_iterations2, batch_size,
                 l2penalty):

        self.lr1 = lr1
        self.num_iterations1 = num_iterations1
        self.lr2 = lr2
        self.num_iterations2 = num_iterations2
        self.batch_size = batch_size
        self.l2penalty = l2penalty

        with open('labels.json') as label_file:
            self.labels = str(''.join(json.load(label_file)))
        self.labels_map = dict([(self.labels[i], i)
                                for i in range(len(self.labels))])
        self.decoder = GreedyDecoder(self.labels,
                                     blank_index=self.labels.index('_'))
Beispiel #9
0
def acc(model):
    decoder = GreedyDecoder()
    train_loader = SpeechDataloader(SpeechDataset('uf.csv'), batch_size=16)
    train_acc = evaluate(model, train_loader, decoder)
    test_loader = SpeechDataloader(SpeechDataset('test.csv'), batch_size=16)
    test_acc = evaluate(model, test_loader, decoder)
    return train_acc, test_acc
Beispiel #10
0
def training_loop(model, kwargs, train_dataset, train_batch_loader, eval_dataset):
    device = 'cuda:0' if torch.cuda.is_available() and kwargs['cuda'] else 'cpu'
    model.to(device)
    greedy_decoder = GreedyDecoder(model.labels)
    criterion = nn.CTCLoss(blank=0,reduction='none')
    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters,lr=kwargs['lr'],momentum=kwargs['momentum'],nesterov=True,weight_decay=1e-5)
    scaling_factor = model.get_scaling_factor()
    epochs=kwargs['epochs']
    print('Train dataset size:%d' % len(train_dataset))
    batch_count = math.ceil(len(train_dataset) / kwargs['batch_size'])
    for epoch in range(epochs):
        with timing.EpochTimer(epoch,_log_to_tensorboard) as et:
            model.train()
            total_loss = 0
            for idx, data in et.across_epoch('Data Loading time', tqdm.tqdm(enumerate(train_batch_loader),total=batch_count)):
                inputs, input_lengths, targets, target_lengths, file_paths, texts = data
                with et.timed_action('Model execution time'):
                    out = model(torch.FloatTensor(inputs).to(device))
                out = out.transpose(1,0)
                output_lengths = [l // scaling_factor for l in input_lengths]
                with et.timed_action('Loss and BP time'):
                    loss = criterion(out, targets.to(device), torch.IntTensor(output_lengths), torch.IntTensor(target_lengths))
                    optimizer.zero_grad()
                    loss.mean().backward()
                    optimizer.step()
                total_loss += loss.mean().item()
            log_loss_to_tensorboard(epoch, total_loss / batch_count)
            evaluate(model,eval_dataset,greedy_decoder,epoch,kwargs)
            if epoch != 0 and epoch % kwargs['epochs_per_save'] == 0 :
                save_epoch_model(model,epoch, kwargs['model_dir'])
    if kwargs['model_dir']:
        save_model(model, kwargs['model_dir']+'/final.pth')
    print('Finished at %s' % time.asctime())
Beispiel #11
0
    def an4_prepare(self):
        from audio_data.data_loader import AudioDataLoader, SpectrogramDataset, BucketingSampler, DistributedBucketingSampler
        from decoder import GreedyDecoder
        audio_conf = self.ext['audio_conf']
        labels = self.ext['labels']
        train_manifest = os.path.join(self.data_dir, 'an4_train_manifest.csv')
        val_manifest = os.path.join(self.data_dir, 'an4_val_manifest.csv')


        with open('labels.json') as label_file:
            labels = str(''.join(json.load(label_file)))
        trainset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=train_manifest, labels=labels, normalize=True, augment=True)
        self.trainset = trainset
        testset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=val_manifest, labels=labels, normalize=True, augment=False)
        self.testset = testset

        if self.nworkers > 1:
            train_sampler = DistributedBucketingSampler(self.trainset, batch_size=self.batch_size, num_replicas=self.nworkers, rank=self.rank)
        else:
            train_sampler = BucketingSampler(self.trainset, batch_size=self.batch_size)

        self.train_sampler = train_sampler
        trainloader = AudioDataLoader(self.trainset, num_workers=4, batch_sampler=self.train_sampler)
        testloader = AudioDataLoader(self.testset, batch_size=self.batch_size,
                                  num_workers=4)
        self.trainloader = trainloader
        self.testloader = testloader
        decoder = GreedyDecoder(labels)
        self.decoder = decoder
Beispiel #12
0
class SpeechTranscriber:
    def __init__(self, model_path):
        """

        :param model_path:
        """
        assert os.path.exists(model_path), "Cannot find model here {}".format(
            model_path)
        self.deep_speech_model = DeepSpeech.load_model(model_path)
        self.deep_speech_model.eval()
        labels = DeepSpeech.get_labels(self.deep_speech_model)
        self.audio_conf = DeepSpeech.get_audio_conf(self.deep_speech_model)
        self.decoder = GreedyDecoder(labels)
        self.parser = SpectrogramParser(self.audio_conf, normalize=True)

    def transcribe(self, audio_file):
        """

        :param audio_file:
        :return:
        """
        spect = self.parser.parse_audio(audio_file).contiguous()
        spect = spect.view(1, 1, spect.size(0), spect.size(1))
        out = self.deep_speech_model(Variable(spect, volatile=True))
        out = out.transpose(0, 1)  # TxNxH
        decoded_output = self.decoder.decode(out.data)
        return decoded_output
Beispiel #13
0
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x,
                   mesh_y, labels, grid_index):
    print("Beginning decode for {}, {}".format(lm_alpha, lm_beta))
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=batch_size,
                                  num_workers=0)
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    decoder = BeamCTCDecoder(labels,
                             beam_width=args.beam_width,
                             cutoff_top_n=args.cutoff_top_n,
                             blank_index=labels.index('_'),
                             lm_path=args.lm_path,
                             alpha=lm_alpha,
                             beta=lm_beta,
                             num_processes=1)
    total_cer, total_wer = 0, 0
    for i, (data) in enumerate(test_loader):
        inputs, targets, input_percentages, target_sizes = data

        # unflatten targets
        split_targets = []
        offset = 0
        for size in target_sizes:
            split_targets.append(targets[offset:offset + size])
            offset += size

        out = torch.from_numpy(logits[i][0])
        sizes = torch.from_numpy(logits[i][1])

        decoded_output, _, = decoder.decode(out, sizes)
        target_strings = target_decoder.convert_to_strings(split_targets)
        wer, cer = 0, 0
        for x in range(len(target_strings)):
            transcript, reference = decoded_output[x][0], target_strings[x][0]
            wer_inst = decoder.wer(transcript, reference) / float(
                len(reference.split()))
            cer_inst = decoder.cer(transcript, reference) / float(
                len(reference))
            wer += wer_inst
            cer += cer_inst
        total_cer += cer
        total_wer += wer

    wer = total_wer / len(test_loader.dataset)
    cer = total_cer / len(test_loader.dataset)

    return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
Beispiel #14
0
def test_beam_is_not_greedy():
    '''
    Example from https://towardsdatascience.com/beam-search-decoding-in-ctc-trained-neural-networks-51889a3d85a7
    Shows that beam search can find a path that greedy decoding can not.
    '''
    labels = ['_', 'A', 'B', ' ']
    samples = np.array([[0.8, 0.2, 0, 0], [0.6, 0.4, 0, 0]])
    res = prefix_beam_search(samples,
                             labels,
                             blank_index=0,
                             return_weights=True)
    assert res == ('A', 0.52)

    greedy_decoder = GreedyDecoder(labels, blank_index=0)
    greedy_res = greedy_decoder.decode(torch.FloatTensor(samples).unsqueeze(0),
                                       sizes=None)
    assert greedy_res == ['']
Beispiel #15
0
def get_decoder(decoder_type, lm_path, labels, beam_search_params):
    if decoder_type == 'beam':
        decoder = PrefixBeamSearchLMDecoder(lm_path,labels,**beam_search_params)
    else:
        if not decoder_type == 'greedy':
            print ('Decoder type not recognized, defaulting to greedy')
        decoder = GreedyDecoder(labels)
    return decoder
Beispiel #16
0
def main(model_path, confs): 
    model, __ = MultiTask.load_model(model_path)
    if confs['cuda']:
        model = model.cuda()
    
    
    if not model._meta['use_transcripts_out']: # only accent classification
        criterion = nn.CrossEntropyLoss()
    elif not model._meta['use_accents_out']: # only text recognition
        criterion = CTCLoss()
    else: # both tasks
        criterion = (CTCLoss(), nn.CrossEntropyLoss())
        
    
    # Results
    results = {}
    for manifest, lm in confs['testing_manifests']:
        eprint(f'\n### Testing {manifest.split("/")[-1]} for model {Path(model_path).stem.split("_")[0]}')
        
        # Decoder
        if model._meta['use_transcripts_out']:
            decoder = BeamCTCDecoder(confs['labels'], 
                                     lm_path=lm,
                                     alpha=confs['decoder_alpha'], 
                                     beta=confs['decoder_beta'],
                                     cutoff_top_n=confs['decoder_cutoff_top_n'],
                                     cutoff_prob=confs['decoder_cutoff_top_n'],
                                     beam_width=confs['decoder_beam_width'], 
                                     num_processes=confs['num_workers'])

            target_decoder = GreedyDecoder(confs['labels'])
        else:
            decoder, target_decoder = None, None
        
        # Test
        results[manifest.split('/')[-1]] = result_for_manifest(model, criterion, manifest, decoder, target_decoder, confs['batch_size'], confs['num_workers'])
        
        
    if not PRINT_LATEX_TABLE:
        print(f'Model: {model_path.split("/")[-1]}')
        for name, res in results.items():
            print(f'\nResults for {name}:')
            print('; '.join([f'{k}: {v:.3f}' for k, v in res.items()]))
    else:
        print(' & '.join(['model']+list([k[:-4] for k in results.keys()])))
        val_dict = {}
        for k in list(results.values())[0].keys():
            val_dict[k] = []
        for res in results.values():
            [val_dict[k].append(f'{v:.1f}') for k, v in res.items()]
        for val in val_dict.values():
            print(' & '.join([Path(model_path).stem.split('_')[0]]+val)+r' \\')
Beispiel #17
0
 def __init__(self,
              extractor,
              needs=None,
              store=False,
              key=None,
              **extractor_args):
     super(TextFeature, self).__init__(extractor,
                                       needs=needs,
                                       store=store,
                                       encoder=TextEncoder,
                                       decoder=GreedyDecoder(),
                                       key=key,
                                       **extractor_args)
Beispiel #18
0
 def __init__(self, purge=True):
     self.facedetector = dlib.get_frontal_face_detector()
     self.facepredictor = dlib.shape_predictor(
         talkPredictor.FACE_MODEL_PATH)
     self.start_time = int(round(time.time() * 1000))
     self.log = pd.DataFrame(data=[], columns=['ts', 'key', 'value'])
     self.log.set_index(['ts', 'key'])
     self.purge = purge
     self.procdevice = torch.device(
         'cuda' if torch.cuda.is_available() else 'cpu')
     self.talkmodel = Model.load_model(
         talkPredictor.RNN_MODEL_PATH)  # , map_location='cpu'
     self.talkmodel.eval()
     self.talklabels = Model.get_labels(self.talkmodel)
     self.talkdecoder = GreedyDecoder(
         self.talklabels, blank_index=self.talklabels.index('_'))
     self.audio_conf = Model.get_audio_conf(self.talkmodel)
     self.samplerate = 16000
     self.framerate = None
     self.video_queue = {}
     self.audio_queue = []
     self.pred_queue = []
Beispiel #19
0
    def __init__(self, config_path='config.ini'):
        if config_path is None:
            raise Exception('Path to config file is None')
        self.config = configparser.ConfigParser()
        self.config.read(config_path, encoding='UTF-8')
        self.labels = self.config['Wav2Letter']['labels'][1:-1]
        self.sample_rate = int(self.config['Wav2Letter']['sample_rate'])
        self.window_size = float(self.config['Wav2Letter']['window_size'])
        self.window_stride = float(self.config['Wav2Letter']['window_stride'])
        self.greedy = int(self.config['Wav2Letter']['greedy'])
        self.cpu = int(self.config['Wav2Letter']['cpu'])

        if self.cpu:
            from PuzzleLib import Config
            Config.backend = Config.Backend.cpu

        from PuzzleLib.Models.Nets.WaveToLetter import loadW2L
        from PuzzleLib.Modules import MoveAxis

        nfft = int(self.sample_rate * self.window_size)
        self.w2l = loadW2L(modelpath=self.config['Wav2Letter']['model_path'], inmaps=(1 + nfft // 2),
                           nlabels=len(self.labels))
        self.w2l.append(MoveAxis(src=2, dst=0))

        if not self.cpu:
            self.w2l.calcMode(np.float16)

        self.w2l.evalMode()

        if not self.greedy:
            from decoder import TrieDecoder
            lexicon = self.config['Wav2Letter']['lexicon']
            tokens = self.config['Wav2Letter']['tokens']
            lm_path = self.config['Wav2Letter']['lm_path']
            beam_threshold = float(self.config['Wav2Letter']['beam_threshold'])
            self.decoder = TrieDecoder(lexicon, tokens, lm_path, beam_threshold)
        else:
            self.decoder = GreedyDecoder(self.labels)
Beispiel #20
0
def build():
    global model, spect_parser, decoder, device

    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    device = torch.device("cpu")
    model = load_model(device, "/workspace/models/deepspeech_final.pth", False)

    decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_'))

    spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
    logging.info('Server initialised')
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, labels, grid_index):
    print("Beginning decode for {}, {}".format(lm_alpha, lm_beta))
    test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=0)
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n,
                             blank_index=labels.index('_'), lm_path=args.lm_path,
                             alpha=lm_alpha, beta=lm_beta, num_processes=1)
    total_cer, total_wer = 0, 0
    for i, (data) in enumerate(test_loader):
        inputs, targets, input_percentages, target_sizes = data

        # unflatten targets
        split_targets = []
        offset = 0
        for size in target_sizes:
            split_targets.append(targets[offset:offset + size])
            offset += size

        out = torch.from_numpy(logits[i][0])
        sizes = torch.from_numpy(logits[i][1])

        decoded_output, _, = decoder.decode(out, sizes)
        target_strings = target_decoder.convert_to_strings(split_targets)
        wer, cer = 0, 0
        for x in range(len(target_strings)):
            transcript, reference = decoded_output[x][0], target_strings[x][0]
            wer_inst = decoder.wer(transcript, reference) / float(len(reference.split()))
            cer_inst = decoder.cer(transcript, reference) / float(len(reference))
            wer += wer_inst
            cer += cer_inst
        total_cer += cer
        total_wer += wer

    wer = total_wer / len(test_loader.dataset)
    cer = total_cer / len(test_loader.dataset)

    return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
def main():
    import argparse
    global model, spect_parser, decoder, args, device, decompressor

    parser = argparse.ArgumentParser(
        description='DeepSpeech transcription server')
    parser.add_argument('--host',
                        type=str,
                        default='0.0.0.0',
                        help='Host to be used by the server')
    parser.add_argument('--port',
                        type=int,
                        default=8888,
                        help='Port to be used by the server')
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.half)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(model.labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index('_'))

    spect_parser = OnlineSpectrogramParser(model.audio_conf, normalize=True)
    logging.info('Server initialised')

    decompressor = LZString()

    server = WebsocketServer(host=args.host, port=args.port)
    server.set_fn_new_client(new_client)
    server.set_fn_client_left(client_left)
    server.set_fn_message_received(message_received)
    server.run_forever()
Beispiel #23
0
def main():
    import argparse

    global model, spect_parser, decoder, args, device
    parser = argparse.ArgumentParser(
        description="DeepSpeech transcription server")
    parser.add_argument("--host",
                        type=str,
                        default="0.0.0.0",
                        help="Host to be used by the server")
    parser.add_argument("--port",
                        type=int,
                        default=8888,
                        help="Port to be used by the server")
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info("Setting up server...")
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.cuda)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(
            model.labels,
            lm_path=args.lm_path,
            alpha=args.alpha,
            beta=args.beta,
            cutoff_top_n=args.cutoff_top_n,
            cutoff_prob=args.cutoff_prob,
            beam_width=args.beam_width,
            num_processes=args.lm_workers,
        )
    else:
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index("_"))

    spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
    logging.info("Server initialised")
    app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
    def __init__(self, *args, **kwargs):
        super(Tester, self).__init__(*args, **kwargs)
        device = 'cpu'
        model_path = '/home/chris/git/deepspeech.pytorch/models/ted_pretrained_v2.pth'
        half = False
        model = load_model(device, model_path, half)

        # if args.decoder == "beam":
            # from decoder import BeamCTCDecoder
            # decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                     # cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                     # beam_width=args.beam_width, num_processes=args.lm_workers)
        # else:
        decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_'))

        self.half = half
        self.device = device
        self.decoder = decoder
        self.model = model
        self.spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
Beispiel #25
0
                          noise_prob=args.noise_prob,
                          noise_levels=(args.noise_min, args.noise_max))

        rnn_type = args.rnn_type.lower()
        assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
        model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                           nb_layers=args.hidden_layers,
                           labels=labels,
                           rnn_type=supported_rnns[rnn_type],
                           audio_conf=audio_conf,
                           bidirectional=args.bidirectional)
        parameters = model.parameters()
        optimizer = torch.optim.SGD(parameters, lr=args.lr,
                                    momentum=args.momentum, nesterov=True)
    criterion = CTCLoss()
    decoder = GreedyDecoder(labels)
    train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels,
                                       normalize=True, augment=args.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels,
                                      normalize=True, augment=False)
    if not args.distributed:
        train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size)
    else:
        train_sampler = DistributedBucketingSampler(train_dataset, batch_size=args.batch_size,
                                                    num_replicas=args.world_size, rank=args.rank)
    train_loader = AudioDataLoader(train_dataset,
                                   num_workers=args.num_workers, batch_sampler=train_sampler)
    test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size,
                                  num_workers=args.num_workers)

    if (not args.no_shuffle and start_epoch != 0) or args.no_sorta_grad:
                          window_stride=args.window_stride,
                          window=args.window,
                          noise_dir=args.noise_dir,
                          noise_prob=args.noise_prob,
                          noise_levels=(args.noise_min, args.noise_max))

        rnn_type = args.rnn_type.lower()
        assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
        model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                           nb_layers=args.hidden_layers,
                           labels=labels,
                           rnn_type=supported_rnns[rnn_type],
                           audio_conf=audio_conf,
                           bidirectional=args.bidirectional)

    decoder = GreedyDecoder(labels)
    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=args.train_manifest,
                                       labels=labels,
                                       normalize=True,
                                       augment=args.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.val_manifest,
                                      labels=labels,
                                      normalize=True,
                                      augment=False)
    if not args.distributed:
        train_sampler = BucketingSampler(train_dataset,
                                         batch_size=args.batch_size)
    else:
        train_sampler = DistributedBucketingSampler(
Beispiel #27
0
def main():
    args = parser.parse_args()
    cf = ConfigParser.ConfigParser()
    try:
        cf.read(args.conf)
    except:
        print("conf file not exists")
        sys.exit(1)
    USE_CUDA = cf.getboolean('Training', 'use_cuda')
    try:
        seed = long(cf.get('Training', 'seed'))
    except:
        seed = torch.cuda.initial_seed()
        cf.set('Training', 'seed', seed)
        cf.write(open(args.conf, 'w'))

    torch.manual_seed(seed)
    if USE_CUDA:
        torch.cuda.manual_seed(seed)

    log_dir = cf.get('Data', 'log_dir')
    log_file = os.path.join(log_dir, cf.get('Data', 'log_file'))
    logger = init_logger(log_file)

    # Define Model
    rnn_input_size = cf.getint('Model', 'rnn_input_size')
    rnn_hidden_size = cf.getint('Model', 'rnn_hidden_size')
    rnn_layers = cf.getint('Model', 'rnn_layers')
    rnn_type = RNN[cf.get('Model', 'rnn_type')]
    bidirectional = cf.getboolean('Model', 'bidirectional')
    batch_norm = cf.getboolean('Model', 'batch_norm')
    rnn_param = {
        "rnn_input_size": rnn_input_size,
        "rnn_hidden_size": rnn_hidden_size,
        "rnn_layers": rnn_layers,
        "rnn_type": rnn_type,
        "bidirectional": bidirectional,
        "batch_norm": batch_norm
    }
    num_class = cf.getint('Model', 'num_class')
    drop_out = cf.getfloat('Model', 'drop_out')

    model = CTC_Model(rnn_param=rnn_param,
                      num_class=num_class,
                      drop_out=drop_out)
    print("Model Structure:")
    logger.info("Model Structure:")
    for idx, m in enumerate(model.children()):
        print(idx, m)
        logger.info(str(idx) + "->" + str(m))

    data_dir = cf.get('Data', 'data_dir')
    batch_size = cf.getint("Training", 'batch_size')

    # Data Loader
    train_dataset = SpeechDataset(data_dir, data_set='train')
    dev_dataset = SpeechDataset(data_dir, data_set="dev")
    train_loader = SpeechDataLoader(train_dataset,
                                    batch_size=batch_size,
                                    shuffle=True,
                                    num_workers=4,
                                    pin_memory=False)
    dev_loader = SpeechDataLoader(dev_dataset,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  num_workers=4,
                                  pin_memory=False)

    # ensure the feats is equal to the rnn_input_Size
    assert train_dataset.n_feats == rnn_input_size

    # decoder for dev set
    decoder = GreedyDecoder(int2char,
                            space_idx=len(int2char) - 1,
                            blank_index=0)

    # Training
    init_lr = cf.getfloat('Training', 'init_lr')
    num_epoches = cf.getint('Training', 'num_epoches')
    end_adjust_acc = cf.getfloat('Training', 'end_adjust_acc')
    decay = cf.getfloat("Training", 'lr_decay')
    weight_decay = cf.getfloat("Training", 'weight_decay')

    params = {
        'num_epoches': num_epoches,
        'end_adjust_acc': end_adjust_acc,
        'seed': seed,
        'decay': decay,
        'learning_rate': init_lr,
        'weight_decay': weight_decay,
        'batch_size': batch_size,
        'n_feats': train_dataset.n_feats
    }
    print(params)

    if USE_CUDA:
        model = model.cuda()

    loss_fn = CTCLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=init_lr,
                                 weight_decay=weight_decay)

    # visualization for training
    from visdom import Visdom
    viz = Visdom()
    title = 'TIMIT LSTM_CTC Acoustic Model'

    opts = [
        dict(title=title + " Loss", ylabel='Loss', xlabel='Epoch'),
        dict(title=title + " Loss on Dev", ylabel='DEV Loss', xlabel='Epoch'),
        dict(title=title + ' CER on DEV', ylabel='DEV CER', xlabel='Epoch')
    ]
    viz_window = [None, None, None]

    count = 0
    learning_rate = init_lr
    loss_best = 1000
    loss_best_true = 1000
    adjust_rate_flag = False
    stop_train = False
    adjust_time = 0
    acc_best = 0
    start_time = time.time()
    loss_results = []
    dev_loss_results = []
    dev_cer_results = []

    while not stop_train:
        if count >= num_epoches:
            break
        count += 1

        if adjust_rate_flag:
            learning_rate *= decay
            adjust_rate_flag = False
            for param in optimizer.param_groups:
                param['lr'] *= decay

        print("Start training epoch: %d, learning_rate: %.5f" %
              (count, learning_rate))
        logger.info("Start training epoch: %d, learning_rate: %.5f" %
                    (count, learning_rate))

        loss = train(model,
                     train_loader,
                     loss_fn,
                     optimizer,
                     logger,
                     print_every=20,
                     USE_CUDA=USE_CUDA)
        loss_results.append(loss)
        acc, dev_loss = dev(model,
                            dev_loader,
                            loss_fn,
                            decoder,
                            logger,
                            USE_CUDA=USE_CUDA)
        print("loss on dev set is %.4f" % dev_loss)
        logger.info("loss on dev set is %.4f" % dev_loss)
        dev_loss_results.append(dev_loss)
        dev_cer_results.append(acc)

        # adjust learning rate by dev_loss
        #adjust_rate_count  :  表示连续超过count个epoch的loss在end_adjust_acc区间内认为稳定
        if dev_loss < (loss_best - end_adjust_acc):
            loss_best = dev_loss
            loss_best_true = dev_loss
            adjust_rate_count = 0
            acc_best = acc
            best_model_state = copy.deepcopy(model.state_dict())
            best_op_state = copy.deepcopy(optimizer.state_dict())
        elif (dev_loss < loss_best + end_adjust_acc):
            adjust_rate_count += 1
            if dev_loss < loss_best and dev_loss < loss_best_true:
                loss_best_true = dev_loss
                acc_best = acc
                best_model_state = copy.deepcopy(model.state_dict())
                best_op_state = copy.deepcopy(optimizer.state_dict())
        else:
            adjust_rate_count = 10

        print("adjust_rate_count: %d" % adjust_rate_count)
        print('adjust_time: %d' % adjust_time)
        logger.info("adjust_rate_count: %d" % adjust_rate_count)
        logger.info('adjust_time: %d' % adjust_time)

        if adjust_rate_count == 10:
            adjust_rate_flag = True
            adjust_time += 1
            adjust_rate_count = 0
            if loss_best > loss_best_true:
                loss_best = loss_best_true
            model.load_state_dict(best_model_state)
            optimizer.load_state_dict(best_op_state)

        if adjust_time == 8:
            stop_train = True

        time_used = (time.time() - start_time) / 60
        print("epoch %d done, dev acc is: %.4f, time_used: %.4f minutes" %
              (count, acc, time_used))
        logger.info(
            "epoch %d done, dev acc is: %.4f, time_used: %.4f minutes" %
            (count, acc, time_used))

        x_axis = range(count)
        y_axis = [
            loss_results[0:count], dev_loss_results[0:count],
            dev_cer_results[0:count]
        ]
        for x in range(len(viz_window)):
            if viz_window[x] is None:
                viz_window[x] = viz.line(
                    X=np.array(x_axis),
                    Y=np.array(y_axis[x]),
                    opts=opts[x],
                )
            else:
                viz.line(
                    X=np.array(x_axis),
                    Y=np.array(y_axis[x]),
                    win=viz_window[x],
                    update='replace',
                )

    print("End training, best dev loss is: %.4f, acc is: %.4f" %
          (loss_best_true, acc_best))
    logger.info("End training, best dev loss acc is: %.4f, acc is: %.4f" %
                (loss_best_true, acc_best))
    model.load_state_dict(best_model_state)
    optimizer.load_state_dict(best_op_state)
    best_path = os.path.join(log_dir,
                             'best_model' + '_dev' + str(acc_best) + '.pkl')
    cf.set('Model', 'model_file', best_path)
    cf.write(open(args.conf, 'w'))
    params['epoch'] = count

    torch.save(
        CTC_Model.save_package(model,
                               optimizer=optimizer,
                               epoch=params,
                               loss_results=loss_results,
                               dev_loss_results=dev_loss_results,
                               dev_cer_results=dev_cer_results), best_path)
Beispiel #28
0
        rnn_type = args.rnn_type.lower()
        assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
        model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                           nb_layers=args.hidden_layers,
                           labels=labels,
                           rnn_type=supported_rnns[rnn_type],
                           audio_conf=audio_conf,
                           bidirectional=args.bidirectional)
        parameters = model.parameters()
        optimizer = torch.optim.SGD(parameters,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    nesterov=True)
    criterion = CTCLoss()
    decoder = GreedyDecoder(labels)
    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=args.train_manifest,
                                       labels=labels,
                                       normalize=True,
                                       augment=args.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.val_manifest,
                                      labels=labels,
                                      normalize=True,
                                      augment=False)
    if not args.distributed:
        train_sampler = BucketingSampler(train_dataset,
                                         batch_size=args.batch_size)
    else:
        train_sampler = DistributedBucketingSampler(
Beispiel #29
0
class Trainer(object):  # the most basic model
    def __init__(self, config, data_loader=None):
        self.config = config
        self.data_loader = data_loader  # needed for VAE

        self.lr = config.lr
        self.beta1 = config.beta1
        self.beta2 = config.beta2
        self.optimizer = config.optimizer
        self.batch_size = config.batch_size

        self.diffLoss = L1Loss_mask()  # custom module

        self.valmin_iter = 0
        self.model_dir = 'logs/' + str(config.expnum)
        self.savename_G = ''
        self.decoder = GreedyDecoder(data_loader.labels)

        self.kt = 0  # used for Proportional Control Theory in BEGAN, initialized as 0
        self.lb = 0.001
        self.conv_measure = 0  # convergence measure

        self.dce_tr = AverageMeter()
        self.dce_val = AverageMeter()
        self.wer_tr = AverageMeter()
        self.cer_tr = AverageMeter()
        self.wer_val = AverageMeter()
        self.cer_val = AverageMeter()

        self.build_model()
        self.G.loss_stop = 100000
        #self.get_weight_statistic()

        if self.config.gpu >= 0:
            self.G.cuda()
            self.ASR.cuda()

        if len(self.config.load_path) > 0:
            self.load_model()

        if config.mode == 'train':
            self.logFile = open(self.model_dir + '/log.txt', 'w')

    def zero_grad_all(self):
        self.G.zero_grad()

    def build_model(self):
        print('initialize enhancement model')
        self.G = stackedBRNN(I=self.config.nFeat,
                             H=self.config.rnn_size,
                             L=self.config.rnn_layers,
                             rnn_type=supported_rnns[self.config.rnn_type])

        print('load pre-trained ASR model')
        package_ASR = torch.load(self.config.ASR_path,
                                 map_location=lambda storage, loc: storage)
        self.ASR = DeepSpeech.load_model_package(package_ASR)
        # Weight initialization is done inside the module

    def load_model(self):
        print("[*] Load models from {}...".format(self.load_path))
        postfix = '_valmin'
        paths = glob(os.path.join(self.load_path, 'G{}*.pth'.format(postfix)))
        paths.sort()

        if len(paths) == 0:
            print("[!] No checkpoint found in {}...".format(self.load_path))
            assert (0), 'checkpoint not avilable'

        idxes = [
            int(os.path.basename(path.split('.')[0].split('_')[-1]))
            for path in paths
        ]
        if self.config.start_iter < 0:
            self.config.start_iter = max(idxes)
            if (self.config.start_iter < 0):  # if still 0, then raise error
                raise Exception(
                    "start iter is still less than 0 --> probably try to load initial random model"
                )

        if self.config.gpu < 0:  #CPU
            map_location = lambda storage, loc: storage
        else:  # GPU
            map_location = None

        # Ver2
        print('Load models from ' + self.load_path + ', ITERATION = ' +
              str(self.config.start_iter))
        self.G.load_state_dict(
            torch.load('{}/G{}_{}.pth'.format(self.load_path[:-1], postfix,
                                              self.config.start_iter),
                       map_location=map_location))

        print("[*] Model loaded")

    def train(self):
        # Setting
        optimizer_g = torch.optim.Adam(self.G.parameters(),
                                       lr=self.config.lr,
                                       betas=(self.beta1, self.beta2),
                                       amsgrad=True)

        for iter in trange(self.config.start_iter, self.config.max_iter):
            # Train
            data_list = self.data_loader.next(cl_ny='ny', type='train')
            inputs, cleans, mask = _get_variable_nograd(
                data_list[0]), _get_variable_nograd(
                    data_list[1]), _get_variable_nograd(data_list[2])

            # forward
            outputs = self.G(inputs)
            dce, nElement = self.diffLoss(
                outputs, cleans, mask)  # already normalized inside function

            # backward
            self.zero_grad_all()
            dce.backward()
            optimizer_g.step()

            # log
            #pdb.set_trace()
            if (iter + 1) % self.config.log_iter == 0:
                str_loss = "[{}/{}] (train) DCE: {:.7f}".format(
                    iter, self.config.max_iter, dce.data[0])
                print(str_loss)
                self.logFile.write(str_loss + '\n')
                self.logFile.flush()

            if (iter + 1) % self.config.save_iter == 0:
                self.G.eval()
                # Measure performance on training subset
                self.dce_tr.reset()
                self.wer_tr.reset()
                self.cer_tr.reset()
                for _ in trange(0, len(self.data_loader.trsub_dl)):
                    data_list = self.data_loader.next(cl_ny='ny', type='trsub')
                    inputs, cleans, mask, targets, input_percentages, target_sizes = \
                        _get_variable_volatile(data_list[0]), _get_variable_volatile(data_list[1]), _get_variable_volatile(data_list[2]), \
                        data_list[3], data_list[4], data_list[5]

                    outputs = self.G(inputs)
                    dce, nElement = self.diffLoss(
                        outputs, cleans,
                        mask)  # already normalized inside function
                    self.dce_tr.update(dce.data[0], nElement)

                    # Greedy decodoing
                    wer, cer, nWord, nChar = self.greedy_decoding(
                        inputs, targets, input_percentages, target_sizes)
                    self.wer_tr.update(wer, nWord)
                    self.cer_tr.update(cer, nChar)

                str_loss = "[{}/{}] (training subset) DCE: {:.7f}".format(
                    iter, self.config.max_iter, self.dce_tr.avg)
                print(str_loss)
                self.logFile.write(str_loss + '\n')

                str_loss = "[{}/{}] (training subset) WER: {:.7f}, CER: {:.7f}".format(
                    iter, self.config.max_iter, self.wer_tr.avg * 100,
                    self.cer_tr.avg * 100)
                print(str_loss)
                self.logFile.write(str_loss + '\n')

                # Measure performance on validation data
                self.dce_val.reset()
                self.wer_val.reset()
                self.cer_val.reset()
                for _ in trange(0, len(self.data_loader.val_dl)):
                    data_list = self.data_loader.next(cl_ny='ny', type='val')
                    inputs, cleans, mask, targets, input_percentages, target_sizes = \
                        _get_variable_volatile(data_list[0]), _get_variable_volatile(data_list[1]), _get_variable_volatile(data_list[2]), \
                        data_list[3], data_list[4], data_list[5]

                    outputs = self.G(inputs)
                    dce, nElement = self.diffLoss(
                        outputs, cleans,
                        mask)  # already normalized inside function
                    self.dce_val.update(dce.data[0], nElement)

                    # Greedy decodoing
                    wer, cer, nWord, nChar = self.greedy_decoding(
                        inputs, targets, input_percentages, target_sizes)
                    self.wer_val.update(wer, nWord)
                    self.cer_val.update(cer, nChar)

                str_loss = "[{}/{}] (validation) DCE: {:.7f}".format(
                    iter, self.config.max_iter, self.dce_val.avg)
                print(str_loss)
                self.logFile.write(str_loss + '\n')

                str_loss = "[{}/{}] (validation) WER: {:.7f}, CER: {:.7f}".format(
                    iter, self.config.max_iter, self.wer_val.avg * 100,
                    self.cer_val.avg * 100)
                print(str_loss)
                self.logFile.write(str_loss + '\n')

                self.G.train()  # end of validation
                self.logFile.flush()

                # Save model
                if (len(self.savename_G) > 0):  # do not remove here
                    if os.path.exists(self.savename_G):
                        os.remove(self.savename_G)  # remove previous model
                self.savename_G = '{}/G_{}.pth'.format(self.model_dir, iter)
                torch.save(self.G.state_dict(), self.savename_G)

                if (self.G.loss_stop > self.wer_val.avg):
                    self.G.loss_stop = self.wer_val.avg
                    savename_G_valmin_prev = '{}/G_valmin_{}.pth'.format(
                        self.model_dir, self.valmin_iter)
                    if os.path.exists(savename_G_valmin_prev):
                        os.remove(
                            savename_G_valmin_prev)  # remove previous model

                    print('save model for this checkpoint')
                    savename_G_valmin = '{}/G_valmin_{}.pth'.format(
                        self.model_dir, iter)
                    copyfile(self.savename_G, savename_G_valmin)
                    self.valmin_iter = iter

    def greedy_decoding(self,
                        inputs,
                        targets,
                        input_percentages,
                        target_sizes,
                        transcript_prob=0.001):
        # unflatten targets
        split_targets = []
        offset = 0
        for size in target_sizes:
            split_targets.append(targets[offset:offset + size])
            offset += size

        # step 1) Decoding to get wer & cer
        enhanced = self.G(inputs)
        prob = self.ASR(enhanced)
        prob = prob.transpose(0, 1)
        T = prob.size(0)
        sizes = input_percentages.mul_(int(T)).int()

        decoded_output, _ = self.decoder.decode(prob.data, sizes)
        target_strings = self.decoder.convert_to_strings(split_targets)
        we, ce, total_word, total_char = 0, 0, 0, 0

        for x in range(len(target_strings)):
            decoding, reference = decoded_output[x][0], target_strings[x][0]
            nChar = len(reference)
            nWord = len(reference.split())
            we_i = self.decoder.wer(decoding, reference)
            ce_i = self.decoder.cer(decoding, reference)
            we += we_i
            ce += ce_i
            total_word += nWord
            total_char += nChar
            if (random.uniform(0, 1) < transcript_prob):
                print('reference = ' + reference)
                print('decoding = ' + decoding)
                print('wer = ' + str(we_i / float(nWord)) + ', cer = ' +
                      str(ce_i / float(nChar)))

        wer = we / total_word
        cer = ce / total_word

        return wer, cer, total_word, total_char
Beispiel #30
0
def test():
    args = parser.parse_args()
    cf = ConfigParser.ConfigParser()
    cf.read(args.conf)
    USE_CUDA = cf.getboolean('Training', 'USE_CUDA')
    model_path = cf.get('Model', 'model_file')
    data_dir = cf.get('Data', 'data_dir')
    beam_width = cf.getint('Decode', 'beam_width')
    package = torch.load(model_path)
    
    rnn_param = package["rnn_param"]
    num_class = package["num_class"]
    n_feats = package['epoch']['n_feats']
    drop_out = package['_drop_out']

    decoder_type =  cf.get('Decode', 'decoder_type')
    data_set = cf.get('Decode', 'eval_dataset')

    test_dataset = SpeechDataset(data_dir, data_set=data_set)
    
    model = CTC_Model(rnn_param=rnn_param, num_class=num_class, drop_out=drop_out)
        
    test_loader = SpeechDataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=4, pin_memory=False)
    
    model.load_state_dict(package['state_dict'])
    model.eval()
    
    if USE_CUDA:
        model = model.cuda()

    if decoder_type == 'Greedy':
        decoder  = GreedyDecoder(int2char, space_idx=len(int2char) - 1, blank_index = 0)
    else:
        decoder = BeamDecoder(int2char, beam_width=beam_width, blank_index = 0, space_idx = len(int2char) - 1)    

    total_wer = 0
    total_cer = 0
    start = time.time()
    for data in test_loader:
        inputs, target, input_sizes, input_size_list, target_sizes = data 
        inputs = inputs.transpose(0,1)
        inputs = Variable(inputs, volatile=True, requires_grad=False)
        
        if USE_CUDA:
            inputs = inputs.cuda()
        
        inputs = nn.utils.rnn.pack_padded_sequence(inputs, input_size_list)
        probs = model(inputs)

        probs = probs.data.cpu()
        decoded = decoder.decode(probs, input_size_list)
        targets = decoder._unflatten_targets(target, target_sizes)
        labels = decoder._process_strings(decoder._convert_to_strings(targets))

        for x in range(len(labels)):
            print("origin : " + labels[x])
            print("decoded: " + decoded[x])
        cer = 0
        wer = 0
        for x in range(len(labels)):
            cer += decoder.cer(decoded[x], labels[x])
            wer += decoder.wer(decoded[x], labels[x])
            decoder.num_word += len(labels[x].split())
            decoder.num_char += len(labels[x])
        total_cer += cer
        total_wer += wer
    CER = (1 - float(total_cer) / decoder.num_char)*100
    WER = (1 - float(total_wer) / decoder.num_word)*100
    print("Character error rate on test set: %.4f" % CER)
    print("Word error rate on test set: %.4f" % WER)
    end = time.time()
    time_used = (end - start) / 60.0
    print("time used for decode %d sentences: %.4f minutes." % (len(test_dataset), time_used))
Beispiel #31
0
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    elif args.decoder == "greedy":
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    else:
        decoder = None
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size,
                                  num_workers=args.num_workers)
    total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0
    output_data = []
    for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)):
        inputs, targets, input_percentages, target_sizes = data
        input_sizes = input_percentages.mul_(int(inputs.size(3))).int()

        # unflatten targets
        split_targets = []
        offset = 0
        for size in target_sizes:
            split_targets.append(targets[offset:offset + size])
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.half)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(model.labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    elif args.decoder == "greedy":
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index('_'))
    else:
        decoder = None
    target_decoder = GreedyDecoder(model.labels,
                                   blank_index=model.labels.index('_'))
    test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                      manifest_filepath=args.test_manifest,
                                      labels=model.labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)
    wer, cer, output_data = evaluate(test_loader=test_loader,
                                     device=device,
                                     model=model,
                                     decoder=decoder,
Beispiel #33
0
    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    elif args.decoder == "greedy":
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    else:
        decoder = None
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.test_manifest,
                                      labels=labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)
    total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0
    output_data = []
    for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)):
        inputs, targets, input_percentages, target_sizes = data
        input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
Beispiel #34
0
    def __init__(self, config, data_loader=None):
        if (config.w_minWvar > 0):
            config.minimize_W_var = True
            self.varLoss = var_mask()

        self.config = config
        self.data_loader = data_loader  # needed for VAE

        self.lr = config.lr
        self.beta1 = config.beta1
        self.beta2 = config.beta2
        self.optimizer = config.optimizer
        self.batch_size = config.batch_size

        self.diffLoss = L1Loss_mask()  # custom module

        log_domain = False
        if (self.config.linear_to_mel):
            log_domain = True
        self.get_SNRout = get_SNRout(log_domain=log_domain)

        self.valmin_iter = 0
        self.model_dir = 'models/' + str(config.expnum)
        self.log_dir = 'logs_only/' + str(config.expnum)
        self.savename_G = ''
        self.decoder = GreedyDecoder(data_loader.labels)

        self.kt = 0  # used for Proportional Control Theory in BEGAN, initialized as 0
        self.lb = 0.001
        self.conv_measure = 0  # convergence measure

        self.dce_tr = AverageMeter()
        self.dce_val = AverageMeter()

        self.snrout_tr = AverageMeter()
        self.snrout_val = AverageMeter()
        self.snrimpv_tr = AverageMeter()
        self.snrimpv_val = AverageMeter()

        if (config.linear_to_mel):
            self.mel_basis = librosa.filters.mel(self.config.fs,
                                                 self.config.nFFT,
                                                 self.config.nMel)
            self.melF_to_linearFs = get_linearF_from_melF(self.mel_basis)
            self.STFT_to_LMFB = STFT_to_LMFB(self.mel_basis,
                                             window_change=False)
            self.mag2mfb = linearmag2mel(self.mel_basis)

        mel_basis_20ms = librosa.filters.mel(
            self.config.fs, 320, self.config.nMel
        )  # mel_basis will be used only for 20ms window spectrogram
        self.STFT_to_LMFB_20ms = STFT_to_LMFB(mel_basis_20ms,
                                              win_size=self.config.nFFT)

        self.F = int(self.config.nFFT / 2 + 1)

        self.build_model()
        self.G.loss_stop = 100000
        #self.get_weight_statistic()

        if self.config.gpu >= 0:
            self.G.cuda()

        if len(self.config.load_path) > 0:
            self.load_model()

        if config.mode == 'train':
            self.logFile = open(self.log_dir + '/log.txt', 'w')