def predict_one(img_path, predict_func, idx):
    img = misc.imread(img_path, 'L')
    if img.shape[0] != cfg.input_height:
        if cfg.input_width != None:
            img = cv2.resize(img, (cfg.input_width, cfg.input_height))
        else:
            scale = cfg.input_height / img.shape[0]
            img = cv2.resize(img, None, fx=scale, fy=scale)
    seqlen = img.shape[1]
    img = np.expand_dims(np.expand_dims(img, axis=2), axis=0)

    logits = predict_func([img, [seqlen]])[0][0]
    pytorch_logits = torch.from_numpy(np.transpose(logits))
    labels = cfg.dictionary
    labels.append('空')
    decoder = BeamCTCDecoder(''.join(labels), space_index=labels.index(' '),
                                blank_index=len(labels)-1, 
                                lm_path='language_model/bigram.klm',
                                dict_path='language_model/a'
                                )
    strings, offsets, conf, char_probs = decoder.decode(pytorch_logits)
    # if idx == None:
    #     logger.info(img_path)
    #     logger.info(result)
    # else:
    #     logger.info(str(idx) + ": " + img_path)
    #     logger.info(str(idx) + ": " + result)
    result = strings[0][0]
    print(result)
    return result
Exemple #2
0
    def __init__(self, args):
        super(create_model, self).__init__()
        self.args = args
        self.model = SpeechNet(args)
        self.model.to(args.device)
        self.criterion = nn.CTCLoss()
        self.decoder = BeamCTCDecoder(PHONEME_MAP, blank_index=0, beam_width=args.beam_width)

        self.state_names = ['loss', 'edit_dist', 'lr']
Exemple #3
0
def main():
    import argparse
    global model, spect_parser, decoder, args
    parser = argparse.ArgumentParser(description='DeepSpeech transcription server')
    parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server')
    parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server')
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    if args.cuda:
        model.cuda()
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))

    spect_parser = SpectrogramParser(audio_conf, normalize=True)
    logging.info('Server initialised')
    app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
def init(beam_width, blank_index, lm_path):
    global decoder
    decoder = BeamCTCDecoder(model.labels,
                             lm_path=lm_path,
                             beam_width=beam_width,
                             num_processes=args.lm_workers,
                             blank_index=blank_index)
def model_setup(args=None):

    test_dataset = data.MASRDataset(args.test_index_path,
                                    args.labels_path,
                                    args.mode,
                                    config=args)
    dataloader = data.MASRDataLoader(test_dataset,
                                     batch_size=args.batch_size,
                                     num_workers=args.num_workers)

    model = GatedConv.load(args.pretrained_path)

    global decoder
    decoder = BeamCTCDecoder(
        dataloader.dataset.labels_str,
        alpha=0.8,
        beta=0.3,
        lm_path="/root/lm/zh_giga.no_cna_cmn.prune01244.klm",
        cutoff_top_n=40,
        cutoff_prob=1.0,
        beam_width=100,
        num_processes=args.num_workers,
        blank_index=0,
    )

    return model, dataloader
Exemple #6
0
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x,
                   mesh_y, labels, grid_index):
    print("Beginning decode for {}, {}".format(lm_alpha, lm_beta))
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=batch_size,
                                  num_workers=0)
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    decoder = BeamCTCDecoder(labels,
                             beam_width=args.beam_width,
                             cutoff_top_n=args.cutoff_top_n,
                             blank_index=labels.index('_'),
                             lm_path=args.lm_path,
                             alpha=lm_alpha,
                             beta=lm_beta,
                             num_processes=1)
    total_cer, total_wer = 0, 0
    for i, (data) in enumerate(test_loader):
        inputs, targets, input_percentages, target_sizes = data

        # unflatten targets
        split_targets = []
        offset = 0
        for size in target_sizes:
            split_targets.append(targets[offset:offset + size])
            offset += size

        out = torch.from_numpy(logits[i][0])
        sizes = torch.from_numpy(logits[i][1])

        decoded_output, _, = decoder.decode(out, sizes)
        target_strings = target_decoder.convert_to_strings(split_targets)
        wer, cer = 0, 0
        for x in range(len(target_strings)):
            transcript, reference = decoded_output[x][0], target_strings[x][0]
            wer_inst = decoder.wer(transcript, reference) / float(
                len(reference.split()))
            cer_inst = decoder.cer(transcript, reference) / float(
                len(reference))
            wer += wer_inst
            cer += cer_inst
        total_cer += cer
        total_wer += wer

    wer = total_wer / len(test_loader.dataset)
    cer = total_cer / len(test_loader.dataset)

    return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
Exemple #7
0
def main(model_path, confs): 
    model, __ = MultiTask.load_model(model_path)
    if confs['cuda']:
        model = model.cuda()
    
    
    if not model._meta['use_transcripts_out']: # only accent classification
        criterion = nn.CrossEntropyLoss()
    elif not model._meta['use_accents_out']: # only text recognition
        criterion = CTCLoss()
    else: # both tasks
        criterion = (CTCLoss(), nn.CrossEntropyLoss())
        
    
    # Results
    results = {}
    for manifest, lm in confs['testing_manifests']:
        eprint(f'\n### Testing {manifest.split("/")[-1]} for model {Path(model_path).stem.split("_")[0]}')
        
        # Decoder
        if model._meta['use_transcripts_out']:
            decoder = BeamCTCDecoder(confs['labels'], 
                                     lm_path=lm,
                                     alpha=confs['decoder_alpha'], 
                                     beta=confs['decoder_beta'],
                                     cutoff_top_n=confs['decoder_cutoff_top_n'],
                                     cutoff_prob=confs['decoder_cutoff_top_n'],
                                     beam_width=confs['decoder_beam_width'], 
                                     num_processes=confs['num_workers'])

            target_decoder = GreedyDecoder(confs['labels'])
        else:
            decoder, target_decoder = None, None
        
        # Test
        results[manifest.split('/')[-1]] = result_for_manifest(model, criterion, manifest, decoder, target_decoder, confs['batch_size'], confs['num_workers'])
        
        
    if not PRINT_LATEX_TABLE:
        print(f'Model: {model_path.split("/")[-1]}')
        for name, res in results.items():
            print(f'\nResults for {name}:')
            print('; '.join([f'{k}: {v:.3f}' for k, v in res.items()]))
    else:
        print(' & '.join(['model']+list([k[:-4] for k in results.keys()])))
        val_dict = {}
        for k in list(results.values())[0].keys():
            val_dict[k] = []
        for res in results.values():
            [val_dict[k].append(f'{v:.1f}') for k, v in res.items()]
        for val in val_dict.values():
            print(' & '.join([Path(model_path).stem.split('_')[0]]+val)+r' \\')
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x, mesh_y, labels, grid_index):
    print("Beginning decode for {}, {}".format(lm_alpha, lm_beta))
    test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=0)
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, cutoff_top_n=args.cutoff_top_n,
                             blank_index=labels.index('_'), lm_path=args.lm_path,
                             alpha=lm_alpha, beta=lm_beta, num_processes=1)
    total_cer, total_wer = 0, 0
    for i, (data) in enumerate(test_loader):
        inputs, targets, input_percentages, target_sizes = data

        # unflatten targets
        split_targets = []
        offset = 0
        for size in target_sizes:
            split_targets.append(targets[offset:offset + size])
            offset += size

        out = torch.from_numpy(logits[i][0])
        sizes = torch.from_numpy(logits[i][1])

        decoded_output, _, = decoder.decode(out, sizes)
        target_strings = target_decoder.convert_to_strings(split_targets)
        wer, cer = 0, 0
        for x in range(len(target_strings)):
            transcript, reference = decoded_output[x][0], target_strings[x][0]
            wer_inst = decoder.wer(transcript, reference) / float(len(reference.split()))
            cer_inst = decoder.cer(transcript, reference) / float(len(reference))
            wer += wer_inst
            cer += cer_inst
        total_cer += cer
        total_wer += wer

    wer = total_wer / len(test_loader.dataset)
    cer = total_cer / len(test_loader.dataset)

    return [grid_index, mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
def main():
    import argparse
    global model, spect_parser, decoder, args, device, decompressor

    parser = argparse.ArgumentParser(
        description='DeepSpeech transcription server')
    parser.add_argument('--host',
                        type=str,
                        default='0.0.0.0',
                        help='Host to be used by the server')
    parser.add_argument('--port',
                        type=int,
                        default=8888,
                        help='Port to be used by the server')
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info('Setting up server...')
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.half)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(model.labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index('_'))

    spect_parser = OnlineSpectrogramParser(model.audio_conf, normalize=True)
    logging.info('Server initialised')

    decompressor = LZString()

    server = WebsocketServer(host=args.host, port=args.port)
    server.set_fn_new_client(new_client)
    server.set_fn_client_left(client_left)
    server.set_fn_message_received(message_received)
    server.run_forever()
Exemple #10
0
def main():
    import argparse

    global model, spect_parser, decoder, args, device
    parser = argparse.ArgumentParser(
        description="DeepSpeech transcription server")
    parser.add_argument(
        "--host",
        type=str,
        default="0.0.0.0",
        help="Host to be used by the server",
    )
    parser.add_argument("--port",
                        type=int,
                        default=8888,
                        help="Port to be used by the server")
    parser = add_inference_args(parser)
    parser = add_decoder_args(parser)
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.DEBUG)

    logging.info("Setting up server...")
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.half)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(
            model.labels,
            lm_path=args.lm_path,
            alpha=args.alpha,
            beta=args.beta,
            cutoff_top_n=args.cutoff_top_n,
            cutoff_prob=args.cutoff_prob,
            beam_width=args.beam_width,
            num_processes=args.lm_workers,
        )
    else:
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index("_"))

    spect_parser = SpectrogramParser(model.audio_conf, normalize=True)
    logging.info("Server initialised")
    app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
Exemple #11
0
    spect = spect.to(device)
    input_sizes = torch.IntTensor([spect.size(3)]).int()
    out, output_sizes = model(spect, input_sizes)
    decoded_output, decoded_offsets = decoder.decode(out, output_sizes)
    return decoded_output, decoded_offsets


if __name__ == "__main__":
    args = get_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = load_model(device, args.model_file, True)

    parser = SpectrogramParser(model.audio_conf, normalize=True)
    decoder = BeamCTCDecoder(model.labels,
                             beam_width=args.beam_size,
                             num_processes=args.num_worker,
                             blank_index=0)

    with open(args.manifest_file) as f:
        data = f.read().split('\n')[:-1]

    src = []
    lbl = []
    for line in tqdm(data):
        vp, tp = line.split(',')
        with open(tp) as f:
            text = f.read().strip()
        decoded_output, decoded_offsets = transcribe(vp, parser, model,
                                                     decoder, device)
        for h in decoded_output[0]:
            src.append(h)
Exemple #12
0
    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        scorer = None
        if args.lm_path is not None:
            scorer = KenLMScorer(labels, args.lm_path, args.trie_path)
            scorer.set_lm_weight(args.lm_alpha)
            scorer.set_word_weight(args.lm_beta1)
            scorer.set_valid_word_weight(args.lm_beta2)
        else:
            scorer = Scorer()
        decoder = BeamCTCDecoder(labels,
                                 scorer,
                                 beam_width=args.beam_width,
                                 top_paths=1,
                                 space_index=labels.index(' '),
                                 blank_index=labels.index('_'))
    else:
        decoder = GreedyDecoder(labels,
                                space_index=labels.index(' '),
                                blank_index=labels.index('_'))

    parser = SpectrogramParser(audio_conf, normalize=True)

    t0 = time.time()
    spect = parser.parse_audio(args.audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    out = model(Variable(spect, volatile=True))
    out = out.transpose(0, 1)  # TxNxH
    decoded_output = decoder.decode(out.data)
Exemple #13
0
    model_name=re.sub('.json.pth.tar','',os.path.basename(args.model_path))
    corpus=re.sub('csv','',os.path.basename(args.test_manifest))
    if args.save_path :
        ref_file=open("%s/%s_reference.%s.txt"%( args.save_path, corpus, model_name),'w')
        trans_file=open("%s/%s_transcription.%s.txt"%(args.save_path, corpus, model_name),'w')
        print (ref_file	)
    labels = DeepSpeech.get_labels(model)
    print ("model_name ", model_name)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder
	print ( "alpha=args.alpha, beta=args.beta ", args.alpha, args.beta)
        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers, blank_index=labels.index('_'))
    elif args.decoder == "greedy":
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    else:
        decoder = None
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size,
                                  num_workers=args.num_workers)
    total_cer, total_wer = 0, 0
    n_concept_ref=0
    n_concept_err=0
    output_data = []
    total_err_c=0
def save_(p_, data_):
    with open(f"{args.save}/{os.path.basename(p_).split('.')[0]}.txt",
              "w") as f:
        f.write(data_)


if __name__ == '__main__':
    if args.decoder == "beam":
        print(f"using beam decoder")
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(model.labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.workers,
                                 blank_index=model.labels.index('_'))
    else:
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index('_'))

    print(f'Reading files from {args.path}')
    directory_files = [i for i in glob.glob(args.path + '*.wav')]
    print(f"{len(directory_files)} number of Files have been read")

    t_ = len(directory_files)

    trans = [
Exemple #15
0
    if (not audio_conf):
        audio_conf = dict(sample_rate=args.sample_rate,
                          window_size=args.window_size,
                          window_stride=args.window_stride,
                          window=args.window,
                          n_mels=args.n_mels,
                          process_mel=args.process_mel)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    elif args.decoder == "greedy":
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    else:
        decoder = None
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))

    if (args.preprocess == 'file'):
        test_dataset = FeatDataset(manifest_filepath=args.test_manifest,
                                   labels=labels)
        test_loader = FeatLoader(test_dataset,
                                 batch_size=args.batch_size,
                                 num_workers=args.num_workers)
    return wer * 100, cer * 100, output_data


if __name__ == '__main__':
    args = parser.parse_args()
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = load_model(device, args.model_path, args.half)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(model.labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    elif args.decoder == "greedy":
        decoder = GreedyDecoder(model.labels,
                                blank_index=model.labels.index('_'))
    else:
        decoder = None
    target_decoder = GreedyDecoder(model.labels,
                                   blank_index=model.labels.index('_'))
    test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                      manifest_filepath=args.test_manifest,
                                      labels=model.labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset,
Exemple #17
0
import warnings

from opts import add_decoder_args, add_inference_args
from utils import load_model
from decoder import BeamCTCDecoder
warnings.simplefilter('ignore')

parser = argparse.ArgumentParser()
parser.add_argument("fileAddr",help = "The file for which the prediction needs to be made",type= str)
args = parser.parse_args()
prepath = os.getcwd()
device = torch.device("cpu")
half = False
model = load_model(device, prepath+ "/public/models/deepspeech_final.pth", True).type(torch.FloatTensor)

decoder = BeamCTCDecoder(model.labels, lm_path=prepath+"/public/models/libri.binary", alpha=0.47, beta=0.28,
                        beam_width=2048, num_processes=12)
spect_parser = SpectrogramParser(model.audio_conf, normalize=True)

def transcribe(audio_path, spect_parser, model, decoder, device, use_half):
    spect = spect_parser.parse_audio(audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    spect = spect.to(device)
    if use_half:
        spect = spect.half()
    input_sizes = torch.IntTensor([spect.size(3)]).int()
    out, output_sizes = model(spect, input_sizes)
    with open("out.txt","wb") as f:
        pickle.dump(out.cpu().detach().numpy(),f)
    decoded_output, decoded_offsets = decoder.decode(out, output_sizes)
    return decoded_output, decoded_offsets
if __name__ == '__main__':
    model = DeepSpeech.load_model(args.model_path, cuda=args.cuda)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels,
                                 beam_width=args.beam_width,
                                 top_paths=args.top_paths,
                                 space_index=labels.index(' '),
                                 blank_index=labels.index('_'),
                                 lm_path=args.lm_path,
                                 trie_path=args.trie_path,
                                 lm_alpha=args.lm_alpha,
                                 lm_beta=args.lm_beta,
                                 label_size=args.label_size,
                                 label_margin=args.label_margin)
    else:
        decoder = GreedyDecoder(labels,
                                space_index=labels.index(' '),
                                blank_index=labels.index('_'))

    parser = SpectrogramParser(audio_conf, normalize=True)

    spect = parser.parse_audio(args.audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    out = model(Variable(spect, volatile=True))
Exemple #19
0
class create_model(nn.Module):
    def __init__(self, args):
        super(create_model, self).__init__()
        self.args = args
        self.model = SpeechNet(args)
        self.model.to(args.device)
        self.criterion = nn.CTCLoss()
        self.decoder = BeamCTCDecoder(PHONEME_MAP, blank_index=0, beam_width=args.beam_width)

        self.state_names = ['loss', 'edit_dist', 'lr']

    def train_setup(self):
        self.lr = self.args.lr
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr, weight_decay=self.args.weight_decay)
        if self.args.use_step_schedule:
            self.scheduler = MultiStepLR(self.optimizer, milestones=self.args.decay_steps, gamma=self.args.lr_gamma)
        elif self.args.use_reduce_schedule:
            self.scheduler = ReduceLROnPlateau(self.optimizer, mode='min', factor=0.5, patience=1)
        else:
            self.scheduler = ParamScheduler(self.optimizer, scale_cos, self.args.num_epochs * self.args.loader_length)
#         self.model.apply(weights_init)
        self.model.train()

    def optimize_parameters(self, input, input_lens, target, target_lens):
        input, target = input.to(self.args.device), target.to(self.args.device)
        output, output_lens, self.loss = self.forward(input, input_lens, target, target_lens)

        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

        self.edit_dist = self.get_edit_dist(output, output_lens, target, target_lens)

        del input
        del target
        del input_lens
        del target_lens
        del output
        del output_lens

    def update_learning_rate(self, dist=None):
        if self.args.use_reduce_schedule:
            self.scheduler.step(dist)
        else:
            self.scheduler.step()
        self.lr = self.optimizer.param_groups[0]['lr']

    def get_current_states(self):
        errors_ret = OrderedDict()
        for name in self.state_names:
            if isinstance(name, str):
                # float(...) works for both scalar tensor and float number
                errors_ret[name] = float(getattr(self, name))
        return errors_ret

    def get_edit_dist(self, output, output_lens, target, target_lens):
        output, target = output.cpu(), target.cpu()
        phonome_preds = self.decoder.decode(output, output_lens)
        phonomes = self.decoder.convert_to_strings(target, target_lens)
        edit_dist = np.sum(
            [self.decoder.Lev_dist(phonome_pred, phonome) for (phonome_pred, phonome) in zip(phonome_preds, phonomes)])
        return edit_dist

    def forward(self, input, input_lens, target=None, target_lens=None, is_training=True):
        output, output_lens = self.model(input, input_lens)
        if is_training:
            # The official documentation is your best friend: https://pytorch.org/docs/stable/nn.html#ctcloss
            # nn.CTCLoss takes 4 arguments to compute the loss:
            # [log_probs]: Prediction of your model at each time step. Shape: (seq_len, batch_size, vocab_size)
            # Values must be log probabilities. Neither probabilities nor logits will work.
            # Make sure the output of your network is log probabilities, by adding a nn.LogSoftmax after the last layer.
            # [targets]: The ground truth sequences. Shape: (batch_size, seq_len)
            # Values are indices of phonemes. Again, remember that index 0 is reserved for "blank"
            # [input_lengths]: Lengths of sequences in log_probs. Shape: (batch_size,).
            # This is not necessarily the same as lengths of input of the model.
            # [target_lengths]: Lengths of sequences in targets. Shape: (batch_size,).
            loss = self.criterion(output.permute(1, 0, 2), target, input_lens, target_lens)
            return output, output_lens, loss
        else:
            return output, output_lens,

    def train(self):
        try:
            self.model.train()
        except:
            print('train() cannot be implemented as model does not exist.')

    def eval(self):
        try:
            self.model.eval()
        except:
            print('eval() cannot be implemented as model does not exist.')

    def load_model(self, model_path):
        self.model.load_state_dict(torch.load(model_path))

    def save_model(self, which_epoch):
        save_filename = '%s_net.pth' % (which_epoch)
        save_path = os.path.join(self.args.expr_dir, save_filename)
        if torch.cuda.is_available():
            try:
                torch.save(self.model.module.cpu().state_dict(), save_path)
            except:
                torch.save(self.model.cpu().state_dict(), save_path)
        else:
            torch.save(self.model.cpu().state_dict(), save_path)

        self.model.to(self.args.device)
Exemple #20
0
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x,
                   mesh_y, index, labels, eval):
    print("Beginning decode for {}, {}".format(lm_alpha, lm_beta))
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=batch_size,
                                  num_workers=0)
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    decoder = BeamCTCDecoder(labels,
                             beam_width=args.beam_width,
                             cutoff_top_n=args.cutoff_top_n,
                             blank_index=labels.index('_'),
                             lm_path=args.lm_path,
                             alpha=lm_alpha,
                             beta=lm_beta,
                             num_processes=1)
    model_name = re.sub('.json.pth.tar', '', os.path.basename(args.model_path))
    ref_file = None
    if eval == 'concept':
        eval_dir = "%s/%s/%s" % (os.path.dirname(
            args.output_path), model_name, index)
        if not os.path.exists(eval_dir):
            os.makedirs(eval_dir)
        ref_file = open(
            "%s/%s_reference.txt" %
            (eval_dir, re.sub('.csv', '', os.path.basename(
                args.test_manifest))), 'w')
        trans_file = open(
            "%s/%s_transcription.txt" %
            (eval_dir, re.sub('.csv', '', os.path.basename(
                args.test_manifest))), 'w')
    total_cer, total_wer = 0, 0
    for i, (data) in enumerate(test_loader):
        inputs, targets, input_percentages, target_sizes, audio_ids = data

        # unflatten targets
        split_targets = []
        offset = 0
        for size in target_sizes:
            split_targets.append(targets[offset:offset + size])
            offset += size

        out = torch.from_numpy(logits[i][0])
        sizes = torch.from_numpy(logits[i][1])

        decoded_output, _, _, _, _ = decoder.decode(out, sizes)
        target_strings = target_decoder.convert_to_strings(split_targets)
        wer, cer = 0, 0
        for x in range(len(target_strings)):
            transcript, reference = decoded_output[x][0], target_strings[x][0]
            if eval == 'concept':
                ref_file.write(
                    reference.encode('utf-8') + "(" + audio_ids[x] + ")\n")
                trans_file.write(
                    transcript.encode('utf-8') + "(" + audio_ids[x] + ")\n")

            wer_inst = decoder.wer(transcript, reference) / float(
                len(reference.split()))
            cer_inst = decoder.cer(transcript, reference) / float(
                len(reference))
            wer += wer_inst
            cer += cer_inst
        total_cer += cer
        total_wer += wer
    ref_file.close()
    trans_file.close()
    wer = total_wer / len(test_loader.dataset)
    cer = total_cer / len(test_loader.dataset)
    if eval == 'concept':  # Concept error rate evaluation
        cmd = "perl /lium/buster1/ghannay/deepSpeech2/deepspeech.pytorch/data/eval.sclit_cer.pl %s" % (
            eval_dir)
        print("cmd  ", cmd)
        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             shell=True)
        coner, error = p.communicate()
        print(" coner  ", coner)
        return [mesh_x, mesh_y, lm_alpha, lm_beta, float(coner) / 100, cer]
    else:
        return [mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
Exemple #21
0
                       help='Language model word bonus (IV words)')
args = parser.parse_args()

if __name__ == '__main__':
    model = DeepSpeech.load_model(args.model_path, cuda=args.cuda)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        decoder = BeamCTCDecoder(labels,
                                 beam_width=args.beam_width,
                                 top_paths=1,
                                 space_index=labels.index(' '),
                                 blank_index=labels.index('_'),
                                 lm_path=args.lm_path,
                                 trie_path=args.trie_path,
                                 lm_alpha=args.lm_alpha,
                                 lm_beta1=args.lm_beta1,
                                 lm_beta2=args.lm_beta2)
    else:
        decoder = GreedyDecoder(labels,
                                space_index=labels.index(' '),
                                blank_index=labels.index('_'))

    parser = SpectrogramParser(audio_conf, normalize=True)

    t0 = time.time()
    spect = parser.parse_audio(args.audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    out = model(Variable(spect, volatile=True))
Exemple #22
0
def decode_dataset(logits, test_dataset, batch_size, lm_alpha, lm_beta, mesh_x,
                   mesh_y, labels):
    print("Beginning decode for {}, {}".format(lm_alpha, lm_beta))
    test_loader = FeatLoader(test_dataset,
                             batch_size=batch_size,
                             num_workers=0)
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    decoder = BeamCTCDecoder(labels,
                             beam_width=args.beam_width,
                             cutoff_top_n=args.cutoff_top_n,
                             blank_index=labels.index('_'),
                             lm_path=args.lm_path,
                             alpha=lm_alpha,
                             beta=lm_beta,
                             num_processes=1)
    total_cer, total_wer = 0, 0
    #decoding_log = []
    for i, (data) in enumerate(test_loader):
        inputs, targets, input_percentages, target_sizes = data

        # unflatten targets
        split_targets = []
        offset = 0
        for size in target_sizes:
            split_targets.append(targets[offset:offset + size])
            offset += size

        out = torch.from_numpy(logits[i][0])
        sizes = torch.from_numpy(logits[i][1])

        decoded_output, _ = decoder.decode(out, sizes)
        target_strings = target_decoder.convert_to_strings(split_targets)
        wer, cer = 0, 0
        for x in range(len(target_strings)):
            transcript, reference = decoded_output[x][0], target_strings[x][0]
            wer_inst = decoder.wer(transcript, reference) / float(
                len(reference.split()))
            cer_inst = decoder.cer(transcript, reference) / float(
                len(reference))
            wer += wer_inst
            cer += cer_inst

            # ver1
            # write result to logFile # can't do this because multi processing code cannot do this
            #logFile.write('decoding : ' + transcript)
            #logFIle.write('reference : ' + reference)
            #logFile.write('WER = ' + str(wer_inst) + ', CER = ' + str(cer_inst))

            if (random.uniform(0, 1) < float(args.detail_log_print_prob)):
                print('decoding : ' + transcript)
                print('reference : ' + reference)
                print('WER = ' + str(wer_inst) + ', CER = ' + str(cer_inst))
                print(' ')

                #ver1
                #decoding_log_sample = []
                #decoding_log_sample.append(transcript)
                #decoding_log_sample.append(reference)
                #decoding_log.append(decoding_log_sample)

                #ver2. thread safe but does not write anything to file
                #logging.info('decoding : ' + transcript)
                #logging.info('reference : ' + reference)
                #logging.info('WER = ' + str(wer_inst) + ', CER = ' + str(cer_inst))
                #logging.info(' ')

                #ver3
                logger.error('decoding : ' + transcript)
                logger.error('reference : ' + reference)
                logger.error('WER = ' + str(wer_inst) + ', CER = ' +
                             str(cer_inst))
                logger.error(' ')

        total_cer += cer
        total_wer += wer

    wer = total_wer / len(test_loader.dataset)
    cer = total_cer / len(test_loader.dataset)

    return [mesh_x, mesh_y, lm_alpha, lm_beta, wer, cer]
Exemple #23
0

if __name__ == '__main__':
    model = DeepSpeech.load_model(args.model_path, cuda=args.cuda)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    else:
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))

    parser = SpectrogramParser(audio_conf, normalize=True)

    spect = parser.parse_audio(args.audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    out = model(Variable(spect, volatile=True))
    out = out.transpose(0, 1)  # TxNxH
    decoded_output, decoded_offsets = decoder.decode(out.data)
    print(json.dumps(decode_results(decoded_output, decoded_offsets)))
Exemple #24
0
def init(beam_width, blank_index, lm_path):
	global decoder, ae_decoder
	decoder = BeamCTCDecoder(model.vocabulary, lm_path=lm_path, beam_width=beam_width, num_processes=args.lm_workers,
							 blank_index=blank_index)
	ae_decoder = GreedyDecoder(model.vocabulary)
Exemple #25
0
if __name__ == '__main__':
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    if args.cuda:
        model.cuda()
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width, num_processes=args.lm_workers)
    elif args.decoder == "greedy":
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    else:
        decoder = None
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size,
                                  num_workers=args.num_workers)
    total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0
    output_data = []
    for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)):
        inputs, targets, input_percentages, target_sizes = data
        input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
Exemple #26
0
    out, output_sizes = model(spect, input_sizes)
    decoded_output, decoded_offsets = decoder.decode(out, output_sizes)
    return decoded_output, decoded_offsets


if __name__ == "__main__":
    args = get_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = load_model(device, args.model_file, True)

    parser = SpectrogramParser(model.audio_conf, normalize=True)
    decoder = BeamCTCDecoder(model.labels,
                             lm_path=args.lm_path,
                             alpha=args.alpha,
                             beta=args.beta,
                             beam_width=args.beam_size,
                             num_processes=args.num_worker,
                             blank_index=0)

    with open(args.manifest_file) as f:
        data = f.read().split('\n')[:-1]

    idx = []
    pred = []
    lbl = []
    for line in tqdm(data):
        vp, tp = line.split(',')
        with open(tp) as f:
            text = f.read().strip()
        decoded_output, decoded_offsets = transcribe(vp, parser, model,
Exemple #27
0
args = parser.parse_args()

if __name__ == '__main__':
    model = DeepSpeech.load_model(args.model_path, cuda=args.cuda)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder
        decoder = BeamCTCDecoder(labels,
                                 beam_width=args.beam_width,
                                 top_paths=1,
                                 space_index=labels.index(' '),
                                 blank_index=labels.index('_'),
                                 lm_path=args.lm_path,
                                 trie_path=args.trie_path,
                                 lm_alpha=args.lm_alpha,
                                 lm_beta1=args.lm_beta1,
                                 lm_beta2=args.lm_beta2)
    else:
        decoder = GreedyDecoder(labels,
                                space_index=labels.index(' '),
                                blank_index=labels.index('_'))

    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.test_manifest,
                                      labels=labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        scorer = None
        if args.lm_path is not None:
            scorer = KenLMScorer(labels, args.lm_path, args.trie_path)
            scorer.set_lm_weight(args.lm_alpha)
            scorer.set_word_weight(args.lm_beta1)
            scorer.set_valid_word_weight(args.lm_beta2)
        else:
            scorer = Scorer()
        decoder = BeamCTCDecoder(labels,
                                 scorer,
                                 beam_width=args.beam_width,
                                 top_paths=1,
                                 space_index=labels.index(' '),
                                 blank_index=labels.index('_'))
    else:
        decoder = GreedyDecoder(labels,
                                space_index=labels.index(' '),
                                blank_index=labels.index('_'))

    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.test_manifest,
                                      labels=labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)
    total_cer, total_wer = 0, 0
Exemple #29
0
    torch.set_grad_enabled(False)
    model = DeepSpeech.load_model(args.model_path)
    device = torch.device("cuda" if args.cuda else "cpu")
    model = model.to(device)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)

    if args.decoder == "beam":
        from decoder import BeamCTCDecoder

        decoder = BeamCTCDecoder(labels,
                                 lm_path=args.lm_path,
                                 alpha=args.alpha,
                                 beta=args.beta,
                                 cutoff_top_n=args.cutoff_top_n,
                                 cutoff_prob=args.cutoff_prob,
                                 beam_width=args.beam_width,
                                 num_processes=args.lm_workers)
    elif args.decoder == "greedy":
        decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    else:
        decoder = None
    target_decoder = GreedyDecoder(labels, blank_index=labels.index('_'))
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.test_manifest,
                                      labels=labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)
def run_experiment(_exp_name, _epochs, _train_manifest, _test_manifest,
                   _labels, _use_mfcc_in, _use_ivectors_in, _use_embeddings_in,
                   _use_transcripts_out, _use_accents_out, _batch_size,
                   _num_workers, _mfcc_size, _ivector_size, _embedding_size,
                   _rnn_type, _rnn_hidden_size, _nb_head_layers,
                   _nb_speech_layers, _nb_accents_layers, _bidirectional,
                   _losses_mix, _learning_rate, _lm_path, _decoder_alpha,
                   _decoder_beta, _decoder_cutoff_top_n, _decoder_beam_width,
                   _cuda, _tensorboard_path, _saved_models_path,
                   _bottleneck_size, _accent_loss):

    print(f'\n##### Running experiment {_exp_name} #####')

    # Tools to log values
    results_dict = {}
    results_dict['train_loss'] = []
    results_dict['train_loss_text'] = []
    results_dict['train_loss_accent'] = []
    results_dict['test_loss'] = []
    results_dict['test_loss_text'] = []
    results_dict['test_loss_accent'] = []
    results_dict['test_wer'] = []
    results_dict['test_accent_acc'] = []

    tb_path = Path(_tensorboard_path) / _exp_name
    makedirs(tb_path, exist_ok=True)
    tb_writer = SummaryWriter(tb_path)

    ### DATA LOADING

    # Training set
    train_dataset = MultiDataset(_train_manifest,
                                 _labels,
                                 use_mfcc_in=_use_mfcc_in,
                                 use_ivectors_in=_use_ivectors_in,
                                 use_embeddings_in=_use_embeddings_in,
                                 embedding_size=_embedding_size,
                                 use_transcripts_out=_use_transcripts_out,
                                 use_accents_out=_use_accents_out)

    train_loader = MultiDataLoader(train_dataset,
                                   batch_size=_batch_size,
                                   shuffle=True,
                                   num_workers=_num_workers)

    # Testing set
    test_dataset = MultiDataset(_test_manifest,
                                _labels,
                                use_mfcc_in=_use_mfcc_in,
                                use_ivectors_in=_use_ivectors_in,
                                use_embeddings_in=_use_embeddings_in,
                                embedding_size=_embedding_size,
                                use_transcripts_out=_use_transcripts_out,
                                use_accents_out=_use_accents_out)

    test_loader = MultiDataLoader(test_dataset,
                                  batch_size=_batch_size,
                                  shuffle=True,
                                  num_workers=_num_workers)

    ### CREATE MODEL

    model = MultiTask(use_mfcc_in=_use_mfcc_in,
                      use_ivectors_in=_use_ivectors_in,
                      use_embeddings_in=_use_embeddings_in,
                      use_transcripts_out=_use_transcripts_out,
                      use_accents_out=_use_accents_out,
                      mfcc_size=_mfcc_size,
                      ivector_size=_ivector_size,
                      embedding_size=_embedding_size,
                      rnn_type=_rnn_type,
                      labels=_labels,
                      accents_dict=train_dataset.accent_dict,
                      rnn_hidden_size=_rnn_hidden_size,
                      nb_head_layers=_nb_head_layers,
                      nb_speech_layers=_nb_speech_layers,
                      nb_accents_layers=_nb_accents_layers,
                      bidirectional=_bidirectional,
                      bottleneck_size=_bottleneck_size,
                      DEBUG=False)
    if _cuda:
        model = model.cuda()

    print(model, '\n')
    print('Model parameters counts:', MultiTask.get_param_size(model), '\n')

    ### OPTIMIZER, CRITERION, DECODER

    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=_learning_rate)

    # Criterion
    if _use_accents_out:
        if _accent_loss == 'focal':
            AccLoss = FocalLoss()
        elif _accent_loss == 'CE':
            AccLoss = nn.CrossEntropyLoss()
        else:
            raise ValueError(
                f'Loss {_accent_loss} for accent_loss is unknown. Please use either "focal" or "CE".'
            )

    if not _use_transcripts_out:  # only accent classification
        criterion = AccLoss
    elif not _use_accents_out:  # only text recognition
        criterion = nn.CTCLoss()
    else:  # both tasks
        criterion = (nn.CTCLoss(), FocalLoss())

    # Decoder
    if _use_transcripts_out:
        decoder = BeamCTCDecoder(_labels,
                                 lm_path=_lm_path,
                                 alpha=_decoder_alpha,
                                 beta=_decoder_beta,
                                 cutoff_top_n=_decoder_cutoff_top_n,
                                 cutoff_prob=_decoder_cutoff_top_n,
                                 beam_width=_decoder_beam_width,
                                 num_processes=_num_workers)

        target_decoder = GreedyDecoder(_labels)
    else:
        decoder, target_decoder = None, None

    ### EPOCHS
    best_wer = math.inf
    best_acc = 0

    for epoch in range(1, _epochs + 1):
        ### TRAIN
        print(f'Epoch {epoch} training: {exp_name}')
        train_results = train(model,
                              train_loader,
                              criterion,
                              optimizer,
                              losses_mix=_losses_mix)
        train_loss, train_loss_text, train_loss_accent = train_results

        results_dict['train_loss'].append(train_loss)
        results_dict['train_loss_text'].append(train_loss_text)
        results_dict['train_loss_accent'].append(train_loss_accent)
        print(f'Epoch {epoch} training loss: {train_loss}')

        ### TEST
        print(f'Epoch {epoch} testing')
        test_results = test(model,
                            test_loader,
                            criterion,
                            decoder,
                            target_decoder,
                            losses_mix=_losses_mix)
        test_loss, test_loss_text, test_loss_accent, test_wer, test_accent_acc = test_results

        results_dict['test_loss'].append(test_loss)
        results_dict['test_loss_text'].append(test_loss_text)
        results_dict['test_loss_accent'].append(test_loss_accent)
        results_dict['test_wer'].append(test_wer)
        results_dict['test_accent_acc'].append(test_accent_acc)
        print(f'Epoch {epoch} testing loss: {test_loss}')

        # Add values to tensorboard
        for key, results in results_dict.items():
            tb_writer.add_scalar(key, results[-1], epoch)

        #Save model if it is best
        save_new = False
        if _use_transcripts_out:
            if test_wer < best_wer:
                save_new = True
                best_wer = test_wer
        else:
            if test_accent_acc > best_acc:
                save_new = True
                best_acc = test_accent_acc

        if save_new:
            MultiTask.serialize(
                model,
                Path(_saved_models_path) / _exp_name,
                save=True,
                exp_name=_exp_name,
                optimizer=optimizer,
                epoch=epoch,
                train_losses=results_dict['train_loss'],
                test_losses=results_dict['test_loss'],
                text_train_losses=results_dict['train_loss_text'],
                text_test_losses=results_dict['test_loss_text'],
                text_wers=results_dict['test_wer'],
                accent_train_losses=results_dict['train_loss_accent'],
                accent_test_losses=results_dict['test_loss_accent'],
                accent_accuracies=results_dict['test_accent_acc'])

    del model
    gc.collect()
    torch.cuda.empty_cache()