def run_evaluation( test_loader, model, decoder: Decoder, device: torch.device, target_decoder: Decoder, # precision: Precision ): model.eval() wer = WordErrorRate(decoder=decoder, target_decoder=target_decoder) cer = CharErrorRate(decoder=decoder, target_decoder=target_decoder) for i, (batch) in tqdm(enumerate(test_loader), total=len(test_loader)): inputs, targets, input_percentages, target_sizes = batch input_sizes = input_percentages.mul_(int(inputs.size(3))).int() inputs = inputs.to(device) # with autocast(enabled=precision is Precision.half): with autocast(enabled=True): out, output_sizes = model(inputs, input_sizes) decoded_output, _ = decoder.decode(out, output_sizes) wer.update(preds=out, preds_sizes=output_sizes, targets=targets, target_sizes=target_sizes) cer.update(preds=out, preds_sizes=output_sizes, targets=targets, target_sizes=target_sizes) return wer.compute(), cer.compute()
def run_transcribe(audio_path: str, spect_parser: SpectrogramParser, model: DeepSpeech, decoder: Decoder, device: torch.device, use_half: bool): # audio_path # try: # # inTranscript = audio_path.replace("wav", "txt") # # print(inTranscript) # # getTranscript(inTranscript) # pass # except Exception as asd: # print(asd) # pass spect = spect_parser.parse_audio(audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) spect = spect.to(device) if use_half: spect = spect.half() input_sizes = torch.IntTensor([spect.size(3)]).int() out, output_sizes = model(spect, input_sizes) decoded_output, decoded_offsets = decoder.decode(out, output_sizes) #Thêm vào greedy decoder2 = GreedyDecoder(labels=model.labels, blank_index=model.labels.index('_')) decoded_output2, decoded_offsets2 = decoder2.decode(out, output_sizes) return decoded_output, decoded_output2, decoded_offsets, decoded_offsets2
def run_transcribe(audio_path: str, spect_parser: SpectrogramParser, model: DeepSpeech, decoder: Decoder, device: torch.device, precision: int): spect = spect_parser.parse_audio(audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) spect = spect.to(device) input_sizes = torch.IntTensor([spect.size(3)]).int() with autocast(enabled=precision == 16): out, output_sizes = model(spect, input_sizes) decoded_output, decoded_offsets = decoder.decode(out, output_sizes) return decoded_output, decoded_offsets
def run_transcribe(audio_path: str, spect_parser: SpectrogramParser, model: DeepSpeech, decoder: Decoder, device: torch.device, use_half: bool): spect = spect_parser.parse_audio(audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) spect = spect.to(device) if use_half: spect = spect.half() input_sizes = torch.IntTensor([spect.size(3)]).int() out, output_sizes = model(spect, input_sizes) decoded_output, decoded_offsets = decoder.decode(out, output_sizes) return decoded_output, decoded_offsets
def run_evaluation_print(test_loader, model, decoder: Decoder, device: torch.device, target_decoder: Decoder, precision: int): # track time to complete start_time = time.time() model.eval() wer = WordErrorRate( decoder=decoder, target_decoder=target_decoder ) cer = CharErrorRate( decoder=decoder, target_decoder=target_decoder ) output_dict = {} pred, gt = [],[] for i, (batch) in tqdm(enumerate(test_loader), total=len(test_loader)): inputs, targets, input_percentages, target_sizes = batch input_sizes = input_percentages.mul_(int(inputs.size(3))).int() inputs = inputs.to(device) with autocast(enabled=precision == 16): out, output_sizes = model(inputs, input_sizes) decoded_output, _ = decoder.decode(out, output_sizes) # prints out text # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size #print('split_targets:') #print(split_targets) target_strings = target_decoder.convert_to_strings(split_targets) #print('target_strings:') #print(target_strings) for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] pred.append(transcript) gt.append(reference) # self.calculate_metric( # transcript=transcript, # reference=reference # ) wer.update( preds=out, preds_sizes=output_sizes, targets=targets, target_sizes=target_sizes ) cer.update( preds=out, preds_sizes=output_sizes, targets=targets, target_sizes=target_sizes ) output_dict['pred'] = pred output_dict['gt'] = gt print("--- Time taken to infer %s seconds ---" % (time.time() - start_time)) output_file_path = '/root/deepspeech/outputs/valid_output.json' #export to file with open(output_file_path, 'w') as fp: json.dump(output_dict, fp) return wer.compute(), cer.compute()