def train(net, trainLoader, criterion, optimizer): net.train() running_loss = 0.0 running_per = 0.0 for batchIdx, batch in enumerate(trainLoader): feature = batch['feature'].cuda() feat_len = batch['feat_len'].cuda() utterance = batch['utterance'].cuda() utter_len = batch['utter_len'].cuda() optimizer.zero_grad() logits = net(feature, utterance) loss = criterion(logits.view(-1, log_logits.size(-1)), utterance.view(-1)) running_loss += loss.item() preds = logits.max(-1)[1] utterance = [u[u != config.data.pad_idx] for u in utterance] # long sequence lower computation running_per += np.array([wer(*z) for z in zip(utterance, preds)]).mean() loss.backward() optimizer.step() N = len(trainLoader) // 10 if batchIdx % N == N - 1: print( f'batch: {batchIdx} | loss: {running_loss/N} | per: {running_per/N}' ) running_loss = 0.0 running_per = 0.0
def validate(model, x, y_true, input_len, label_len, y_strings, test=False, save_file=None): input_len = np.expand_dims(input_len, axis=1) label_len = np.expand_dims(label_len, axis=1) y_pred = model(x) loss = ctc_batch_cost(y_true, y_pred, input_len, label_len) input_len = np.squeeze(input_len) y_decode = ctc_decode(y_pred, input_len)[0][0] accuracy = 0.0 for i in range(len(y_strings)): predicted_sentence = indices_to_string(y_decode[i].numpy()) accuracy += wer(predicted_sentence, y_strings[i]) if test: save_file.write("Correct Sentence:" + str(y_strings[i]) + "\n") save_file.write("Predicted Sentence:" + predicted_sentence + "\n") return tf.reduce_mean(loss), accuracy / len(y_strings)
def evaluate(epoch, model, optimizer, error, dataloader, idx2char): model.eval() loss = 0 correct = 0 test_cer = [] test_wer = [] with torch.no_grad(): for i, (images, labels, target_len, padded_len) in enumerate(dataloader): images, labels = images.to(device), labels.to(device) outputs = model(images) outputs = F.log_softmax(outputs, dim=2) loss += error(outputs, labels, padded_len, target_len).item() decoded_preds, decoded_targets = GreedyDecoder( outputs.transpose(0, 1), labels, target_len, idx2char) for j in range(len(decoded_preds)): test_cer.append(cer(decoded_targets[j], decoded_preds[j])) test_wer.append(wer(decoded_targets[j], decoded_preds[j])) avg_cer = sum(test_cer) / len(test_cer) avg_wer = sum(test_wer) / len(test_wer) loss /= len(dataloader) wandb.log({"Val loss": loss, "WER": avg_wer, "CER": avg_cer}) print( 'Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n' .format(loss, avg_cer, avg_wer))
def train_epoch(model, optimizer, dataloader, CTCLoss, device, melspec_transforms): model.train() losses = [] for i, (wavs, wavs_len, answ, answ_len) in tqdm(enumerate(dataloader)): wavs, answ = wavs.to(device), answ.to(device) trans_wavs = torch.log(melspec_transforms(wavs) + 1e-9) optimizer.zero_grad() output = model(trans_wavs) output = F.log_softmax(output, dim=1) output = output.transpose(0, 1).transpose(0, 2) loss = CTCLoss(output, answ, wavs_len, answ_len) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 15) optimizer.step() losses.append(loss.item()) if i % 100 == 0: wandb.log({'mean_train_loss': loss}) preds, targets = decoder_func(output, answ, answ_len, del_repeated=False) wandb.log({"CER_train": cer(targets[0], preds[0])}) wandb.log({"WER_train": wer(targets[0], preds[0])}) return np.mean(losses)
def evaluate(net, devLoader, criterion): net.eval() epoch_loss = 0.0 epoch_wer = 0.0 with torch.no_grad(): for batchIdx, batch in enumerate(devLoader): feature = batch['feature'].cuda() feat_len = batch['feat_len'].cuda() utterance = batch['utterance'].cuda() utter_len = batch['utter_len'].cuda() logits, feat_len = net(feature, feat_len) log_logits = F.log_softmax(logits, dim=-1).transpose(0, 1) loss = criterion(log_logits, utterance, feat_len, utter_len) epoch_loss += loss.item() preds = log_logits.max(-1)[1].transpose(0, 1) preds = [[k for k, _ in groupby(s) if k != config.data.blank_idx] for s in preds] # ? preds = [[k for k, _ in groupby(s)] for s in preds] utterance = [u[u != config.data.pad_idx] for u in utterance] epoch_wer += np.array([wer(*z) for z in zip(utterance, preds)]).mean() return epoch_loss / len(devLoader), epoch_wer / len(devLoader)
def train(net, trainLoader, criterion, optimizer, epoch): net.train() running_loss = 0.0 running_wer = 0.0 for batchIdx, batch in enumerate(trainLoader): feature = batch['feature'].cuda() feat_len = batch['feat_len'].cuda() utterance = batch['utterance'].cuda() utter_len = batch['utter_len'].cuda() optimizer.zero_grad() # noise weight if args.resume_training: hh_noise = torch.normal( 0, config.model.ctc.sigma, size=net.encoder.rnn.weight_hh_l0.shape).cuda() ih_noise = torch.normal( 0, config.model.ctc.sigma, size=net.encoder.rnn.weight_ih_l0.shape).cuda() net.encoder.rnn.weight_hh_l0.data.add_(hh_noise) net.encoder.rnn.weight_ih_l0.data.add_(ih_noise) logits, feat_len = net(feature, feat_len) log_logits = F.log_softmax(logits, dim=-1).transpose(0, 1) # without downsampling loss-compute will be resuming loss = criterion(log_logits, utterance, feat_len, utter_len) running_loss += loss.item() preds = log_logits.max(-1)[1].transpose(0, 1) preds = [[k for k, _ in groupby(s) if k != config.data.blank_idx] for s in preds] # ? preds = [[k for k, _ in groupby(s)] for s in preds] utterance = [u[u != config.data.pad_idx] for u in utterance] # long sequence lower computation running_wer += np.array([wer(*z) for z in zip(utterance, preds)]).mean() loss.backward() optimizer.step() N = len(trainLoader) // 10 if batchIdx % N == N - 1: print( f'epoch: {epoch} | batch: {batchIdx} | loss: {running_loss/N} | wer: {running_wer/N}' ) running_loss = 0.0 running_wer = 0.0
def evaluate(net, devLoader): net.eval() epoch_per = 0.0 with torch.no_grad(): for batchIdx, batch in enumerate(devLoader): inputs = batch['feature'].cuda() inputs_len = batch['feat_len'].cuda() targets = batch['utterance'].cuda() targets_len = batch['utter_len'].cuda() preds = net.best_path_decode(inputs, inputs_len) preds = [[k for k, _ in groupby(sent)] for sent in preds] targets = [u[u != config.data.pad_idx] for u in targets] per = np.array([wer(*z) for z in zip(targets, preds)]).mean() epoch_per += per return epoch_per / len(devLoader)
def test(self, batch_size=4): self.net.eval() #test = torchaudio.datasets.COMMONVOICE(root='/media/gussim/SlaveDisk/MCV',version= 'cv-corpus-6.1-2020-12-11', download = False) test = torchaudio.datasets.LIBRISPEECH("./", url="test-clean", download=False) testset = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True, collate_fn=self.collate_fn) test_loss = 0 test_cer, test_wer = [], [] with torch.no_grad(): i = 0 for data in testset: spectrograms, labels, input_lengths, label_lengths = data spectrograms, labels = spectrograms.to(self.device), labels.to( self.device) output = self.net(spectrograms) # batch, time, num_class output = F.log_softmax(output, dim=2) output = output.transpose(0, 1) # time, batch, num_class loss = self.criterion(output, labels, input_lengths, label_lengths) test_loss += loss.item() / len(testset) decoded_preds, decoded_targets = textprocess.greedy_decoder_label( output.transpose(0, 1), labels, label_lengths) for j in range(len(decoded_preds)): test_cer.append( utils.cer(decoded_targets[j], decoded_preds[j])) test_wer.append( utils.wer(decoded_targets[j], decoded_preds[j])) avg_cer = sum(test_cer) / len(test_cer) avg_wer = sum(test_wer) / len(test_wer) i = i + 1 if i == 10: break f = open(self.logpath, 'a') f.write( 'Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n' .format(test_loss, avg_cer, avg_wer)) f.close() print( 'Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n' .format(test_loss, avg_cer, avg_wer))
def evaluate(net, devLoader, criterion): net.eval() epoch_loss = 0.0 epoch_per = 0.0 with torch.no_grad(): for batchIdx, batch in enumerate(devLoader): feature = batch['feature'].cuda() feat_len = batch['feat_len'].cuda() utterance = batch['utterance'].cuda() utter_len = batch['utter_len'].cuda() logits = net(feature, utterance, teacher_forcing_ratio=0) loss = criterion(logits.view(-1, logits.size(-1)), utterance.view(-1)) epoch_loss += loss.item() preds = logits.max(-1)[1] utterance = [u[u != config.data.pad_idx] for u in utterance] epoch_per += np.array([wer(*z) for z in zip(utterance, preds)]).mean() return epoch_loss / len(devLoader), epoch_per / len(devLoader)
def train_one_step(model, optimizer, x, y_true, input_len, label_len, y_strings): input_len = np.expand_dims(input_len, axis=1) label_len = np.expand_dims(label_len, axis=1) with tf.GradientTape() as tape: y_pred = model(x) loss = ctc_batch_cost(y_true, y_pred, input_len, label_len) grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) input_len = np.squeeze(input_len) y_decode = ctc_decode(y_pred, input_len)[0][0] accuracy = 0.0 for i in range(len(y_strings)): predicted_sentence = indices_to_string(y_decode[i].numpy()) accuracy += wer(predicted_sentence, y_strings[i]) return tf.reduce_mean(loss), accuracy / len(y_strings)
def main(): global RVR_DIR, fnames args = parse_args() RVR_DIR = args.rover df = pd.read_csv(args.data, delimiter='\t') print('creating ctm files...') # create ds2 and service ctm file with open('ds2.ctm', 'w') as fd_ds2, open('service.ctm', 'w') as fd_ser: for i, row in df.iterrows(): fname = row['fname'] ser_transcript, ser_conf = row[[ 'service_transcript', 'service_confs' ]] ds2_transcript, ds2_conf = row[['ds2_transcript', 'ds2_confs']] assert len(ser_transcript.split()) == len(ser_conf.split()) assert len(ds2_transcript.split()) == len(ds2_conf.split()) for w, c in zip(ser_transcript.split(), ser_conf.split()): fd_ds2.write('{} a 0.0 0.0 {} {}\n'.format(fname, w, c)) for w, c in zip(ds2_transcript.split(), ds2_conf.split()): fd_ser.write('{} a 0.0 0.0 {} {}\n'.format(fname, w, c)) # sorting in order as required by ctm file bashCommand = "sort +0 -1 +1 -2 +2nb -3 -s -o ds2.ctm ds2.ctm" process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output, error = process.communicate() bashCommand = "sort +0 -1 +1 -2 +2nb -3 -s -o service.ctm service.ctm" process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) output, error = process.communicate() print('created ctm files.\nGenerating rover output...') fnames = df['fname'].to_list() references = df['reference'].to_list() ser_transcripts = df['service_transcript'].to_list() ds2_transcripts = df['ds2_transcript'].to_list() if args.finetune: p = Pool(multiprocessing.cpu_count()) params_grid = np.linspace(args.conf_from, args.conf_to, args.num_conf) scores = [] for preds in tqdm(p.imap(call_rover, params_grid), total=len(params_grid)): scores.append(wer(references, preds)) scores = zip(params_grid, scores) min_results = min(scores, key=lambda x: x[1]) print("Best Params:\nConf : %f \nWER: %f" % tuple(min_results)) else: rover_transcripts = call_rover(args.conf) print("\nNumber Utterances : {}".format(len(references))) print( "SER WER = {}, CER = {}\nDS2 WER = {}, CER = {}\nRVR WER = {}, CER = {}\n" .format(wer(references, ser_transcripts), cer(references, ser_transcripts), wer(references, ds2_transcripts), cer(references, ds2_transcripts), wer(references, rover_transcripts), cer(references, rover_transcripts))) os.remove('service.ctm') os.remove('ds2.ctm')
def corpus_wer(r, h): from utils import wer return np.mean(map(lambda (a, b): wer(a, b), zip(r, h)))
def test(model, optimizer, dataloader, CTCLoss, device, melspec, bs_width=None): model.eval() cers, wers, cers_bs, wers_bs = [], [], [], [] losses = [] with torch.no_grad(): for i, (wavs, wavs_len, answ, answ_len) in enumerate(dataloader): wavs, answ = wavs.to(device), answ.to(device) trans_wavs = torch.log(melspec(wavs) + 1e-9) output = model(trans_wavs) if bs_width != None: output_bs = F.softmax(output, dim=1).transpose(0, 1).transpose(0, 2) preds_bs, targets_bs = beam_search_decoding(output_bs, answ, answ_len, width=bs_width) output = F.log_softmax(output, dim=1) output = output.transpose(0, 1).transpose(0, 2) loss = CTCLoss(output, answ, wavs_len, answ_len) losses.append(loss.item()) # argmax preds, targets = decoder_func(output, answ, answ_len, del_repeated=True) for i in range(len(preds)): if i == 0: print('target: ', ''.join(targets[i])) print('prediction: ', ''.join(preds[i])) cers.append(cer(targets[i], preds[i])) wers.append(wer(targets[i], preds[i])) if bs_width != None and i == 0: print('beamS pred:', ''.join(preds_bs[i])) cers_bs.append(cer(targets_bs[i], preds_bs[i])) wers_bs.append(wer(targets_bs[i], preds_bs[i])) avg_cer = np.mean(cers) avg_wer = np.mean(wers) if bs_width != None: avg_cer_bs = np.mean(cers_bs) avg_wer_bs = np.mean(wers_bs) wandb.log({"CER_val": avg_cer}) wandb.log({"WER_val": avg_wer}) avg_loss = np.mean(losses) print('average test loss is', avg_loss) wandb.log({'mean_VAL_loss': avg_loss})
def main(): urls = ['gs://<folder>/%s_part.wav' % i for i in xrange(1, 5)] # put your GS files in this list speech_service = get_speech_service() operations = [] transcripts = [] global_timer = time.time() for url in urls: service_request = speech_service.speech().asyncrecognize( body={ 'config': { 'encoding': 'LINEAR16', 'sampleRate': 16000, 'languageCode': 'en-US' # a BCP-47 language tag }, 'audio': { 'uri': url } }) response = service_request.execute() operations.append(response.get('name')) confidences = [] for o in operations: response = None attempt = 0 start = time.time() while True: operation = speech_service.operations().get(name=str(o)) response = operation.execute() if 'error' in response: print response continue if 'response' in response and response.get('metadata').get( 'progressPercent') == 100: print 'Operation %s took %s sec' % (o, time.time() - start) break else: attempt += 1 print 'Operation %s attempt %s' % (o, attempt) time.sleep(10) # debounce timeout for 10 secs if 'results' not in response.get('response'): continue # collect all results _results = [] for each in response.get('response').get('results'): # pick alternative with the highest confidence alt_highest_confidence = max(each['alternatives'], key=lambda x: x['confidence']) _results.append(alt_highest_confidence['transcript']) confidences.append(alt_highest_confidence['confidence']) transcript_one_line = ' '.join(_results) transcripts.append(transcript_one_line) print print 'Recognition process took %s sec' % (time.time() - global_timer) print recognized_transcript = ' '.join(transcripts) print '%s frames in recognition' % len(transcripts) print print 'Average confidence of recognized transcripts is %s' % ( sum(confidences) / len(confidences)) with open(ORIGIN_TRANSCRIPT, 'rb') as f: origin_transcript = f.read() origin_transcript = origin_transcript.decode('UTF-8') print '%s words recognized' % len( re.findall(r'\w+', recognized_transcript)) print 'Recognized transcript: \n%s' % recognized_transcript print print '%s words in origin' % len(re.findall(r'\w+', origin_transcript)) print 'Origin transcript: \n%s' % origin_transcript print print 'Diff %s' % utils.find_diff(recognized_transcript, origin_transcript) # WER = (I + D + S) / N print 'WER %s%%' % utils.wer(recognized_transcript.split(), origin_transcript.split()) # write recognized transcript into file f = open(os.path.join(os.getcwd(), 'leo', 'recognized_g.txt'), 'w+') f.write(recognized_transcript) f.close()
def main(): parser = argparse.ArgumentParser( description="Generate modified transcripts using FineMerge") parser.add_argument( "--dataset", help="Path to preprocessed data pickle file", type=str, required=True, ) parser.add_argument( "--params_config", help="Path to json config file containing param values", type=str, required=True, ) parser.add_argument( "--utterances", help="Path to file containing list of tab separated utterance," " reference to generate modified transcripts", type=str, required=True, ) parser.add_argument("--output_path", help="Path to save file containing modified transcipt", type=str, default='final_preds.txt') parser.add_argument( "--labels", help= "Path to labels json files containing ordered list of output labels mapping to ds2 probs", type=str, required=False, default='labels_char.json', ) parser.add_argument( "--lm_path", help="Path to arpa lm file to use while decoding", type=str, default=None, ) parser.add_argument( "--save_modified_probs", help="Path to save the new probs", type=str, required=False, default=None, ) args = parser.parse_args() global labels with open(args.labels) as label_file: labels = json.load(label_file) global params with open(args.params_config) as params_file: params = json.load(params_file) global data data = np.load(args.dataset, allow_pickle=True) df_utt = pd.read_csv(args.utterances, delimiter='\t') df_utt = df_utt[df_utt['file_name'].isin(data.keys())] #TODO utterances = df_utt['file_name'].to_list() references = df_utt['transcript'].to_list() references = [normalize_string(text, labels[1:]) for text in references] # references = [parse_text(text) for text in references] service_transcripts = [ data[utt]['service_transcript'] for utt in utterances ] probs_list = [data[utt]['ds2_probs'] for utt in utterances] print('Getting transcripts for DS2...') ds2_transcripts = ctc_beam_decode(probs_list, labels, args.lm_path, labels.index('_'), params['ds2_lm_alpha'], params['ds2_lm_beta'], params['beam_size']) print("Applying FineMerge to DS2 probs using service transcripts...") with Pool(multiprocessing.cpu_count()) as pool: new_probs_list = list( tqdm(pool.imap(get_merged_transcript, utterances), total=len(utterances))) if args.save_modified_probs: np.save(args.save_modified_probs, list(zip(utterances, new_probs_list))) print("Getting the final transcripts...") new_transcipts = ctc_beam_decode(new_probs_list, labels, args.lm_path, labels.index('_'), params['lm_alpha'], params['lm_beta'], params['beam_size']) print("\nNumber utterances : {}".format(len(references))) print("SER WER: {}".format(wer(references, service_transcripts))) print("DS2 WER: {}".format(wer(references, ds2_transcripts))) print("NEW WER : {}\n".format(wer(references, new_transcipts))) with open(args.output_path, 'w') as fd: for utt, ref, ser, ds2, pred in zip(utterances, references, service_transcripts, \ ds2_transcripts, new_transcipts): fd.write("UTT: {}\nREF: {}\nSER: {}\nDS2: {}\nNEW: {}\n\n".format( utt, ref, ser, ds2, pred))
def main(args): speech_service = get_speech_service() transcripts = [] confidences = [] global_timer = time.time() audio_files = glob.glob(r'%s/*part_[0-9]*.%s' % (args.path, args.format)) audio_files.sort(key=sort_fileparts) for audio_file in audio_files: current = os.path.basename(audio_file) with open(audio_file, 'rb') as f: content = base64.b64encode(f.read()) service_request = speech_service.speech().syncrecognize( body={ 'config': { 'encoding': 'FLAC' if args.format == 'flac' else 'LINEAR16', 'sampleRate': 16000, 'languageCode': 'en-US' # a BCP-47 language tag }, 'audio': { 'content': content.decode('UTF-8') } }) response = service_request.execute() if 'error' in response or 'results' not in response or not response: print '%s: NOT recognized, full response is %s' % (current, response) continue print '%s: %s' % (current, 'recognized') # collect all results _results = [] for each in response['results']: # pick alternative with the highest confidence alt_highest_confidence = max(each['alternatives'], key=lambda x: x['confidence']) _results.append(alt_highest_confidence['transcript']) confidences.append(alt_highest_confidence['confidence']) transcript_one_line = ' '.join(_results) transcripts.append(transcript_one_line) f.close() print print 'Recognition process took %s sec' % (time.time() - global_timer) print recognized_transcript = ' '.join(transcripts) print '%s frames in recognition' % len(transcripts) print print 'Average confidence of recognized transcripts is %s' % ( sum(confidences) / len(confidences)) with open(ORIGIN_TRANSCRIPT, 'rb') as f: origin_transcript = f.read() origin_transcript = origin_transcript.decode('UTF-8') print '%s words recognized' % len( re.findall(r'\w+', recognized_transcript)) print 'Recognized transcript: \n%s' % recognized_transcript print print '%s words in origin' % len(re.findall(r'\w+', origin_transcript)) print 'Origin transcript: \n%s' % origin_transcript print print 'Diff %s' % utils.find_diff(recognized_transcript, origin_transcript) # WER = (I + D + S) / N print 'WER %s%%' % utils.wer(recognized_transcript.split(), origin_transcript.split()) # write recognized transcript into file f = open(os.path.join(os.getcwd(), 'leo', 'recognized_g_sync.txt'), 'w+') f.write(recognized_transcript) f.close()