def apply_augment(self): """ Apply Spec-Augmentation Comment: - **audio_paths**: [KaiSpeech_135248.pcm, KaiSpeech_453892.pcm, ......, KaiSpeech_357891.pcm] - **label_paths**: [KaiSpeech_135248.txt, KaiSpeech_453892.txt, ......, KaiSpeech_357891.txt] - **is_augment**: [True, False, ......, False] Apply SpecAugmentation if is_augment[idx] == True otherwise, it doesn`t 0 augment_end end_idx (len(self.audio_paths) │-----hparams.augment_ratio------│-----------------else-----------------│ """ augment_end_idx = int(0 + ( (len(self.audio_paths) - 0) * self.augment_ratio)) logger.info("Applying Augmentation...") for idx in range(augment_end_idx): self.is_augment.append(True) self.audio_paths.append(self.audio_paths[idx]) self.label_paths.append(self.label_paths[idx]) # after add data which applied Spec-Augmentation, shuffle tmp = list(zip(self.audio_paths, self.label_paths, self.is_augment)) random.shuffle(tmp) self.audio_paths, self.label_paths, self.is_augment = zip(*tmp)
def get_label(label_path, bos_id=2037, eos_id=2038, target_dict=None): """ Provides specific file`s label to list format. Inputs: filepath, bos_id, eos_id, target_dict - **filepath**: specific path of label file - **bos_id**: <s>`s id - **eos_id**: </s>`s id - **target_dict**: dictionary of filename and labels Format : {KaiSpeech_label_FileNum : '5 0 49 4 0 8 190 0 78 115', ... } Outputs: label - **label**: list of bos + sequence of label + eos Format : [<s>, 5, 0, 49, 4, 0, 8, 190, 0, 78, 115, </s>] """ if target_dict == None: logger.info("target_dict is None") key = label_path.split('/')[-1].split('.')[0] script = target_dict[key] tokens = script.split(' ') label = list() label.append(int(bos_id)) for token in tokens: label.append(int(token)) label.append(int(eos_id)) del script, tokens # memory deallocation return label
def get_librosa_mfcc(filepath=None, n_mfcc=33, del_silence=False, input_reverse=True, format='pcm'): """ Provides Mel Frequency Cepstral Coefficient (MFCC) for Speech Recognition Args: filepath: specific path of audio file n_mfcc: number of mel filter del_silence: flag indication whether to delete silence or not (default: True) input_reverse: flag indication whether to reverse input or not (default: True) format: file format ex) pcm, wav (default: pcm) Comment: sample rate: A.I Hub dataset`s sample rate is 16,000 frame length: 25ms stride: 10ms overlap: 15ms window: Hamming Window n_fft = sr * frame_length (16,000 * 30ms) hop_length = sr * stride (16,000 * 7.5ms) Outputs: mfcc: return MFCC values of signal """ if format == 'pcm': try: pcm = np.memmap(filepath, dtype='h', mode='r') except: # exception handling logger.info("np.memmap error in %s" % filepath) return torch.zeros(1) sig = np.array([float(x) for x in pcm]) elif format == 'wav': sig, _ = librosa.core.load(filepath, sr=16000) else: logger.info("%s is not Supported" % format) if del_silence: non_silence_indices = librosa.effects.split(sig, top_db=30) sig = np.concatenate( [sig[start:end] for start, end in non_silence_indices]) feat = librosa.feature.mfcc(y=sig, sr=16000, hop_length=160, n_mfcc=n_mfcc, n_fft=400, window='hamming') if input_reverse: feat = feat[:, ::-1] return torch.FloatTensor(np.ascontiguousarray(np.swapaxes(feat, 0, 1)))
def evaluate(model, queue, loss_func, device): logger.info('evaluate() start') total_loss = 0. total_num = 0 total_dist = 0 total_length = 0 total_sent_num = 0 model.eval() with torch.no_grad(): while True: feats, scripts, feat_lengths, script_lengths = queue.get() if feats.shape[0] == 0: break feats = feats.to(device) scripts = scripts.to(device) target = scripts[:, 1:] model.module.flatten_parameters() logit = model(feats, feat_lengths, scripts, teacher_forcing_ratio=0.0) logit = torch.stack(logit, dim=1).to(device) y_hat = logit.max(-1)[1] loss = loss_func(logit.contiguous().view(-1, logit.size(-1)), target.contiguous().view(-1)) total_loss += loss.item() total_num += sum(feat_lengths) display = random.randrange(0, 100) == 0 dist, length = get_distance(target, y_hat, display=display) total_dist += dist total_length += length total_sent_num += target.size(0) logger.info('evaluate() completed') return total_loss / total_num, total_dist / total_length
def get_item(self, idx): label = get_label(self.label_paths[idx], self.bos_id, self.eos_id, self.target_dict) feat = get_librosa_mfcc(self.audio_paths[idx], n_mfcc=33, del_silence=False, input_reverse=self.input_reverse, format='pcm') # exception handling if feat.size(0) == 1: logger.info("Delete label_paths : %s" % self.label_paths[idx]) label = '' return feat, label if self.is_augment[idx]: feat = spec_augment(feat, T=40, F=15, time_mask_num=2, freq_mask_num=2) return feat, label
def logger_hparams(self): logger.info("use_bidirectional : %s" % str(self.use_bidirectional)) logger.info("use_attention : %s" % str(self.use_attention)) logger.info("use_pickle : %s" % str(self.use_pickle)) logger.info("use_augment : %s" % str(self.use_augment)) logger.info("use_pyramidal : %s" % str(self.use_pyramidal)) logger.info("augment_ratio : %0.2f" % self.augment_ratio) logger.info("input_reverse : %s" % str(self.input_reverse)) logger.info("hidden_size : %d" % self.hidden_size) logger.info("listener_layer_size : %d" % self.listener_layer_size) logger.info("speller_layer_size : %d" % self.speller_layer_size) logger.info("dropout : %0.2f" % self.dropout) logger.info("batch_size : %d" % self.batch_size) logger.info("worker_num : %d" % self.worker_num) logger.info("max_epochs : %d" % self.max_epochs) logger.info("learning rate : %0.4f" % self.lr) logger.info("teacher_forcing_ratio : %0.2f" % self.teacher_forcing) logger.info("seed : %d" % self.seed) logger.info("max_len : %d" % self.max_len) logger.info("use_cuda : %s" % str(self.use_cuda)) logger.info("save_name : %s" % self.save_name) logger.info("mode : %s" % self.mode)
def load_pickle(filepath, message=""): with open(filepath, "rb") as f: load_result = pickle.load(f) logger.info(message) return load_result
def save_pickle(save_var, savepath, message=""): with open(savepath, "wb") as f: pickle.dump(save_var, f) logger.info(message)
def load_model(filepath): logger.info("Load model..") model = torch.load(filepath) model.eval() logger.info("Load model Succesfuuly completely !!") return model
def train(model, total_batch_size, queue, loss_func, optimizer, device, train_begin, worker_num, print_batch=5, teacher_forcing_ratio=1): total_loss = 0. total_num = 0 total_dist = 0 total_length = 0 total_sent_num = 0 batch = 0 model.train() begin = epoch_begin = time.time() while True: feats, targets, feat_lengths, label_lengths = queue.get() if feats.shape[0] == 0: # empty feats means closing one loader worker_num -= 1 logger.debug('left train_loader: %d' % (worker_num)) if worker_num == 0: break else: continue optimizer.zero_grad() feats = feats.to(device) targets = targets.to(device) target = targets[:, 1:] model.module.flatten_parameters() # Seq2seq forward() y_hat, logit = model(feats, targets, teacher_forcing_ratio) loss = loss_func(logit.contiguous().view(-1, logit.size(-1)), target.contiguous().view(-1)) total_loss += loss.item() total_num += sum(feat_lengths) display = random.randrange(0, 100) == 0 dist, length = get_distance(target, y_hat, display=display) total_dist += dist total_length += length total_sent_num += target.size(0) loss.backward() optimizer.step() if batch % print_batch == 0: current = time.time() elapsed = current - begin epoch_elapsed = (current - epoch_begin) / 60.0 train_elapsed = (current - train_begin) / 3600.0 logger.info('batch: {:4d}/{:4d}, loss: {:.4f}, cer: {:.2f}, elapsed: {:.2f}s {:.2f}m {:.2f}h' .format(batch, total_batch_size, total_loss / total_num, total_dist / total_length, elapsed, epoch_elapsed, train_elapsed)) begin = time.time() if batch % 1000 == 0: save_step_result(train_step_result, total_loss / total_num, total_dist / total_length) batch += 1 train.cumulative_batch_count += 1 logger.info('train() completed') return total_loss / total_num, total_dist / total_length