def augmentation(self): """ Apply Spec-Augmentation """ augment_end_idx = int(0 + ( (len(self.audio_paths) - 0) * self.augment_ratio)) logger.info("Applying Augmentation...") for idx in range(augment_end_idx): self.augment_flags.append(True) self.audio_paths.append(self.audio_paths[idx]) self.label_paths.append(self.label_paths[idx])
def load_pickle(filepath, message=""): """ load pickle file Args: filepath (str): Path to pickle file to load message (str): message to print Returns: load_result -**load_result** : load result of pickle """ with open(filepath, "rb") as f: load_result = pickle.load(f) logger.info(message) return load_result
def evaluate(model, queue, criterion, device): r""" Args: model (torch.nn.Module): Model to be evaluated queue (queue): queue for threading criterion (torch.nn): one of PyTorch’s loss function. Refer to http://pytorch.org/docs/master/nn.html#loss-functions for a list of them. device (torch.cuda): device used ('cuda' or 'cpu') Returns: loss, cer - **loss** (float): loss of evalution - **cer** (float): character error rate """ logger.info('evaluate() start') total_loss = 0. total_num = 0 total_dist = 0 total_length = 0 total_sentence_num = 0 model.eval() with torch.no_grad(): while True: feats, scripts, feat_lengths, script_lengths = queue.get() if feats.shape[0] == 0: break feats = feats.to(device) scripts = scripts.to(device) target = scripts[:, 1:] model.module.flatten_parameters() y_hat, logit = model(feats, scripts, teacher_forcing_ratio=0.0, use_beam_search=False) loss = criterion(logit.contiguous().view(-1, logit.size(-1)), target.contiguous().view(-1)) total_loss += loss.item() total_num += sum(feat_lengths) dist, length = get_distance(target, y_hat, id2char, EOS_TOKEN) total_dist += dist total_length += length total_sentence_num += target.size(0) logger.info('evaluate() completed') return total_loss / total_num, total_dist / total_length
def evaluate(model, queue, perplexity, device): logger.info('evaluate() start') total_loss = 0 total_num = 0 model.eval() with torch.no_grad(): while True: loss = perplexity inputs, targets, input_lengths, target_lengths = queue.get() if inputs.shape[0] == 0: break inputs = inputs.to(device) targets = targets.to(device) model.module.flatten_parameters() outputs = model(inputs, teacher_forcing_ratio=0.0) loss.reset() for step, step_output in enumerate(outputs): batch_size = targets.size(0) loss.eval_batch(step_output.contiguous().view(batch_size, -1), targets[:, step]) loss = loss.get_loss() total_loss += loss total_num += sum(input_lens) logger.info('evaluate() completed') return total_loss / total_num
def supervised_train(model, config, epoch, total_time_step, queue, criterion, optimizer, device, train_begin, worker_num, print_every=10, teacher_forcing_ratio=0.90): r""" Args: train_begin: train begin time total_time_step: total time step in epoch epoch (int): present epoch config (Config): configuration model (torch.nn.Module): Model to be trained optimizer (torch.optim): optimizer for training teacher_forcing_ratio (float): The probability that teacher forcing will be used (default: 0.90) print_every (int): Parameters to determine how many steps to output queue (Queue.queue): queue for threading criterion (torch.nn): one of PyTorch’s loss function. Refer to http://pytorch.org/docs/master/nn.html#loss-functions for a list of them. device (torch.cuda): device used ('cuda' or 'cpu') worker_num (int): the number of cpu cores used Returns: loss, cer - **loss** (float): loss of present epoch - **cer** (float): character error rate """ epoch_loss_total = 0. print_loss_total = 0. total_num = 0 total_dist = 0 total_length = 0 time_step = 0 decay_speed = 1.0 RAMPUP_POWER = 3 RANMPUP_PERIOD = 3000 EXP_DECAY_PERIOD = total_time_step * 3 model.train() begin = epoch_begin = time.time() while True: # LR Wamp-Up if config.use_multistep_lr and epoch == 0 and time_step < RANMPUP_PERIOD: set_lr(optimizer, lr=config.high_plateau_lr * ((time_step + 1) / RANMPUP_PERIOD) ** RAMPUP_POWER) # LR Exponential-Decay if config.use_multistep_lr and (epoch == 1 or epoch == 2 or epoch == 3): decay_rate = config.low_plateau_lr / config.high_plateau_lr decay_speed *= decay_rate ** (1 / EXP_DECAY_PERIOD) set_lr(optimizer, config.high_plateau_lr * decay_speed) feats, scripts, feat_lens, target_lens = queue.get() if feats.shape[0] == 0: # empty feats means closing one loader worker_num -= 1 logger.debug('left train_loader: %d' % worker_num) if worker_num == 0: break else: continue inputs = feats.to(device) scripts = scripts.to(device) targets = scripts[:, 1:] model.module.flatten_parameters() y_hat, logit = model(inputs, scripts, teacher_forcing_ratio=teacher_forcing_ratio) loss = criterion(logit.contiguous().view(-1, logit.size(-1)), targets.contiguous().view(-1)) epoch_loss_total += loss.item() print_loss_total += loss.item() total_num += sum(feat_lens) dist, length = get_distance(targets, y_hat, id2char, EOS_TOKEN) total_dist += dist total_length += length optimizer.zero_grad() loss.backward() optimizer.step() time_step += 1 torch.cuda.empty_cache() if time_step % print_every == 0: current = time.time() elapsed = current - begin epoch_elapsed = (current - epoch_begin) / 60.0 train_elapsed = (current - train_begin) / 3600.0 logger.info('timestep: {:4d}/{:4d}, loss: {:.4f}, cer: {:.2f}, elapsed: {:.2f}s {:.2f}m {:.2f}h'.format( time_step, total_time_step, print_loss_total / print_every, total_dist / total_length, elapsed, epoch_elapsed, train_elapsed) ) print_loss_total = 0 begin = time.time() if time_step % 1000 == 0: save_step_result(train_step_result, epoch_loss_total / total_num, total_dist / total_length) if time_step % 10000 == 0: torch.save(model, "./data/weight_file/epoch_%s_step_%s.pt" % (str(epoch), str(time_step))) logger.info('train() completed') return epoch_loss_total / total_num, total_dist / total_length
def supervised_train(model, hparams, epoch, total_time_step, queue, criterion, optimizer, device, train_begin, worker_num, print_time_step=10, teacher_forcing_ratio=0.90): """ Args: model (torch.nn.Module): Model to be trained optimizer (torch.optim): optimizer for training teacher_forcing_ratio (float): The probability that teacher forcing will be used (default: 0.90) print_time_step (int): Parameters to determine how many steps to output queue (Queue.queue): queue for threading criterion (torch.nn): one of PyTorch’s loss function. Refer to http://pytorch.org/docs/master/nn.html#loss-functions for a list of them. device (torch.cuda): device used ('cuda' or 'cpu') worker_num (int): the number of cpu cores used Returns: loss, cer - **loss** (float): loss of present epoch - **cer** (float): character error rate """ total_loss = 0. total_num = 0 total_dist = 0 total_length = 0 total_sent_num = 0 time_step = 0 model.train() begin = epoch_begin = time.time() while True: if hparams.use_multistep_lr and epoch == 0 and time_step < 1000: ramp_up(optimizer, time_step, hparams) if hparams.use_multistep_lr and epoch == 1: exp_decay(optimizer, total_time_step, hparams) feats, targets, feat_lengths, label_lengths = queue.get() if feats.shape[0] == 0: # empty feats means closing one loader worker_num -= 1 logger.debug('left train_loader: %d' % (worker_num)) if worker_num == 0: break else: continue optimizer.zero_grad() feats = feats.to(device) targets = targets.to(device) target = targets[:, 1:] model.module.flatten_parameters() y_hat, logit = model(feats, targets, teacher_forcing_ratio=teacher_forcing_ratio) loss = criterion(logit.contiguous().view(-1, logit.size(-1)), target.contiguous().view(-1)) total_loss += loss.item() total_num += sum(feat_lengths) dist, length = get_distance(target, y_hat, id2char, EOS_TOKEN) total_dist += dist total_length += length total_sent_num += target.size(0) loss.backward() optimizer.step() if time_step % print_time_step == 0: current = time.time() elapsed = current - begin epoch_elapsed = (current - epoch_begin) / 60.0 train_elapsed = (current - train_begin) / 3600.0 logger.info( 'timestep: {:4d}/{:4d}, loss: {:.4f}, cer: {:.2f}, elapsed: {:.2f}s {:.2f}m {:.2f}h' .format(time_step, total_time_step, total_loss / total_num, total_dist / total_length, elapsed, epoch_elapsed, train_elapsed)) begin = time.time() if time_step % 1000 == 0: save_step_result(train_step_result, total_loss / total_num, total_dist / total_length) if time_step % 10000 == 0: torch.save(model, "model.pt") torch.save( model, "./data/weight_file/epoch_%s_step_%s.pt" % (str(epoch), str(time_step))) time_step += 1 supervised_train.cumulative_batch_count += 1 torch.cuda.empty_cache( ) # GPU memory free. if you have enough GPU memory, delete this line loss = total_loss / total_num cer = total_dist / total_length logger.info('train() completed') return loss, cer
def logger_hparams(self): """ print information of hyperparameters """ logger.info("use_bidirectional : %s" % str(self.use_bidirectional)) logger.info("use_attention : %s" % str(self.use_attention)) logger.info("use_pickle : %s" % str(self.use_pickle)) logger.info("use_augment : %s" % str(self.use_augment)) logger.info("use_pyramidal : %s" % str(self.use_pyramidal)) logger.info("augment_ratio : %0.2f" % self.augment_ratio) logger.info("input_reverse : %s" % str(self.input_reverse)) logger.info("hidden_size : %d" % self.hidden_size) logger.info("listener_layer_size : %d" % self.listener_layer_size) logger.info("speller_layer_size : %d" % self.speller_layer_size) logger.info("dropout : %0.2f" % self.dropout) logger.info("batch_size : %d" % self.batch_size) logger.info("worker_num : %d" % self.worker_num) logger.info("max_epochs : %d" % self.max_epochs) logger.info("initial learning rate : %0.4f" % self.init_lr) if self.use_multistep_lr: logger.info("high plateau learning rate : %0.4f" % self.high_plateau_lr) logger.info("low plateau learning rate : %0.4f" % self.low_plateau_lr) logger.info("teacher_forcing_ratio : %0.2f" % self.teacher_forcing) logger.info("seed : %d" % self.seed) logger.info("max_len : %d" % self.max_len) logger.info("use_cuda : %s" % str(self.use_cuda))
def split_dataset(config, audio_paths, label_paths, valid_ratio=0.05, target_dict=None): """ Dataset split into training and validation Dataset. Args: valid_ratio: validation set ratio of total dataset config (package.config.HyperParams): set of configures audio_paths (list): set of audio path label_paths (list): set of label path target_dict (dict): dictionary of filename and target Returns: train_batch_num, train_dataset_list, valid_dataset - **train_batch_num** (int): num of batch for training - **train_dataset_list** (list): list of training dataset - **valid_dataset** (utils.dataset.BaseDataset): validation dataset """ logger.info("split dataset start !!") trainset_list = list() train_num = math.ceil(len(audio_paths) * (1 - valid_ratio)) total_time_step = math.ceil(len(audio_paths) / config.batch_size) valid_time_step = math.ceil(total_time_step * valid_ratio) train_time_step = total_time_step - valid_time_step if config.use_augment: train_time_step = int(train_time_step * (1 + config.augment_ratio)) train_num_per_worker = math.ceil(train_num / config.worker_num) # audio_paths & label_paths shuffled in the same order # for seperating train & validation data_paths = list(zip(audio_paths, label_paths)) random.shuffle(data_paths) audio_paths, label_paths = zip(*data_paths) # seperating the train dataset by the number of workers for idx in range(config.worker_num): train_begin_idx = train_num_per_worker * idx train_end_idx = min(train_num_per_worker * (idx + 1), train_num) trainset_list.append( CustomDataset( audio_paths=audio_paths[train_begin_idx:train_end_idx], label_paths=label_paths[train_begin_idx:train_end_idx], sos_id=SOS_TOKEN, eos_id=EOS_TOKEN, target_dict=target_dict, input_reverse=config.input_reverse, use_augment=config.use_augment, batch_size=config.batch_size, augment_ratio=config.augment_ratio)) validset = CustomDataset(audio_paths=audio_paths[train_num:], label_paths=label_paths[train_num:], sos_id=SOS_TOKEN, eos_id=EOS_TOKEN, batch_size=config.batch_size, target_dict=target_dict, input_reverse=config.input_reverse, use_augment=False) save_pickle(trainset_list, './data/pickle/trainset_list') save_pickle(validset, './data/pickle/validset') logger.info("split dataset complete !!") return train_time_step, trainset_list, validset
from torch import optim from package.config import Config from package.definition import char2id, logger, SOS_token, EOS_token, PAD_token from package.data_loader import CustomDataset, load_corpus, CustomDataLoader from package.evaluator import evaluate from package.loss import Perplexity from package.trainer import supervised_train from model import LanguageModel # Character-level Recurrent Neural Network Language Model implement in Pytorch # https://github.com/sooftware/char-rnnlm if __name__ == '__main__': os.environ[ "CUDA_LAUNCH_BLOCKING"] = "1" # if you use Multi-GPU, delete this line logger.info("device : %s" % torch.cuda.get_device_name(0)) logger.info("CUDA is available : %s" % (torch.cuda.is_available())) logger.info("CUDA version : %s" % torch.version.cuda) logger.info("PyTorch version : %s" % torch.__version__) config = Config(use_cuda=True, hidden_size=512, dropout_p=0.5, n_layers=4, batch_size=16, max_epochs=40, lr=0.0001, teacher_forcing_ratio=1.0, seed=1, max_len=428, worker_num=1)
def get_librosa_mfcc(filepath, n_mfcc=40, del_silence=False, input_reverse=True): r""": Mel-frequency cepstral coefficients (MFCCs) Args: filepath (str): specific path of audio file n_mfcc (int): number of mel filter del_silence (bool): flag indication whether to delete silence or not (default: True) input_reverse (bool): flag indication whether to reverse input or not (default: True) Feature Parameters: - **sample rate**: A.I Hub dataset`s sample rate is 16,000 - **frame length**: 25ms - **stride**: 10ms - **overlap**: 15ms - **window**: Hamming Window .. math:: \begin{array}{ll} NFFT = sr * frame length \\ HopLength = sr * stride \\ \end{array} Returns: mfcc - **mfcc** (torch.Tensor): return mel frequency cepstral coefficient feature Examples:: Generate mfccs from a time series >>> get_librosa_mfcc("KaiSpeech_021458.pcm", n_mfcc=40, input_reverse=True) Tensor([[ -5.229e+02, -4.944e+02, ..., -5.229e+02, -5.229e+02], [ 7.105e-15, 3.787e+01, ..., -7.105e-15, -7.105e-15], ..., [ 1.066e-14, -7.500e+00, ..., 1.421e-14, 1.421e-14], [ 3.109e-14, -5.058e+00, ..., 2.931e-14, 2.931e-14]]) """ if filepath.split('.')[-1] == 'pcm': try: pcm = np.memmap(filepath, dtype='h', mode='r') except: # exception handling logger.info("%s Error Occur !!" % filepath) return None signal = np.array([float(x) for x in pcm]) elif filepath.split('.')[-1] == 'wav': signal, _ = librosa.core.load(filepath, sr=16000) else: raise ValueError("Invalid format !!") if del_silence: non_silence_ids = librosa.effects.split(signal, top_db=30) signal = np.concatenate( [signal[start:end] for start, end in non_silence_ids]) mfcc = librosa.feature.mfcc(signal, 16000, hop_length=160, n_mfcc=n_mfcc, n_fft=400, window='hamming') if input_reverse: mfcc = mfcc[:, ::-1] mfcc = torch.FloatTensor(np.ascontiguousarray(np.swapaxes(mfcc, 0, 1))) return mfcc
def get_librosa_melspectrogram(filepath, n_mels=128, del_silence=False, input_reverse=True, mel_type='log_mel'): r""" Compute a mel-scaled soectrigram (or Log-Mel). Args: filepath (str): specific path of audio file n_mels (int): number of mel filter del_silence (bool): flag indication whether to delete silence or not (default: True) mel_type (str): if 'log_mel' return log-mel (default: 'log_mel') input_reverse (bool): flag indication whether to reverse input or not (default: True) Feature Parameters: - **sample rate**: A.I Hub dataset`s sample rate is 16,000 - **frame length**: 25ms - **stride**: 10ms - **overlap**: 15ms - **window**: Hamming Window .. math:: \begin{array}{ll} NFFT = sr * frame length \\ Hop Length = sr * stride \\ \end{array} Returns: mel_spectrogram - **mel_spectrogram** (torch.Tensor): return Mel-Spectrogram (or Log-Mel) feature Examples:: Generate mel spectrogram from a time series >>> get_librosa_melspectrogram("KaiSpeech_021458.pcm", n_mels=128, input_reverse=True) Tensor([[ 2.891e-07, 2.548e-03, ..., 8.116e-09, 5.633e-09], [ 1.986e-07, 1.162e-02, ..., 9.332e-08, 6.716e-09], ..., [ 3.668e-09, 2.029e-08, ..., 3.208e-09, 2.864e-09], [ 2.561e-10, 2.096e-09, ..., 7.543e-10, 6.101e-10]]) """ if filepath.split('.')[-1] == 'pcm': try: pcm = np.memmap(filepath, dtype='h', mode='r') except: # exception handling logger.info("%s Error Occur !!" % filepath) return None signal = np.array([float(x) for x in pcm]) elif filepath.split('.')[-1] == 'wav': signal, _ = librosa.core.load(filepath, sr=16000) else: raise ValueError("Invalid format !!") if del_silence: non_silence_ids = librosa.effects.split(y=signal, top_db=30) signal = np.concatenate( [signal[start:end] for start, end in non_silence_ids]) mel_spectrogram = librosa.feature.melspectrogram(signal, sr=16000, n_mels=n_mels, n_fft=400, hop_length=160, window='hamming') if mel_type == 'log_mel': mel_spectrogram = librosa.amplitude_to_db(mel_spectrogram, ref=np.max) if input_reverse: mel_spectrogram = mel_spectrogram[:, ::-1] mel_spectrogram = torch.FloatTensor( np.ascontiguousarray(np.swapaxes(mel_spectrogram, 0, 1))) return mel_spectrogram
def supervised_train(model, queue, perplexity, optimizer, device, print_every, epoch, teacher_forcing_ratio, worker_num, total_time_step, train_begin): print_loss_total = 0 # Reset every print_every epoch_loss_total = 0 # Reset every epoch total_num = 0 time_step = 0 model.train() begin = epoch_begin = time.time() while True: loss = perplexity inputs, targets, input_lens, target_lens = queue.get() if inputs.shape[0] == 0: # empty feats means closing one loader worker_num -= 1 logger.debug('left train_loader: %d' % worker_num) if worker_num == 0: break else: continue inputs = inputs.to(device) targets = targets.to(device) model.module.flatten_parameters() outputs = model(inputs, teacher_forcing_ratio=teacher_forcing_ratio) # Get loss loss.reset() for step, step_output in enumerate(outputs): batch_size = targets.size(0) loss.eval_batch(step_output.contiguous().view(batch_size, -1), targets[:, step]) # Backpropagation model.zero_grad() loss.backward() optimizer.step() loss = loss.get_loss() epoch_loss_total += loss print_loss_total += loss total_num += sum(input_lens) time_step += 1 torch.cuda.empty_cache() if time_step % print_every == 0: current = time.time() elapsed = current - begin epoch_elapsed = (current - epoch_begin) / 60.0 train_elapsed = (current - train_begin) / 3600.0 logger.info( 'timestep: {:4d}/{:4d}, perplexity: {:.4f}, elapsed: {:.2f}s {:.2f}m {:.2f}h' .format(time_step, total_time_step, print_loss_total / print_every, elapsed, epoch_elapsed, train_elapsed)) print_loss_total = 0 begin = time.time() if time_step % 50000 == 0: torch.save(model, "./data/epoch%s_%s.pt" % (str(epoch), str(time_step))) logger.info('train() completed') return epoch_loss_total / total_num