Beispiel #1
0
    def apply_augment(self):
        """
        Apply Spec-Augmentation
        Comment:
            - **audio_paths**: [KaiSpeech_135248.pcm, KaiSpeech_453892.pcm, ......, KaiSpeech_357891.pcm]
            - **label_paths**: [KaiSpeech_135248.txt, KaiSpeech_453892.txt, ......, KaiSpeech_357891.txt]
            - **is_augment**: [True, False, ......, False]
            Apply SpecAugmentation if is_augment[idx] == True otherwise, it doesn`t

        0                            augment_end                             end_idx (len(self.audio_paths)
        │-----hparams.augment_ratio------│-----------------else-----------------│
        """
        augment_end_idx = int(0 + (
            (len(self.audio_paths) - 0) * self.augment_ratio))
        logger.info("Applying Augmentation...")

        for idx in range(augment_end_idx):
            self.is_augment.append(True)
            self.audio_paths.append(self.audio_paths[idx])
            self.label_paths.append(self.label_paths[idx])

        # after add data which applied Spec-Augmentation, shuffle
        tmp = list(zip(self.audio_paths, self.label_paths, self.is_augment))
        random.shuffle(tmp)
        self.audio_paths, self.label_paths, self.is_augment = zip(*tmp)
def get_label(label_path, bos_id=2037, eos_id=2038, target_dict=None):
    """
    Provides specific file`s label to list format.
    Inputs: filepath, bos_id, eos_id, target_dict
        - **filepath**: specific path of label file
        - **bos_id**: <s>`s id
        - **eos_id**: </s>`s id
        - **target_dict**: dictionary of filename and labels
                Format : {KaiSpeech_label_FileNum : '5 0 49 4 0 8 190 0 78 115', ... }
    Outputs: label
        - **label**: list of bos + sequence of label + eos
                Format : [<s>, 5, 0, 49, 4, 0, 8, 190, 0, 78, 115, </s>]
    """
    if target_dict == None: logger.info("target_dict is None")
    key = label_path.split('/')[-1].split('.')[0]
    script = target_dict[key]
    tokens = script.split(' ')

    label = list()
    label.append(int(bos_id))
    for token in tokens:
        label.append(int(token))
    label.append(int(eos_id))
    del script, tokens  # memory deallocation

    return label
def get_librosa_mfcc(filepath=None,
                     n_mfcc=33,
                     del_silence=False,
                     input_reverse=True,
                     format='pcm'):
    """
    Provides Mel Frequency Cepstral Coefficient (MFCC) for Speech Recognition
    Args:
        filepath: specific path of audio file
        n_mfcc: number of mel filter
        del_silence: flag indication whether to delete silence or not (default: True)
        input_reverse: flag indication whether to reverse input or not (default: True)
        format: file format ex) pcm, wav (default: pcm)
    Comment:
        sample rate: A.I Hub dataset`s sample rate is 16,000
        frame length: 25ms
        stride: 10ms
        overlap: 15ms
        window: Hamming Window
        n_fft = sr * frame_length (16,000 * 30ms)
        hop_length = sr * stride (16,000 * 7.5ms)
    Outputs:
        mfcc: return MFCC values of signal
    """
    if format == 'pcm':
        try:
            pcm = np.memmap(filepath, dtype='h', mode='r')
        except:  # exception handling
            logger.info("np.memmap error in %s" % filepath)
            return torch.zeros(1)
        sig = np.array([float(x) for x in pcm])
    elif format == 'wav':
        sig, _ = librosa.core.load(filepath, sr=16000)
    else:
        logger.info("%s is not Supported" % format)

    if del_silence:
        non_silence_indices = librosa.effects.split(sig, top_db=30)
        sig = np.concatenate(
            [sig[start:end] for start, end in non_silence_indices])
    feat = librosa.feature.mfcc(y=sig,
                                sr=16000,
                                hop_length=160,
                                n_mfcc=n_mfcc,
                                n_fft=400,
                                window='hamming')
    if input_reverse:
        feat = feat[:, ::-1]

    return torch.FloatTensor(np.ascontiguousarray(np.swapaxes(feat, 0, 1)))
Beispiel #4
0
def evaluate(model, queue, loss_func, device):
    logger.info('evaluate() start')
    total_loss = 0.
    total_num = 0
    total_dist = 0
    total_length = 0
    total_sent_num = 0

    model.eval()

    with torch.no_grad():
        while True:
            feats, scripts, feat_lengths, script_lengths = queue.get()
            if feats.shape[0] == 0:
                break

            feats = feats.to(device)
            scripts = scripts.to(device)
            target = scripts[:, 1:]

            model.module.flatten_parameters()
            logit = model(feats,
                          feat_lengths,
                          scripts,
                          teacher_forcing_ratio=0.0)

            logit = torch.stack(logit, dim=1).to(device)
            y_hat = logit.max(-1)[1]

            loss = loss_func(logit.contiguous().view(-1, logit.size(-1)),
                             target.contiguous().view(-1))
            total_loss += loss.item()
            total_num += sum(feat_lengths)

            display = random.randrange(0, 100) == 0
            dist, length = get_distance(target, y_hat, display=display)
            total_dist += dist
            total_length += length
            total_sent_num += target.size(0)

    logger.info('evaluate() completed')
    return total_loss / total_num, total_dist / total_length
Beispiel #5
0
 def get_item(self, idx):
     label = get_label(self.label_paths[idx], self.bos_id, self.eos_id,
                       self.target_dict)
     feat = get_librosa_mfcc(self.audio_paths[idx],
                             n_mfcc=33,
                             del_silence=False,
                             input_reverse=self.input_reverse,
                             format='pcm')
     # exception handling
     if feat.size(0) == 1:
         logger.info("Delete label_paths : %s" % self.label_paths[idx])
         label = ''
         return feat, label
     if self.is_augment[idx]:
         feat = spec_augment(feat,
                             T=40,
                             F=15,
                             time_mask_num=2,
                             freq_mask_num=2)
     return feat, label
Beispiel #6
0
 def logger_hparams(self):
     logger.info("use_bidirectional : %s" % str(self.use_bidirectional))
     logger.info("use_attention : %s" % str(self.use_attention))
     logger.info("use_pickle : %s" % str(self.use_pickle))
     logger.info("use_augment : %s" % str(self.use_augment))
     logger.info("use_pyramidal : %s" % str(self.use_pyramidal))
     logger.info("augment_ratio : %0.2f" % self.augment_ratio)
     logger.info("input_reverse : %s" % str(self.input_reverse))
     logger.info("hidden_size : %d" % self.hidden_size)
     logger.info("listener_layer_size : %d" % self.listener_layer_size)
     logger.info("speller_layer_size : %d" % self.speller_layer_size)
     logger.info("dropout : %0.2f" % self.dropout)
     logger.info("batch_size : %d" % self.batch_size)
     logger.info("worker_num : %d" % self.worker_num)
     logger.info("max_epochs : %d" % self.max_epochs)
     logger.info("learning rate : %0.4f" % self.lr)
     logger.info("teacher_forcing_ratio : %0.2f" % self.teacher_forcing)
     logger.info("seed : %d" % self.seed)
     logger.info("max_len : %d" % self.max_len)
     logger.info("use_cuda : %s" % str(self.use_cuda))
     logger.info("save_name : %s" % self.save_name)
     logger.info("mode : %s" % self.mode)
def load_pickle(filepath, message=""):
    with open(filepath, "rb") as f:
        load_result = pickle.load(f)
        logger.info(message)
        return load_result
def save_pickle(save_var, savepath, message=""):
    with open(savepath, "wb") as f:
        pickle.dump(save_var, f)
    logger.info(message)
def load_model(filepath):
    logger.info("Load model..")
    model = torch.load(filepath)
    model.eval()
    logger.info("Load model Succesfuuly completely !!")
    return model
Beispiel #10
0
def train(model, total_batch_size, queue, loss_func, optimizer, device, train_begin, worker_num, print_batch=5, teacher_forcing_ratio=1):
    total_loss = 0.
    total_num = 0
    total_dist = 0
    total_length = 0
    total_sent_num = 0
    batch = 0

    model.train()
    begin = epoch_begin = time.time()

    while True:
        feats, targets, feat_lengths, label_lengths = queue.get()
        if feats.shape[0] == 0:
            # empty feats means closing one loader
            worker_num -= 1
            logger.debug('left train_loader: %d' % (worker_num))

            if worker_num == 0: break
            else: continue
        optimizer.zero_grad()

        feats = feats.to(device)
        targets = targets.to(device)
        target = targets[:, 1:]
        model.module.flatten_parameters()

        # Seq2seq forward()
        y_hat, logit = model(feats, targets, teacher_forcing_ratio)

        loss = loss_func(logit.contiguous().view(-1, logit.size(-1)), target.contiguous().view(-1))
        total_loss += loss.item()
        total_num += sum(feat_lengths)
        display = random.randrange(0, 100) == 0
        dist, length = get_distance(target, y_hat, display=display)
        total_dist += dist
        total_length += length
        total_sent_num += target.size(0)
        loss.backward()
        optimizer.step()

        if batch % print_batch == 0:
            current = time.time()
            elapsed = current - begin
            epoch_elapsed = (current - epoch_begin) / 60.0
            train_elapsed = (current - train_begin) / 3600.0

            logger.info('batch: {:4d}/{:4d}, loss: {:.4f}, cer: {:.2f}, elapsed: {:.2f}s {:.2f}m {:.2f}h'
                .format(batch,
                        total_batch_size,
                        total_loss / total_num,
                        total_dist / total_length,
                        elapsed, epoch_elapsed, train_elapsed))
            begin = time.time()

        if batch % 1000 == 0:
            save_step_result(train_step_result, total_loss / total_num, total_dist / total_length)

        batch += 1
        train.cumulative_batch_count += 1

    logger.info('train() completed')
    return total_loss / total_num, total_dist / total_length