def prepare_batch(self, features, texts):
        """ Featurize a minibatch of data, zero pad them and return a dictionary
        Params:
            features (list(np.array)): List of ECoG data
            texts (list(str)): List of texts corresponding to the features
        Returns:
            dict: See below for contents
        """
        assert len(features) == len(texts),\
            "Inputs and outputs to the network must be of the same number"
        # Features is a list of (timesteps, feature_dim) arrays
        input_lengths = [f.shape[0] for f in features]
        max_length = max(input_lengths)
        nr_channels = features[0].shape[1]

        # This may differ for the last batch (may be smaller)
        batch_size = len(features)
        # Pad all the inputs so that they are all the same length
        x = np.zeros((batch_size, max_length, nr_channels))
        for i in range(batch_size):
            feat = features[i]
            #feat = self.normalize(feat)  # Center using means and std
            x[i, :feat.shape[0], :] = feat

        y = text_to_int_sequence(texts)
        sparse_y = sparse_tensor_feed(y)

        return {
            'x': x,  # (0-padded features of shape(mb_size,timesteps,feat_dim)
            'y': y,  # list(int) Labels (integer sequences)
            'sparse_y': sparse_y,  # A tuple with (indices, values, shape)
            'texts': texts,  # list(str) Original texts
            'input_lengths': input_lengths,  # list(int) Length of each input
        }
Beispiel #2
0
def get_intseq(trans, max_intseq_length=80):
    # PAD
    t = text_to_int_sequence(trans)
    while (len(t) < max_intseq_length):
        t.append(27)  # replace with a space char to pad
    # print(t)
    return t[:max_intseq_length]
Beispiel #3
0
    def get_batch(self, partition):
        """ Obtain a batch of train, validation, or test data
        """
        if partition == 'train':
            audio_paths = self.train_audio_paths
            cur_index = self.cur_train_index
            texts = self.train_texts
        elif partition == 'valid':
            audio_paths = self.valid_audio_paths
            cur_index = self.cur_valid_index
            texts = self.valid_texts
        elif partition == 'test':
            audio_paths = self.test_audio_paths
            cur_index = self.test_valid_index
            texts = self.test_texts
        else:
            raise Exception("Invalid partition. " "Must be train/validation")

        features = [
            self.normalize(self.featurize(a))
            for a in audio_paths[cur_index:cur_index + self.minibatch_size]
        ]

        # calculate necessary sizes
        max_length = max(
            [features[i].shape[0] for i in range(0, self.minibatch_size)])
        max_string_length = max(
            [len(texts[cur_index + i]) for i in range(0, self.minibatch_size)])

        # initialize the arrays
        X_data = np.zeros([
            self.minibatch_size, max_length, self.feat_dim * self.spectrogram +
            self.mfcc_dim * (not self.spectrogram)
        ])
        labels = np.ones([self.minibatch_size, max_string_length
                          ]) * 28  # blanks
        input_length = np.zeros([self.minibatch_size, 1])
        label_length = np.zeros([self.minibatch_size, 1])

        for i in range(0, self.minibatch_size):
            # calculate X_data & input_length
            feat = features[i]
            input_length[i] = feat.shape[0]
            X_data[i, :feat.shape[0], :] = feat

            # calculate labels & label_length
            label = np.array(text_to_int_sequence(texts[cur_index + i]))
            labels[i, :len(label)] = label
            label_length[i] = len(label)

        # return the arrays
        outputs = {'ctc': np.zeros([self.minibatch_size])}
        inputs = {
            'the_input': X_data,
            'the_labels': labels,
            'input_length': input_length,
            'label_length': label_length
        }
        return (inputs, outputs)
Beispiel #4
0
def get_max_intseq(comb):
    max_intseq_length = 0
    for x in comb:
        try:
            y = text_to_int_sequence(x)
            if len(y) > max_intseq_length:
                max_intseq_length = len(y)
        except:
            print("error at:", x)
    return max_intseq_length
 def prepare_minibatch(self, audio_paths, texts, durations, arpabets):
     """ Featurize a minibatch of audio, zero pad them and return a dictionary
     Params:
         audio_paths (list(str)): List of paths to audio files
         texts (list(str)): List of texts corresponding to the audio files
     Returns:
         dict: See below for contents
     """
     assert len(audio_paths) == len(texts),\
         "Inputs and outputs to the network must be of the same number"
     # Features is a list of (timesteps, feature_dim) arrays
     # Calculate the features for each audio clip, as the log of the
     # Fourier Transform of the audio
     features = [self.featurize(a) for a in audio_paths]
     input_lengths = [f.shape[0] for f in features]
     max_length = max(input_lengths)
     feature_dim = features[0].shape[1]
     mb_size = len(features)
     # Pad all the inputs so that they are all the same length
     x = np.zeros((mb_size, max_length, feature_dim))
     y = []
     label_lengths = []
     for i in range(mb_size):
         feat = features[i]
         feat = self.normalize(feat)  # Center using means and std
         x[i, :feat.shape[0], :] = feat
         text = text_normalize(texts[i])
         label = text_to_int_sequence(text)
         y.append(label)
         label_lengths.append(len(label))
     y = pad_sequences(y, maxlen=len(max(texts, key=len)), dtype='int32',
                       padding='post', truncating='post', value=-1)
     res = {
         'x': x,  # (0-padded features of shape(mb_size,timesteps,feat_dim)
         'y': y,  # list(int) Flattened labels (integer sequences)
         'texts': texts,  # list(str) Original texts
         'input_lengths': input_lengths,  # list(int) Length of each input
         'label_lengths': label_lengths  # list(int) Length of each label
         # 'durations' [if use_durations] list(float) Duration of each sample
         # 'phonemes'[if use_arpabets] list(int) Flattened arpabet ints
     }
     if self.use_durations:
         res['durations'] = durations
     if self.use_arpabets:
         arpints, arpaint_lengths = [], []
         for i in range(mb_size):
             arpaint_seq = arpabet_to_int_sequence(arpabets[i])
             arpints.append(arpaint_seq)
             arpaint_lengths.append(len(arpaint_seq))
         maxlen = len(max(arpints, key=len))
         res['phonemes'] = pad_sequences(arpints, maxlen=maxlen,
                                         dtype='int32', padding='post',
                                         truncating='post', value=-1)
         res['phoneme_lengths'] = arpaint_lengths
     return res
    def get_batch(self, partition):
        """ Obtain a batch of train, validation, or test data
        """
        if partition == 'train':
            audio_paths = self.train_audio_paths
            cur_index = self.cur_train_index
            texts = self.train_texts
        elif partition == 'valid':
            audio_paths = self.valid_audio_paths
            cur_index = self.cur_valid_index
            texts = self.valid_texts
        elif partition == 'test':
            audio_paths = self.test_audio_paths
            cur_index = self.test_valid_index
            texts = self.test_texts
        else:
            raise Exception("Invalid partition. "
                "Must be train/validation")

        features = [self.normalize(self.featurize(a)) for a in 
            audio_paths[cur_index:cur_index+self.minibatch_size]]

        # calculate necessary sizes
        max_length = max([features[i].shape[0] 
            for i in range(0, self.minibatch_size)])
        max_string_length = max([len(texts[cur_index+i]) 
            for i in range(0, self.minibatch_size)])
        
        # initialize the arrays
        X_data = np.zeros([self.minibatch_size, max_length, 
            self.feat_dim*self.spectrogram + self.mfcc_dim*(not self.spectrogram)])
        labels = np.ones([self.minibatch_size, max_string_length]) * 28
        input_length = np.zeros([self.minibatch_size, 1])
        label_length = np.zeros([self.minibatch_size, 1])
        
        for i in range(0, self.minibatch_size):
            # calculate X_data & input_length
            feat = features[i]
            input_length[i] = feat.shape[0]
            X_data[i, :feat.shape[0], :] = feat

            # calculate labels & label_length
            label = np.array(text_to_int_sequence(texts[cur_index+i])) 
            labels[i, :len(label)] = label
            label_length[i] = len(label)
 
        # return the arrays
        outputs = {'ctc': np.zeros([self.minibatch_size])}
        inputs = {'the_input': X_data, 
                  'the_labels': labels, 
                  'input_length': input_length, 
                  'label_length': label_length 
                 }
        return (inputs, outputs)
    def get_batch(self, partition):
        if partition == 'train':
            audio_paths = self.train_audio_paths
            cur_index = self.cur_train_index
            texts = self.train_texts
        elif partition == 'valid':
            audio_paths = self.valid_audio_paths
            cur_index = self.cur_valid_index
            texts = self.valid_texts
        elif partition == 'test':
            audio_paths = self.test_audio_paths
            cur_index = self.test_valid_index
            texts = self.test_texts
        else:
            raise Exception("Invalid partition. Must be train/validation")
        features = [
            self.normalize(self.featurize(a))
            for a in audio_paths[cur_index:cur_index + self.minibatch_size]
        ]
        max_length = max(
            [features[i].shape[0] for i in range(0, self.minibatch_size)])
        max_string_length = max(
            [len(texts[cur_index + i]) for i in range(0, self.minibatch_size)])
        X_data = np.zeros([
            self.minibatch_size, max_length, self.feat_dim * self.spectrogram +
            self.mfcc_dim * (not self.spectrogram)
        ])
        labels = np.ones([self.minibatch_size, max_string_length]) * 28
        input_length = np.zeros([self.minibatch_size, 1])
        label_length = np.zeros([self.minibatch_size, 1])

        for i in range(0, self.minibatch_size):
            feat = features[i]
            input_length[i] = feat.shape[0]
            X_data[i, :feat.shape[0], :] = feat
            label = np.array(text_to_int_sequence(texts[cur_index + i]))
            labels[i, :len(label)] = label
            label_length[i] = len(label)

        outputs = {'ctc': np.zeros([self.minibatch_size])}
        inputs = {
            'the_input': X_data,
            'the_labels': labels,
            'input_length': input_length,
            'label_length': label_length
        }
        return (inputs, outputs)
Beispiel #8
0
    def prepare_minibatch(self, audio_paths, texts, mode):
        """ Featurize a minibatch of audio, zero pad them and return a dictionary
        Params:
            audio_paths (list(str)): List of paths to audio files
            texts (list(str)): List of texts corresponding to the audio files
        Returns:
            dict: See below for contents
        """
        assert len(audio_paths) == len(texts),\
            "Inputs and outputs to the network must be of the same number"
        # Features is a list of (timesteps, feature_dim) arrays
        # Calculate the features for each audio clip, as the log of the
        # Fourier Transform of the audio
        features = [self.featurize(a, mode) for a in audio_paths]
        input_lengths = [f.shape[0] for f in features]
        max_length = max(input_lengths)
        feature_dim = features[0].shape[1]
        mb_size = len(features)
        # Pad all the inputs so that they are all the same length
        x = np.zeros((mb_size, max_length, feature_dim))
        y_temp = []
        label_lengths = []
        for i in range(mb_size):
            feat = features[i]
            feat = self.normalize(feat)  # Center using means and std
            x[i, :feat.shape[0], :] = feat
            label = text_to_int_sequence(texts[i])
            y_temp.append(label)
            label_lengths.append(len(label))

        # padding zero để có thể có được label dạng [batch_size, max_label_length]
        max_label_length = max(label_lengths)
        y = np.full((mb_size, max_label_length), -1)
        for i in range(mb_size):
            y[i, :label_lengths[i]] = y_temp[i]
        # Flatten labels to comply with warp-CTC signature
        y_temp = reduce(lambda i, j: i + j, y_temp)
        return {
            'x': x,  # (0-padded features of shape(mb_size,timesteps,feat_dim)
            'y': y,  # list(int) Flattened labels (integer sequences)
            'texts': texts,  # list(str) Original texts
            'input_lengths': input_lengths,  # list(int) Length of each input
            'label_lengths': label_lengths  # list(int) Length of each label
        }
    def get_batch(self, index, size, audio_paths, texts):
        # pull necessary info
        max_length = max(
            [self.features[index + i].shape[0] for i in range(0, size)])
        max_string_length = max(
            [len(self.train_texts[index + i]) for i in range(0, size)])

        # initialize the arrays
        X_data = np.zeros([size, max_length, self.feat_dim])
        labels = np.ones([size, max_string_length]) * 28
        input_length = np.zeros([size, 1])
        label_length = np.zeros([size, 1])

        # populate the arrays
        for i in range(0, size):
            # X_data, input_length
            feat = self.features[index + i]
            input_length[i] = feat.shape[0]
            feat = self.normalize(feat)
            X_data[i, :feat.shape[0], :] = feat

            # y, label_length
            label = np.array(text_to_int_sequence(texts[index + i])) - 1
            labels[i, :len(label)] = label
            label_length[i] = len(label)

        # repare and return the arrays
        input_length = np.array([
            conv_output_length(i,
                               filter_size=11,
                               border_mode='valid',
                               stride=2) for i in input_length
        ])
        outputs = {'ctc': np.zeros([size])}
        inputs = {
            'the_input':
            X_data,  # array; dim: mb_size x max_aud_length x features[0].shape[1]
            'the_labels':
            labels,  # array; dim: mb_size, time_steps, num_categories
            'input_length': input_length,  # array; dim: mb_size x 1
            'label_length': label_length  # array; dim: mb_size x 1
        }
        return (inputs, outputs)
Beispiel #10
0
def get_maxseq_len(trans):
    # PAD
    t = text_to_int_sequence(trans)
    return len(t)
labels = np.ones([size, max_string_length]) * 28
input_length = np.zeros([size, 1])
label_length = np.zeros([size, 1])

for i in range(0, size):
    # X_data, input_length
    feat = audio_gen.features[index + i]
    feat = audio_gen.normalize(feat)
    input_length[i] = conv_output_length(max_length,
                                         filter_size=11,
                                         border_mode='valid',
                                         stride=2)
    X_data[i, :feat.shape[0], :] = feat

    # y, label_length
    label = np.array(text_to_int_sequence(
        audio_gen.train_texts[index + i])) - 1
    labels[i, :len(label)] = label
    label_length[i] = 133


def decode_batch(test_func, audio):
    out = test_func([audio])[0]
    ret = []
    for j in range(out.shape[0]):
        out_best = list(np.argmax(out[j, :], 1))
        out_best = [k for k, g in itertools.groupby(out_best)]
        # 26 is space, 27 is CTC blank char
        outstr = ''
        for c in out_best:
            if c >= 0 and c < 26:
                outstr += chr(c + ord('a'))
    def load_metadata_from_desc_file(self, desc_file, partition='train',
                                     max_duration=10.0):
        """ Read metadata from the description file
            (possibly takes long, depending on the filesize)
        Params:
            desc_file (str):  Path to a JSON-line file that contains labels and
                paths to the audio files
            partition (str): One of 'train', 'validation' or 'test'
            max_duration (float): In seconds, the maximum duration of
                utterances to train or test on
        """
        logger.info('Reading description file: {} for partition: {}'
                    .format(desc_file, partition))
        audio_paths, durations, texts, arpabets = [], [], [], []
        with open(desc_file, encoding='utf-8') as json_line_file:
            for line_num, json_line in enumerate(json_line_file):
                try:
                    spec = json.loads(json_line)
                    if float(spec['duration']) > max_duration:
                        continue
                    textlen= len(text_to_int_sequence(text_normalize(spec['text'])))
                    speclen= len(spectrogram_from_file(spec['key']))
                    if textlen >  speclen :
                            print('label > feats ignore setence')
                            continue
                    if textlen < 2:
                        print('small label ignore setence')
                        continue
                    audio_paths.append(spec['key'])
                    durations.append(float(spec['duration']))
                    texts.append(spec['text'])
                    if self.use_arpabets:
                        arpabets.append(spec['arpabet'])
                except Exception as e:
                    # Change to (KeyError, ValueError) or
                    # (KeyError,json.decoder.JSONDecodeError), depending on
                    # json module version
                    logger.warn('Error reading line #{}: {}'
                                .format(line_num, json_line))
                    logger.warn(str(e))

        if not self.use_arpabets:
            arpabets = [''] * len(audio_paths)

        if partition == 'train':
            self.train_audio_paths = audio_paths
            self.train_durations = durations
            self.train_texts = texts
            self.train_arpabets = arpabets
        elif partition == 'validation':
            self.val_audio_paths = audio_paths
            self.val_durations = durations
            self.val_texts = texts
            self.val_arpabets = arpabets
        elif partition == 'test':
            self.test_audio_paths = audio_paths
            self.test_durations = durations
            self.test_texts = texts
            self.test_arpabets = arpabets
        else:
            raise Exception("Invalid partition to load metadata. "
                            "Must be train/validation/test")