def load_demo_data(path): train = pd.read_csv("../data/original_data/train.csv") LABELS = list(train.label.unique()) label_idx = {label: i for i, label in enumerate(LABELS)} train.set_index("fname", inplace=True) # test.set_index("fname", inplace=True) train["label_idx"] = train.label.apply(lambda x: label_idx[x]) train_verified = train[train.manually_verified == 1] train_csv = train # test = pd.read_csv("../data/original_data/sample_submission.csv") config = Config(sampling_rate_raw=16000, audio_duration=2, n_folds=10, learning_rate=0.001, use_mfcc=True, n_mfcc=50, sampling_rate=16000) X_train, X_train_t = prepare_data(train_csv, config, path) # X_train_n = prepare_data(train, config, path, noise=True) y_train = to_categorical(train_csv.label_idx, num_classes=config.n_classes) X_train, train_mean = preprocess(X_train) pickle_data(train_mean, '../data/original_data/train_mean.pkl') # TODO: # split the train_set and the val_set skf = StratifiedKFold(train_csv.label_idx, n_folds=10) for i, (train_split, val_split) in enumerate(skf): if i == 1: train_split_0 = train_split val_split_0 = val_split break X_train_t = np.expand_dims(X_train_t, axis=-1) features = X_train_t[train_split_0] targets = y_train[train_split_0] train_set = DataSet(features=features, targets=targets, data_dict={'mfcc': X_train[train_split_0]}) features = X_train_t[val_split_0] targets = y_train[val_split_0] val_set = DataSet(features=features, targets=targets, data_dict={'mfcc': X_train[val_split_0]}) return train_set, val_set
def load_as_tframe_data(cls, data_dir): from .dataset import DataSet file_path = os.path.join(data_dir, cls.TFD_FILE_NAME) if os.path.exists(file_path): return DataSet.load(file_path) # If .tfd file does not exist, try to convert from raw data console.show_status('Trying to convert raw data to tframe DataSet ...') images, labels = cls.load_as_numpy_arrays(data_dir) data_set = DataSet(images, labels, name=cls.DATA_NAME, **cls.PROPERTIES) # Generate groups if necessary if data_set.num_classes is not None: groups = [] dense_labels = misc.convert_to_dense_labels(labels) for i in range(data_set.num_classes): # Find samples of class i and append to groups samples = list( np.argwhere([j == i for j in dense_labels]).ravel()) groups.append(samples) data_set.properties[data_set.GROUPS] = groups # Show status console.show_status('Successfully converted {} samples'.format( data_set.size)) # Save DataSet console.show_status('Saving data set ...') data_set.save(file_path) console.show_status('Data set saved to {}'.format(file_path)) return data_set
def load_test_data(path, train_mean=None): test = pd.read_csv("../data/original_data/sample_submission.csv") test.set_index("fname", inplace=True) train_csv = test # test = pd.read_csv("../data/original_data/sample_submission.csv") config = Config(sampling_rate_raw=16000, audio_duration=2, n_folds=10, learning_rate=0.001, use_mfcc=True, n_mfcc=50, sampling_rate=16000) train_csv = train_csv.head() X_train, X_train_t = prepare_data(train_csv, config, path) # X_train_n = prepare_data(train, config, path, noise=True) X_train, mean = preprocess(X_train, train_mean=train_mean) # TODO: X_train_t = np.expand_dims(X_train_t, axis=-1) test_set = DataSet(features=X_train_t, data_dict={'mfcc': X_train}) return test_set
def emit(self, num_steps): assert self.is_ready assert isinstance(num_steps, int) if num_steps < 0: num_steps = self.max_emit_length # Determine steps steps = min(self.max_emit_length, num_steps) template = self._data_sets[0] assert isinstance(template, DataSet) data_dict = template.data_dict.copy() cursors = None for key, data in data_dict.items(): assert isinstance(data, np.ndarray) sample_shape = data.shape[1:] data_dict[key] = np.zeros(shape=(self.size, steps, *sample_shape)) cursors = self._cursors.copy() for i in range(self.size): array = self._data_sets[i][key] assert isinstance(array, np.ndarray) c = cursors[i] data_dict[key][i] = array[c:c + steps] # Move cursor assert 0 < c + steps <= len(array) cursors[i] += steps # Update cursors assert cursors is not None self._cursors = cursors # Wrap data into a DataSet and return return DataSet(data_dict=data_dict, is_rnn_input=True)
def gen_batches(self, batch_size, **kwargs): assert self.is_ready checker.check_positive_integer(self.batches_per_epoch) for i in range(self.batches_per_epoch): matrix, labels = self._random_signal_matrix( batch_size, self.input_size) batch = DataSet(matrix, labels) batch.name = 'gpat_{}of{}'.format(i + 1, self.batches_per_epoch) yield batch
def tfr_view(self): """View image sequence using tframe.ImageViewer""" from tframe.data.images.image_viewer import ImageViewer from tframe.data.dataset import DataSet images = self.sequence images = images / np.max(images) ds = DataSet(features=images) viewer = ImageViewer(ds) viewer.show()
def split_data_set(split_indices, data_set): # TODO: only for features are list and targets are ndarrays assert isinstance(data_set, DataSet) split_features = [] for id in split_indices: split_features.append(data_set.features[id]) split_targets = data_set.targets[split_indices] split_data_set = DataSet(split_features, split_targets) return split_data_set
def stack(self): """Concatenate this sequence set (a list consists of sequences with shape [steps, *dim]) to a regular array with shape [sum(steps), *dim]""" if self.DATA_STACK in self.properties.keys(): stack = self.properties[self.DATA_STACK] assert isinstance(stack, DataSet) return stack self.properties[self.DATA_STACK] = DataSet( data_dict=self._apply(np.concatenate, self.merged_data_dict), name=self.name + '(stacked)', **self.properties) return self.stack
def padded_stack(self): """Stack this sequence set with 0 padded. The output shape is (self.size, max_steps, *dim)""" if self.PADDED_STACK in self.properties.keys(): stack = self.properties[self.PADDED_STACK] assert isinstance(stack, DataSet) return stack max_step = max(self.structure) f = lambda seqs: self._pad_sequences(seqs, max_step) self.properties[self.PADDED_STACK] = DataSet( data_dict=self._apply(f, self.merged_data_dict), name=self.name + '(padded_stack)', is_rnn_input=True, **self.properties) self.padded_stack.active_length = self.structure return self.padded_stack
def activate(export_false=False): assert callable(th.model) model = th.model(th) assert isinstance(model, Predictor) train_set, val_set, test_set = load_data(th.data_dir, 600, -1, 1) if th.train: model.train(train_set, validation_set=val_set, trainer_hub=th, probe=lambda t: probe(t, train_set)) else: from tframe.data.images.image_viewer import ImageViewer from tframe.data.dataset import DataSet import cv2 import skimage.transform as transform dir = os.path.join(th.data_dir, 'test') imgs = [] images = os.listdir(dir) for i in range(len(images)): img = cv2.imread(os.path.join(dir, images[i]), 0) assert isinstance(img, np.ndarray) img = img / 255 img = transform.resize(img, [256, 256]) imgs.append(img.reshape(1, 256, 256, 1)) X = np.concatenate(imgs) test_set_a = DataSet(features=X) images = model.predict(test_set_a, batch_size=2) images = images.reshape([-1, 256, 256]) viewer = ImageViewer(DataSet(features=images)) viewer.show()
def load_data(path, csv_path, fold=0): # TODO: train = pd.read_csv(csv_path) LABELS = list(train.label.unique()) label_idx = {label: i for i, label in enumerate(LABELS)} train.set_index("fname", inplace=True) train["label_idx"] = train.label.apply(lambda x: label_idx[x]) # split the train_set and the val_set skf = StratifiedKFold(train.label_idx, n_folds=10) for i, (train_split, val_split) in enumerate(skf): if i == fold: train_split_0 = train_split val_split_0 = val_split break audio_length = 32000 data_set = DataSet.load(path) assert isinstance(data_set, DataSet) train_split_data = Gpat_set.split_data_set(train_split_0, data_set) val_set = Gpat_set.split_data_set(val_split_0, data_set) raw_val_set = val_set raw_val_set.properties[raw_val_set.NUM_CLASSES] = 41 train_set = Gpat_set(features=train_split_data.features, targets=train_split_data.targets, NUM_CLASSES=41) train_set.init_groups() for i in range(len(val_set.features)): if i == 0: features = GPAT.length_adapted(val_set.features[i], audio_length) mfccs = librosa.feature.mfcc(features, 16000, n_mfcc=50) mfccs = np.expand_dims(mfccs, axis=0) features = np.reshape(features, (1, -1)) # targets = batch_data[i].targets else: feature = GPAT.length_adapted(val_set.features[i], audio_length) mfcc = librosa.feature.mfcc(feature, 16000, n_mfcc=50) mfcc = np.expand_dims(mfcc, axis=0) mfccs = np.concatenate((mfccs, mfcc), axis=0) feature = np.reshape(feature, (1, -1)) features = np.concatenate((features, feature), axis=0) targets = val_set.targets features = np.expand_dims(features, axis=2) mfccs = np.expand_dims(mfccs, axis=-1) val_set = DataSet(features, targets, data_dict={'mfcc': mfccs}) test_set = val_set return train_set, val_set, test_set, raw_val_set
def get_round_length(batch_size, num_steps, lengths, len_f=None): checker.check_type(lengths, int) pe = ParallelEngine(batch_size) round_len, cursor = 0, 0 while True: # Set sequences and targets if necessary while not pe.is_ready: if cursor < len(lengths): length = lengths[cursor] if len_f is None else len_f(lengths[cursor]) ds = DataSet(features=np.zeros(shape=(length, 1))) cursor += 1 else: ds = None pe.set_data(ds) if pe.flameout: break pe.emit(num_steps) round_len += 1 return round_len
def _gen_rnn_batches(self, x, y, num_steps, *args): # Sanity check assert isinstance(x, np.ndarray) and isinstance(y, np.ndarray) assert isinstance(num_steps, int) assert len(x.shape) == 3 and x.shape[2] == self.input_size steps = x.shape[1] assert y.shape == (x.shape[0], steps, self.NUM_CLASSES) # Yield RNN batches if num_steps < 0: num_steps = steps yield_times = int(np.ceil(steps / num_steps)) for i in range(yield_times): batch_x = x[:, i * num_steps:min((i + 1) * num_steps, steps)] batch_y = y[:, i * num_steps:min((i + 1) * num_steps, steps)] batch = DataSet(batch_x, batch_y, in_rnn_format=True) # State should be reset at the beginning of a sequence if i == 0: batch.should_reset_state = True batch.name = 'gpat_{}of{}'.format(i + 1, yield_times) yield batch
def load_as_tframe_data(cls, data_dir, file_name=None, size=512, unique_=True): # Check file_name if file_name is None: file_name = cls._get_file_name(size, unique_) data_path = os.path.join(data_dir, file_name) if os.path.exists(data_path): return DataSet.load(data_path) # If data does not exist, create a new one console.show_status('Making data ...') erg_list = ReberGrammar.make_strings( size, unique_, embedded=True, verbose=True) # Wrap erg into a DataSet features = [erg.one_hot for erg in erg_list] targets = [erg.transfer_prob for erg in erg_list] data_set = DataSet(features, targets, {'erg_list': erg_list}, name='Embedded Reber Grammar') console.show_status('Saving data set ...') data_set.save(data_path) console.show_status('Data set saved to {}'.format(data_path)) return data_set
def _gen_rnn_batches_by_wheel(self, batch_size, num_steps, round_len, L, **_): """Each sequence in batch is a sub-sequence of length L of a randomly selected sequence. First introduced in sampling LOB data. The sub-sequence length L must be specified. """ # Sanity check if batch_size < 0 or batch_size is None: batch_size = self.size if num_steps < 0 or num_steps is None: num_steps = L # Generate feature list and target list features, targets = [], [] wheel = Wheel(self.structure if th. use_wheel else list(np.ones([self.size]) / self.size)) for _ in range(batch_size): # Choose a sequence to sample from index = wheel.spin() t = np.random.randint(0, self.structure[index] - L + 1) x = self.features[index][t:t + L] y = self.targets[index][t:t + L] assert len(x) == len(y) == L features.append(x) targets.append(y) # Stack features and targets features, targets = np.stack(features), np.stack(targets) data_set = DataSet(features, targets, is_rnn_input=True) assert data_set.size == batch_size # Generate RNN batches using DataSet.gen_rnn_batches counter = 0 for batch in data_set.gen_rnn_batches(batch_size, num_steps, is_training=True): yield batch counter += 1 # Check round_len if counter != round_len: raise AssertionError( "!! counter = {} while round_len = {}. (batch_size = {}, num_steps={})" "".format(counter, round_len, batch_size, num_steps))
def rnn_batch_generator(data_set, batch_size, num_steps, is_training, round_len): """Generated epoch batches are guaranteed to cover all sequences""" assert isinstance(data_set, SequenceSet) and is_training L = int(sum(data_set.structure) / batch_size) assert L < min(data_set.structure) and L == th.sub_seq_len rad = int(th.random_shift_pct * L) # Distribute batch_size to stocks # [23336, 44874, 38549, 54675, 93316] num_sequences = wise_man.apportion(data_set.structure, batch_size) # Generate feature list and target list features, targets = [], [] for num, x, y in zip(num_sequences, data_set.features, data_set.targets): # Find starts for each sequence to sample starts = wise_man.spread(len(x), num, L, rad) # Sanity check assert len(starts) == num # Put the sub-sequences into corresponding lists for s in starts: features.append(x[s:s + L]) targets.append(y[s:s + L]) # Stack features and targets features, targets = np.stack(features), np.stack(targets) data_set = DataSet(features, targets, is_rnn_input=True) assert data_set.size == batch_size # Generate RNN batches using DataSet.gen_rnn_batches counter = 0 for batch in data_set.gen_rnn_batches(batch_size, num_steps, is_training=True): yield batch counter += 1 # Check round_len if counter != round_len: raise AssertionError( '!! counter = {} while round_len = {}. (batch_size = {}, num_steps={})' ''.format(counter, round_len, batch_size, num_steps))
def gen_batches(self, batch_size, shuffle=False): round_len = self.get_round_length(batch_size) for i in range(round_len): if shuffle: indices = (self._rand_indices(size=batch_size)) else: range(i * batch_size, min((i + 1) * batch_size, len(self.targets))) batch_features = [] for indice in indices: batch_features.append(self.features[indice]) for i in range(len(indices)): if i == 0: features = GPAT.length_adapted(batch_features[i], self.audio_length) mfccs = librosa.feature.mfcc(features, 16000, n_mfcc=50) mfccs = np.expand_dims(mfccs, axis=0) features = GPAT.audio_norm(features) features = np.reshape(features, (1, -1)) # targets = batch_data[i].targets else: feature = GPAT.length_adapted(batch_features[i], self.audio_length) mfcc = librosa.feature.mfcc(feature, 16000, n_mfcc=50) mfcc = np.expand_dims(mfcc, axis=0) mfccs = np.concatenate((mfccs, mfcc), axis=0) feature = GPAT.audio_norm(feature) feature = np.reshape(feature, (1, -1)) features = np.concatenate((features, feature), axis=0) # targets = np.concatenate((targets, batch_data[i].targets), axis=0) targets = self.targets[indices] features = np.expand_dims(features, axis=2) mfccs = np.expand_dims(mfccs, axis=-1) output_batch_data = DataSet(features, targets, data_dict={'mfcc': mfccs}) yield output_batch_data
def load_as_tframe_data(cls, data_dir, **kwargs): # Load directly if all files exists data_path = cls._get_data_paths(data_dir) if os.path.exists(data_path): data_set = DataSet.load(data_path) else: # If data does not exist, create from raw data console.show_status('Creating data sets ...') data, mapping = cls._load_raw_data(data_dir) x = np.array(data[:-1]).reshape(-1, 1) y = np.array(data[1:]).reshape(-1, 1) data_set = DataSet(x, y, name='Text8.char', mapping=mapping) # Save data set and show info data_set.save(data_path) console.show_status('{} saved to `{}`'.format( data_set.name, data_path)) # Show mapping size console.show_status( 'Data sets (containing {} different characters) loaded:'.format( len(data_set['mapping']))) return data_set
data_generator = DataGenerator(config=config, data_dir='../data/original_data/audio_train/', list_IDs=train.index, labels=train["label_idx"]) batches = len(train.index) // 64 for i in range(batches): feature, target = data_generator[i] if i == 0: features = feature targets = target else: features = np.concatenate((features, feature), axis=0) targets = np.concatenate((targets, target), axis=0) demo_data = DataSet(features=features, targets=targets) demo_data.save('../data/processed_data/demo_data_0') a = data_generator[2] b = a[0] c = 1 for i in range(len(val_set.features)): if i == 0: features = GPAT.length_adapted(val_set.features[i], audio_length) features = np.reshape(features, (1, -1)) # targets = batch_data[i].targets else: feature = GPAT.length_adapted(val_set.features[i], audio_length) feature = np.reshape(feature, (1, -1)) features = np.concatenate((features, feature), axis=0) # targets = np.concatenate((targets, batch_data[i].targets), axis=0) targets = val_set.targets
def f(u): assert isinstance(model, Predictor) return np.ravel(model.predict(DataSet(features=u)))