def testNestedOutputs(self): ds = Dataset.zip((Dataset.range(4), Dataset.zip((Dataset.range(4), Dataset.range(4))))) total = 0 # The Iterator will return a nested structure of Tensor objects. # Some funkiness to compare against simple integers. for (i, x) in enumerate(datasets.Iterator(ds)): want = (i, (i, i)) got = (x[0].numpy(), (x[1][0].numpy(), x[1][1].numpy())) self.assertEqual(got, want) total += 1 self.assertEqual(4, total)
def deploy_dataset_generator(deploy_root_dir='data/Deploy', batch_size=conf.batch_size, input_size=conf.input_size): def _decode_image(im_path): im_raw = tf.read_file(im_path) # convert to a grayscale image and downscale x2 image = tf.image.decode_jpeg(im_raw, channels=1, ratio=2) return image def _preprocess(im): # Convert to float scaled [0, 1]. if im.dtype != tf.float32: im = tf.image.convert_image_dtype(im, dtype=tf.float32) # Resize image to output size. im = tf.image.resize_images(im, input_size) # H x W x C --> C x H x W return tf.transpose(im, perm=(2, 0, 1)) def _sort(p): """'data/Deploy/KLAC/KLAC0003/KLAC0003_86.jpg' ==> 'data/Deploy/KLAC/KLAC0003/KLAC0003_0086.jpg' """ prefix, old_name = p.split('_') new_name = old_name.zfill(8) return '_'.join([prefix, new_name]) frames_name_list = sorted(glob('{}/*/*/*.jpg'.format(deploy_root_dir)), key=_sort) dir_dataset = Dataset.from_tensor_slices(frames_name_list) img_dataset = dir_dataset.map(_decode_image) img_dataset = img_dataset.map(_preprocess) dataset = Dataset.zip((img_dataset, dir_dataset)) return dataset.batch(batch_size), len(frames_name_list)
def get_dataset_encoded(dir='train', batch_size=32): # Load encoder. encoder = tfds.deprecated.text.SubwordTextEncoder.load_from_file('vocab') print('Vocab size is', encoder.vocab_size) # Load data. with open('dataset/' + dir + '/original.txt') as original: # Remove newline at the end. data_orig = original.readlines()[:-1] with open('dataset/' + dir + '/shuffled.txt') as shuffled: data_shuffled = shuffled.readlines()[:-1] data = data_orig + data_shuffled # Get song with max length to know the size for padding. max_len = 0 longest_song = '' count = 0 for i in range(len(data)): count += 1 data[i] = data[i].strip() song = data[i] data[i] = encoder.encode(data[i]) if len(data[i]) > max_len: max_len = len(data[i]) longest_song = song print('max len is', max_len) print('longest song:', longest_song) # Create labels. labels = [1] * len(data_orig) + [0] * len(data_shuffled) # Shuffle. random.seed(42) random.shuffle(data) random.seed(42) random.shuffle(labels) # Create Dataset objects from generators. data_gen = lambda: (d for d in data) label_gen = lambda: ([l] for l in labels) dataset_data = tf.data.Dataset.from_generator(data_gen, output_types=tf.int32, output_shapes=tf.TensorShape( [None])) dataset_labels = tf.data.Dataset.from_generator( label_gen, output_types=tf.int32, output_shapes=tf.TensorShape([1])) dataset = Dataset.zip((dataset_data, dataset_labels)) # Each batch is padded to the size of the longest element in that batch. dataset_batched = dataset.padded_batch(batch_size, padding_values=0, padded_shapes=(max_len, 1)) # Debug prints: print('{0} dataset: {1}'.format(dir, dataset_batched.cardinality())) # for element in dataset: # print(element) for text_batch, label_batch in dataset_batched.take(1): print(text_batch.shape) print(label_batch.shape) for i in range(5): print(text_batch[i]) print(label_batch[i]) return dataset
def get_dataset(dir='train', batch_size=32, padding_char='_'): fixed_len = 275 # Load data, padding from left to fixed length. with open('dataset/' + dir + '/original.txt') as original: # Remove newline at the end. data_orig = original.readlines()[:-1] with open('dataset/' + dir + '/shuffled.txt') as shuffled: data_shuffled = shuffled.readlines()[:-1] data = data_orig + data_shuffled for i in range(len(data)): data[i] = data[i].strip() # # Add padding # data_len = len(data[i].split(' ')) # padding_len = fixed_len - data_len # data[i] = (padding_char + ' ') * padding_len + data[i] # Add labels. labels = [1] * len(data_orig) + [0] * len(data_shuffled) # Shuffle. random.seed(42) random.shuffle(data) random.seed(42) random.shuffle(labels) # Convert to tensors. data_tensor = tf.ragged.constant(data) labels_tensor = tf.ragged.constant(labels) # Convert to Dataset object. features_dataset = Dataset.from_tensor_slices(data_tensor) labels_dataset = Dataset.from_tensor_slices(labels_tensor) dataset = Dataset.zip((features_dataset, labels_dataset)) dataset = dataset.batch(batch_size) # Debug prints: print('{0} dataset: {1}'.format(dir, dataset.cardinality())) # for element in dataset: # print(element) # for text_batch, label_batch in dataset.take(1): # for i in range(5): # print(text_batch.numpy()[i]) # print(label_batch.numpy()[i]) return dataset
tgt_dataset = Dataset.from_tensor_slices( tf.constant( [ 'a b', 'b c', '', 'c c' ] ) ) src_eos_id = tf.cast( src_vocab_table.lookup( tf.constant( 'eos' ) ), tf.int32 ) tgt_sos_id = tf.cast( tgt_vocab_table.lookup( tf.constant( 'sos' ) ), tf.int32 ) tgt_eos_id = tf.cast( tgt_vocab_table.lookup( tf.constant( 'eos' ) ), tf.int32 ) src_tgt_dataset = Dataset.zip( ( src_dataset, tgt_dataset ) ) print( 'begin') print_Dataset( src_tgt_dataset ) src_tgt_dataset = src_tgt_dataset.map( lambda src, tgt: ( tf.string_split( [src] ).values, tf.string_split( [tgt] ).values ) ) print( 'string_split') print_Dataset( src_tgt_dataset ) src_tgt_dataset = src_tgt_dataset.filter( lambda src, tgt: tf.logical_and( tf.size(src) >0, tf.size( tgt) > 0 ) ) print( 'Filter zero length input sequences')
import matplotlib.pyplot as plt import pickle #examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, # as_supervised=True) #train_examples, val_examples = examples['train'], examples['validation'] from tensorflow.python import keras from tensorflow.python.data import Dataset from tensorflow_datasets.core.features.text import TokenTextEncoder #dataset_x = dataset_x.map(lambda token: token.numpy().decode("utf-8")) dataset_x = tf.data.TextLineDataset("data/texts_noisy.txt") dataset_y = tf.data.TextLineDataset("data/texts.txt") dataset = Dataset.zip((dataset_x, dataset_y)) print("data loaded") vocab = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', 'ø', 'å', ' '] tokenizer_pt = None tokenizer_en = None if os.path.isfile('tokenizer_pt.pickle'): with open('tokenizer_pt.pickle', 'rb') as handle: tokenizer_pt = pickle.load(handle)
def get_dataset(dir='train', batch_size=32): # Load encoder. encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab') # Load data. with open('dataset/' + dir + '/original.txt') as original: # Remove newline at the end. data_orig = original.readlines()[:-1] with open('dataset/' + dir + '/shuffled.txt') as shuffled: data_shuffled = shuffled.readlines()[:-1] data = data_orig + data_shuffled max_len = 0 count = 0 for i in range(len(data)): count += 1 data[i] = data[i].strip() data[i] = encoder.encode(data[i]) if len(data[i]) > max_len: max_len = len(data[i]) print('max len is', max_len) # Add padding. # for i in range(len(data)): # data[i] += [0]*(max_len - len(data[i])) # Add labels. labels = [1] * len(data_orig) + [0] * len(data_shuffled) # Shuffle. random.seed(42) random.shuffle(data) random.seed(42) random.shuffle(labels) # Convert to tensors. # data_tensor = tf.ragged.constant(data) # labels_tensor = tf.ragged.constant(labels) # # Convert to Dataset object. # features_dataset = Dataset.from_tensor_slices(data_tensor) # labels_dataset = Dataset.from_tensor_slices(labels_tensor) # dataset = Dataset.zip((features_dataset, labels_dataset)) # # Convert to numpy array to create Dataset object. # dataset = Dataset.from_tensor_slices((data, labels)) data_gen = lambda: (d for d in data) label_gen = lambda: ([l] for l in labels) dataset_data = tf.data.Dataset.from_generator(data_gen, output_types=tf.int32, output_shapes=tf.TensorShape( [None])) dataset_labels = tf.data.Dataset.from_generator( label_gen, output_types=tf.int32, output_shapes=tf.TensorShape([1])) dataset = Dataset.zip((dataset_data, dataset_labels)) # im_dataset = im_dataset.prefetch(4) # print("output data type is ", im_dataset.output_types) # print("output data shape is ", im_dataset.output_shapes) # iterator = im_dataset.make_initializable_iterator() # with tf.Session() as sess: # sess.run(iterator.initializer) # a = sess.run(iterator.get_next()) # print("shape of the run results are: ") # print(a[0].shape) # print(a[1].shape) # print(a[2].shape) # print(a[3].shape) # for elem, val in dataset: # print(elem) # print(val) # break # Each batch is padded to the size of the longest element in that batch. dataset_batched = dataset.padded_batch(batch_size, padding_values=0, padded_shapes=(max_len, 1)) # Debug prints: print('{0} dataset: {1}'.format(dir, dataset_batched.cardinality())) # for element in dataset: # print(element) # for text_batch, label_batch in dataset_batched.take(1): # print(text_batch.shape) # print(label_batch.shape) # for i in range(5): # print(text_batch[i]) # print(label_batch[i]) return dataset