def make_tf_dataset(file_path='', batch_size=10): loaded_data = np.load(file_path) X_train = loaded_data['X_train'] X_test = loaded_data['X_test'] Y_train = loaded_data['Y_train'] Y_test = loaded_data['Y_test'] print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape, flush=True) X_train = tf.cast(X_train, tf.float32) X_test = tf.cast(X_test, tf.float32) Y_train = tf.cast(Y_train, tf.int32) Y_test = tf.cast(Y_test, tf.int32) train_dat = Dataset.from_tensor_slices((X_train, Y_train)) train_dat = train_dat.batch(batch_size) test_dat = Dataset.from_tensor_slices((X_test, Y_test)) test_dat = test_dat.batch(batch_size) data_dict = {} iterator = Iterator.from_structure(train_dat.output_types, train_dat.output_shapes) data_dict['iterator'] = iterator data_dict['train_it_init'] = iterator.make_initializer(train_dat) data_dict['test_it_init'] = iterator.make_initializer(test_dat) #data_dict['train_it'] = train_iterator #data_dict['test_it'] = test_iterator #data_dict['train_it_init'] = train_iterator.initializer #data_dict['test_it_init'] = test_iterator.initializer return data_dict
def run(data_path, image_size=160, epochs=10, batch_size=32, learning_rate=0.0001, output='model', dataset=None): img_shape = (image_size, image_size, 3) info('Loading Data Set') # load dataset train, test, val, labels = load_dataset(data_path, dataset) # training data train_data, train_labels = zip(*train) train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)), Dataset.from_tensor_slices(list(train_labels)))) train_ds = train_ds.map(map_func=process_image, num_parallel_calls=5) train_ds = train_ds.apply(tf.data.experimental.ignore_errors()) train_ds = train_ds.batch(batch_size) train_ds = train_ds.prefetch(buffer_size=5) train_ds = train_ds.repeat() # model info('Creating Model') base_model = tf.keras.applications.ResNet50(input_shape=img_shape, include_top=False, weights='imagenet') base_model.trainable = True model = tf.keras.Sequential([ base_model, tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy']) model.summary() # training info('Training') steps_per_epoch = math.ceil(len(train)/batch_size) history = model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch) # save model info('Saving Model') # check existence of base model folder output = check_dir(output) print('Serializing into saved_model format') tf.saved_model.save(model, str(output)) # add time prefix folder #stamp = datetime.now().strftime('%y_%m_%d_%H_%M.h5') #stamped = str(Path(output).joinpath(stamp)) file_output = str(Path(output).joinpath('latest.h5')) #print('Serializing model to:\n{}\n{}'.format(stamped, output) model.save(file_output)
def main(): dataset_count = 10 def create_dataset(i): return Dataset.range(4 * i, 4 * (i + 1)) dataset = Dataset.range(dataset_count).map(create_dataset) for d in dataset: show_dataset(d) d = dataset.flat_map(lambda x: x) show_dataset(d) d = dataset.interleave(lambda x: x, cycle_length=2, block_length=3) show_dataset(d) # Repeat two datasets of different lengths and interleave them. a = Dataset.from_tensor_slices(np.arange(10)).repeat() b = Dataset.from_tensor_slices(100 + np.arange(17)).repeat() datasets = [a, b] n = len(datasets) c = Dataset.from_tensor_slices(datasets) d = c.interleave(lambda x: x, cycle_length=n).take(50) show_dataset(d)
def test_input_fn(x_test,y_test,batch_size): if y_test is None: ds=tds.from_tensor_slices( {'input-features':x_test}) else: ds=tds.from_tensor_slices( ({'input-features':x_test}, y_test.reshape(-1,1))) return ds.batch(batch_size)
def getData(mypath, config): # get list of filepaths onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] data_dict = [mypath + "\\" + s for s in onlyfiles] # create numpy datasets for each stock data = [] for fname in data_dict: data.append( DataLoader(fname, window=config.experiment.window, threshold=config.experiment.threshold)) # initialize numpy arrays for training and test data X_train = data[0].X_train_std Y_train = data[0].Y_train X_val = data[0].X_val_std Y_val = data[0].Y_val X_test = data[0].X_test_std Y_test = data[0].Y_test # add other stocks to previously initialized numpy arrays for i in range(1, len(data)): X_train = np.concatenate((X_train, data[i].X_train_std), axis=0) Y_train = np.concatenate((Y_train, data[i].Y_train), axis=0) X_val = np.concatenate((X_val, data[i].X_val_std), axis=0) Y_val = np.concatenate((Y_val, data[i].Y_val), axis=0) X_test = np.concatenate((X_test, data[i].X_test_std), axis=0) Y_test = np.concatenate((Y_test, data[i].Y_test), axis=0) # Save number of features and samples num_train_samples = X_train.shape[0] num_val_samples = X_val.shape[0] num_test_samples = X_test.shape[0] num_train_features = X_train.shape[1] # Generate TF dataset for Keras model logging.info('------Final Training and Test Datasets------') logging.info('Size of X_Train: %s', X_train.shape) logging.info('Size of Y_Train: %s', Y_train.shape) logging.info('Size of X_val: %s', X_val.shape) logging.info('Size of Y_val: %s', Y_val.shape) logging.info('Size of X_Test: %s', X_test.shape) logging.info('Size of Y_Test: %s', Y_test.shape) train_dataset = Dataset.from_tensor_slices((X_train, Y_train)) train_dataset = train_dataset.shuffle(config.model.shuffle).batch( config.model.batch_size).repeat() val_dataset = Dataset.from_tensor_slices((X_val, Y_val)) val_dataset = val_dataset.shuffle(config.model.shuffle).batch( config.model.batch_size).repeat() test_dataset = Dataset.from_tensor_slices((X_test, Y_test)) test_dataset = test_dataset.shuffle(config.model.shuffle).batch( config.model.batch_size).repeat() return train_dataset, val_dataset, test_dataset, num_train_features, num_train_samples, num_val_samples, num_test_samples
def train_sa_bilstm(pad_to, lstm_hidden, da, r, lr, loss, savefigto): (train_x, train_y), (val_x, val_y), (test_x, test_y) = \ load_ESOL('data/ESOL-solubility.csv', 'data/mol2vec_model_300dim.pkl', pad_to=pad_to) _, _, vector_size = train_x.shape model = build_sa_bilstm_model(pad_to=pad_to, vector_size=vector_size, lstm_hidden=lstm_hidden, da=da, r=r) model.compile(optimizer=keras.optimizers.Adam(lr=lr), loss=loss, metrics=['mse']) print(model.summary()) print(train_x.shape, train_y.shape) train_dataset = Dataset.from_tensor_slices( (train_x, train_y)).shuffle(buffer_size=128).batch(64, drop_remainder=True) val_dataset = Dataset.from_tensor_slices( (val_x, val_y)).batch(32, drop_remainder=True) test_dataset = Dataset.from_tensor_slices( (test_x, test_y)).batch(32, drop_remainder=True) # This eats huge HD space! tensorboard_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1, update_freq='batch') earlystop_callback = keras.callbacks.EarlyStopping(patience=10) checkpoint_callback = keras.callbacks.ModelCheckpoint( f'./checkpoints/model-sa-bilstm-{pad_to}-{lstm_hidden}-{da}-{r}-{lr}-{loss}.ckpt', save_best_only=True) model.fit(train_dataset, epochs=100, validation_data=val_dataset, callbacks=[ tensorboard_callback, earlystop_callback, checkpoint_callback ]) # std, mean predict = np.array(model.predict(test_x)).ravel() * 2.0965 - 3.058 truth = np.array(test_y).ravel() * 2.0965 - 3.058 plt.figure(figsize=(5, 5)) plt.scatter(predict, truth) plt.plot([-8, 0], [-8, 0], 'r--') plt.axis([-8, 0, -8, 0]) plt.xlabel("Prediction") plt.ylabel("Groundtruth") MSE = ((predict - truth)**2).mean() plt.title(f"MSE = {MSE:.3f}") plt.savefig( Path(savefigto) / f'./solubility_sa_bilstm-{pad_to}-{lstm_hidden}-{da}-{r}-{lr}-{loss}-{MSE:.4f}.png' ) plt.close()
def simple_dataset(config): batch_size = config["batch_size"] x_train, y_train = linear_dataset(size=NUM_TRAIN_SAMPLES) x_test, y_test = linear_dataset(size=NUM_TEST_SAMPLES) train_dataset = Dataset.from_tensor_slices((x_train, y_train)) test_dataset = Dataset.from_tensor_slices((x_test, y_test)) train_dataset = train_dataset.shuffle(NUM_TRAIN_SAMPLES).repeat().batch( batch_size) test_dataset = test_dataset.repeat().batch(batch_size) return train_dataset, test_dataset
def create_final_datasets(X_train, X_valid, y_train, y_valid): train = Dataset.from_tensor_slices((X_train, (y_train.identity_hate.values, y_train.insult.values, y_train.obscene.values, y_train.severe_toxic.values, y_train.threat.values, y_train.toxic.values))).map( custom_loss.preprocess_sample).batch(config.BATCH_SIZE).repeat() valid = Dataset.from_tensor_slices((X_valid, (y_valid.identity_hate.values, y_valid.insult.values, y_valid.obscene.values, y_valid.severe_toxic.values, y_valid.threat.values, y_valid.toxic.values))).map( custom_loss.preprocess_sample).batch(config.BATCH_SIZE).repeat() return train, valid
def generate_tf_data(enc_input: list, dec_input: list, batch_size: int, train_size: int, val_size: int) -> [Dataset]: '''Generates a tensorflow data set, splits it in train, test and validation sets. Problem: Feeding in three arrays containing almost two million sequences each, requires too much main memory. Solution: We use the Tensorflow Dataset, where we can feed the model with slices of the whole dataset. Also: shuffles the observations. Args: enc_input: encoder input ids, token ids for each word and each sentence dec_input: used for teacher forcing. Token ids for each word and each sentence in target lang. More specific: - decoder input, token sequences (index 0 in dec_input) - decoder target output, token sequences (for teacher forcing, index 1 in dec_input) batch_size: Number of observation passed to the Seq2Seq model during training time. train_size: Fraction of all observations to be reserved for training the model. val_size: Fraction of all observations to be reserved for evaluating the model performance during training. Returns: train_data: contains encoder_input, decoder_input, decoder_target_output for training the model. val_data: contains encoder_input, decoder_input, decoder_target_output for evaluating the model. ''' assert train_size + val_size == 1, "Train, Validation and Test size doesn't sum up to 1!" data_size = enc_input[0].shape[0] # Summarize the source language token ids and the decoder input as: model_input model_input = Dataset.from_tensor_slices((enc_input[0], dec_input[0])) # enc_token_ids dec_token_ids # convert decoder_target_output to TF.Dataset decoder_target_output = Dataset.from_tensor_slices((dec_input[1])) # dec_token_ids used as target output (shifted by one observation) # Combine the model_input and the decoder_target_output to a full TF.Dataset, shuffle it full_data = Dataset.zip( (model_input, decoder_target_output)).shuffle(data_size) # Train Val split train_size = int(train_size * data_size) val_size = int(val_size * data_size) train_data = full_data.take(train_size) val_data = full_data.skip(train_size) train_data = train_data.batch(batch_size, drop_remainder=True) val_data = val_data.batch(batch_size, drop_remainder=True) return train_data, val_data
def _input_fn(directory, config, mode): print("Fetching {} data...".format(mode)) all_features = [] all_labels = [] if config["cloud"] == 0: all_files = os.listdir(directory) for file in all_files: features, labels = _load_json_file(os.path.join(directory, file), config) all_features += features all_labels += labels else: s = sagemaker.Session() all_files = s.list_s3_files(config["bucket"], directory) for file in all_files[1:]: features, labels = _load_json_file( s.read_s3_file(config["bucket"], file), config) all_features += features all_labels += labels num_data_points = len(all_features) num_batches = math.ceil(len(all_features) / config["batch_size"]) dataset = Dataset.from_tensor_slices((all_features, all_labels)) if mode == "train": dataset = Dataset.from_tensor_slices((all_features, all_labels)) dataset = dataset.batch(config["batch_size"]).shuffle( 10000, seed=12345).repeat(config["num_epoch"]) num_batches = math.ceil(len(all_features) / config["batch_size"]) if mode in ("validation", "eval"): dataset = dataset.batch(config["batch_size"]).repeat( config["num_epoch"]) num_batches = math.ceil(len(all_features) / config["batch_size"]) iterator = dataset.make_one_shot_iterator() dataset_features, dataset_labels = iterator.get_next() return [{ config["input_tensor_name"]: dataset_features }, dataset_labels, { "num_data_point": num_data_points, "num_batches": num_batches }]
def prepare_batch_datasets(x_train, y_train, batch_size): logger.info('Preparing train and validation datasets for batches...') # Reserve the required samples for validation x_val = x_train[-(len(x_train) * int(VALIDATION_DATA_SPLIT)):] y_val = y_train[-(len(y_train) * int(VALIDATION_DATA_SPLIT)):] # Prepare the training dataset with shuffling train_dataset = Dataset.from_tensor_slices((x_train, y_train)) train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size) # Prepare the validation dataset val_dataset = Dataset.from_tensor_slices((x_val, y_val)) val_dataset = val_dataset.batch(batch_size) logger.info( 'Completed preparing train and validation datasets for batches.') return x_val, y_val, train_dataset, val_dataset
def ds_rndm() -> Tuple[Dataset, Dataset, int, int, int]: # Hardcoded values taken from MNIST num_classes = 10 m_train = 60000 m_test = 10000 # Random noise ds_image = Dataset.from_tensor_slices( (tf.random_uniform([m_train, 28, 28, 1], maxval=255, dtype=tf.int32))) ds_label = Dataset.from_tensor_slices((tf.random_uniform([m_train], maxval=9, dtype=tf.int64))) ds_train = Dataset.zip((ds_image, ds_label)) ds_test = ds_train.take(m_test) return ds_train, ds_test, num_classes, m_train, m_test
def __init__(self, txt_file, batch_size, num_classes, image_size, buffer_scale=100): self.txt_file = txt_file self.batch_size = batch_size self.num_classes = num_classes self.image_size = image_size buffer_size = buffer_scale * batch_size self.read_txt_file() self.dataset_size = len(self.labels) print("The dataset has {} data.".format(self.dataset_size)) # Converting images and labels to tensor self.img_paths = convert_to_tensor(self.img_paths, dtype=dtypes.string) self.labels = convert_to_tensor(self.labels, dtype=dtypes.int32) # Creating dataset data = Dataset.from_tensor_slices((self.img_paths, self.labels)) data = data.map(self.parse_function) data = data.repeat(1000) data = data.shuffle(buffer_size=buffer_size) # Setting data batch self.data = data.batch(batch_size)
def setup(self): """Setup the dataset. """ self.num_instances = len(self.image_features_files) self.num_batches = int( np.ceil(self.num_instances * 1.0 / self.batch_size)) # self.num_batches = self.num_instances // self.batch_size dataset = Dataset.from_tensor_slices( (self.image_features_files, self.captions)) # using map to load the numpy files in parallel dataset = dataset.map(lambda item1, item2: tf.numpy_function( map_image_features_to_caption, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) # shuffling and batching the train dataset if self.shuffle: dataset = dataset.shuffle(self.buffer_size).batch( self.batch_size, drop_remainder=self.drop_remainder) else: dataset = dataset.batch(self.batch_size, drop_remainder=self.drop_remainder) self.dataset = dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE)
def __init__(self, txt_file, mode, batch_size, num_classes, shuffle=True, buffer_size=1000): self.txt_file = txt_file self.num_classes = num_classes self._read_txt_file() self.data_size = len(self.labels) if shuffle: self._shuffle_lists() self.img_paths = convert_to_tensor(self.img_paths, dtype=dtypes.string) self.labels = convert_to_tensor(self.labels, dtype=dtypes.int32) """ Create dataset """ data = Dataset.from_tensor_slices((self.img_paths, self.labels)) if mode == 'train': data = data.map(self._parse_tensor, num_parallel_calls=4) data = data.prefetch(buffer_size=batch_size * 100) elif mode == 'inference': data = data.map(self._parse_tensor, num_parallel_calls=4) data = data.prefetch(buffer_size=batch_size * 100) else: raise ValueError("Invalid mode '%s'." % (mode)) self.data = data.batch(batch_size)
def load_and_format_images_for_fitting(folder): all_images, all_image_labels = get_all_images(folder) ds = Dataset.from_tensor_slices((all_images, all_image_labels)) ds = ds.shuffle(buffer_size=len(all_images)) ds = ds.batch(batch_size) return ds
def validate(model, examples, labels, features=None): '''Check the mse on the validation set.''' if not features: features = examples.columns ds = Ds.from_tensor_slices( ({feature: examples[feature] for feature in features}, labels)) predictions = get_predictions(model, ds) plt.figure() plt.subplot(1, 2, 1) plt.scatter(examples['longitude'], examples['latitude'], cmap='coolwarm', c=labels.iloc[:, 0]) plt.subplot(1, 2, 2) plt.scatter(examples['longitude'], examples['latitude'], cmap='coolwarm', c=predictions) if "classifier" in str(type(model)).casefold(): print("Validation log loss:", log_loss(labels, predictions)) else: print("Validation mse:", mse(predictions, labels)) return predictions
def __init__(self, txt_file, batch_size, num_classes, image_size, buffer_scale=100): self.image_size = image_size self.batch_size = batch_size self.txt_file = txt_file ##txt list file,stored as: imagename id self.num_classes = num_classes buffer_size = batch_size * buffer_scale # 读取图片 self.read_txt_file() self.dataset_size = len(self.labels) print "num of train datas=", self.dataset_size # 转换成Tensor self.img_paths = convert_to_tensor(self.img_paths, dtype=dtypes.string) self.labels = convert_to_tensor(self.labels, dtype=dtypes.int32) # 创建数据集 data = Dataset.from_tensor_slices((self.img_paths, self.labels)) print "data type=", type(data) data = data.map(self.parse_function) data = data.repeat(1000) data = data.shuffle(buffer_size=buffer_size) # 设置self data Batch self.data = data.batch(batch_size) print "self.data type=", type(self.data)
def __init__(self, txt_file, mode, batch_size, img_size=227, buffer_size=1000): self.txt_file = txt_file # retrieve the data from the text file self._read_txt_file() self.img_size = img_size # number of samples in the dataset self.data_size = len(self.RSAs) self.batch_size = batch_size # convert lists to TF tensor self.img1_paths = convert_to_tensor(self.img1_paths, dtype=dtypes.string) self.img2_paths = convert_to_tensor(self.img2_paths, dtype=dtypes.string) self.RSAs = convert_to_tensor(self.RSAs, dtype=dtypes.float32) # create dataset data = Dataset.from_tensor_slices( (self.img1_paths, self.img2_paths, self.RSAs)) data = data.map(self._parse_function_train) data = data.batch(batch_size) self.data = data
def train(examples, labels, features=None, lr=1e-4, steps=100, batch_size=1, model=None): '''Create and train a linear regression model.''' # Create datasets. if not features: features = examples.columns fcs = [tf.feature_column.numeric_column(feature) for feature in features] ds = Ds.from_tensor_slices( ({feature: examples[feature] for feature in features}, labels)) opt = tf.contrib.estimator.clip_gradients_by_norm( tf.train.GradientDescentOptimizer(learning_rate=lr), 5.0) if not model: model = tf.estimator.LinearRegressor(fcs, optimizer=opt) for _ in range(10): model.train( train_fn(ds, batch_size=batch_size), steps=steps//10) preds = model.predict( lambda: ds.batch(1).make_one_shot_iterator().get_next()) predictions = np.hstack(pred['predictions'] for pred in preds) print("Mean squared error: ", mse(predictions, labels)) return model
def build_dataset(series, window_size, batch_size, shuffle_buffer_size): dataset = Dataset.from_tensor_slices(series) \ .window(window_size + 1, shift=1, drop_remainder=True) \ .flat_map(lambda window: window.batch(window_size + 1)) \ .shuffle(shuffle_buffer_size).map(lambda window: (window[:-1], window[-1])) \ .batch(batch_size).prefetch(1) return dataset
def simple_test(): image_path = ['/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png'] label = np.array([1,2]) data = np.random.uniform(size=(12,3)) image_path = convert_to_tensor(image_path, dtype=dtypes.string) label = convert_to_tensor(label, dtype=dtypes.int32) dataset = Dataset.from_tensor_slices((image_path, label)) iterator = dataset.make_one_shot_iterator() one_element = iterator.get_next() with tf.Session() as sess: try: while True: result = sess.run(one_element) #print(result[0]) image_string = tf.read_file(result[0]) image_decode = tf.image.decode_png(image_string, channels=3) image_resize = tf.image.resize_images(image_decode,[64, 64]) print (image_resize) except tf.errors.OutOfRangeError: print("end!") ''' with tf.Session() as sess: for i in range(3): print (sess.run(one_element)) ''' '''
def load_validation_data(data_folder, batch_size): x_validation = np.load(data_folder + '/validation/images.npy') x_validation = np.add(x_validation, -127.5, dtype=np.float32) / 127.5 y1_validation = np.load(data_folder + '/validation/class_labels.npy') y2_validation = np.load(data_folder + '/validation/bounding_box_labels.npy') y3_validation = np.load(data_folder + '/validation/landmark_labels.npy') return Dataset.from_tensor_slices((x_validation, (y1_validation, y2_validation, y3_validation))).batch(batch_size, drop_remainder=True)
def load_data(self): data = GFile(self.file_path, 'rb').read().decode(encoding='UTF-8') # Get a list of the unique characters in the text vocab = list(sorted(set(data))) vocab_size = len(vocab) chars_to_ids = StringLookup(vocabulary=vocab) self.ids_to_chars_layer = StringLookup( vocabulary=chars_to_ids.get_vocabulary(), invert=True) # Split the entire text by character chars = unicode_split(data, 'UTF-8') ids_of_chars = chars_to_ids(chars) # Group characters to form sequences (+1 since the targets are shifted by one) sequences_ds = Dataset.from_tensor_slices(ids_of_chars) sequences_ds = sequences_ds.batch(C.SEQUENCE_LENGTH + 1) # Batch the sequences ds = sequences_ds.padded_batch(C.BATCH_SIZE) ds = ds.map(self._to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.shuffle(C.BUFFER_SIZE) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) return ds
def __init__(self, positive_dir, negative_dir, img_size=(12, 12), batch=50): #output image size self.img_size = img_size #output image bathch self.batch = batch # where the positive data from self.positive_dir = positive_dir # where the negative data from self.negative_dir = negative_dir # get the file list file_list = os.listdir(positive_dir) # need to generate negative sample? if negative_dir is not None: file_list.extend(os.listdir(negative_dir)) # data info data_label_info = [(file_name.split('_')[1], file_name.split('_')[2][:-4]) for file_name in file_list] self.data_info = [] for idx, filename in enumerate(file_list): lab, pattern = data_label_info[idx] if lab == '0': self.data_info.append( (os.path.join(negative_dir, filename), int(lab), int(pattern))) elif lab == '1': self.data_info.append( (os.path.join(positive_dir, filename), int(lab), int(pattern))) # split the train and test self.data_train, self.data_test = train_test_split(self.data_info) # make Dataset self.dataset_train = Dataset.from_tensor_slices( self.conver(self.data_train)).map(self.load_img) self.dataset_train = self.dataset_train.shuffle(1000).batch( self.batch).repeat() self.dataset_test = Dataset.from_tensor_slices( self.conver(self.data_test)).map(self.load_img) self.dataset_test = self.dataset_test.shuffle(1000).batch( self.batch).repeat()
def _input_fn(bucket, directory, config, mode): print("Fetching {} data...".format(mode)) # all_files = os.listdir(directory) all_features = [] all_labels = [] # for file in all_files: # features, labels = _load_json_file(os.path.join(directory, file), config) # all_features += features # all_labels += labels #connect to my S3 bucket and load the file content_object = s3.Object(bucket, directory) all_features, all_labels = _load_json_file(content_object, config) num_data_points = len(all_features) num_batches = math.ceil(len(all_features) / config["batch_size"]) dataset = Dataset.from_tensor_slices((all_features, all_labels)) if mode == "train": dataset = Dataset.from_tensor_slices((all_features, all_labels)) dataset = dataset.batch(config["batch_size"]).shuffle( 10000, seed=12345).repeat(config["num_epoch"]) num_batches = math.ceil(len(all_features) / config["batch_size"]) if mode in ("validation", "eval"): dataset = dataset.batch(config["batch_size"]).repeat( config["num_epoch"]) num_batches = math.ceil(len(all_features) / config["batch_size"]) iterator = dataset.make_one_shot_iterator() dataset_features, dataset_labels = iterator.get_next() return [{ config["input_tensor_name"]: dataset_features }, dataset_labels, { "num_data_point": num_data_points, "num_batches": num_batches }]
def generate_labelled_data(x_pl, y_pl, sess, X, y, batch_size, buffer_size=10000): dataset = Dataset.from_tensor_slices((x_pl,y_pl)) dataset = dataset.shuffle(buffer_size=buffer_size) dataset = dataset.repeat() # Repeat the input indefinitely. dataset = dataset.batch(batch_size) iterator = dataset.make_initializable_iterator() sess.run(iterator.initializer, feed_dict={x_pl:X, y_pl:y}) return iterator.get_next()
def _dataset(ids, batch_size=None): df = Dataset.from_tensor_slices(ids) df = df.shuffle(len(ids)) df = df.repeat() df = df.batch(batch_size) df = df.map(lambda x: tf.py_function(PreprocessData, [x], [tf.int32, tf.int32, tf.int32])) return df
def df_to_dataset(dataframe, target_name, shuffle=True, batch_size=100): dataframe = dataframe.copy() targets = dataframe.pop(target_name) ds = Dataset.from_tensor_slices((dict(dataframe), targets)) if shuffle: ds = ds.shuffle(buffer_size=len(dataframe)) ds = ds.batch(batch_size) return ds
def get_img_ds(self): file_names = glob.glob('datasets/imgs/*') img_ds = Dataset.from_tensor_slices(file_names) img_ds = img_ds.map(self.parse_img, num_parallel_calls=tf.data.AUTOTUNE) img_ds = img_ds.batch(C.BATCH_SIZE) return img_ds