def run(data_path, image_size=160, epochs=10, batch_size=32, learning_rate=0.0001, output='model', dataset=None): img_shape = (image_size, image_size, 3) info('Loading Data Set') # load dataset train, test, val, labels = load_dataset(data_path, dataset) # training data train_data, train_labels = zip(*train) train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)), Dataset.from_tensor_slices(list(train_labels)))) train_ds = train_ds.map(map_func=process_image, num_parallel_calls=5) train_ds = train_ds.apply(tf.data.experimental.ignore_errors()) train_ds = train_ds.batch(batch_size) train_ds = train_ds.prefetch(buffer_size=5) train_ds = train_ds.repeat() # model info('Creating Model') base_model = tf.keras.applications.ResNet50(input_shape=img_shape, include_top=False, weights='imagenet') base_model.trainable = True model = tf.keras.Sequential([ base_model, tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy']) model.summary() # training info('Training') steps_per_epoch = math.ceil(len(train)/batch_size) history = model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch) # save model info('Saving Model') # check existence of base model folder output = check_dir(output) print('Serializing into saved_model format') tf.saved_model.save(model, str(output)) # add time prefix folder #stamp = datetime.now().strftime('%y_%m_%d_%H_%M.h5') #stamped = str(Path(output).joinpath(stamp)) file_output = str(Path(output).joinpath('latest.h5')) #print('Serializing model to:\n{}\n{}'.format(stamped, output) model.save(file_output)
def apply_all(*sources): if len(sources) == 1: return func(sources[0]) res = tuple( func(*source) if isinstance(source, tuple) else func(source) for source in sources) if all(isinstance(r, Dataset) for r in res): res = Dataset.zip(res) return res
def parse_ds(file_name): x_train = tfio.v0.IODataset.from_hdf5(file_name, dataset='/x_train', spec=tf.float32) y_train = tfio.v0.IODataset.from_hdf5(file_name, dataset='/y_train', spec=tf.float32) return Dataset.zip((x_train, y_train))
def load_data(self): img_ds = self.get_img_ds() svg_ds = self.get_svg_ds() # Batch the sequences ds = Dataset.zip((svg_ds, img_ds)) ds = ds.shuffle(C.BUFFER_SIZE) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) return ds
def get_dataset(): def parse_img(file_name): img = tf.io.read_file(file_name) img = tf.io.decode_jpeg(img, channels=3) img = tf.image.resize(img, [224, 224], antialias=True, method='nearest') img = tf.cast(img, tf.float32) img = vgg19_preprocess_input(img) / 255.0 return img def build_ds(file_names): tmp_ds = Dataset.from_tensor_slices(file_names) tmp_ds = tmp_ds.shuffle(len(file_names)) tmp_ds = tmp_ds.map(parse_img, num_parallel_calls=tf.data.AUTOTUNE) return tmp_ds content_ds = build_ds( glob.glob('/home/jephthia/datasets/mscoco/unlabeled2017/train/*')[:1]) style_ds = build_ds( glob.glob('/home/jephthia/datasets/wikiart/train/*')[:1]) val_content_ds = build_ds( glob.glob('/home/jephthia/datasets/mscoco/unlabeled2017/validate/*') [:2500]) val_style_ds = build_ds( glob.glob('/home/jephthia/datasets/wikiart/validate/*')[:2500]) # Train dataset ds = Dataset.zip((content_ds, style_ds)) ds = ds.batch(BATCH_SIZE) # ds = ds.prefetch(2) # Validation dataset val_ds = Dataset.zip((val_content_ds, val_style_ds)) val_ds = val_ds.batch(BATCH_SIZE) # val_ds = val_ds.cache() # val_ds = val_ds.prefetch(2) return ds, val_ds
def get_data(dataset): if dataset == 'cifar100': from tensorflow.keras.datasets import cifar100 (x_tr, y_tr), (x_te, y_te) = cifar100.load_data() elif dataset == 'cifar10': from tensorflow.keras.datasets import cifar10 (x_tr, y_tr), (x_te, y_te) = cifar10.load_data() preprocesses = ([todtype, normalize], [ohe]) x_te, y_te = preprocess(x_te, y_te, preprocesses) x_tr, y_tr = preprocess(x_tr, y_tr, preprocesses) tr_ds_x = Dataset.from_tensor_slices(x_tr) tr_ds_y = Dataset.from_tensor_slices(y_tr) te_ds_x = Dataset.from_tensor_slices(x_te) te_ds_y = Dataset.from_tensor_slices(y_te) tr_ds = Dataset.zip((tr_ds_x, tr_ds_y)).shuffle(1000).batch(128) te_ds = Dataset.zip((te_ds_x, te_ds_y)).batch(128) return tr_ds, te_ds
def generate_tf_data(enc_input: list, dec_input: list, batch_size: int, train_size: int, val_size: int) -> [Dataset]: '''Generates a tensorflow data set, splits it in train, test and validation sets. Problem: Feeding in three arrays containing almost two million sequences each, requires too much main memory. Solution: We use the Tensorflow Dataset, where we can feed the model with slices of the whole dataset. Also: shuffles the observations. Args: enc_input: encoder input ids, token ids for each word and each sentence dec_input: used for teacher forcing. Token ids for each word and each sentence in target lang. More specific: - decoder input, token sequences (index 0 in dec_input) - decoder target output, token sequences (for teacher forcing, index 1 in dec_input) batch_size: Number of observation passed to the Seq2Seq model during training time. train_size: Fraction of all observations to be reserved for training the model. val_size: Fraction of all observations to be reserved for evaluating the model performance during training. Returns: train_data: contains encoder_input, decoder_input, decoder_target_output for training the model. val_data: contains encoder_input, decoder_input, decoder_target_output for evaluating the model. ''' assert train_size + val_size == 1, "Train, Validation and Test size doesn't sum up to 1!" data_size = enc_input[0].shape[0] # Summarize the source language token ids and the decoder input as: model_input model_input = Dataset.from_tensor_slices((enc_input[0], dec_input[0])) # enc_token_ids dec_token_ids # convert decoder_target_output to TF.Dataset decoder_target_output = Dataset.from_tensor_slices((dec_input[1])) # dec_token_ids used as target output (shifted by one observation) # Combine the model_input and the decoder_target_output to a full TF.Dataset, shuffle it full_data = Dataset.zip( (model_input, decoder_target_output)).shuffle(data_size) # Train Val split train_size = int(train_size * data_size) val_size = int(val_size * data_size) train_data = full_data.take(train_size) val_data = full_data.skip(train_size) train_data = train_data.batch(batch_size, drop_remainder=True) val_data = val_data.batch(batch_size, drop_remainder=True) return train_data, val_data
def ds_rndm() -> Tuple[Dataset, Dataset, int, int, int]: # Hardcoded values taken from MNIST num_classes = 10 m_train = 60000 m_test = 10000 # Random noise ds_image = Dataset.from_tensor_slices( (tf.random_uniform([m_train, 28, 28, 1], maxval=255, dtype=tf.int32))) ds_label = Dataset.from_tensor_slices((tf.random_uniform([m_train], maxval=9, dtype=tf.int64))) ds_train = Dataset.zip((ds_image, ds_label)) ds_test = ds_train.take(m_test) return ds_train, ds_test, num_classes, m_train, m_test
def run(self, n_iterations=1): for itr in range(n_iterations): samples = self.sampler.partial_rollout(max_steps=self.T) # Using zip instead of from_tensor_slices because the latter needs # all tensors to have the same type, and actions may be integers self.train.dataset = Dataset.zip(( Dataset.from_tensor_slices(samples['observations']), Dataset.from_tensor_slices(samples['actions']), Dataset.from_tensor_slices(samples['advantages']), Dataset.from_tensor_slices(samples['value_targets']) )).batch(self.batch_size) print('Training...') self.train.run(self.n_epochs) self.run_callbacks('post-iteration') self.old_policy.net.set_weights(self.policy.net.get_weights()) self.sampler.send('policy_params', self.policy.net.get_weights()) self.sampler.send('value_fn_params', self.value_fn.get_weights())
def test_pipeline(self, num_threads): real_fname = os.path.join(self.dataset_path, 'test_real.txt') # extract directories real_dir, inst_dir = self.real_dir, self.inst_dir # count lines num_real = count_lines(real_fname) # dataset creation with tf.name_scope('dataset'): real = TextLineDataset(real_fname) # @see https://www.tensorflow.org/api_docs/python/tf/contrib/data/shuffle_and_repeat #synt.apply(shuffle_and_repeat(buffer_size = num_synt)) #, count = 1)) #real.apply(shuffle_and_repeat(buffer_size = num_real)) #, count = ceil(ratio))) real = real.shuffle(num_real) # no repetition! .repeat() # real data only augment = 0 # self.params.get('augment', 0) def name2real(name): inst = read_instr(os.path.join(inst_dir, name.decode() + '.png')) if augment: src_dir = self.params.get('augment_src', 'best') # print('{}/{}/{}'.format(real_dir, str(src_dir), name.decode() + '.JPG')) full = read_image(os.path.join(real_dir, str(src_dir), 'rgb', name.decode() + '.jpg'), False) pnts = read_points(os.path.join(real_dir, str(src_dir), 'points', name.decode() + '.txt')) if isinstance(src_dir, float): pnts *= src_dir self.params['augment_scale'] = 0. real = random_crop(full, pnts, self.params) else: real = read_image(os.path.join(real_dir, '160x160', 'gray', name.decode() + '.jpg')) return real, inst, name.decode() real = real.map(lambda name: tuple(tf.py_func(name2real, [name], [tf.float32, tf.int32, tf.string])), num_parallel_calls = num_threads) #dataset = Dataset.zip((rend, xfer, real, inst_synt, inst_real)) dataset = Dataset.zip({ 'real': real }) dataset = dataset.batch(self.batch_size, drop_remainder = True) # we need full batches! dataset = dataset.prefetch(self.batch_size * 2) return dataset
def input(dataset, mode, params, genre=None): uid_max = FLAGS.uid_max parse_py_fn_ = lambda line: parse_py_fn(line, mode) if mode.startswith('train'): dataset = dataset.map( lambda line: tf.py_func(parse_py_fn_, [line], tf.float32)) else: dataset = dataset.map(lambda line: tf.py_func( parse_py_fn_, [line], [tf.float32, tf.float32])) if mode.endswith("genre"): dataset = dt.zip((dataset, genre)) dataset = dataset.cache() if mode.startswith('train'): dataset = dataset.shuffle(buffer_size=100 * params['batch_size']) dataset = dataset.batch(params['batch_size']) if mode.startswith('train'): dataset = dataset.repeat(params['repeat_times']) else: dataset = dataset.repeat() pass iterator = dataset.make_one_shot_iterator() if mode.startswith('train'): if mode.endswith("genre"): mat, genre_mat = iterator.get_next() return mat, genre_mat else: mat = iterator.get_next() return mat else: if mode.endswith("genre"): mat, genre_mat = iterator.get_next() mat1, mat2 = mat return mat1, mat2, genre_mat else: mat1, mat2 = iterator.get_next() return mat1, mat2
def dataset(self): return Dataset.zip((self.images, self.labels))
def pipeline(self, name, num_threads): if not self.params.get('training', 1): return None synt_fname = os.path.join(self.dataset_path, name + '_synt.txt') real_fname = os.path.join(self.dataset_path, name + '_real.txt') unsup_fname = os.path.join(self.dataset_path, 'train_unsup.txt') num_synt, num_real, num_unsup = [count_lines(fname) for fname in [synt_fname, real_fname, unsup_fname]] ratio = num_synt / float(num_real) # extract directories fake_dirs, real_dir, inst_dir = self.fake_dirs, self.real_dir, self.inst_dir # dataset creation with tf.name_scope('dataset'): synt, real, unsup = [TextLineDataset(name) for name in [synt_fname, real_fname, unsup_fname]] # @see https://www.tensorflow.org/api_docs/python/tf/contrib/data/shuffle_and_repeat #synt.apply(shuffle_and_repeat(buffer_size = num_synt)) #, count = 1)) #real.apply(shuffle_and_repeat(buffer_size = num_real)) #, count = ceil(ratio))) synt = synt.shuffle(num_synt).repeat() real = real.shuffle(num_real).repeat() unsup = unsup.shuffle(num_unsup).repeat() # map to corresonding files # synthetic data def name2synt(name): fakes = [ read_image(os.path.join(path, name.decode() + '.jpg')) for path in fake_dirs.values() ] inst = read_instr(os.path.join(inst_dir, name.decode() + '.png')) return fakes + [inst] synt_types = [tf.float32 for _ in self.fakes] + [tf.int32] synt = synt.map(lambda name: tf.py_func(name2synt, [name], synt_types), num_parallel_calls = num_threads) # real data augment = self.params.get('augment', 1) def name2real(name): inst = read_instr(os.path.join(inst_dir, name.decode() + '.png')) if augment: src_dir = self.params.get('augment_src', 'best') # print('{}/{}/{}'.format(real_dir, str(src_dir), name.decode() + '.JPG')) full = read_image(os.path.join(real_dir, str(src_dir), 'rgb', name.decode() + '.jpg'), False) pnts = read_points(os.path.join(real_dir, str(src_dir), 'points', name.decode() + '.txt')) if isinstance(src_dir, float): pnts *= src_dir real = random_crop(full, pnts, self.params) # TODO add mirror augmentation else: real = read_image(os.path.join(real_dir, name.decode() + '.jpg')) return real, inst real = real.map(lambda name: tuple(tf.py_func(name2real, [name], [tf.float32, tf.int32])), num_parallel_calls = num_threads) # unsup data def name2unsup(name): if augment: # print('{}/{}/{}'.format(real_dir, str(src_dir), name.decode() + '.JPG')) img = read_image(os.path.join(self.unsup_dir, name.decode() + '.jpg'), False) imsz = img.shape # y,x,c # [TL, TR, BR, BL] real = random_crop(img, np.array([[5,5],[imsz[1]-5,5],[imsz[1]-5,imsz[0]-5],[5,imsz[0]-5]], dtype = np.float32), self.params) else: real = read_image(os.path.join(self.unsup_dir, name.decode() + '.jpg')) return real # unsup = unsup.map(lambda name: tuple(tf.py_func(name2unsup, [name], [tf.float32])), num_parallel_calls = num_threads) # zip all, batch and prefetch #dataset = Dataset.zip((rend, xfer, real, inst_synt, inst_real)) dataset = Dataset.zip({ 'synt': synt, 'real': real }) # , 'unsup': unsup dataset = dataset.batch(self.batch_size, drop_remainder = True) # we need full batches! dataset = dataset.prefetch(self.batch_size * 2) return dataset
ict = tf.reshape(ict, [79, 159, 5]) return ict source_data = tr_data.map(load_tensor) # Same from the 6-hours ahead data input_file_dir = (("%s/Machine-Learning-experiments/datasets/uk_centred+6h/" + "20CR2c/air.2m/training/") % os.getenv('SCRATCH')) t2m_files = glob("%s/*.tfd" % input_file_dir) n_steps = len(t2m_files) tr_tfd = tf.constant(t2m_files) tr_data = Dataset.from_tensor_slices(tr_tfd).repeat(n_epochs) target_data = tr_data.map(load_tensor) tr_data = Dataset.zip((source_data, target_data)) tr_data = tr_data.shuffle(buffer_size).batch(batch_size) # Same for the test dataset input_file_dir = (("%s/Machine-Learning-experiments/datasets/uk_centred/" + "20CR2c/air.2m/test/") % os.getenv('SCRATCH')) t2m_files = glob("%s/*.tfd" % input_file_dir) test_steps = len(t2m_files) test_tfd = tf.constant(t2m_files) test_data = Dataset.from_tensor_slices(test_tfd).repeat(n_epochs) test_source = test_data.map(load_tensor) input_file_dir = (("%s/Machine-Learning-experiments/datasets/uk_centred+6h/" + "20CR2c/air.2m/test/") % os.getenv('SCRATCH')) t2m_files = glob("%s/*.tfd" % input_file_dir) test_steps = len(t2m_files) test_tfd = tf.constant(t2m_files)
# source with data from (window_size-1):(len-forecast_steps) # Each source datset of length window_size # target with data from (forecast_steps+window_size-1):len # Each target dataset of length 1 source_tfd = tf.constant( training_files[(window_size - 1):(len(training_files) - forecast_steps)]) source_data = Dataset.from_tensor_slices(source_tfd) source_data = source_data.repeat(n_epochs) source_data = source_data.map(load_tensor_window) target_tfd = tf.constant(training_files[(forecast_steps + window_size - 1):(len(training_files))]) target_data = Dataset.from_tensor_slices(target_tfd) target_data = target_data.repeat(n_epochs) target_data = target_data.map(load_tensor) # Zip these together into (source,target) tuples for model fitting. training_data = Dataset.zip((source_data, target_data)) training_data = training_data.batch(n_batch) # Repeat the whole process with the test data test_file_dir = ( ("%s/Machine-Learning-experiments/datasets/" + "DWR/20CR2c/prmsl/test/") % os.getenv('SCRATCH')) test_files = glob("%s/*.tfd" % test_file_dir) test_steps = len(test_files) // n_batch source2_tfd = tf.constant(test_files[(window_size - 1):(len(training_files) - forecast_steps)]) source2_data = Dataset.from_tensor_slices(source2_tfd) source2_data = source2_data.repeat(n_epochs) source2_data = source2_data.map(load_tensor_window_test) target2_tfd = tf.constant(test_files[(forecast_steps + window_size - 1):(len(training_files))])
ax_test[i, 1].imshow(zebra) plt.show() # Putting all together generator_G = get_resnet_generator(name='generator_G') generator_F = get_resnet_generator(name='generator_F') discriminator_X = get_discriminator(name='discriminator_X') discriminator_Y = get_discriminator(name='discriminator_Y') cycle_model = CycleGAN(generator_G=generator_G, generator_F=generator_F, discriminator_X=discriminator_X, discriminator_Y=discriminator_Y) cycle_model.compile(generator_G_opt=Adam(learning_rate=2e-4, beta_1=0.5), generator_F_opt=Adam(learning_rate=2e-4, beta_1=0.5), discriminator_X_opt=Adam(learning_rate=2e-4, beta_1=0.5), discriminator_Y_opt=Adam(learning_rate=2e-4, beta_1=0.5), generator_loss_fn=generator_loss_fn, discriminator_loss_fn=discriminator_loss_fn) plotter = GANMonitor(data=test_horses) checkpoint_filepath = "./model_checkpoints/cyclegan_checkpoints.{epoch:03d}" model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath) cycle_model.fit( Dataset.zip((train_horses, train_zebras)), epochs=90, callbacks=[plotter, model_checkpoint_callback], )
file_name = tf.strings.regex_replace(file_name, 'prmsl', 'air.2m') sict = tf.read_file(file_name) t2m = tf.parse_tensor(sict, numpy.float32) t2m = tf.reshape(t2m, [79, 159, 1]) file_name = tf.strings.regex_replace(file_name, 'air.2m', 'z500') sict = tf.read_file(file_name) prate = tf.parse_tensor(sict, numpy.float32) prate = tf.reshape(prate, [79, 159, 1]) ict = tf.concat([prmsl, t2m, prate], 2) # Now [79,159,3] ict = tf.reshape(ict, [79, 159, 3]) return ict tr_data = tr_data.map(load_tensor) tr_data = tr_data.shuffle(buffer_size).batch(batch_size) tr_data = Dataset.zip((tr_data, tr_data)) # Same for the test dataset input_file_dir = (("%s/Machine-Learning-experiments/datasets/rotated_pole/" + "20CR2c/prmsl/test/") % os.getenv('SCRATCH')) prmsl_files = glob("%s/*.tfd" % input_file_dir) test_steps = len(prmsl_files) test_tfd = tf.constant(prmsl_files) test_data = Dataset.from_tensor_slices(test_tfd).repeat(n_epochs) test_data = test_data.map(load_tensor) test_data = test_data.batch(batch_size) test_data = Dataset.zip((test_data, test_data)) # reparameterization trick # instead of sampling from Q(z|X), sample eps = N(0,I)
TRAIN_SLICES = len(train_labels) VAL_SLICES = len(val_labels) TEST_SLICES = len(test_labels) print('\nFinished splitting each image within each list into {} 1D slices.' .format(SLICES_PER_IMAGE)) # Note the slices are now normalised train_slices_ds = Dataset.from_tensor_slices(train_slices) val_slices_ds = Dataset.from_tensor_slices(val_slices) test_slices_ds = Dataset.from_tensor_slices(test_slices) print('\nCompleted slice datasets.') train_labels_ds = Dataset.from_tensor_slices(train_labels) val_labels_ds = Dataset.from_tensor_slices(val_labels) test_labels_ds = Dataset.from_tensor_slices(test_labels) print('Completed label datasets.') train_ds = Dataset.zip((train_slices_ds, train_labels_ds)) val_ds = Dataset.zip((val_slices_ds, val_labels_ds)) test_ds = Dataset.zip((test_slices_ds, test_labels_ds)) print('\nCompleted datasets of labelled slices.') AUTOTUNE = tf.data.experimental.AUTOTUNE BATCH_SIZE = 256 train_ds = train_ds.shuffle(TRAIN_SLICES).batch(BATCH_SIZE).prefetch(AUTOTUNE) val_ds = val_ds.shuffle(VAL_SLICES).batch(BATCH_SIZE).prefetch(AUTOTUNE) test_ds = test_ds.batch(BATCH_SIZE) print('\nFinished batching and shuffling datasets.') def build_CNN(train_ds, val_ds, test_ds): """Function to build a convolutional neural network with 2 convolutions and 1 dense layer.
mnth = tf.strings.substr(fdte,5,2) dy = tf.strings.substr(fdte,8,2) dy = tf.cond(tf.math.equal(mnth+dy,'0229'), lambda: tf.constant('28'),lambda: dy) file_name=(tf.strings.substr(file_name,0,tf.strings.length(file_name)-17)+ '1969-'+mnth+'-'+dy+tf.strings.substr(fdte,tf.strings.length(fdte)-7,7)) sict = tf.read_file(file_name) insol = tf.parse_tensor(sict,numpy.float32) insol = tf.reshape(insol,[79,159,1]) ict = tf.concat([t2m,prmsl,uwnd,vwnd,insol],2) # Now [79,159,5] ict = tf.reshape(ict,[79,159,5]) return ict tr_source = tr_data.map(load_tensor_w_insol) tr_data = Dataset.zip((tr_source, tr_target)) tr_data = tr_data.shuffle(buffer_size).batch(batch_size) # Same for the test dataset input_file_dir=(("%s/Machine-Learning-experiments/datasets/uk_centred/" + "20CR2c/air.2m/test/") % os.getenv('SCRATCH')) t2m_files=glob("%s/*.tfd" % input_file_dir) test_steps=len(t2m_files) test_tfd = tf.constant(t2m_files) test_data = Dataset.from_tensor_slices(test_tfd).repeat(n_epochs) test_target = test_data.map(load_tensor) test_source = test_data.map(load_tensor_w_insol) test_data = Dataset.zip((test_source, test_target)) test_data = test_data.batch(batch_size)
It is the most recent data and will be most analagous to making future predictions given we're not collecting more data. Since we have millions of data points and most of them are not recent (thus possibly not taking into account current trends) taking this relatively small slice should be fine. """ combine = 5 # should match param from preprocess window = int(1440 / combine) print(data.shape[0]) data = Dataset.from_tensor_slices(data).window(window, 1, combine, True) data = data.flat_map(lambda x: x.batch(window, drop_remainder=True)) print(data) labels = Dataset.from_tensor_slices(datalabels) prices = Dataset.from_tensor_slices(dataprices) ratios = Dataset.from_tensor_slices(dataratios) ins = Dataset.zip((data, prices)) outs = Dataset.zip((ratios, prices)) data = Dataset.zip((ins, outs)) """ Choose a window stride number for training that does not have a common factor with minutes in a day (1440) so we can pinstripe through every day while going through the whole timescale, but also still pick many per day. I picked 29. Repeat is done before our pinstriping window so it will roll over the end and pinstripe the year as well as long as it not a factor in our data size either (which it is not for the bitstamp set nor the slightly truncated one due to the first 1440 size window). We'll split off the last month for testing and the two months before that for validation. """ val_test_count = 8760 * 3
def run(dpath, img_size=160, epochs=10, batch_size=32, learning_rate=0.0001, output='model', dset=None): global g_image_size g_image_size = img_size img_shape = (img_size, img_size, 3) info('Loading Data Set') # load dataset train = load_dataset(dpath, dset) # training data train_data, train_labels = zip(*train) train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)), Dataset.from_tensor_slices(list(train_labels)), Dataset.from_tensor_slices( [img_size] * len(train_data)))) # noqa: E501 print(train_ds) train_ds = train_ds.map(map_func=process_image, num_parallel_calls=5) train_ds = train_ds.apply(tf.data.experimental.ignore_errors()) train_ds = train_ds.batch(batch_size) train_ds = train_ds.prefetch(buffer_size=5) train_ds = train_ds.repeat() # model info('Creating Model') base_model = tf.keras.applications.MobileNetV2(input_shape=img_shape, include_top=False, weights='imagenet') base_model.trainable = True model = tf.keras.Sequential([ base_model, tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy']) model.summary() # training info('Training') steps_per_epoch = math.ceil(len(train) / batch_size) mlflow.tensorflow.autolog() model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch) # Log metric # TODO calculate metric from based on evalution data. # accuracy = model.evaluate() accuracy = random() # dummy score metric = { 'name': 'accuracy-score', 'numberValue': accuracy, 'format': "PERCENTAGE", } metrics = { # [doc] https://www.kubeflow.org/docs/pipelines/sdk/pipelines-metrics/ # noqa: E501 'metrics': [metric] } # TODO # It would be nice to refactor all this infra code below like logging, saving files, # noqa: E501 # out of this method so it just does the training and returns the model along with metrics # noqa: E501 # Log to mlflow mlflow.log_metrics({"accuracy": accuracy}) # Pipeline Metric info('Writing Pipeline Metric') with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f: json.dump(metrics, f) # save model info('Saving Model') # check existence of base model folder output = check_dir(output) print('Serializing into saved_model format') tf.saved_model.save(model, str(output)) print('Done!') # add time prefix folder file_output = str(Path(output).joinpath('latest.h5')) print('Serializing h5 model to:\n{}'.format(file_output)) model.save(file_output) # mlflow.log_artifact(file_output) return generate_hash(file_output, 'kf_pipeline')
def run(dpath, img_size=160, epochs=10, batch_size=32, learning_rate=0.0001, output='model', dset=None): global g_image_size g_image_size = img_size img_shape = (img_size, img_size, 3) info('Loading Data Set') train = load_dataset(dpath, dset) train_data, train_labels = zip(*train) train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)), Dataset.from_tensor_slices(list(train_labels)), Dataset.from_tensor_slices( [img_size] * len(train_data)))) # noqa: E501 print(train_ds) train_ds = train_ds.map(map_func=process_image, num_parallel_calls=5) train_ds = train_ds.apply(tf.data.experimental.ignore_errors()) train_ds = train_ds.batch(batch_size) train_ds = train_ds.prefetch(buffer_size=5) train_ds = train_ds.repeat() info('Creating Model') base_model = tf.keras.applications.MobileNetV2(input_shape=img_shape, include_top=False, weights='imagenet') base_model.trainable = True model = tf.keras.Sequential([ base_model, tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy']) model.summary() info('Training') steps_per_epoch = math.ceil(len(train) / batch_size) mlflow.tensorflow.autolog() model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch) # Log metric accuracy = random() # dummy score metric = { 'name': 'accuracy-score', 'numberValue': accuracy, 'format': "PERCENTAGE", } metrics = {'metrics': [metric]} mlflow.log_metrics({"accuracy": accuracy}) info('Writing Pipeline Metric') with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f: json.dump(metrics, f) info('Saving Model') output = check_dir(output) print('Serializing into saved_model format') tf.saved_model.save(model, str(output)) print('Done!') file_output = str(Path(output).joinpath('latest.h5')) print('Serializing h5 model to:\n{}'.format(file_output)) model.save(file_output) return generate_hash(file_output, 'kf_pipeline')
return ict obs_data = Dataset.from_tensor_slices(train_tfd) obs_data = obs_data.repeat(n_epochs) obs_data = obs_data.map(load_observations) obs_data = obs_data.batch(1) # And the test observations obs_test_data = Dataset.from_tensor_slices(test_tfd) obs_test_data = obs_test_data.repeat(n_epochs) obs_test_data = obs_test_data.map(load_observations) obs_test_data = obs_test_data.batch(1) # Zip the target and source together for training training_data = Dataset.zip((obs_data, field_data)) test_data = Dataset.zip((obs_test_data, field_test_data)) # Need to resize data so it's dimensions are a multiple of 8 (3*2-fold pool) class ResizeLayer(tf.keras.layers.Layer): def __init__(self, newsize=None, **kwargs): super(ResizeLayer, self).__init__(**kwargs) self.resize_newsize = newsize def call(self, input): return tf.image.resize_images(input, self.resize_newsize, align_corners=True) def get_config(self):