def train(model='fcn5'): config = tf.ConfigProto(allow_soft_placement=False,log_device_placement=FLAGS.log_device_placement) device_id = FLAGS.device_id device_str = '' if int(device_id) >= 0: device_str = '/gpu:%d'%int(device_id) else: device_str = '/cpu:0' with tf.Graph().as_default(), tf.device(device_str), tf.Session(config=config) as sess: feature_dim = models.feature_dim label_dim = models.label_dim images = tf.placeholder(tf.float32, [None, feature_dim]) labels = tf.placeholder(tf.float32, [None, label_dim]) logits = None if model == 'fcn5': logits = models.model_fcn5(images) else: logits = models.model_fcn8(images) loss = models.loss(logits, labels) predictionCorrectness = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1)) accuracy = tf.reduce_mean(tf.cast(predictionCorrectness, "float")) lr = 0.05 #optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss) optimizer = tf.train.MomentumOptimizer(lr, 0.9).minimize(loss) init = tf.initialize_all_variables() sess.run(init) tf.train.start_queue_runners(sess=sess) batch_size_per_epoch = int((EPOCH_SIZE + FLAGS.batch_size - 1)/ FLAGS.batch_size) iterations = FLAGS.epochs * batch_size_per_epoch average_batch_time = 0.0 epochs_info = [] average_loss = 0.0 for step in range(iterations): start_time = time.time() imgs, labs = get_real_batch_data(FLAGS.batch_size, 10) _, loss_value = sess.run([optimizer, loss], feed_dict={images:imgs,labels:labs}) average_loss += loss_value duration = time.time() - start_time average_batch_time += float(duration) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % FLAGS.log_step == 0: examples_per_sec = FLAGS.batch_size / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step > 0 and step % (FLAGS.eval_step * batch_size_per_epoch) == 0: average_loss /= FLAGS.eval_step * batch_size_per_epoch accuracy_value = accuracy.eval(feed_dict={images: mnist.test.images, labels: mnist.test.labels}) print("test accuracy %g"%accuracy_value) epochs_info.append('%d:%g:%s'%(step/(FLAGS.eval_step*batch_size_per_epoch), accuracy_value, average_loss)) average_loss = 0.0 average_batch_time /= iterations print 'average_batch_time: ', average_batch_time print ('epoch_info: %s' % ','.join(epochs_info))
def train(model='fcn5'): if FLAGS.num_gpus < 2: print("The number of GPU should be 2 or more, if you use one GPU, please use fcn5_mnist.py to train") return config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=FLAGS.log_device_placement) with tf.Graph().as_default(), tf.device("/cpu:0"): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) device_ids = FLAGS.device_ids.split(',') if len(device_ids) > FLAGS.num_gpus: print('The device_ids should have the same number of GPUs with num_gpus') return lr = 0.05 #optimizer = tf.train.GradientDescentOptimizer(lr) optimizer = tf.train.MomentumOptimizer(lr, 0.9) tower_grads = [] feed_vars = [] average_loss_tensor = [] for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%s'%device_ids[i]): with tf.name_scope('%s_%s' % ('TOWER', device_ids[i])) as scope: feature_dim = models.feature_dim label_dim = models.label_dim images = tf.placeholder(tf.float32, [None, feature_dim], name='images') labels = tf.placeholder(tf.float32, [None, label_dim], name='labels') feed_vars.append((images, labels)) logits = models.model_fcn5(images) loss = models.loss(logits, labels) tf.add_to_collection('losses', loss) #tf.add_n(tf.get_collection('losses'), name='total_loss') losses = tf.get_collection('losses', scope) total_loss = tf.add_n(losses, name='total_loss') average_loss_tensor.append(total_loss) tf.get_variable_scope().reuse_variables() grads = optimizer.compute_gradients(total_loss) tower_grads.append(grads) print('tower_grads: ', tower_grads, '\nlen: ', len(tower_grads)) print ('total_loss: ', total_loss) grads = average_gradients(tower_grads) apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) train_op = apply_gradient_op average_op = tf.reduce_mean(average_loss_tensor, 0) saver = tf.train.Saver(tf.all_variables()) init = tf.initialize_all_variables() sess = tf.Session(config=config) sess.run(init) tf.train.start_queue_runners(sess=sess) real_batch_size = FLAGS.batch_size * FLAGS.num_gpus num_batches_per_epoch = int((EPOCH_SIZE + real_batch_size - 1)/ real_batch_size) iterations = FLAGS.epochs * num_batches_per_epoch average_batch_time = 0.0 epochs_info = [] step = 0 average_loss = 0.0 for step in range(iterations): start_time = time.time() imgs, labs = get_real_batch_data(real_batch_size, 10) feed_dict = {} for i in range(FLAGS.num_gpus): feed_dict[feed_vars[i][0]] = imgs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size] feed_dict[feed_vars[i][1]] = labs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size] # _, loss_value = sess.run([train_op, total_loss], feed_dict=feed_dict) _, loss_value = sess.run([train_op, average_op], feed_dict=feed_dict) duration = time.time() - start_time average_batch_time += float(duration) average_loss += loss_value assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % FLAGS.log_step == 0: examples_per_sec = (FLAGS.batch_size * FLAGS.num_gpus) / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step > 0 and step % (FLAGS.eval_step * num_batches_per_epoch) == 0: average_loss /= num_batches_per_epoch * FLAGS.eval_step print ('epoch: %d, loss: %.2f' % (step/(FLAGS.eval_step*num_batches_per_epoch), average_loss)) epochs_info.append('%d:-:%s'%(step/(FLAGS.eval_step*num_batches_per_epoch), average_loss)) average_loss = 0.0 checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) average_batch_time /= iterations print 'average_batch_time: ', average_batch_time print ('epoch_info: %s' % ','.join(epochs_info))
def train(model='fcn5'): config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=FLAGS.log_device_placement) if FLAGS.xla: # Turns on XLA. XLA is not included in the standard build. For single GPU this shows ~5% improvement config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Graph().as_default(), tf.device("/" + FLAGS.local_ps_device + ":0"): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) device_ids = FLAGS.device_ids if not device_ids: device_ids = [str(i) for i in range(FLAGS.num_gpus)] else: device_ids = device_ids.split(',') lr = 0.05 #optimizer = tf.train.GradientDescentOptimizer(lr) optimizer = tf.train.MomentumOptimizer(lr, 0.9) def assign_to_device(device, ps_device=FLAGS.local_ps_device): worker_device = device ps_sizes = [0] if FLAGS.local_ps_device.lower == 'gpu': ps_sizes = [0] * FLAGS.num_gpus def _assign(op): if op.device: return op.device if op.type not in ['Variable', 'VariableV2']: return worker_device device_index, _ = min(enumerate( ps_sizes), key=operator.itemgetter(1)) device_name = '/' + FLAGS.local_ps_device +':' + str(device_index) var_size = op.outputs[0].get_shape().num_elements() ps_sizes[device_index] += var_size return device_name return _assign images = None labels = None if FLAGS.use_dataset: with tf.device('/CPU:0'): d_features = mnist.train.images d_labels = mnist.train.labels dataset = tf.contrib.data.Dataset.from_tensor_slices((d_features, d_labels)) dataset = dataset.shuffle(buffer_size=60000) dataset = dataset.repeat() dataset = dataset.batch(FLAGS.batch_size) # Trick to get datasets to buffer the next epoch. This is needed because # the data loading is occuring outside DataSets in python. Normally preprocessing # would occur in DataSets and this odd looking line is not needed. dataset = dataset.map(lambda x,y:(x,y), num_threads=FLAGS.num_gpus, output_buffer_size=FLAGS.num_gpus) iterator = dataset.make_initializable_iterator() images,labels = iterator.get_next() tower_grads = [] feed_vars = [] average_loss_tensor = [] reuse_variables = False accuracy = None for i in xrange(FLAGS.num_gpus): with tf.device(assign_to_device('/gpu:%s'%device_ids[i])): with tf.name_scope('%s_%s' % ('TOWER', device_ids[i])) as scope: if not FLAGS.use_dataset: feature_dim = models.feature_dim label_dim = models.label_dim images = tf.placeholder(tf.float32, [None, feature_dim], name='images') labels = tf.placeholder(tf.int64, [None, label_dim], name='labels') feed_vars.append((images, labels)) with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): logits = models.model_fcn5(images) if i == 0: # Prediction only on GPU:0 predictionCorrectness = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1)) accuracy = tf.reduce_mean(tf.cast(predictionCorrectness, "float")) loss = models.loss(logits, labels) reuse_variables = True average_loss_tensor.append(loss) grads = optimizer.compute_gradients(loss) tower_grads.append(grads) grads = average_gradients(tower_grads) apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) train_op = apply_gradient_op average_op = tf.reduce_mean(average_loss_tensor) saver = tf.train.Saver(tf.global_variables()) init = tf.global_variables_initializer() sess = tf.Session(config=config) sess.run(init) if FLAGS.use_dataset: sess.run(iterator.initializer) real_batch_size = FLAGS.batch_size * FLAGS.num_gpus num_batches_per_epoch = int((EPOCH_SIZE + real_batch_size - 1)/ real_batch_size) iterations = FLAGS.epochs * num_batches_per_epoch average_batch_time = 0.0 epochs_info = [] step = 0 average_loss = 0.0 for step in range(iterations): start_time = time.time() feed_dict = {} if not FLAGS.use_dataset: imgs, labs = get_real_batch_data(real_batch_size, 10) for i in range(FLAGS.num_gpus): feed_dict[feed_vars[i][0]] = imgs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size] feed_dict[feed_vars[i][1]] = labs[i*FLAGS.batch_size:(i+1)*FLAGS.batch_size] _, loss_value = sess.run([train_op, average_op], feed_dict=feed_dict) duration = time.time() - start_time average_batch_time += float(duration) average_loss += loss_value assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % FLAGS.log_step == 0: examples_per_sec = (FLAGS.batch_size * FLAGS.num_gpus) / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step > 0 and step % (FLAGS.eval_step * num_batches_per_epoch) == 0: average_loss /= num_batches_per_epoch * FLAGS.eval_step print ('epoch: %d, loss: %.2f' % (step/(FLAGS.eval_step*num_batches_per_epoch), average_loss)) epochs_info.append('%d:-:%s'%(step/(FLAGS.eval_step*num_batches_per_epoch), average_loss)) average_loss = 0.0 feed_dict = { images: mnist.test.images, labels :mnist.test.labels } if not FLAGS.use_dataset: feed_dict = {} feed_dict[feed_vars[0][0]] = mnist.test.images feed_dict[feed_vars[0][1]] = mnist.test.labels accuracy_value = accuracy.eval(session=sess, feed_dict=feed_dict) print("test accuracy %g"%accuracy_value) checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) average_batch_time /= iterations print 'average_batch_time: ', average_batch_time print ('epoch_info: %s' % ','.join(epochs_info))
def train(model='fcn5'): config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement) device_id = FLAGS.device_id device_str = '' if int(device_id) >= 0: device_str = '/gpu:%d' % int(device_id) config.allow_soft_placement = True config.intra_op_parallelism_threads = 1 config.inter_op_parallelism_threads = 0 else: device_str = '/cpu:0' num_threads = os.getenv('OMP_NUM_THREADS', 1) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=int(num_threads)) if FLAGS.xla: # Turns on XLA. XLA is not included in the standard build. For single GPU this shows ~5% improvement config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Graph().as_default(), tf.device(device_str), tf.Session( config=config) as sess: feature_dim = models.feature_dim label_dim = models.label_dim images = None labels = None iterator = None if FLAGS.use_dataset: with tf.device('/CPU:0'): d_features = mnist.train.images d_labels = mnist.train.labels dataset = tf.contrib.data.Dataset.from_tensor_slices( (d_features, d_labels)) dataset = dataset.repeat() dataset = dataset.shuffle(buffer_size=60000) dataset = dataset.batch(FLAGS.batch_size) # Trick to get datasets to buffer the next epoch. This is needed because # the data loading is occuring outside DataSets in python. Normally preprocessing # would occur in DataSets and this odd looking line is not needed. dataset = dataset.map(lambda x, y: (x, y), num_threads=1, output_buffer_size=1) iterator = dataset.make_initializable_iterator() images, labels = iterator.get_next() else: images = tf.placeholder(tf.float32, [None, feature_dim], name="images_placeholder") labels = tf.placeholder(tf.int64, [None, label_dim], name="labels_placeholder") logits = None loss = None if model == 'fcn5': logits = models.model_fcn5(images) else: logits = models.model_fcn8(images) loss = models.loss(logits, labels) predictionCorrectness = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1)) accuracy = tf.reduce_mean(tf.cast(predictionCorrectness, "float")) lr = 0.05 optimizer = tf.train.MomentumOptimizer(lr, 0.9).minimize(loss) init = tf.global_variables_initializer() sess.run(init) if FLAGS.use_dataset: sess.run(iterator.initializer) batch_size_per_epoch = int( (EPOCH_SIZE + FLAGS.batch_size - 1) / FLAGS.batch_size) iterations = FLAGS.epochs * batch_size_per_epoch average_batch_time = 0.0 epochs_info = [] average_loss = 0.0 for step in range(iterations): start_time = time.time() imgs = None labs = None if FLAGS.use_dataset: _, loss_value = sess.run([optimizer, loss]) else: imgs, labs = get_real_batch_data(FLAGS.batch_size, 10) _, loss_value = sess.run([optimizer, loss], feed_dict={ images: imgs, labels: labs }) duration = time.time() - start_time average_loss += loss_value average_batch_time += float(duration) assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % FLAGS.log_step == 0: examples_per_sec = FLAGS.batch_size / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)' ) print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step > 0 and step % (FLAGS.eval_step * batch_size_per_epoch) == 0: average_loss /= FLAGS.eval_step * batch_size_per_epoch accuracy_value = accuracy.eval(feed_dict={ images: mnist.test.images, labels: mnist.test.labels }) print("test accuracy %g" % accuracy_value) epochs_info.append('%d:%g:%s' % (step / (FLAGS.eval_step * batch_size_per_epoch), accuracy_value, average_loss)) average_loss = 0.0 average_batch_time /= iterations print 'average_batch_time: ', average_batch_time print('epoch_info: %s' % ','.join(epochs_info))
def main(): ckpt_state = tf.train.get_checkpoint_state(CHECKPOINTS_PATH) if not ckpt_state or not ckpt_state.model_checkpoint_path: print('No check point files are found!') return ckpt_files = ckpt_state.all_model_checkpoint_paths num_ckpt = len(ckpt_files) if num_ckpt < 1: print('No check point files are found!') return low_res_holder = tf.placeholder( tf.float32, shape=[BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, NUM_CHENNELS]) high_res_holder = tf.placeholder( tf.float32, shape=[BATCH_SIZE, LABEL_SIZE, LABEL_SIZE, NUM_CHENNELS]) inferences = models.create_model(MODEL_NAME, low_res_holder) testing_loss = models.loss(inferences, high_res_holder, name='testing_loss') low_res_batch, high_res_batch = batch_queue_for_testing(TESTING_DATA_PATH) sess = tf.Session() # we still need to initialize all variables even when we use Saver's restore method. sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) tf.train.start_queue_runners(sess=sess) best_mse = 100000 best_ckpt = '' for ckpt_file in ckpt_files: saver.restore(sess, ckpt_file) mse = 0 for i in range(NUM_TESTING_STEPS): low_res_images, high_res_images = sess.run( [low_res_batch, high_res_batch]) feed_dict = { low_res_holder: low_res_images, high_res_holder: high_res_images } mse += sess.run(testing_loss, feed_dict=feed_dict) mse /= NUM_TESTING_STEPS print('Model: %s. MSE: %.3f' % (ckpt_file, mse)) if mse < best_mse: best_mse = mse best_ckpt = ckpt_file print('Best model: %s. MSE: %.3f' % (best_ckpt, best_mse)) # now, we use the best model to generate some inference patches and compare with the ground truthes print('\ngenerating inference patches...') saver.restore(sess, best_ckpt) for k in range(4): low_res_images, high_res_images = sess.run( [low_res_batch, high_res_batch]) feed_dict = { low_res_holder: low_res_images, high_res_holder: high_res_images } inference_patches = sess.run(inferences, feed_dict=feed_dict) if not os.path.exists(INFERENCES_SAVE_PATH): os.mkdir(INFERENCES_SAVE_PATH) for i in range(BATCH_SIZE): low_res_input = low_res_images[i, ...] # INPUT_SIZE x INPUT_SIZE ground_truth = high_res_images[i, ...] # LABEL_SIZE x LABEL_SIZE inference = inference_patches[i, ...] crop_begin = (ground_truth.shape[0] - inference.shape[0]) // 2 crop_end = crop_begin + inference.shape[0] ground_truth = ground_truth[crop_begin:crop_end, crop_begin:crop_end, ...] low_res_input = cv.resize(low_res_input, (LABEL_SIZE, LABEL_SIZE), interpolation=cv.INTER_CUBIC) low_res_input = low_res_input[crop_begin:crop_end, crop_begin:crop_end, ...] patch_pair = np.hstack((low_res_input, inference, ground_truth)) # patch_pair += 0.5 patch_pair = tf.image.convert_image_dtype(patch_pair, tf.uint8, True) save_name = 'inference_%d_%d.png' % (k, i) cv.imwrite(join(INFERENCES_SAVE_PATH, save_name), patch_pair.eval(session=sess)) print('Test Finished!')
def main(): low_res_holder = tf.placeholder( tf.float32, shape=[BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, NUM_CHENNELS]) high_res_holder = tf.placeholder( tf.float32, shape=[BATCH_SIZE, LABEL_SIZE, LABEL_SIZE, NUM_CHENNELS]) inferences = models.create_model(MODEL_NAME, low_res_holder) training_loss = models.loss(inferences, high_res_holder, name='training_loss', weights_decay=0) validation_loss = models.loss(inferences, high_res_holder, name='validation_loss') tf.summary.scalar('training_loss', training_loss) tf.summary.scalar('validation_loss', validation_loss) global_step = tf.Variable(0, trainable=False, name='global_step') # learning_rate = tf.train.piecewise_constant( # global_step, # [2000, 5000, 8000, 12000, 16000], # [0.0005, 0.0001, 0.00005, 0.00001, 0.000005, 0.000001] # ) learning_rate = tf.train.inverse_time_decay(0.001, global_step, 10000, 2) train_step = tf.train.AdamOptimizer(learning_rate).minimize( training_loss, global_step=global_step) low_res_batch, high_res_batch = batch_queue_for_training( TRAINING_DATA_PATH) low_res_eval, high_res_eval = batch_queue_for_testing(VALIDATION_DATA_PATH) init = (tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() #sess.run(tf.global_variables_initializer()) sess.run(init) # Start the queue runners (make batches). tf.train.start_queue_runners(sess=sess) # the saver will restore all model's variables during training saver = tf.train.Saver(tf.global_variables(), max_to_keep=MAX_CKPT_TO_KEEP) # Merge all the summaries and write them out to TRAINING_DIR merged_summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(TRAINING_SUMMARY_PATH, sess.graph) for step in range(1, NUM_TRAINING_STEPS + 1): start_time = time.time() low_res_images, high_res_images = sess.run( [low_res_batch, high_res_batch]) feed_dict = { low_res_holder: low_res_images, high_res_holder: high_res_images } _, batch_loss = sess.run([train_step, training_loss], feed_dict=feed_dict) duration = time.time() - start_time assert not np.isnan(batch_loss), 'Model diverged with loss = NaN' if step % 100 == 0: # show training status num_examples_per_step = BATCH_SIZE examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = 'step %d, batch_loss = %.3f (%.1f examples/sec; %.3f sec/batch)' print(format_str % (step, batch_loss, examples_per_sec, sec_per_batch)) if step % 1000 == 0: # run validation and show its result low_res_images, high_res_images = sess.run( [low_res_eval, high_res_eval]) feed_dict = { low_res_holder: low_res_images, high_res_holder: high_res_images } batch_loss = sess.run(validation_loss, feed_dict=feed_dict) print('step %d, validation loss = %.3f' % (step, batch_loss)) summary = sess.run(merged_summary, feed_dict=feed_dict) summary_writer.add_summary(summary) # Save the model checkpoint periodically. if step % 10000 == 0 or (step + 1) == NUM_TRAINING_STEPS: saver.save(sess, join(CHECKPOINTS_PATH, 'model.ckpt'), global_step=step) print('Training Finished!')
def train(training_files, testing_files, params={ 'tau': 1e-5, 'priorlengthscale': 1e1 }, learning_rate=1e-4, save_dir=None, model_name=None, batch_size=128, device='/gpu:1', iterations=int(1e6), save_step=int(1e3), summary_step=int(1e2), N=int(1e9)): params_path = './default-checkpoint/bayes_opt/%s_best_parameters_Ax.txt' % model_name if os.path.exists(params_path): with open(params_path) as json_file: params = json.load(json_file) print('----loaded best parameters----') tau, priorlengthscale = params['tau'], params['priorlengthscale'] if save_dir is None: save_dir = './default-checkpoint' save_dir = os.path.join( save_dir, "default-%s-tau-%.3E-pls-%s.ckpt" % (model_name, tau, priorlengthscale)) if not os.path.exists(save_dir): os.makedirs(save_dir) em = himawari.EmulatorData() train_set = em.make_dataset(training_files, batch_size=batch_size) test_set = em.make_dataset(testing_files, batch_size=batch_size) # Use CNN output_bands = 6 if model_name == 'DCFC': model = DCCNN(layer_sizes=[512] * 3 + [output_bands * 2 + 1], filter_sizes=[1] * 4, output_bands=output_bands, N=N, tau=tau, priorlengthscale=priorlengthscale) elif model_name == 'DCCNN': model = DCCNN(layer_sizes=[512] * 3 + [output_bands * 2 + 1], filter_sizes=[3] * 4, output_bands=output_bands, N=N, tau=tau, priorlengthscale=priorlengthscale) elif model_name == 'DCResNet': model = DCResNet(blocks=5, output_bands=output_bands, N=N, tau=tau, priorlengthscale=priorlengthscale) elif model_name == 'DCVDSR': model = DCVDSR(hidden_layers=[512] * 3, output_bands=output_bands, N=N, tau=tau, priorlengthscale=priorlengthscale) optimizer = tf.compat.v2.keras.optimizers.Adam(learning_rate) ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=model) manager = tf.train.CheckpointManager(ckpt, save_dir, max_to_keep=3) if manager.latest_checkpoint: ckpt.restore(manager.latest_checkpoint) print("Restoring from checkpoint {}".format(manager.latest_checkpoint)) summary_writer = tf.summary.create_file_writer(save_dir + '/log') with summary_writer.as_default(): for i in range(iterations): element = train_set.get_next() x_train, y_train, m_train = element['AHI05'], element[ 'AHI12'], element['mask'] element = test_set.get_next() x_test, y_test, m_test = element['AHI05'], element[ 'AHI12'], element['mask'] start_time = time.time() with tf.GradientTape() as tape: loc, logvar, probs, prediction, reg_losses, dropout_probs = model( x_train, training=True) train_loss = loss(y_train, m_train, loc, logvar, probs, reg_losses=reg_losses, is_training=tf.constant(True), step=ckpt.step) grads = tape.gradient(train_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) duration = time.time() - start_time ckpt.step.assign_add(1) if int(ckpt.step) % save_step == 0: tf.saved_model.save(model, save_dir) # only the latest model for now manager.save() if int(ckpt.step) % summary_step == 0: loc, logvar, probs, prediction, reg_losses, dropout_probs = model( x_test, training=False) test_loss = loss(y_test, m_test, loc, logvar, probs, reg_losses=reg_losses, is_training=tf.constant(False), step=ckpt.step) print("Step: %d, Examples/sec: %0.5f, Training Loss: %2.4f, Test Loss: %2.4f" % \ (int(ckpt.step), batch_size / duration, train_loss, test_loss)) print("dropout probabilities: ", dropout_probs) for i in range(len(dropout_probs)): tf.compat.v2.summary.scalar('concrete/dropout_prob_%s' % i, tf.reduce_mean( dropout_probs[i]), step=int(ckpt.step)) tf.summary.image('input-band0', tf.expand_dims(x_test[:, :, :, 0], -1), step=int(ckpt.step)) tf.summary.image('label-band0', tf.expand_dims(tf.nn.relu(y_test[:, :, :, 0]), -1), step=int(ckpt.step)) tf.summary.image('output-band0', tf.expand_dims( tf.nn.relu(prediction[:, :, :, 0]), -1), step=int(ckpt.step)) test_loss = 0 for i in range(100): element = test_set.get_next() x_test, y_test, m_test = element['AHI05'], element['AHI12'], element[ 'mask'] loc, logvar, probs, prediction, reg_losses = model(x_test, training=False) test_loss += loss(y_test, m_test, loc, logvar, probs, reg_losses=reg_losses, step=ckpt.step, is_training=tf.constant(False)) return test_loss.numpy() / 100.
def train(): config = Config() classNum = config.classnum batch = config.batch size = config.size epoches = config.epoches preTrain = config.preTrain trainweights = config.trainWeights weights = config.weightsSave start_lr = config.start_lr lr_change = config.lr_change lr_decay = config.lr_decay os.makedirs(weights, exist_ok=True) model = RetinaNet(weights=preTrain, classNum=classNum) #for name, param in model.named_parameters(): # print(name, param) if t.cuda.is_available(): print("----GPU-Training----") model = model.cuda() if not trainweights == None: print("trainWeights:", trainweights) model.load_state_dict(t.load(trainweights)) model.train() optimer = Adam(model.parameters(), lr=start_lr) optimer.zero_grad() scheduler = lr_scheduler.MultiStepLR(optimer, lr_change, lr_decay) datasets = TrainDataset(img_road="datasets/train.txt", size=(size, size)) dataloader = DataLoader(datasets, batch_size=batch, shuffle=True, collate_fn=datasets.collate_fn, drop_last=True) Loss = loss() for epoch in range(epoches): print("epoch-{}".format(epoch)) for i, (imgs, labels, paths) in enumerate(dataloader): print("--epoch-{}-batch-{}--".format(epoch, i)) if t.cuda.is_available(): imgs = imgs.cuda() labels = labels.cuda() classify, regression, all_anchor = model(imgs) all_loss = Loss(classify, regression, labels, all_anchor) print("Loss:", all_loss) all_loss.backward() #if (i + 1) % 2 == 0: optimer.step() optimer.zero_grad() scheduler.step() if (epoch + 1) % 10 == 0: t.save(model.state_dict(), weights + "epoch{}.pth".format(epoch + 49)) t.save(model.state_dict(), weights + "finally.pth")
def main(): SEED = 42 torch.manual_seed(SEED) parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--map_dir", default=None, type=str, required=True, help="Folder containing maps") parser.add_argument( "--goal_dir", default=None, type=str, required=True, help="Folder containing goals for maps. See dataset class for info.") parser.add_argument( "--heuristic_dir", default=None, type=str, required=True, help= "Folder containing heurisctics for maps. See dataset class for info.") parser.add_argument( "--map_to_heuristic", default=None, type=str, required=True, help= "json file with maps names as keys and heuristic files as values. Note that goal and heuristic for one task should have the same names." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: small, big") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument('--alpha', type=float, default=0.0, required=True, help="Weight for gradient loss.") parser.add_argument( '--alpha1', type=float, default=1.0, required=True, help= "Weight for component of piece loss where output heuristic is less than minimal cost." ) parser.add_argument( '--alpha2', type=float, default=0.0, required=True, help= "Weight for component of piece loss where output heuristic is more than target cost." ) parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--learning_rate", default=1e-3, type=float, help="The initial learning rate for Adam.") parser.add_argument( '--desired_batch_size', type=int, default=32, help= "Desired batch size to accumulate before performing a backward/update pass." ) parser.add_argument("--num_train_epochs", default=10, type=int, help="Total number of training epochs to perform.") args = parser.parse_args() alpha = args.alpha alpha1 = args.alpha1 alpha2 = args.alpha2 if args.model_type == 'small': model = SmallUNet() elif args.model_type == 'big': model = UNet() else: raise (ValueError, 'Model type should be in [small, big]') learning_rate = args.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) criterion = lambda output, target_map, minimal_cost: loss( output, target_map, minimal_cost, device, alpha, alpha1, alpha2) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") exp_name = f'alpha_{alpha}_alpha1_{alpha1}_alpha2_{alpha2}' MAP_DIR = args.map_dir HEURISTIC_DIR = args.heuristic_dir GOAL_DIR = args.goal_dir map2heuristic_path = args.map_to_heuristic output_dir = args.output_dir with open(map2heuristic_path, 'r') as file: map2heuristic = json.load(file) batch_size = args.batch_size num_epochs = args.num_train_epochs desired_batch_size = args.desired_batch_size if args.desired_batch_size > batch_size else batch_size config = { 'learning_rate': learning_rate, 'alpha': alpha, 'alpha1': alpha1, 'alpha2': alpha2, 'num_epochs': num_epochs, 'batch_size': batch_size, 'desired_batch_size': desired_batch_size } if not os.path.exists(output_dir): os.mkdir(output_dir) with open(os.path.join(output_dir, 'config.json'), 'w') as file: json.dump(config, file) dataset = MapsDataset(MAP_DIR, HEURISTIC_DIR, GOAL_DIR, map2heuristic, maps_size=(64, 64)) train_dataset, val_dataset = random_split(dataset, [40000, 10000]) train_batch_gen = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=cpu_count()) val_batch_gen = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=cpu_count()) _ = train_net(model, criterion, optimizer, train_batch_gen, val_batch_gen, device, num_epochs=num_epochs, output_dir=output_dir, desired_batch_size=desired_batch_size, exp_name=exp_name)
import shutil from torch.utils.tensorboard import SummaryWriter if __name__ == '__main__': device = 'cuda:0' if torch.cuda.is_available() else 'cpu' parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=2) parser.add_argument('--lr', type=float, default=0.0001) parser.add_argument('--epoch', type=int, default=250) args = parser.parse_args() train_loader, test_loader = datasets.prepare(batch_size=args.batch_size) model = models.net(num_classes=datasets.num_classes).to(device) criterion = models.loss(num_classes=datasets.num_classes) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) log_dir = 'data/runs' if os.path.exists(log_dir): shutil.rmtree(log_dir) os.makedirs(log_dir) else: os.makedirs(log_dir) writer = SummaryWriter(log_dir=log_dir) epoch_digit = len(list(str(args.epoch))) for epoch in range(args.epoch): model.train() train_loss = 0 train_acc = 0 train_number = 0