def main(argv): parser = argparse.ArgumentParser() # You must accept a --job-dir argument when running on Cloud ML Engine. It specifies where checkpoints # should be saved. You can define additional user arguments which will have to be specified after # an empty arg -- on the command line: # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args # no batch norm: lr 0.002-0.0002-2000 is ok, over 10000 iterations (final accuracy 0.9937 loss 2.39 job156) # batch norm: lr 0.02-0.0001-600 conv 16-32-64 trains in 3000 iteration (final accuracy 0.0.8849 loss 1.466 job 159) parser.add_argument('--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints') parser.add_argument('--data-dir', default="data", help='Where training data will be loaded and unzipped') parser.add_argument('--hp-lr0', default=0.02, type=float, help='Hyperparameter: initial (max) learning rate') parser.add_argument('--hp-lr1', default=0.0001, type=float, help='Hyperparameter: target (min) learning rate') parser.add_argument('--hp-lr2', default=600, type=float, help='Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.') parser.add_argument('--hp-dropout', default=0.3, type=float, help='Hyperparameter: dropout rate on dense layers.') parser.add_argument('--hp-conv1', default=6, type=int, help='Hyperparameter: depth of first convolutional layer.') parser.add_argument('--hp-conv2', default=12, type=int, help='Hyperparameter: depth of second convolutional layer.') parser.add_argument('--hp-conv3', default=24, type=int, help='Hyperparameter: depth of third convolutional layer.') parser.add_argument('--hp-bnexp', default=0.993, type=float, help='Hyperparameter: exponential decay for batch norm moving averages.') parser.add_argument('--hp-iterations', default=10000, type=int, help='Hyperparameter: number of training iterations.') args = parser.parse_args() arguments = args.__dict__ hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')} otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')} logging.log(logging.INFO, "Hyperparameters:" + str(sorted(hparams.items()))) output_dir = otherargs.pop('job_dir') # learn_runner needs an experiment function with a single parameter: the output directory. # Here we pass additional command line arguments through a closure. experiment_fn = lambda output_dir: experiment_fn_with_params(output_dir, hparams, **otherargs) # Compatibility warning: learn_runner is currently in contrib. It will move in TF 1.2 tf.contrib.learn.learn_runner.run(experiment_fn, output_dir)
def log(level, message, *args): """Conditionally logs `message % args` at the level `level`. Note that tensorboard_logging verbosity and logging verbosity are separate; the message will always be passed through to the logging module regardless of whether it passes the tensorboard_logging verbosity check. Args: level: The verbosity level to use. Must be one of tensorboard_logging.{DEBUG, INFO, WARN, ERROR, FATAL}. message: The message template to use. *args: Arguments to interpolate to the message template, if any. Raises: ValueError: If `level` is not a valid logging level. RuntimeError: If the `SummaryWriter` to use has not been set. """ if _summary_writer is _sentinel_summary_writer: raise RuntimeError('Must call set_summary_writer before doing any ' 'logging from tensorboard_logging') _check_verbosity(level) proto_level = _LEVEL_PROTO_MAP[level] if proto_level >= _LEVEL_PROTO_MAP[_verbosity]: log_message = event_pb2.LogMessage(level=proto_level, message=message % args) event = event_pb2.Event(wall_time=time.time(), log_message=log_message) if _summary_writer: _summary_writer.add_event(event) logging.log(_PLATFORM_LOGGING_LEVEL_MAP[level], message, *args)
def log(level, message, *args): """Conditionally logs `message % args` at the level `level`. Note that tensorboard_logging verbosity and logging verbosity are separate; the message will always be passed through to the logging module regardless of whether it passes the tensorboard_logging verbosity check. Args: level: The verbosity level to use. Must be one of tensorboard_logging.{DEBUG, INFO, WARN, ERROR, FATAL}. message: The message template to use. *args: Arguments to interpolate to the message template, if any. Raises: ValueError: If `level` is not a valid logging level. RuntimeError: If the `SummaryWriter` to use has not been set. """ if _summary_writer is _sentinel_summary_writer: raise RuntimeError('Must call set_summary_writer before doing any ' 'logging from tensorboard_logging') _check_verbosity(level) proto_level = _LEVEL_PROTO_MAP[level] if proto_level >= _LEVEL_PROTO_MAP[_verbosity]: log_message = event_pb2.LogMessage(level=proto_level, message=message % args) event = event_pb2.Event(wall_time=time.time(), log_message=log_message) if _summary_writer: _summary_writer.add_event(event) logging.log(_PLATFORM_LOGGING_LEVEL_MAP[level], message, *args)
def run_data_generation(data, output_dir, record_batch_size, shuffle_buf, tiles_per_gt_roi, rnd_distmax, rnd_orientation, is_eval): img_filelist, roi_filelist = load_file_list(data) # sanity checks and log messages if len(img_filelist) > 0: logging.log(logging.INFO, "Generating {} data.".format("eval" if is_eval else "training")) else: logging.log(logging.INFO, "No image/json pairs found in folder {}. Skipping.".format(data)) return # dummy args only used in YOLO box assignments, which will be discarded anyway # TODO: refactor these outside of the generate_slice function yolo_cfg = YOLOConfig(grid_nn = 16, cell_n = 2, cell_swarm = True, cell_grow = 1.0) if is_eval: dataset = init_eval_dataset_from_images(img_filelist, roi_filelist, record_batch_size, yolo_cfg) else: dataset = init_train_dataset_from_images(img_filelist, roi_filelist, record_batch_size, shuffle_buf, yolo_cfg, False, rnd_orientation, tiles_per_gt_roi, rnd_distmax) # False = no rnd hue dataset = dataset.repeat(1) ### # TF graph for JPEG image encoding features, labels = dataset.make_one_shot_iterator().get_next() image_tiles = features['image'] fname = labels['fnames'] target_rois = labels['target_rois'] # shape [n_tiles, MAX_TARGET_ROIS_PER_TILE, 4] encoded_jpegs = tf.map_fn(lambda image_bytes: tf.image.encode_jpeg(image_bytes, optimize_size=True, chroma_downsampling=False), image_tiles, dtype=tf.string) # end of TF graph for image encoding ### i = 0 with tf.Session() as sess: while True: try: image_jpegs_r, target_rois_r, fname_r = sess.run([encoded_jpegs, target_rois, fname]) except tf.errors.OutOfRangeError: break except tf.errors.NotFoundError: break i += 1 # write ROIs basename = os.path.basename(fname_r[0].decode("utf-8")) basename, _ = os.path.splitext(basename) filename = os.path.join(output_dir, "{}tiles{:06}_{}.tfrecord".format(record_batch_size, i, basename)) with tf.python_io.TFRecordWriter(filename) as file: for one_image_jpeg, per_image_target_rois in zip(image_jpegs_r, target_rois_r): nonempty_target_rois = filter(lambda roi: abs(roi[2]-roi[0]) > 0 and # roi format is x1y1x2y2 abs(roi[3]-roi[1]) > 0, per_image_target_rois) nonempty_target_rois = np.array(list(nonempty_target_rois), np.float32) nonempty_target_rois = np.reshape(nonempty_target_rois, [-1]).tolist() write_tfrecord_features(file, one_image_jpeg, nonempty_target_rois, fname_r[0]) # write TFRecord
def batch_filter_by_bool(rois, mask, max_n): rois_n = tf.count_nonzero(mask, axis=1) overflow = tf.maximum(rois_n - max_n, 0) rois = tf.map_fn( lambda rois__mask: filter_by_bool_remove(*rois__mask, max_n=max_n), (rois, mask), dtype=tf.float32) # shape[batch,max_n, 4] rois = tf.reshape(rois, [-1, max_n, 4]) logging.log(logging.INFO, rois) # Tensorflow needs a hint about the shape return rois, overflow
def datagen_main(argv): parser = argparse.ArgumentParser() def str2bool(v): return v=='True' parser.add_argument('--job-dir', default="checkpoints", help='Not used in datagen mode but required by ML engine') parser.add_argument('--data', default="sample_data/USGS_public_domain_airports", help='Path to data file (can be on Google cloud storage gs://...)') parser.add_argument('--output-dir', default="tilecache", help='Folder where generated training and eval tiles will be stored (can be on Google cloud storage gs://...)') parser.add_argument('--record-batch-size', default=100, type=int, help='How many tiles per TFRecord file in the output') parser.add_argument('--shuffle-buf', default=10000, type=int, help='Size of the shuffle buffer for shuffling tiles. 0 to disable shuffling.') parser.add_argument('--hp-data-tiles-per-gt-roi', default=100, type=int, help='Data generation hyperparameter: number of training tiles generated around each ground truth ROI') parser.add_argument('--hp-data-rnd-distmax', default=2.0, type=float, help='Data generation hyperparameter: training tiles selection max random distance from ground truth ROI (always 2.0 for eval tiles)') parser.add_argument('--hp-data-rnd-orientation', default=True, type=str2bool, help='Data generation hyperparameter: data augmentation by rotating and flipping tiles.') args = parser.parse_args() data_eval = args.data + "_eval" output_dir_eval = args.output_dir + "_eval" if not gcsfile.file_exists(args.output_dir) or not gcsfile.file_exists(output_dir_eval): logging.log(logging.ERROR, "Error: both the otput path \"{}\" and the eval " "output path \"{}\" must exist. Please create them " "before starting data generation.".format(args.output_dir, output_dir_eval)) exit(-1) logging.log(logging.INFO, "Training data path: " + args.data) logging.log(logging.INFO, "Eval data path: " + data_eval) logging.log(logging.INFO, "Command-line parameters only affect training data generation. " "Eval data is generated with hard-coded parameters so as to offer " "a consistent evaluation benchmark.") rnd_distmax = args.hp_data_rnd_distmax tiles_per_gt_roi = args.hp_data_tiles_per_gt_roi rnd_orientation = args.hp_data_rnd_orientation # training and eval data generation run_data_generation(args.data, args.output_dir, args.record_batch_size, args.shuffle_buf, tiles_per_gt_roi, rnd_distmax, rnd_orientation, is_eval=False) run_data_generation(data_eval, output_dir_eval, args.record_batch_size, args.shuffle_buf, tiles_per_gt_roi, rnd_distmax, rnd_orientation, is_eval=True)
def load_data(path): # loads from GCS if gs:// path, # loads locally otherwise with gcsfile.FileIO(path, 'rb') as zf: with gzip.GzipFile(fileobj=zf, mode='rb') as f: planesnet = pickle.load(f) # unpack dictionary data_images = planesnet['data'] data_labels = np.array(planesnet['labels']) #data_latlon = np.array(planesnet['locations']) #data_scnids = np.array(planesnet['scene_ids']) assert len(data_images) == len(data_labels) #log message logging.log(logging.INFO, "Loaded data file " + path) # images are provided, as a single array of ints, by color planes first # and in each color plane, first row first. Reshaping to [batch, 3, 20, 20] # will give indexing as [batch, rgb, y, x]. Then swap axes -> [batch, y, x, rgb] data_images = np.reshape(data_images, (-1, 3, 20, 20), order="C") data_images = np.swapaxes(data_images, 1, 2) data_images = np.swapaxes(data_images, 2, 3) # image dump for debugging #for i in range(24000, 32000): # image_dump(data_images[i], data_labels[i], data_latlon[i], data_scnids[i]) # shuffle the data np.random.seed(0) n = len(data_images) p = np.random.permutation(n) data_images = data_images[p] data_labels = data_labels[p] # convert images to float #data_images = (data_images / 255.0).astype(np.float32) # image format uint8 # partition training and test data TEST_SIZE = n // 10 TEST_SIZE = 5000 if TEST_SIZE < 5000 else 10000 if TEST_SIZE > 10000 else TEST_SIZE test_images = data_images[:TEST_SIZE] test_labels = data_labels[:TEST_SIZE] train_images = data_images[TEST_SIZE:] train_labels = data_labels[TEST_SIZE:] return test_images, test_labels, train_images, train_labels
def main(argv): parser = argparse.ArgumentParser() # You must accept a --job-dir argument when running on Cloud ML Engine. It specifies where checkpoints # should be saved. You can define additional user arguments which will have to be specified after # an empty arg -- on the command line: # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args # no batch norm: lr 0.002-0.0002-2000 is ok, over 10000 iterations (final accuracy 0.9937 loss 2.39 job156) # batch norm: lr 0.02-0.0001-600 conv 16-32-64 trains in 3000 iteration (final accuracy 0.9949 loss 1.466 job 159) parser.add_argument('--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints') parser.add_argument('--data-dir', default="data", help='Where training data will be loaded and unzipped') parser.add_argument('--hp-lr0', default=0.02, type=float, help='Hyperparameter: initial (max) learning rate') parser.add_argument('--hp-lr1', default=0.0001, type=float, help='Hyperparameter: target (min) learning rate') parser.add_argument('--hp-lr2', default=600, type=float, help='Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.') parser.add_argument('--hp-dropout', default=0.3, type=float, help='Hyperparameter: dropout rate on dense layers.') parser.add_argument('--hp-conv1', default=6, type=int, help='Hyperparameter: depth of first convolutional layer.') parser.add_argument('--hp-conv2', default=12, type=int, help='Hyperparameter: depth of second convolutional layer.') parser.add_argument('--hp-conv3', default=24, type=int, help='Hyperparameter: depth of third convolutional layer.') parser.add_argument('--hp-bnexp', default=0.993, type=float, help='Hyperparameter: exponential decay for batch norm moving averages.') parser.add_argument('--hp-iterations', default=3000, type=int, help='Hyperparameter: number of training iterations.') args = parser.parse_args() arguments = args.__dict__ hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')} otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')} logging.log(logging.INFO, "Hyperparameters:" + str(sorted(hparams.items()))) data_dir = otherargs['data_dir'] job_dir = otherargs.pop('job_dir') train_images_file, train_labels_file, test_images_file, test_labels_file = load_mnist_data(data_dir) def train_input_fn(): return train_data_input_fn(train_images_file, train_labels_file) def eval_input_fn(): return eval_data_input_fn(test_images_file, test_labels_file) training_config = tf.estimator.RunConfig(model_dir=job_dir, save_summary_steps=10, save_checkpoints_steps=200) estimator = tf.estimator.Estimator(model_fn=conv_model, model_dir=job_dir, params=hparams, config=training_config) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=hparams['iterations']) export_latest = tf.estimator.LatestExporter("mnist-model",serving_input_receiver_fn=serving_input_fn) eval_spec = tf.estimator.EvalSpec(eval_input_fn, steps=1, exporters=export_latest, throttle_secs=60) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def main(argv): parser = argparse.ArgumentParser() # You must accept a --job-dir argument when running on Cloud ML Engine. It specifies where checkpoints # should be saved. You can define additional user arguments which will have to be specified after # an empty arg -- on the command line: # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args # no batch norm: lr 0.002-0.0002-2000 is ok, over 10000 iterations (final accuracy 0.9937 loss 2.39 job156) # batch norm: lr 0.02-0.0001-600 conv 16-32-64 trains in 3000 iteration (final accuracy 0.9949 loss 1.466 job 159) parser.add_argument('--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints') parser.add_argument('--data-dir', default="data", help='Where training data will be loaded and unzipped') parser.add_argument('--hp-lr0', default=0.02, type=float, help='Hyperparameter: initial (max) learning rate') parser.add_argument('--hp-lr1', default=0.0001, type=float, help='Hyperparameter: target (min) learning rate') parser.add_argument('--hp-lr2', default=600, type=float, help='Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.') parser.add_argument('--hp-dropout', default=0.3, type=float, help='Hyperparameter: dropout rate on dense layers.') parser.add_argument('--hp-conv1', default=6, type=int, help='Hyperparameter: depth of first convolutional layer.') parser.add_argument('--hp-conv2', default=12, type=int, help='Hyperparameter: depth of second convolutional layer.') parser.add_argument('--hp-conv3', default=24, type=int, help='Hyperparameter: depth of third convolutional layer.') parser.add_argument('--hp-bnexp', default=0.993, type=float, help='Hyperparameter: exponential decay for batch norm moving averages.') parser.add_argument('--hp-iterations', default=3000, type=int, help='Hyperparameter: number of training iterations.') args = parser.parse_args() arguments = args.__dict__ hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')} otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')} logging.log(logging.INFO, "Hyperparameters:" + str(sorted(hparams.items()))) data_dir = otherargs['data_dir'] job_dir = otherargs.pop('job_dir') train_images_file, train_labels_file, test_images_file, test_labels_file = load_mnist_data(data_dir) def train_input_fn(): return train_data_input_fn(train_images_file, train_labels_file) def eval_input_fn(): return eval_data_input_fn(test_images_file, test_labels_file) training_config = tf.estimator.RunConfig(model_dir=job_dir, save_summary_steps=10, save_checkpoints_steps=200) estimator = tf.estimator.Estimator(model_fn=conv_model, model_dir=job_dir, params=hparams, config=training_config) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=hparams['iterations']) export_latest = tf.estimator.LatestExporter("mnist-model",serving_input_receiver_fn=serving_input_fn) eval_spec = tf.estimator.EvalSpec(eval_input_fn, steps=1, exporters=export_latest, throttle_secs=60) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def start_training(output_dir, hparams, data, tiledata, **kwargs): # YOLO configuration for ROI assignments yolo_cfg = datagen.YOLOConfig(hparams["grid_nn"], hparams["cell_n"], hparams["cell_swarm"], hparams["cell_grow"]) eval_yolo_cfg = datagen.YOLOConfig(hparams["grid_nn"], hparams["cell_n"], hparams["cell_swarm"], 1.0) # data source selection: full aerial imagery of TFRecords containing individual 256x256 tiles if tiledata != "" and data == "": # training from tfrecords tfrec_filelist = gcsfile.get_matching_files(tiledata + "/*.tfrecord") train_data_input_fn = lambda params: datagen.train_dataset_from_tfrecords( tfrec_filelist, params['batch_size'], hparams["shuffle_buf"], yolo_cfg, hparams["data_rnd_hue"], hparams[ "data_rnd_orientation"], hparams["data_cache_n_epochs"]) tfrec_filelist_eval = gcsfile.get_matching_files(tiledata + "_eval" + "/*.tfrecord") eval_data_input_fn = lambda params: datagen.eval_dataset_from_tfrecords( tfrec_filelist_eval, params['batch_size'], eval_yolo_cfg) elif data != "" and tiledata == "": # training from aerial imagery directly img_filelist, roi_filelist = datagen.load_file_list(data) train_data_input_fn = lambda params: datagen.train_dataset_from_images( img_filelist, roi_filelist, params['batch_size'], hparams[ "shuffle_buf"], yolo_cfg, hparams["data_rnd_hue"], hparams[ "data_rnd_orientation"], hparams["data_tiles_per_gt_roi"], hparams["data_rnd_distmax"], hparams["data_cache_n_epochs"]) img_filelist_eval, roi_filelist_eval = datagen.load_file_list(data + "_eval") eval_data_input_fn = lambda params: datagen.eval_dataset_from_images( img_filelist_eval, roi_filelist_eval, params['batch_size'], eval_yolo_cfg) else: logging.log( logging.ERROR, "One and only one of parameters 'data' and 'tiledata' must be supplied." ) return # Estimator configuration # export_latest = tf.estimator.LatestExporter(name="planespotting", # serving_input_receiver_fn=serving_input_fn, # exports_to_keep=1) # train_spec = tf.estimator.TrainSpec(input_fn=train_data_input_fn, # max_steps=hparams["iterations"]) # eval_spec = tf.estimator.EvalSpec(input_fn=eval_data_input_fn, # steps=hparams['eval_iterations'], # exporters=export_latest, # start_delay_secs=1, # Confirmed: this does not work (plane533 for ex.) # throttle_secs=60) training_config = tf.contrib.tpu.RunConfig( model_dir=output_dir, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=tf.contrib.tpu.TPUConfig(hparams['tpu_iterations'], 8), # 8 cores in a TPU board cluster=tf.contrib.cluster_resolver.TPUClusterResolver( kwargs['tpu'], kwargs['tpu_zone'], kwargs['gcp_project']) if hparams['use_tpu'] else None) # Experimental distribution strategy if running on a machine with multiple GPUs # logging.log(logging.INFO, "GPUs found: " + str(get_available_gpus())) # distribution = tf.contrib.distribute.MirroredStrategy() if len(get_available_gpus()) > 1 else None # training_config = tf.estimator.RunConfig(model_dir=output_dir, # save_summary_steps=100, # save_checkpoints_steps=2000, # keep_checkpoint_max=1) estimator = tf.contrib.tpu.TPUEstimator( model_fn=model.model_fn, model_dir=output_dir, params=hparams, train_batch_size=hparams['batch'], eval_batch_size=hparams[ 'batch'], # TPU constraint: batch sizes must be the same (?) config=training_config, use_tpu=hparams['use_tpu'], export_to_tpu=False ) # we do not need the TPU graph in the exported model since # we will be serving it from CPUs/GPUs. Also, without # export_to_tpu=Flase, TPUEstimator.export_saved_model crashes (TF1.12 and earlier) # estimator = tf.estimator.Estimator(model_fn=model.model_fn, # model_dir=output_dir, # config=training_config, # params=hparams) # tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) TPU_EVAL_EVERY_STEPS = 10000 # only one eval at the end for i in range( int(math.ceil(hparams["iterations"] * 1.0 / TPU_EVAL_EVERY_STEPS))): estimator.train(train_data_input_fn, steps=min( TPU_EVAL_EVERY_STEPS, hparams["iterations"] - TPU_EVAL_EVERY_STEPS * i)) estimator.evaluate(input_fn=eval_data_input_fn, steps=hparams['eval_iterations']) estimator.export_savedmodel(os.path.join(output_dir, "planespotting"), serving_input_fn)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import tensorflow as tf from tensorflow.python.platform import tf_logging as logging from tensorflow.examples.tutorials.mnist import input_data as mnist_data import argparse import math import sys logging.set_verbosity(logging.INFO) logging.log(logging.INFO, "Tensorflow version " + tf.__version__) # # To run this: see README.md # # Called when the model is deployed for online predictions on Cloud ML Engine. def serving_input_fn(): inputs = {'image': tf.placeholder(tf.float32, [None, 28, 28])} # Here, you can transform the data received from the API call features = inputs return tf.estimator.export.ServingInputReceiver(features, inputs) # In memory training data for this simple case. # When data is too large to fit in memory, use Tensorflow queues.
def log(msg): tf_logging.log(tf_logging.FATAL, msg) # FATAL to show up at any TF logging level logging.getLogger('DeepBugHunter').info(msg)
def main(argv): training_config = tf.contrib.learn.RunConfig(save_checkpoints_secs=None, save_checkpoints_steps=500) # Bug, exports_to_keep=None is necessary, otherwise this crashes under Python 3 export_strategy = tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy( serving_input_fn=serving_input_fn, exports_to_keep=None) # The Experiment is an Estimator with data loading functions and other parameters def experiment_fn_with_params(output_dir, hparams, data, **kwargs): # load data test_images, test_labels, train_images, train_labels = load_data(data) #dataset, nb = load_dataset(data) #dataset_eval, nb_eval_files = load_dataset(data + "_eval") ITERATIONS = hparams["iterations"] # Compatibility warning: Experiment will move out of contrib in 1.4 return tf.contrib.learn.Experiment( estimator=tf.estimator.Estimator(model_fn=model.model_fn, model_dir=output_dir, config=training_config, params=hparams), train_input_fn=lambda: train_data_input_fn(train_images, train_labels), eval_input_fn=lambda: eval_data_input_fn(test_images, test_labels), #train_input_fn=lambda: dataset_input_fn(dataset), #eval_input_fn=lambda: dataset_eval_input_fn(dataset_eval, nb_eval_files), train_steps=ITERATIONS, eval_steps=1, min_eval_frequency=100, export_strategies=export_strategy) parser = argparse.ArgumentParser() # mandatory arguments format for ML Engine: # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args parser.add_argument( '--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints') parser.add_argument( '--data', default="planesnet32K.pklz", help='Path to data file (can be on Google cloud storage gs://...)') parser.add_argument('--hp-iterations', default=80000, type=int, help='Hyperparameter: number of training iterations') parser.add_argument('--hp-lr0', default=0.01, type=float, help='Hyperparameter: initial (max) learning rate') parser.add_argument('--hp-lr1', default=0.0001, type=float, help='Hyperparameter: target (min) learning rate') parser.add_argument( '--hp-lr2', default=800, type=float, help= 'Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.' ) parser.add_argument('--hp-dropout', default=0.3, type=float, help='Hyperparameter: dropout rate on dense layers.') parser.add_argument( '--hp-filter-sizes', default='S', help='Hyperparameter: convolutional filter sizes S, M, L.') parser.add_argument( '--hp-conv1', default=16, type=int, help= 'Hyperparameter: depth of first convolutional layer. Depth then doubles at each layer.' ) parser.add_argument( '--hp-bnexp', default=0.993, type=float, help='Hyperparameter: exponential decay for batch norm moving averages.' ) parser.add_argument('--hp-dense', default=80, type=int, help='Hyperparameter: size of the dense layer') args = parser.parse_args() arguments = args.__dict__ hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')} otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')} logging.log(logging.INFO, "Hyperparameters:" + str(sorted(hparams.items()))) logging.log(logging.INFO, "Other parameters:" + str(sorted(otherargs.items()))) output_dir = otherargs.pop('job_dir') experiment_fn = lambda output_dir: experiment_fn_with_params( output_dir, hparams, **otherargs) tf.contrib.learn.learn_runner.run(experiment_fn, output_dir)
def main(argv): parser = argparse.ArgumentParser() # mandatory arguments format for ML Engine: # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args def str2bool(v): return v == 'True' parser.add_argument( '--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints') parser.add_argument( '--data', default="", help= 'Path to training data folder containing full-scale aerial imagery (can be on Google cloud storage gs://...). Eval data should be in a folder with the same name and and _eval suffix.' ) parser.add_argument( '--tiledata', default="", help= 'Path to training data folder containing image tiles (can be on Google cloud storage gs://...). Eval data should be in a folder with the same name and and _eval suffix.' ) parser.add_argument('--hp-iterations', default=25000, type=int, help='Hyperparameter: number of training iterations') parser.add_argument('--hp-batch-size', default=10, type=int, help='Hyperparameter: training batch size') parser.add_argument('--hp-eval-batch-size', default=32, type=int, help='Hyperparameter: evaluation batch size') parser.add_argument( '--hp-eval-iterations', default=262, type=int, help='Hyperparameter: eval iterations' ) # eval dataset is 8380 tiles (262 batches of 32) - larger batch will OOM. parser.add_argument('--hp-shuffle-buf', default=10000, type=int, help='Hyperparameter: data shuffle buffer size') parser.add_argument('--hp-layers', default=12, type=int, help='Hyperparameter: number of layers') parser.add_argument('--hp-first-layer-filter-size', default=6, type=int, help='Hyperparameter: filter size in first layer') parser.add_argument('--hp-first-layer-filter-stride', default=2, type=int, help='Hyperparameter: filter stride in first layer') parser.add_argument( '--hp-first-layer-filter-depth', default=32, type=int, help= 'Hyperparameter: the number of filters in the first and last layers') parser.add_argument( '--hp-depth-increment', default=5, type=int, help= 'Hyperparameter: increment the decrement filter depth by this amount between first and last layer' ) parser.add_argument( '--hp-grid-nn', default=16, type=int, help='Hyperparameter: size of YOLO grid: grid-nn x grid-nn') parser.add_argument( '--hp-cell-n', default=2, type=int, help='Hyperparameter: number of ROIs detected per YOLO grid cell') parser.add_argument( '--hp-cell-swarm', default=True, type=str2bool, help= 'Hyperparameter: ground truth ROIs selection algorithm. The better swarm algorithm is only implemented for cell_n=2' ) parser.add_argument( '--hp-cell-grow', default=1.3, type=float, help= 'Hyperparameter: ROIs allowed to be cetered beyond grid cell by this factor' ) parser.add_argument('--hp-lr0', default=0.01, type=float, help='Hyperparameter: initial (max) learning rate') parser.add_argument('--hp-lr1', default=0.0001, type=float, help='Hyperparameter: target (min) learning rate') parser.add_argument( '--hp-lr2', default=3000, type=float, help= 'Hyperparameter: learning rate decay period in steps. Only used when the decay type is "exponential". For "cosine-restarts", the first decay period is always iterations/8.' ) parser.add_argument( '--hp-decay-type', default="exponential", choices=["exponential", "cosine-restarts"], help= 'Hyperparameter: learning rate decay type. "exponential" (default) or "cosine-restarts".' ) parser.add_argument( '--hp-decay-restarts', default=3, type=int, choices=range(0, 6), help= 'Hyperparameter: learning rate decay restarts over the entire training. Only used when decay-type is "cosine-restarts". The learning rate always decays to its min value at the end of "iterations" and the first restart is always at iterations/8.' ) parser.add_argument( '--hp-decay-restart-height', default=0.99, type=float, help= 'Hyperparameter: learning rate restart value as a fraction of the previous max learning rate. Only used when decay-type is "cosine-restarts"' ) parser.add_argument( '--hp-dropout', default=0.0, type=float, help= 'Hyperparameter: dropout rate. It should be between 0.0 and 0.5. 0.0 for no dropout.' ) parser.add_argument( '--hp-spatial-dropout', default=True, type=str2bool, help= 'Hyperparameter: dropout type, spatial or ordinary. Spatial works better in convolutional networks.' ) parser.add_argument( '--hp-bnexp', default=0.993, type=float, help='Hyperparameter: exponential decay for batch norm moving averages.' ) parser.add_argument('--hp-lw1', default=1, type=float, help='Hyperparameter: loss weight LW1') parser.add_argument('--hp-lw2', default=3, type=float, help='Hyperparameter: loss weight LW2') parser.add_argument('--hp-lw3', default=30, type=float, help='Hyperparameter: loss weight LW3') # hyperparameters for training data generation when training from large photos directly. They do not affect test data. parser.add_argument( '--hp-data-tiles-per-gt-roi', default=166, type=int, help= 'Data generation hyperparameter: number of training tiles generated around each ground truth ROI' ) parser.add_argument( '--hp-data-rnd-distmax', default=2.0, type=float, help= 'Data generation hyperparameter: training tiles selection max random distance from ground truth ROI (always 2.0 for eval tiles)' ) parser.add_argument( '--hp-data-rnd-hue', default=True, type=str2bool, help= 'Data generation hyperparameter: data augmentation with random hue on training images' ) parser.add_argument( '--hp-data-rnd-orientation', default=True, type=str2bool, help= 'Data generation hyperparameter: data augmentation by rotating and flipping tiles.' ) parser.add_argument( '--hp-data-cache-n-epochs', default=0, type=int, help= 'Generate random data variations for n epochs then cache and reuse.') args = parser.parse_args() arguments = args.__dict__ hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')} otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')} logging.log(logging.INFO, "Hyperparameters:" + str(sorted(hparams.items()))) logging.log(logging.INFO, "Other parameters:" + str(sorted(otherargs.items()))) output_dir = otherargs.pop('job_dir') start_training(output_dir, hparams, **otherargs)
def PrintAndLog(msg, lvl=tf_logging.INFO): tf_logging.log(lvl, msg) print(msg)
def YOLO_head(x, mode, params, info, grid_nn, cell_n): """YOLO (You Look Only Once) bounding box head. Divides each image into a gid_nn x grid_nn grid and predicts cell_n bounding boxes per grid cell.""" assert grid_nn == 48 pool_size = 48 // grid_nn # Average pooling down to the grid size. # for GRID_N=48, need pool_size=1, strides=1 (no pooling) y = tf.layers.average_pooling2d( x, pool_size=pool_size, strides=pool_size, padding="valid") # [batch, grid_nn, grid_nn, cell_n*32] info = _layer_stats(info, "YOLO head, avg pool", y, 0, 0) # for each cell, this has CELL_B predictions of bounding box (x,y,w,h,c) # apply tanh for x, y, sigmoid for w,h, softmax for c # TODO: idea: batch norm may be bad on this layer # TODO: try with a deeper layer as well # TODO: try a filtered convolution instead of pooling2d, maybe info from cell sides should be weighted differently box_xr, box_yr, box_wr, box_hr, box_c0, box_c1 = tf.split( y, 6, axis=-1) # shape 4 x [batch, grid_nn, grid_nn, 36] box_x = tf.nn.tanh(conv1x1_batch_norm( box_xr, mode, params, depth=cell_n)) # shape [batch, grid_nn, grid_nn, cell_n] box_y = tf.nn.tanh(conv1x1_batch_norm( box_yr, mode, params, depth=cell_n)) # shape [batch, grid_nn, grid_nn, cell_n] box_w = tf.nn.sigmoid( conv1x1_batch_norm( box_wr, mode, params, depth=cell_n)) # shape [batch, grid_nn, grid_nn, cell_n] box_h = tf.nn.sigmoid( conv1x1_batch_norm( box_hr, mode, params, depth=cell_n)) # shape [batch, grid_nn, grid_nn, cell_n] box_c = tf.concat([box_c0, box_c1], axis=-1) # no batch norm before softmax # TODO: really no batch norm here ? What kind of batch norm could work ? box_c_logits = conv1x1( box_c, depth=cell_n * 2 ) # shape [batch, grid_nn, grid_nn, cell_n*2], 2 = number of classes, plane or not plane box_all = tf.concat([box_x, box_y, box_w, box_h, box_c_logits], axis=-1) info = _layer_stats( info, "YOLO head, box XYWHC", box_all, 1, 4 * _count_conv_weights(box_xr, box_x, 1) + _count_conv_weights(box_c, box_c_logits, 1)) box_c_logits = tf.reshape(box_c_logits, [-1, grid_nn, grid_nn, cell_n, 2]) box_c = tf.nn.softmax( box_c_logits) # shape [batch, GRID_N,GRID_N,CELL_B,2] #box_c_noplane, box_c_plane = tf.unstack(box_c, axis=-1) # Leave some breathing room to the roi sizes so that rois from adjacent cells can reach into this one. # This prevents training from punishing cells that do see an ship but are not assigned any because # the plane is centered in an adjacent cell very close to the limit. A ground truth box that is slightly # off could change cell ownership of a plane while not changing anyhting about the underlying pixels. box_x = box_x * 1.0 * params["cell_grow"] box_y = box_y * 1.0 * params["cell_grow"] logging.log(logging.INFO, y) logging.log(logging.INFO, box_x) logging.log(logging.INFO, box_y) logging.log(logging.INFO, box_w) logging.log(logging.INFO, box_h) logging.log(logging.INFO, box_c) logging.log(logging.INFO, box_c_logits) return box_x, box_y, box_w, box_h, box_c, box_c_logits, info
def model_fn(features, labels, mode, params): """The model, with loss, metrics and debug summaries""" # YOLO parameters grid_nn = params["grid_nn"] # each tile is divided into a grid_nn x grid_nn grid cell_n = params["cell_n"] # each grid cell predicts cell_n bounding boxes. info = None # model inputs X = tf.to_float(features["image"]) / 255.0 # input image format is uint8 with range 0 to 255 X=tf.reshape(X,[-1,768,768,3]) # The model itself is here #Y, info = model_core_squeezenet12(X, mode, params, info) #Y, info = model_core_squeezenet17(X, mode, params, info) Y, info = model_core_squeezenet12(X, mode, params,info) logging.debug(X.shape) # YOLO head: predicts bounding boxes around ships box_x, box_y, box_w, box_h, box_c, box_c_logits, info = layer.YOLO_head(Y, mode, params, info, grid_nn, cell_n) # Debug: print the model structure if mode == tf.estimator.ModeKeys.TRAIN: logging.log(logging.INFO, info["description"]) logging.log(logging.INFO, "NN {} layers / {:,d} total weights".format(info["layers"], info["weights"])) box_c_sim = box_c[:,:,:,:,1] DETECTION_TRESHOLD = 0.5 # ship "detected" if predicted C>0.5 detected_w = tf.where(tf.greater(box_c_sim, DETECTION_TRESHOLD), box_w, tf.zeros_like(box_w)) detected_h = tf.where(tf.greater(box_c_sim, DETECTION_TRESHOLD), box_h, tf.zeros_like(box_w)) # all rois with confidence factors predicted_rois = tf.stack([box_x, box_y, box_w, box_h], axis=-1) # shape [batch, GRID_N, GRID_N, CELL_B, 4] predicted_rois = box.grid_cell_to_tile_coords(predicted_rois, grid_nn, 768) / 768 predicted_rois = tf.reshape(predicted_rois, [-1, grid_nn*grid_nn*cell_n, 4]) predicted_c = tf.reshape(box_c_sim, [-1, grid_nn*grid_nn*cell_n]) # only the rois where a ship was detected detected_rois = tf.stack([box_x, box_y, detected_w, detected_h], axis=-1) # shape [batch, GRID_N, GRID_N, CELL_B, 4] detected_rois = box.grid_cell_to_tile_coords(detected_rois, grid_nn, 768) / 768 detected_rois = tf.reshape(detected_rois, [-1, grid_nn*grid_nn*cell_n, 4]) detected_rois, detected_rois_overflow = box.remove_empty_rois(detected_rois, 50) loss = train_op = eval_metrics = None if mode != tf.estimator.ModeKeys.PREDICT: # Target labels # Ground truth boxes. Used to compute IOU accuracy and display debug ground truth boxes. target_rois = labels["target_rois"] # shape [batch, MAX_TARGET_ROIS_PER_TILE, x1y1x2y2] # Ground truth boxes assigned to YOLO grid cells. Used to compute loss. target_rois_yolo = labels["yolo_target_rois"] # shape [4,4,3,3] = [batch, GRID_N, GRID_N, CEL_B, xywh] target_x, target_y, target_w, target_h = tf.unstack(target_rois_yolo, axis=-1) # shape 3 x [batch, 4,4,3] = [batch, GRID_N, GRID_N,CELL_B] # target probability is 1 if there is a corresponding target box, 0 otherwise target_is_ship = tf.greater(target_w, 0.0001) target_is_ship_onehot = tf.one_hot(tf.cast(target_is_ship, tf.int32), 2, dtype=tf.float32) target_is_ship_float = tf.cast(target_is_ship, tf.float32) # shape [batch, 4,4,3] = [batch, GRID_N, GRID_N, CELL_B] # Mistakes and correct detections for visualisation and debugging. # This is computed against the ground truth boxes assigned to YOLO grid cells. mistakes, size_correct, position_correct, all_correct = box.compute_mistakes(box_x, box_y, box_w, box_h, box_c_sim, target_x, target_y, target_w, target_h, target_is_ship, grid_nn) debug_img = imgdbg.debug_image(X, mistakes, target_rois, predicted_rois, predicted_c, size_correct, position_correct, all_correct, grid_nn, cell_n, 768) if mode == tf.estimator.ModeKeys.EVAL: iou_accuracy = box.compute_safe_IOU(target_rois, detected_rois, detected_rois_overflow, 768) # Loss function position_loss = tf.reduce_mean(target_is_ship_float * (tf.square(box_x - target_x) + tf.square(box_y - target_y))) size_loss = tf.reduce_mean(target_is_ship_float * tf.square(box_w - target_w) * 2 + target_is_ship_float * tf.square(box_h - target_h) * 2) obj_loss = tf.losses.softmax_cross_entropy(target_is_ship_onehot, box_c_logits) # YOLO trick: weights the different losses differently loss_weight_total = (params['lw1'] + params['lw2'] + params['lw3']) * 1.0 # 1.0 to force conversion to float w_obj_loss = obj_loss*(params['lw1'] / loss_weight_total) w_position_loss = position_loss*(params['lw2'] / loss_weight_total) w_size_loss = size_loss*(params['lw3'] / loss_weight_total) loss = w_position_loss + w_size_loss + w_obj_loss nb_mistakes = tf.reduce_sum(mistakes) # average number of mistakes per image lr = learn_rate_decay(tf.train.get_or_create_global_step(), params) optimizer = tf.train.AdamOptimizer(lr) train_op = tf.contrib.training.create_train_op(loss, optimizer) if mode == tf.estimator.ModeKeys.EVAL: # metrics removed from training mode because they are not yet supported with MirroredStrategy eval_metrics = {"position_error": tf.metrics.mean(w_position_loss), "size_error": tf.metrics.mean(w_size_loss), "ship_cross_entropy_error": tf.metrics.mean(w_obj_loss), "mistakes": tf.metrics.mean(nb_mistakes), 'IOU': tf.metrics.mean(iou_accuracy) } else: eval_metrics = None # Tensorboard summaries for debugging tf.summary.scalar("position_error", w_position_loss) tf.summary.scalar("size_error", w_size_loss) tf.summary.scalar("ship_cross_entropy_error", w_obj_loss) tf.summary.scalar("loss", loss) tf.summary.image("input_image", debug_img, max_outputs=20) tf.summary.scalar("learning_rate", lr) # a summary on iou_accuracy would be nice but it goes Out Of Memory return tf.estimator.EstimatorSpec( mode=mode, predictions={"rois":predicted_rois, "rois_confidence": predicted_c}, # name these fields as you like loss=loss, train_op=train_op, eval_metric_ops=eval_metrics, export_outputs={'classes': tf.estimator.export.PredictOutput({"rois": predicted_rois, "rois_confidence": predicted_c})} )
def model_fn(features, labels, mode, params): """The model, with loss, metrics and debug summaries""" # YOLO parameters grid_nn = params["grid_nn"] # each tile is divided into a grid_nn x grid_nn grid cell_n = params["cell_n"] # each grid cell predicts cell_n bounding boxes. info = None # model inputs X = tf.to_float(features["image"]) / 255.0 # input image format is uint8 with range 0 to 255 # The model itself is here #Y, info = model_core_squeezenet12(X, mode, params, info) #Y, info = model_core_squeezenet17(X, mode, params, info) #Y, info = model_core_darknet(X, mode, params, info) #Y, info = model_core_darknet17(X, mode, params, info) Y, info = model_core_configurable_squeezenet(X, mode, params, info) # YOLO head: predicts bounding boxes around airplanes box_x, box_y, box_w, box_c, box_c_logits, info = layer.YOLO_head(Y, mode, params, info, grid_nn, cell_n) # Debug: print the model structure if mode == tf.estimator.ModeKeys.TRAIN: logging.log(logging.INFO, info["description"]) logging.log(logging.INFO, "NN {} layers / {:,d} total weights".format(info["layers"], info["weights"])) # TODO: refactor predicted_rois and predicted_c (or keep it to keep the conde compatible with confidence factor implem?) # with the current softmax implementation, confidence factors are either 0 or 1. box_c_sim = tf.cast(tf.argmax(box_c, axis=-1), dtype=tf.float32) # shape [batch, GRID_N,GRID_N,CELL_B] DETECTION_TRESHOLD = 0.5 # plane "detected" if predicted C>0.5 detected_w = tf.where(tf.greater(box_c_sim, DETECTION_TRESHOLD), box_w, tf.zeros_like(box_w)) # all rois with confidence factors predicted_rois = tf.stack([box_x, box_y, box_w], axis=-1) # shape [batch, GRID_N, GRID_N, CELL_B, 3] predicted_rois = box.grid_cell_to_tile_coords(predicted_rois, grid_nn, settings.TILE_SIZE) / settings.TILE_SIZE predicted_rois = tf.reshape(predicted_rois, [-1, grid_nn*grid_nn*cell_n, 4]) predicted_c = tf.reshape(box_c_sim, [-1, grid_nn*grid_nn*cell_n]) # only the rois where a plane was detected detected_rois = tf.stack([box_x, box_y, detected_w], axis=-1) # shape [batch, GRID_N, GRID_N, CELL_B, 3] detected_rois = box.grid_cell_to_tile_coords(detected_rois, grid_nn, settings.TILE_SIZE) / settings.TILE_SIZE detected_rois = tf.reshape(detected_rois, [-1, grid_nn*grid_nn*cell_n, 4]) detected_rois, detected_rois_overflow = box.remove_empty_rois(detected_rois, settings.MAX_DETECTED_ROIS_PER_TILE) loss = train_op = eval_metrics = None if mode != tf.estimator.ModeKeys.PREDICT: # Target labels target_count = labels["count"] # not used # Ground truth boxes. Used to compute IOU accuracy and display debug ground truth boxes. target_rois = labels["target_rois"] # shape [batch, MAX_TARGET_ROIS_PER_TILE, x1y1x2y2] # Ground truth boxes assigned to YOLO grid cells. Used to compute loss. target_rois_yolo = labels["yolo_target_rois"] # shape [4,4,3,3] = [batch, GRID_N, GRID_N, CEL_B, xyw] target_x, target_y, target_w = tf.unstack(target_rois_yolo, 3, axis=-1) # shape 3 x [batch, 4,4,3] = [batch, GRID_N, GRID_N, CELL_B] # target probability is 1 if there is a corresponding target box, 0 otherwise target_is_plane = tf.greater(target_w, 0.0001) target_is_plane_onehot = tf.one_hot(tf.cast(target_is_plane, tf.int32), 2, dtype=tf.float32) target_is_plane_float = tf.cast(target_is_plane, tf.float32) # shape [batch, 4,4,3] = [batch, GRID_N, GRID_N, CELL_B] # Mistakes and correct detections for visualisation and debugging. # This is computed against the ground truth boxes assigned to YOLO grid cells. mistakes, size_correct, position_correct, all_correct = box.compute_mistakes(box_x, box_y, box_w, box_c_sim, target_x, target_y, target_w, target_is_plane, grid_nn) # Debug image for logging in Tensorboad. debug_img = imgdbg.debug_image(X, mistakes, target_rois, predicted_rois, predicted_c, size_correct, position_correct, all_correct, grid_nn, cell_n, settings.TILE_SIZE) # IOU (Intersection Over Union) accuracy # IOU computation removed from training mode because it used an op not yet supported with MirroredStrategy if mode == tf.estimator.ModeKeys.EVAL: iou_accuracy = box.compute_safe_IOU(target_rois, detected_rois, detected_rois_overflow, settings.TILE_SIZE) # Improvement ideas and experiment results # 1) YOLO trick: take square root of predicted size for loss so as not to drown errors on small boxes: tested, no benefit # 2) if only one plane in cell, teach all cell_n detectors to detect it: implemented in box.n_experimental_roi_selection_strategy, beneficial # 3) TODO: try two or more grids, shifted by 1/2 cell size: This could make it easier to have cells detect planes in their center, if that is an actual problem they have (no idea) # 4) try using TC instead of TC_ in position loss and size loss: tested, no benefit # 5) TODO: one run without batch norm for comparison # 6) TODO: add dropout, tested, weird resukts: eval accuracy goes up signicantly but model performs worse in real life. Probably not enough training data. # 7) TODO: idea, compute detection box loss agains all ROI, not just assigned ROIs: if neighboring cell detects something that aligns well with ground truth, no reason to penalise # 8) TODO: add tile rotations, tile color inversion (data augmentation) # Loss function position_loss = tf.reduce_mean(target_is_plane_float * (tf.square(box_x - target_x) + tf.square(box_y - target_y))) size_loss = tf.reduce_mean(target_is_plane_float * tf.square(box_w - target_w) * 2) obj_loss = tf.losses.softmax_cross_entropy(target_is_plane_onehot, box_c_logits) # YOLO trick: weights the different losses differently loss_weight_total = (params['lw1'] + params['lw2'] + params['lw3']) * 1.0 # 1.0 to force conversion to float w_obj_loss = obj_loss*(params['lw1'] / loss_weight_total) w_position_loss = position_loss*(params['lw2'] / loss_weight_total) w_size_loss = size_loss*(params['lw3'] / loss_weight_total) loss = w_position_loss + w_size_loss + w_obj_loss # average number of mistakes per image nb_mistakes = tf.reduce_sum(mistakes) lr = learn_rate_decay(tf.train.get_or_create_global_step(), params) optimizer = tf.train.AdamOptimizer(lr) train_op = tf.contrib.training.create_train_op(loss, optimizer) if mode == tf.estimator.ModeKeys.EVAL: # metrics removed from training mode because they are not yet supported with MirroredStrategy eval_metrics = {"position_error": tf.metrics.mean(w_position_loss), "size_error": tf.metrics.mean(w_size_loss), "plane_cross_entropy_error": tf.metrics.mean(w_obj_loss), "mistakes": tf.metrics.mean(nb_mistakes), 'IOU': tf.metrics.mean(iou_accuracy)} else: eval_metrics = None # Tensorboard summaries for debugging tf.summary.scalar("position_error", w_position_loss) tf.summary.scalar("size_error", w_size_loss) tf.summary.scalar("plane_cross_entropy_error", w_obj_loss) tf.summary.scalar("loss", loss) tf.summary.scalar("mistakes", nb_mistakes) tf.summary.scalar("learning_rate", lr) tf.summary.image("input_image", debug_img, max_outputs=20) # a summary on iou_accuracy would be nice but it goes Out Of Memory return tf.estimator.EstimatorSpec( mode=mode, predictions={"rois":predicted_rois, "rois_confidence": predicted_c}, # name these fields as you like loss=loss, train_op=train_op, eval_metric_ops=eval_metrics, export_outputs={'classes': tf.estimator.export.PredictOutput({"rois": box.swap_xy(predicted_rois), # TODO: the visualisation GUI was coded for swapped coordinates y1 x1 y2 x2 "rois_confidence": predicted_c})} # TODO: remove legacy C )
Main file for training the YOLO (You Look Only Once) detection model""" import os import sys import json import argparse import tensorflow as tf from tensorflow.python.client import device_lib as tf_devices from tensorflow.python.lib.io import file_io as gcsfile from tensorflow.python.platform import tf_logging as logging from trainer_yolo import model from trainer_yolo import datagen logging.set_verbosity(logging.INFO) logging.log(logging.INFO, "Tensorflow version " + tf.__version__) def get_available_gpus(): local_device_protos = tf_devices.list_local_devices() return [x.name for x in local_device_protos if x.device_type == 'GPU'] # input function for base64 encoded JPEG in JSON # Called when the model is deployed for online predictions on Cloud ML Engine. def serving_input_fn(): # input expects a list of jpeg images input_bytes = { 'image_bytes': tf.placeholder(
def main(argv): parser = argparse.ArgumentParser() # You must accept a --job-dir argument when running on Cloud ML Engine. It specifies where checkpoints # should be saved. You can define additional user arguments which will have to be specified after # an empty arg -- on the command line: # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args # no batch norm: lr 0.002-0.0002-2000 is ok, over 10000 iterations (final accuracy 0.9937 loss 2.39 job156) # batch norm: lr 0.02-0.0001-600 conv 16-32-64 trains in 3000 iteration (final accuracy 0.0.8849 loss 1.466 job 159) parser.add_argument( '--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints') parser.add_argument('--data-dir', default="data", help='Where training data will be loaded and unzipped') parser.add_argument('--hp-lr0', default=0.02, type=float, help='Hyperparameter: initial (max) learning rate') parser.add_argument('--hp-lr1', default=0.0001, type=float, help='Hyperparameter: target (min) learning rate') parser.add_argument( '--hp-lr2', default=600, type=float, help= 'Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.' ) parser.add_argument('--hp-dropout', default=0.3, type=float, help='Hyperparameter: dropout rate on dense layers.') parser.add_argument( '--hp-conv1', default=6, type=int, help='Hyperparameter: depth of first convolutional layer.') parser.add_argument( '--hp-conv2', default=12, type=int, help='Hyperparameter: depth of second convolutional layer.') parser.add_argument( '--hp-conv3', default=24, type=int, help='Hyperparameter: depth of third convolutional layer.') parser.add_argument( '--hp-bnexp', default=0.993, type=float, help='Hyperparameter: exponential decay for batch norm moving averages.' ) parser.add_argument('--hp-iterations', default=10000, type=int, help='Hyperparameter: number of training iterations.') args = parser.parse_args() arguments = args.__dict__ hparams = {k[3:]: v for k, v in arguments.items() if k.startswith('hp_')} otherargs = {k: v for k, v in arguments.items() if not k.startswith('hp_')} logging.log(logging.INFO, "Hyperparameters:" + str(sorted(hparams.items()))) output_dir = otherargs.pop('job_dir') # learn_runner needs an experiment function with a single parameter: the output directory. # Here we pass additional command line arguments through a closure. experiment_fn = lambda output_dir: experiment_fn_with_params( output_dir, hparams, **otherargs) # Compatibility warning: learn_runner is currently in contrib. It will move in TF 1.2 tf.contrib.learn.learn_runner.run(experiment_fn, output_dir)
def start_training(output_dir, hparams, data, tiledata, **kwargs): # YOLO configuration for ROI assignments yolo_cfg = datagen.YOLOConfig(hparams["grid_nn"], hparams["cell_n"], hparams["cell_swarm"], hparams["cell_grow"]) eval_yolo_cfg = datagen.YOLOConfig(hparams["grid_nn"], hparams["cell_n"], hparams["cell_swarm"], 1.0) # data source selection: full aerial imagery of TFRecords containing individual 256x256 tiles if tiledata != "" and data == "": # training from tfrecords tfrec_filelist = gcsfile.get_matching_files(tiledata + "/*.tfrecord") train_data_input_fn = lambda: datagen.train_dataset_from_tfrecords( tfrec_filelist, hparams["batch_size"], hparams["shuffle_buf"], yolo_cfg, hparams["data_rnd_hue"], hparams[ "data_rnd_orientation"], hparams["data_cache_n_epochs"]) tfrec_filelist_eval = gcsfile.get_matching_files(tiledata + "_eval" + "/*.tfrecord") eval_data_input_fn = lambda: datagen.eval_dataset_from_tfrecords( tfrec_filelist_eval, hparams["eval_batch_size"], eval_yolo_cfg) elif data != "" and tiledata == "": # training from aerial imagery directly img_filelist, roi_filelist = datagen.load_file_list(data) train_data_input_fn = lambda: datagen.train_dataset_from_images( img_filelist, roi_filelist, hparams["batch_size"], hparams[ "shuffle_buf"], yolo_cfg, hparams["data_rnd_hue"], hparams[ "data_rnd_orientation"], hparams["data_tiles_per_gt_roi"], hparams["data_rnd_distmax"], hparams["data_cache_n_epochs"]) img_filelist_eval, roi_filelist_eval = datagen.load_file_list(data + "_eval") eval_data_input_fn = lambda: datagen.eval_dataset_from_images( img_filelist_eval, roi_filelist_eval, hparams["eval_batch_size"], eval_yolo_cfg) else: logging.log( logging.ERROR, "One and only one of parameters 'data' and 'tiledata' must be supplied." ) return # Estimator configuration export_latest = tf.estimator.LatestExporter( name="planespotting", serving_input_receiver_fn=serving_input_fn, exports_to_keep=1) train_spec = tf.estimator.TrainSpec(input_fn=train_data_input_fn, max_steps=hparams["iterations"]) eval_spec = tf.estimator.EvalSpec( input_fn=eval_data_input_fn, steps=hparams['eval_iterations'], exporters=export_latest, start_delay_secs=1, # Confirmed: this does not work (plane533 for ex.) throttle_secs=60) # Device filters to prevent unwanted communications between nodes # This is necessary for now for running distributed jobs on ML Engine # If running long evaluations, workers can be done before master and in that case ML Engine crashes. # These device filters prevent unwanted communications from happening and will prevent the crash. # This code should be folded into Estimator in Tensorflow v1.9 tf_config = json.loads(os.environ.get('TF_CONFIG', '{}')) config = None if 'task' not in tf_config: config = None elif tf_config['task']['type'] == 'master': config = tf.ConfigProto(device_filters=['/job:ps', '/job:master']) elif tf_config['task']['type'] == 'worker': config = tf.ConfigProto(device_filters=[ '/job:ps', '/job:worker/task:%d' % tf_config['task']['index'] ]) # end of temporary fix code for distributed training on ML Engine # Experimental distribution strategy if running on a machine with multiple GPUs logging.log(logging.INFO, "GPUs found: " + str(get_available_gpus())) distribution = tf.contrib.distribute.MirroredStrategy() if len( get_available_gpus()) > 1 else None training_config = tf.estimator.RunConfig( model_dir=output_dir, save_summary_steps=100, save_checkpoints_steps=2000, keep_checkpoint_max=1, train_distribute=distribution, session_config=config) # device filters set here estimator = tf.estimator.Estimator(model_fn=model.model_fn, model_dir=output_dir, config=training_config, params=hparams) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def main(argv): parser = argparse.ArgumentParser() # You must accept a --job-dir argument when running on Cloud ML Engine. It specifies where checkpoints # should be saved. You can define additional user arguments which will have to be specified after # an empty arg -- on the command line: # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args # no batch norm: lr 0.002-0.0002-2000 is ok, over 10000 iterations (final accuracy 0.9937 loss 2.39 job156) # batch norm: lr 0.02-0.0001-600 conv 16-32-64 trains in 3000 iteration (final accuracy 0.9949 loss 1.466 job 159) def str2bool(v): return v=='True' parser.add_argument('--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints') parser.add_argument('--data-dir', default="data", help='Where training data will be loaded and unzipped') parser.add_argument('--lr0', default=0.02, type=float, help='Hyperparameter: initial (max) learning rate') parser.add_argument('--lr1', default=0.0001, type=float, help='Hyperparameter: target (min) learning rate') parser.add_argument('--lr2', default=600, type=float, help='Hyperparameter: learning rate decay speed in steps. Learning rate decays by exp(-1) every N steps.') parser.add_argument('--dropout', default=0.3, type=float, help='Hyperparameter: dropout rate on dense layers.') parser.add_argument('--conv1', default=6, type=int, help='Hyperparameter: depth of first convolutional layer.') parser.add_argument('--conv2', default=12, type=int, help='Hyperparameter: depth of second convolutional layer.') parser.add_argument('--conv3', default=24, type=int, help='Hyperparameter: depth of third convolutional layer.') parser.add_argument('--bnexp', default=0.993, type=float, help='Hyperparameter: exponential decay for batch norm moving averages.') parser.add_argument('--iterations', default=5000, type=int, help='Hyperparameter: number of training iterations.') parser.add_argument('--eval-iterations', default=10, type=int, help='Hyperparameter: number of evaluation iterations.') parser.add_argument('--batch', default=1024, type=int, help='Global batch size (1/8th of this is the real batch size on one TPU)') parser.add_argument('--use-tpu', default=False, type=str2bool, help='Using a TPU or not') parser.add_argument('--tpu-iterations', default=100, type=int, help='Iterations per call to the TPU') # TPUEstimator also adds the following parameters internally - do not use them parser.add_argument('--tpu', default=None, help='(internal) ML Engine uses this argument to apps the IP address of the TPU') parser.add_argument('--tpu-zone', default=None, help='(internal) GCP zone where to provision the TPUs') parser.add_argument('--gcp-project', default=None, help='(internal) GCP project where to provision the TPUs') #parser.add_argument('--batch-size', default=None, help='(internal) Global batch size on TPUs') args = parser.parse_args() logging.log(logging.INFO, "Parameters:" + str(args)) train_images_file, train_labels_file, test_images_file, test_labels_file = load_mnist_data(args.data_dir) def train_input_fn(params): return train_data_input_fn(train_images_file, train_labels_file, params) def eval_input_fn(params): return eval_data_input_fn(test_images_file, test_labels_file, params) # training_config = tf.contrib.tpu.RunConfig( # cluster=tf.contrib.cluster_resolver.TPUClusterResolver(args.tpu, args.tpu_zone,args.gcp_project) \ # if args.use_tpu else None, # model_dir=args.job_dir, # session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), # tpu_config=tf.contrib.tpu.TPUConfig(args.tpu_iterations, 8) # ) training_config = tf.estimator.RunConfig(model_dir=args.job_dir, save_summary_steps=100, save_checkpoints_steps=500, keep_checkpoint_max=1) # estimator = tf.contrib.tpu.TPUEstimator(model_fn=conv_model2, model_dir=args.job_dir, params=args.__dict__, # train_batch_size=args.batch, # eval_batch_size=args.batch, # config=training_config, use_tpu=args.use_tpu) params = args.__dict__ params["batch_size"] = args.batch estimator = tf.estimator.Estimator(model_fn=conv_model2, model_dir=args.job_dir, params=params, config=training_config) #train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=args.iterations) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=None) export_latest = tf.estimator.LatestExporter("mnist-model",serving_input_receiver_fn=serving_input_fn) eval_spec = tf.estimator.EvalSpec(eval_input_fn, steps=10, exporters=export_latest, throttle_secs=2) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)