def _eval_network(self, dataset, data_mode, logs_path, batch_size, metrics, gpu_frac, workers, mem_dequeing, track_summaries=50, steps=None): """ Runs and tracks a single evaluation step on the given data subset Args: Other arguments can be seen in train_network data_mode: Data subset to use for evaluation track_summaries: Steps between Tensorboard summaries. Only used if steps is None steps: Number of batches the model is evaluated on. If not None, a single summary is created at the end. If None, the model is evaluated on the whole dataset and it tracks a summary periodically. Returns loss: Loss produced by the batch metrics: Set of metrics values """ # Create reader to get TFRecords reader = DataReader(dataset) with tf.Graph().as_default() as g: # Read data and define operations data, labels = reader.read_batch(batch_size, data_mode, mem_dequeing, workers, shuffle=True, train_mode=False) main_run = self._build_run_settings(data, labels, metrics) # Prepare logging for Tensorboard saver, summary_ops, writer = mu.prepare_logging(logs_path, g) # Supervisor for training. We only want it to deal with the # session. Initializes variables supervisor = tf.train.Supervisor(graph=g) # Initialize session session_conf = mu.get_session_config(gpu_frac, log_placement=False) with supervisor.managed_session(config=session_conf) as sess: # Load model, if existing. Otherwise start from scratch step_value, _ = self._initialize_model(sess, saver=saver, is_training=False) try: # Let queues start dequeing examples coord, threads = mu.initialize_queues(sess) # Initialize loop conditions step_counter = 0 stop = False if steps is None else step_counter >= steps while not stop: # Run evaluation res = main_run.test_run(sess, summary_ops, step_value, data_mode=data_mode, log=True) if steps is not None and steps == step_counter: # Reached max steps, store summary and stop main_run.manual_log(writer, step_value) stop = True elif steps is None \ and step_counter % track_summaries == 0: # Periodic storage of summaries mu.store_summaries(writer, step_counter, res.summary_str) step_counter += 1 try: mu.finalize_queues(coord, threads) except RuntimeError as e: logger.warning('Error stopping coordinator: %s', e) except tf.errors.OutOfRangeError: logger.info('Queue run out of evaluation instances') return main_run.loss_average(), main_run.metrics_average()
def eval_places(self, dataset, data_mode, gpu_frac, workers, mem_dequeing, steps=20): """ Runs and tracks a single evaluation step on the given data subset Args: Other arguments can be seen in _train_network data_mode: Data subset to use for evaluation steps: Number of examples the model is evaluated on. """ # Create reader to get TFRecords reader = DataReader(dataset) with tf.Graph().as_default() as g: # Read data and define operations data, labels = reader.read_batch(1, data_mode, mem_dequeing, workers, shuffle=True, train_mode=False) main_run = self._build_run_settings(data, labels, []) # Load weights ops for warm start on training assign_ops = self._load_pretrained() # Supervisor for training. We only want it to deal with the # session. Initializes variables supervisor = tf.train.Supervisor(graph=g) # Initialize session session_conf = mu.get_session_config(gpu_frac, log_placement=False) with supervisor.managed_session(config=session_conf) as sess: # Load model, if existing. Otherwise start from scratch step_value, _ = self._initialize_model(sess, saver=mu.get_saver(), is_training=True) if len(assign_ops) > 0: logger.info('Assigning pretrained values to models ...') sess.run(assign_ops) else: raise RuntimeError( "Unexpected error: no assign operations" + " to load weights") try: # Let queues start dequeing examples coord, threads = mu.initialize_queues(sess) counter = 0 while counter < steps: self.test_places_output(sess, data, main_run) counter += 1 try: mu.finalize_queues(coord, threads) except RuntimeError as e: logger.warning('Error stopping coordinator: %s', e) except tf.errors.OutOfRangeError: logger.info('Queue run out of evaluation instances')
def _train_network(self, dataset, logs_path, batch_size, metrics, track_summaries, gpu_frac, workers, mem_dequeing, track_models=None, steps=None, max_steps=None, log_steps=10): """ Runs training on the network defined for the given number of steps and stores a checkpoint at the end. Args: See train_network for other arguments. logs_path: Path where to store the network stats Returns step: Step at which training has stopped loss: Mean loss in the process metrics: Mean metrics values in the process """ # Create reader to get TFRecords reader = DataReader(dataset) with tf.Graph().as_default() as g: # Read data and define operations data, labels = reader.read_batch(batch_size, DataMode.TRAINING, mem_dequeing, workers) main_run = self._build_run_settings(data, labels, metrics) # Load weights ops for warm start on training assign_ops = self._load_pretrained() # Prepare logging for Tensorboard saver, summary_ops, writer = mu.prepare_logging(logs_path, g) # Supervisor for training. We only want it to deal with the # session. Initializes variables supervisor = tf.train.Supervisor(graph=g) # Initialize session session_conf = mu.get_session_config(gpu_frac, log_placement=False) with supervisor.managed_session(config=session_conf) as sess: # Load model, if existing. Otherwise start from scratch step_value, start = self._initialize_model(sess, saver=saver, is_training=True) # Get stopping condition according to mode step_limit = step_value + steps if steps is not None else max_steps # noqa step_limit = step_limit if max_steps is None else min( step_limit, max_steps) # noqa stop = step_value >= step_limit # Assign weights only if model started from scratch if start is True and len(assign_ops) > 0: logger.info('Assigning pretrained values to models ...') sess.run(assign_ops) try: # Let queues start dequeing examples coord, threads = mu.initialize_queues(sess) while not stop: # Run network log_run = step_value % log_steps == 0 main_res = main_run.training_run(sess, summary_ops, log=log_run) # Track summaries if needed if track_summaries is not None and \ step_value % track_summaries == 0: mu.store_summaries(writer, step_value, main_res.summary_str) # Track models if needed if track_models is not None \ and step_value % track_models == 0 \ and step_value != 0: mu.store_checkpoint(sess, saver, step_value, logs_path) # Update current step and stop condition step_value = main_res.step stop = step_value >= step_limit # Store model at exit mu.store_checkpoint(sess, saver, step_value, logs_path) try: mu.finalize_queues(coord, threads) except RuntimeError as e: logger.warning('Error stopping coordinator: %s', e) except tf.errors.OutOfRangeError as e: logger.warn( 'Input queue exhausted due to ' + 'unexpected reason: %s.', e) return step_value, main_run.loss_average( ), main_run.metrics_average() # noqa
if FLAGS.show_image: img_spec = DataSpec(batch_size=FLAGS.batch_size, scale_size=32, crop_size=32, channels=3, mean=[0.0, 0.0, 0.0], bgr=False, random_crop=False) else: img_spec = None dataset = settings_fn(dataset_location=FLAGS.data_location, image_specs=img_spec) reader = DataReader(dataset) features, label = reader.read_folded_batch( batch_size=FLAGS.batch_size, data_mode=DataMode.TRAINING, # Use whatever here, e.g. training folds=[0, 1, 2], memory_factor=FLAGS.memory_factor, reader_threads=FLAGS.reader_threads, train_mode=False) # Initi all vars sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # Define coordinator to handle all threads coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess)
def predict_fn(data_settings_fn, data_location, folder, **params): store_summaries = params.get('summaries', True) with tf.Graph().as_default() as graph: step = get_global_step() dataset = data_settings_fn( dataset_location=data_location, image_specs=image_spec_from_params(**params) ) reader = DataReader(dataset) test_context = build_run_context( dataset=dataset, reader=reader, tag=DataMode.TEST, folds=None, step=step, **params ) if store_summaries: writer = get_writer(graph, folder, DataMode.TEST) saver = tf.train.Saver() with tf.train.MonitoredTrainingSession( save_checkpoint_secs=None, save_summaries_steps=None, save_summaries_secs=None) as sess: ckpt = tf.train.get_checkpoint_state(folder) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint logger.debug( 'Restoring {} from {}'.format( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES), ckpt.model_checkpoint_path ) ) saver.restore(sess, ckpt.model_checkpoint_path) else: raise ValueError('No model found in %s' % folder) # Define coordinator to handle all threads coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) status, finish = RunStatus(), False while not finish: try: # Track loss and accuracy until queue exhausted loss, acc, l2 = test_step(sess, test_context) status.update(loss, acc, l2) if store_summaries: summary = sess.run( test_context.summary_op, feed_dict={test_context.is_training_op: False} ) writer.add_summary(summary) except tf.errors.OutOfRangeError: logger.info('Queue exhausted. Read all instances') finish = True coord.request_stop() coord.join(threads) return {'loss': status.loss(), 'l2': status.l2(), 'error': status.error()}
def fit(self, train_folds, val_folds, max_epochs, **params): max_epochs = int(max_epochs) # Parameters with default values strip_length = params.get('strip_length', 5) progress_thresh = params.get('progress_thresh', 0.1) max_successive_strips = params.get('max_successive_strips', 3) is_layerwise = params.get('layerwise', False) self._initialize_training(is_layerwise, **params) with tf.Graph().as_default() as graph: step = get_global_step() dataset = self._settings_fn( dataset_location=self._data_location, image_specs=image_spec_from_params(**params) ) reader = DataReader(dataset) # Get training operations train_context = build_run_context( dataset, reader, DataMode.TRAINING, train_folds, step, **params ) early_stop = EarlyStop( 'global', progress_thresh, max_successive_strips ) # Get validation operations val_context = build_run_context( dataset, reader, DataMode.VALIDATION, val_folds, step, True, **params # noqa ) if self._should_save(): self._init_writers(graph) saver = self._init_savers(step, **params) with tf.train.MonitoredTrainingSession( save_checkpoint_secs=None, save_summaries_steps=None, save_summaries_secs=None) as sess: self._init_session(sess, **params) # Define coordinator to handle all threads coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) while(True): if sess.run(step) >= max_epochs: logger.debug('Max epochs %d reached' % max_epochs) break train_run = run_training_epoch( sess, train_context, self._layer_idx ) early_stop.epoch_update(train_run.error()) epoch = train_run.epoch if epoch % strip_length == 0 and epoch != 0: # Track training stats logger.debug( '[%d] Training Loss: %f, Error: %f. L2: %f' % (epoch, train_run.loss(), train_run.error(), train_run.l2()) # noqa ) # Track validation stats val_run = eval_epoch( sess, val_context, self._layer_idx ) logger.debug( '[%d] Validation loss: %f, Error: %f' % (epoch, val_run.loss(), val_run.error()) ) if self._should_save(): self._epoch_summary( sess, train_context, train_run, val_context, val_run, epoch ) is_best, stop, train_errors = early_stop.strip_update( train_run, val_run, epoch ) if is_best and self._should_save(): save_model(sess, saver, self._folder, epoch) if stop and is_layerwise: self._iterate_layer(epoch, train_errors) early_stop.restart_errors() if self._policy.cycle_ended(): _, l_stop, _ = self._layer_stop.strip_update( train_run, val_run, epoch ) if l_stop: break elif stop and not is_layerwise: break best_model = early_stop.get_best() logger.debug('Best model found: {}'.format(best_model)) coord.request_stop() coord.join(threads) return best_model
def fit(self, max_epochs, **params): max_epochs = int(max_epochs) # Parameters with default values is_layerwise = params.get('switch_epochs') is not None switch_epochs = params.get('switch_epochs').copy() \ if is_layerwise else None summary_epochs = params.get('summary_epochs', 1) with tf.Graph().as_default() as graph: step = get_global_step() dataset = self._settings_fn( dataset_location=self._data_location, image_specs=image_spec_from_params(**params) ) reader = DataReader(dataset) # Get training operations train_flds = range(dataset.get_fold_num()) context = build_run_context( dataset, reader, DataMode.TRAINING, train_flds, step, **params ) # Initialize writers and summaries writer = tf.summary.FileWriter(self._folder, graph) saver = self._init_savers(step, **params) self._initialize_fit(is_layerwise, **params) with tf.train.MonitoredTrainingSession( save_checkpoint_secs=None, save_summaries_steps=None, save_summaries_secs=None) as sess: self._init_session(sess, **params) # Define coordinator to handle all threads coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) while(True): if sess.run(step) >= max_epochs: logger.debug('Max epochs %d reached' % max_epochs) break run = run_training_epoch( sess, context, self._layer_idx ) epoch = run.epoch if epoch % summary_epochs == 0: # Store histogram sum_str = sess.run( context.summary_op, feed_dict={context.is_training_op: True} ) writer.add_summary(sum_str, epoch) # Store stats from current epoch write_epoch(writer, run, epoch) logger.debug( '[%d] Training Loss: %f, Error: %f. L2: %f' % (epoch, run.loss(), run.error(), run.l2()) ) if switch_epochs is not None and len(switch_epochs) > 0 \ and epoch == switch_epochs[0]: self._layer_idx = self._policy.next_layer_id() logger.debug( 'Switching to layer %d' % self._layer_idx ) switch_epochs = switch_epochs[1:] logger.debug('Finished training at step %d' % max_epochs) model_path = save_model(sess, saver, self._folder, max_epochs) coord.request_stop() coord.join(threads) return model_path, run.loss(), run.error(), run.l2()