def setUp(self): tf.reset_default_graph() model_class = get_model('fasterrcnn') image_resize = model_class.base_config.dataset.image_preprocessing self.config = EasyDict({ 'image_resize_min': image_resize.min_size, 'image_resize_max': image_resize.max_size })
def get_config(config_files, override_params=None): custom_config = load_config_files(config_files) model_class = get_model(custom_config['model']['type']) model_base_config = get_base_config(model_class) config = get_model_config(model_base_config, custom_config, override_params) return config
def setUp(self): tf.reset_default_graph() model_class = get_model("fasterrcnn") base_config = get_base_config(model_class) image_resize = base_config.dataset.image_preprocessing self.config = EasyDict({ "image_resize_min": image_resize.min_size, "image_resize_max": image_resize.max_size, })
def get_config(config_files, override_params=None): custom_config = load_config_files(config_files) model_class = get_model(custom_config['model']['type']) model_base_config = get_base_config(model_class) config = get_model_config( model_base_config, custom_config, override_params ) return config
def setUp(self): tf.reset_default_graph() model_class = get_model('fasterrcnn') base_config = get_base_config(model_class) image_resize = base_config.dataset.image_preprocessing self.config = EasyDict({ 'image_resize_min': image_resize.min_size, 'image_resize_max': image_resize.max_size })
def get_config(self, model_type, override_params=None): custom_config = load_config_files(self.config.config_files) model_class = get_model('fasterrcnn') model_base_config = get_base_config(model_class) config = get_model_config(model_base_config, custom_config, override_params) config.model.type = model_type return config
def get_prediction(model_name, image, checkpoint_file=None, classes_file=None): model_class = get_model(model_name) if model_name in LOADED_MODELS: image_tensor, output, graph, session = LOADED_MODELS[model_name] else: graph = tf.Graph() session = tf.Session(graph=graph) with graph.as_default(): image_tensor = tf.placeholder(tf.float32, (1, None, None, 3)) model = model_class(model_class.base_config) output = model(image_tensor) if checkpoint_file: saver = tf.train.Saver(sharded=True, allow_empty=True) saver.restore(session, checkpoint_file) else: init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) session.run(init_op) LOADED_MODELS[model_name] = (image_tensor, output, graph, session) classification_prediction = output['classification_prediction'] objects_tf = classification_prediction['objects'] objects_labels_tf = classification_prediction['labels'] objects_labels_prob_tf = classification_prediction['probs'] image_resize_config = model_class.base_config.dataset.image_preprocessing image_array, scale_factor = resize_image( image, float(image_resize_config.min_size), float(image_resize_config.max_size)) start_time = time.time() objects, objects_labels, objects_labels_prob = session.run( [objects_tf, objects_labels_tf, objects_labels_prob_tf], feed_dict={image_tensor: image_array}) end_time = time.time() if classes_file: # Gets the names of the classes class_labels = json.load(tf.gfile.GFile(classes_file)) objects_labels = [class_labels[obj] for obj in objects_labels] else: objects_labels = objects_labels.tolist() return { 'objects': objects.tolist(), 'objects_labels': objects_labels, 'objects_labels_prob': objects_labels_prob.tolist(), 'inference_time': end_time - start_time, 'scale_factor': scale_factor, }
def get_config(self, model_type, override_params=None): custom_config = load_config_files(self.config.config_files) model_class = get_model('fasterrcnn') model_base_config = get_base_config(model_class) config = get_model_config( model_base_config, custom_config, override_params ) config.model.type = model_type return config
def eval( dataset_split, config_files, watch, from_global_step, override_params, files_per_class, max_detections, ): """Evaluate models using dataset.""" # If the config file is empty, our config will be the base_config for the # default model. try: config = get_config(config_files, override_params=override_params) except KeyError: raise KeyError("model.type should be set on the custom config.") if not config.train.job_dir: raise KeyError("`job_dir` should be set.") if not config.train.run_name: raise KeyError("`run_name` should be set.") # `run_dir` is where the actual checkpoint and logs are located. run_dir = os.path.join(config.train.job_dir, config.train.run_name) # Only activate debug for if needed for debug visualization mode. if not config.train.debug: config.train.debug = config.eval.image_vis == "debug" if config.train.debug or config.train.tf_debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) # Build the dataset tensors, overriding the default dataset split. config.dataset.split = dataset_split # Disable data augmentation. config.dataset.data_augmentation = [] # Attempt to get class names, if available. classes_file = os.path.join(config.dataset.dir, "classes.json") if tf.gfile.Exists(classes_file): class_labels = json.load(tf.gfile.GFile(classes_file)) else: class_labels = None if config.model.type == "fasterrcnn": # Override max detections with specified value. if config.model.network.with_rcnn: config.model.rcnn.proposals.total_max_detections = max_detections else: config.model.rpn.proposals.post_nms_top_n = max_detections # Also overwrite `min_prob_threshold` in order to use all detections. config.model.rcnn.proposals.min_prob_threshold = 0.0 elif config.model.type == "ssd": config.model.proposals.total_max_detections = max_detections config.model.proposals.min_prob_threshold = 0.0 else: raise ValueError("Model type '{}' not supported".format( config.model.type)) # Only a single run over the dataset to calculate metrics. config.train.num_epochs = 1 # Seed setup. if config.train.seed: tf.set_random_seed(config.train.seed) # Set pretrained as not training. config.model.base_network.trainable = False model_class = get_model(config.model.type) model = model_class(config) dataset_class = get_dataset(config.dataset.type) dataset = dataset_class(config) train_dataset = dataset() train_image = train_dataset["image"] train_objects = train_dataset["bboxes"] train_filename = train_dataset["filename"] # Build the graph of the model to evaluate, retrieving required # intermediate tensors. prediction_dict = model(train_image, train_objects) if config.model.type == "ssd" or config.model.network.with_rcnn: pred = prediction_dict["classification_prediction"] pred_objects = pred["objects"] pred_objects_classes = pred["labels"] pred_objects_scores = pred["probs"] else: # Force the num_classes to 1. config.model.network.num_classes = 1 pred = prediction_dict["rpn_prediction"] pred_objects = pred["proposals"] pred_objects_scores = pred["scores"] # When using only RPN all classes are 0. pred_objects_classes = tf.zeros((tf.shape(pred_objects_scores)[0], ), dtype=tf.int32) # Retrieve *all* the losses from the model and calculate their streaming # means, so we get the loss over the whole dataset. batch_losses = model.loss(prediction_dict, return_all=True) losses = {} for loss_name, loss_tensor in batch_losses.items(): loss_mean, _ = tf.metrics.mean( loss_tensor, name=loss_name, metrics_collections="metrics", updates_collections="metric_ops", ) full_loss_name = "{}_losses/{}".format(dataset_split, loss_name) losses[full_loss_name] = loss_mean metric_ops = tf.get_collection("metric_ops") init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Using a global saver instead of the one for the model. saver = tf.train.Saver(sharded=True, allow_empty=True) # Aggregate the required ops to evaluate into a dict. ops = { "init_op": init_op, "metric_ops": metric_ops, "pred_objects": pred_objects, "pred_objects_classes": pred_objects_classes, "pred_objects_scores": pred_objects_scores, "train_objects": train_objects, "losses": losses, "prediction_dict": prediction_dict, "filename": train_filename, "train_image": train_image, } metrics_scope = "{}_metrics".format(dataset_split) # Use global writer for all checkpoints. We don't want to write different # files for each checkpoint. writer = tf.summary.FileWriter(run_dir) files_to_visualize = {} last_global_step = from_global_step while True: # Get the checkpoint files to evaluate. try: checkpoints = get_checkpoints(run_dir, last_global_step, last_only=not watch) except ValueError as e: if not watch: tf.logging.error("Missing checkpoint.") raise e tf.logging.warning( "Missing checkpoint; Checking again in a moment") time.sleep(5) continue for checkpoint in checkpoints: # Always returned in order, so it's safe to assign directly. tf.logging.info( "Evaluating global_step {} using checkpoint '{}'".format( checkpoint["global_step"], checkpoint["file"])) try: start = time.time() evaluate_once( config, writer, saver, ops, checkpoint, class_labels=class_labels, metrics_scope=metrics_scope, image_vis=config.eval.image_vis, files_per_class=files_per_class, files_to_visualize=files_to_visualize, ) last_global_step = checkpoint["global_step"] tf.logging.info("Evaluated in {:.2f}s".format(time.time() - start)) except tf.errors.NotFoundError: # The checkpoint is not ready yet. It was written in the # checkpoints file, but it still hasn't been completely saved. tf.logging.info("Checkpoint {} is not ready yet. " "Checking again in a moment.".format( checkpoint["file"])) time.sleep(5) continue # If no watching was requested, finish the execution. if not watch: return # Sleep for a moment and check for new checkpoints. tf.logging.info("All checkpoints evaluated; sleeping for a moment") time.sleep(5)
def evaluate(dataset_split, config_files, job_dir, watch, from_global_step, override_params, files_per_class): """ Evaluate models using dataset. """ # If the config file is empty, our config will be the base_config for the # default model. try: config = get_config(config_files, override_params=override_params) except KeyError: raise KeyError('model.type should be set on the custom config.') config.train.job_dir = job_dir or config.train.job_dir # Only activate debug for if needed for debug visualization mode. if not config.train.debug: config.train.debug = config.eval.image_vis == 'debug' if config.train.debug or config.train.tf_debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) # Build the dataset tensors, overriding the default dataset split. config.dataset.split = dataset_split # Disable data augmentation. config.dataset.data_augmentation = [] # Only a single run over the dataset to calculate metrics. config.train.num_epochs = 1 # Seed setup if config.train.seed: tf.set_random_seed(config.train.seed) # Set pretrained as not training config.model.base_network.trainable = False model_class = get_model(config.model.type) model = model_class(config) dataset_class = get_dataset(config.dataset.type) dataset = dataset_class(config) train_dataset = dataset() train_image = train_dataset['image'] train_objects = train_dataset['bboxes'] train_filename = train_dataset['filename'] # Build the graph of the model to evaluate, retrieving required # intermediate tensors. prediction_dict = model( train_image, train_objects ) if config.model.network.with_rcnn: pred = prediction_dict['classification_prediction'] pred_objects = pred['objects'] pred_objects_classes = pred['labels'] pred_objects_scores = pred['probs'] else: # Force the num_classes to 1 config.model.network.num_classes = 1 pred = prediction_dict['rpn_prediction'] pred_objects = pred['proposals'] pred_objects_scores = pred['scores'] # When using only RPN all classes are 0. pred_objects_classes = tf.zeros( (tf.shape(pred_objects_scores)[0],), dtype=tf.int32 ) # Retrieve *all* the losses from the model and calculate their streaming # means, so we get the loss over the whole dataset. batch_losses = model.loss(prediction_dict, return_all=True) losses = {} for loss_name, loss_tensor in batch_losses.items(): loss_mean, _ = tf.metrics.mean( loss_tensor, name=loss_name, metrics_collections='metrics', updates_collections='metric_ops', ) full_loss_name = '{}_losses/{}'.format(dataset_split, loss_name) losses[full_loss_name] = loss_mean metric_ops = tf.get_collection('metric_ops') init_op = tf.group( tf.global_variables_initializer(), tf.local_variables_initializer() ) # Using a global saver instead of the one for the model. saver = tf.train.Saver(sharded=True, allow_empty=True) # Aggregate the required ops to evaluate into a dict.. ops = { 'init_op': init_op, 'metric_ops': metric_ops, 'pred_objects': pred_objects, 'pred_objects_classes': pred_objects_classes, 'pred_objects_scores': pred_objects_scores, 'train_objects': train_objects, 'losses': losses, 'prediction_dict': prediction_dict, 'filename': train_filename, 'train_image': train_image } metrics_scope = '{}_metrics'.format(dataset_split) # Use global writer for all checkpoints. We don't want to write different # files for each checkpoint. writer = tf.summary.FileWriter(config.train.job_dir) files_to_visualize = {} last_global_step = from_global_step while True: # Get the checkpoint files to evaluate. try: checkpoints = get_checkpoints(config, last_global_step) except ValueError as e: if not watch: tf.logging.error('Missing checkpoint.') raise e tf.logging.warning( 'Missing checkpoint; Checking again in a minute') time.sleep(60) continue for checkpoint in checkpoints: # Always returned in order, so it's safe to assign directly. tf.logging.info( 'Evaluating global_step {} using checkpoint \'{}\''.format( checkpoint['global_step'], checkpoint['file'] ) ) try: start = time.time() evaluate_once( config, writer, saver, ops, checkpoint, metrics_scope=metrics_scope, image_vis=config.eval.image_vis, files_per_class=files_per_class, files_to_visualize=files_to_visualize ) last_global_step = checkpoint['global_step'] tf.logging.info('Evaluated in {:.2f}s'.format( time.time() - start )) except tf.errors.NotFoundError: # The checkpoint is not ready yet. It was written in the # checkpoints file, but it still hasn't been completely saved. tf.logging.info( 'Checkpoint {} is not ready yet. ' 'Checking again in a minute.'.format( checkpoint['file'] ) ) time.sleep(60) continue # If no watching was requested, finish the execution. if not watch: return # Sleep for a minute and check for new checkpoints. tf.logging.info('All checkpoints evaluated; sleeping for a minute') time.sleep(60)
def evaluate(model_type, dataset_split, config_file, job_dir, watch, from_global_step, override_params, image_vis, files_per_class): """ Evaluate models using dataset. """ model_cls = get_model(model_type) config = model_cls.base_config config = get_model_config(model_cls.base_config, config_file, override_params) config.train.job_dir = job_dir or config.train.job_dir # Only activate debug for image visualizations. config.train.debug = image_vis if config.train.debug or config.train.tf_debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) # Build the dataset tensors, overriding the default dataset split. config.dataset.split = dataset_split # Disable data augmentation. config.dataset.data_augmentation = [] # Only a single run over the dataset to calculate metrics. config.train.num_epochs = 1 # Seed setup if config.train.seed: tf.set_random_seed(config.train.seed) # Set pretrained as not training config.base_network.trainable = False model = model_cls(config) dataset = TFRecordDataset(config) train_dataset = dataset() train_image = train_dataset['image'] train_objects = train_dataset['bboxes'] train_filename = train_dataset['filename'] # TODO: This is not the best place to configure rank? Why is rank not # transmitted through the queue train_image.set_shape((None, None, 3)) # We add fake batch dimension to train data. TODO: DEFINITELY NOT THE BEST # PLACE train_image = tf.expand_dims(train_image, 0) # Build the graph of the model to evaluate, retrieving required # intermediate tensors. prediction_dict = model(train_image, train_objects) pred = prediction_dict['classification_prediction'] pred_objects = pred['objects'] pred_objects_classes = pred['labels'] pred_objects_scores = pred['probs'] # Retrieve *all* the losses from the model and calculate their streaming # means, so we get the loss over the whole dataset. batch_losses = model.loss(prediction_dict, return_all=True) losses = {} for loss_name, loss_tensor in batch_losses.items(): loss_mean, _ = tf.metrics.mean( loss_tensor, name=loss_name, metrics_collections='metrics', updates_collections='metric_ops', ) full_loss_name = '{}_losses/{}'.format(dataset_split, loss_name) losses[full_loss_name] = loss_mean metric_ops = tf.get_collection('metric_ops') init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Using a global saver instead of the one for the model. saver = tf.train.Saver(sharded=True, allow_empty=True) # Aggregate the required ops to evaluate into a dict.. ops = { 'init_op': init_op, 'metric_ops': metric_ops, 'pred_objects': pred_objects, 'pred_objects_classes': pred_objects_classes, 'pred_objects_scores': pred_objects_scores, 'train_objects': train_objects, 'losses': losses, 'prediction_dict': prediction_dict, 'filename': train_filename } metrics_scope = '{}_metrics'.format(dataset_split) # Use global writer for all checkpoints. We don't want to write different # files for each checkpoint. writer = tf.summary.FileWriter(config.train.job_dir) files_to_visualize = {} last_global_step = from_global_step while True: # Get the checkpoint files to evaluate. try: checkpoints = get_checkpoints(config, last_global_step) except ValueError as e: if not watch: tf.logging.error('Missing checkpoint.') raise e tf.logging.warning( 'Missing checkpoint; Checking again in a minute') time.sleep(60) continue for checkpoint in checkpoints: # Always returned in order, so it's safe to assign directly. tf.logging.info( 'Evaluating global_step {} using checkpoint \'{}\''.format( checkpoint['global_step'], checkpoint['file'])) try: start = time.time() evaluate_once(writer, saver, ops, config.network.num_classes, checkpoint, metrics_scope=metrics_scope, image_vis=image_vis, files_per_class=files_per_class, files_to_visualize=files_to_visualize) last_global_step = checkpoint['global_step'] tf.logging.info('Evaluated in {:.2f}s'.format(time.time() - start)) except tf.errors.NotFoundError: # The checkpoint is not ready yet. It was written in the # checkpoints file, but it still hasn't been completely saved. tf.logging.info('Checkpoint {} is not ready yet. ' 'Checking again in a minute.'.format( checkpoint['file'])) time.sleep(60) continue # If no watching was requested, finish the execution. if not watch: return # Sleep for a minute and check for new checkpoints. tf.logging.info('All checkpoints evaluated; sleeping for a minute') time.sleep(60)
def run_local(config, environment=None): model_class = get_model(config.model.type) image_vis = config.train.get('image_vis') var_vis = config.train.get('var_vis') if config.train.get('seed') is not None: tf.set_random_seed(config.train.seed) if config.train.debug or config.train.tf_debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) model = model_class(config) num_gpus = config.train.get('num_gpus') if num_gpus is None: num_gpus = 1 gpu_devices = ['gpu:{}'.format(i) for i in range(num_gpus)] gpu_indices = [i for i in range(num_gpus)] global_step = tf.train.get_or_create_global_step() optimizer = get_optimizer(config.train, global_step) def forward_pass_and_gradients(train_dataset): """ Create forward loss and grads on each device """ train_image = train_dataset['image'] train_filename = train_dataset['filename'] train_bboxes = train_dataset['bboxes'] prediction_dict = model(train_image, train_bboxes, is_training=True) total_loss = model.loss(prediction_dict) # TODO: Is this necesarry? Couldn't we just get them from the # trainable vars collection? We should probably improve our # usage of collections. trainable_vars = model.get_trainable_vars() # Compute, clip and apply gradients with tf.name_scope('gradients'): grads_and_vars = optimizer.compute_gradients( total_loss, trainable_vars) if config.train.clip_by_norm: grads_and_vars = clip_gradients_by_norm(grads_and_vars) return prediction_dict, total_loss, grads_and_vars def build_train_ops(device_grads): training_ops = [] # average all gradients grads_to_reduce = [[g for g, _ in grad_vars] for grad_vars in device_grads] algorithm = batch_allreduce.AllReduceSpecAlgorithm( 'nccl', gpu_indices, 0, 10) reduced_grads, _ = algorithm.batch_all_reduce(grads_to_reduce, 0, 0, 0) reduced_device_grads = [[ (g, v) for g, (_, v) in zip(grads, grad_vars) ] for grads, grad_vars in zip(reduced_grads, device_grads)] for i, device in enumerate(gpu_devices): with tf.device(device): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients( reduced_device_grads[i], global_step=global_step) training_ops.append(train_op) train_ops = tf.group(*(training_ops), name='train_ops_group') return train_ops try: dataset_class = get_dataset(config.dataset.type) dataset = dataset_class(config) except InvalidDataDirectory as exc: tf.logging.error("Error while reading dataset, {}".format(exc)) sys.exit(1) device_losses = [] device_gradvars = [] for device in gpu_devices: train_dataset = dataset() with tf.device(device): prediction_dict, loss, gradvars = forward_pass_and_gradients( train_dataset) device_losses.append(loss) device_gradvars.append(gradvars) train_filename = train_dataset['filename'] train_op = build_train_ops(device_gradvars) # average losses average_loss = tf.reduce_mean(device_losses) # Create custom init for slots in optimizer, as we don't save them to # our checkpoints. An example of slots in an optimizer are the Momentum # variables in MomentumOptimizer. We do this because slot variables can # effectively duplicate the size of your checkpoint! trainable_vars = model.get_trainable_vars() slot_variables = [ optimizer.get_slot(var, name) for name in optimizer.get_slot_names() for var in trainable_vars ] slot_init = tf.variables_initializer(slot_variables, name='optimizer_slots_initializer') # Create saver for saving/restoring model model_saver = tf.train.Saver( set(tf.global_variables()) - set(slot_variables), name='model_saver', max_to_keep=config.train.get('checkpoints_max_keep', 1), ) # Create saver for loading pretrained checkpoint into base network base_checkpoint_vars = model.get_base_network_checkpoint_vars() checkpoint_file = model.get_checkpoint_file() if base_checkpoint_vars and checkpoint_file: base_net_checkpoint_saver = tf.train.Saver( base_checkpoint_vars, name='base_net_checkpoint_saver') # We'll send this fn to Scaffold init_fn def load_base_net_checkpoint(_, session): base_net_checkpoint_saver.restore(session, checkpoint_file) else: load_base_net_checkpoint = None tf.logging.info('Starting training for {}'.format(model)) run_options = None if config.train.full_trace: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # Create custom Scaffold to make sure we run our own init_op when model # is not restored from checkpoint. summary_op = [model.summary] summaries = tf.summary.merge_all() if summaries is not None: summary_op.append(summaries) summary_op = tf.summary.merge(summary_op) # `ready_for_local_init_op` is hardcoded to 'ready' as local init doesn't # depend on global init and `local_init_op` only runs when it is set as # 'ready' (an empty string tensor sets it as ready). is_chief = True local_var_init_op = tf.local_variables_initializer() table_init_ops = tf.tables_initializer() variable_mgr_init_ops = [local_var_init_op] variable_mgr_init_ops.extend([table_init_ops]) variable_mgr_init_ops.extend([slot_init]) local_var_init_op_group = tf.group(*variable_mgr_init_ops) scaffold = tf.train.Scaffold( saver=model_saver, init_op=tf.global_variables_initializer() if is_chief else tf.no_op(), local_init_op=local_var_init_op_group, ready_for_local_init_op=tf.constant([], dtype=tf.string), summary_op=summary_op, init_fn=load_base_net_checkpoint, ) # Custom hooks for our session hooks = [] chief_only_hooks = [] if config.train.tf_debug: debug_hook = tf_debug.LocalCLIDebugHook() debug_hook.add_tensor_filter('has_inf_or_nan', tf_debug.has_inf_or_nan) hooks.extend([debug_hook]) if not config.train.job_dir: tf.logging.warning( '`job_dir` is not defined. Checkpoints and logs will not be saved.' ) checkpoint_dir = None elif config.train.run_name: # Use run_name when available checkpoint_dir = os.path.join(config.train.job_dir, config.train.run_name) else: checkpoint_dir = config.train.job_dir should_add_hooks = (config.train.display_every_steps or config.train.display_every_secs and checkpoint_dir is not None) if should_add_hooks: if not config.train.debug and image_vis == 'debug': tf.logging.warning('ImageVisHook will not run without debug mode.') elif image_vis is not None: # ImageVis only runs on the chief. chief_only_hooks.append( ImageVisHook(prediction_dict, image=train_dataset['image'], gt_bboxes=train_dataset['bboxes'], config=config.model, output_dir=checkpoint_dir, every_n_steps=config.train.display_every_steps, every_n_secs=config.train.display_every_secs, image_visualization_mode=image_vis)) if var_vis is not None: # VarVis only runs on the chief. chief_only_hooks.append( VarVisHook( every_n_steps=config.train.display_every_steps, every_n_secs=config.train.display_every_secs, mode=var_vis, output_dir=checkpoint_dir, vars_summary=model.vars_summary, )) step = -1 target = '' config_proto = tf.ConfigProto() config_proto.allow_soft_placement = True with tf.train.MonitoredTrainingSession( master=target, is_chief=is_chief, checkpoint_dir=checkpoint_dir, scaffold=scaffold, hooks=hooks, chief_only_hooks=chief_only_hooks, save_checkpoint_secs=config.train.save_checkpoint_secs, save_summaries_steps=config.train.save_summaries_steps, save_summaries_secs=config.train.save_summaries_secs, config=config_proto, ) as sess: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: while not coord.should_stop(): before = time.time() _, train_loss, step, filename = sess.run( [train_op, average_loss, global_step, train_filename], options=run_options) # TODO: Add image summary every once in a while. tf.logging.info( 'step: {}, file: {}, train_loss: {}, in {:.2f}s'.format( step, filename, train_loss, time.time() - before)) if is_chief and step == 1: # We save the run after first batch to make sure everything # works properly. save_run(config, environment=environment) except tf.errors.OutOfRangeError: tf.logging.info('{}finished training after {} epoch limit'.format( log_prefix, config.train.num_epochs)) # TODO: Print summary finally: coord.request_stop() # Wait for all threads to stop. coord.join(threads) return step
def get_prediction(image, config, total=None, session=None, fetches=None, image_tensor=None, class_labels=None, return_tf_vars=False): """ Gets the prediction given by the model `model_type` of the image `image`. If a checkpoint exists in the job's directory, load it. The names of the classes will be obtained from the dataset directory. Returns a dictionary with the objects, their labels and probabilities, the inference time and the scale factor. Also if the `return_tf_vars` is True, returns the image tensor, the entire prediction of the model and the sesssion. """ if session is None and fetches is None and image_tensor is None: # Don't use data augmentation in predictions config.dataset.data_augmentation = None dataset_class = get_dataset(config.dataset.type) model_class = get_model(config.model.type) dataset = dataset_class(config) model = model_class(config) graph = tf.Graph() session = tf.Session(graph=graph) with graph.as_default(): image_tensor = tf.placeholder(tf.float32, (None, None, 3)) image_tf, _, process_meta = dataset.preprocess(image_tensor) pred_dict = model(image_tf) # Restore checkpoint if config.train.job_dir: job_dir = config.train.job_dir if config.train.run_name: job_dir = os.path.join(job_dir, config.train.run_name) ckpt = tf.train.get_checkpoint_state(job_dir) if not ckpt or not ckpt.all_model_checkpoint_paths: raise ValueError( 'Could not find checkpoint in {}.'.format(job_dir)) ckpt = ckpt.all_model_checkpoint_paths[-1] saver = tf.train.Saver(sharded=True, allow_empty=True) saver.restore(session, ckpt) tf.logging.info('Loaded checkpoint.') else: # A prediction without checkpoint is just used for testing tf.logging.warning( 'Could not load checkpoint. Using initialized model.') init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) session.run(init_op) if config.model.network.with_rcnn: cls_prediction = pred_dict['classification_prediction'] objects_tf = cls_prediction['objects'] objects_labels_tf = cls_prediction['labels'] objects_labels_prob_tf = cls_prediction['probs'] else: rpn_prediction = pred_dict['rpn_prediction'] objects_tf = rpn_prediction['proposals'] objects_labels_prob_tf = rpn_prediction['scores'] # All labels without RCNN are zero objects_labels_tf = tf.zeros(tf.shape(objects_labels_prob_tf), dtype=tf.int32) fetches = { 'objects': objects_tf, 'labels': objects_labels_tf, 'probs': objects_labels_prob_tf, 'scale_factor': process_meta['scale_factor'] } # If in debug mode, return the full prediction dictionary. if config.train.debug: fetches['_debug'] = pred_dict elif session is None or fetches is None or image_tensor is None: raise ValueError( 'Either all `session`, `fetches` and `image_tensor` are None, ' 'or neither of them are.') start_time = time.time() fetched = session.run(fetches, feed_dict={image_tensor: np.array(image)}) end_time = time.time() objects = fetched['objects'] objects_labels = fetched['labels'] objects_labels_prob = fetched['probs'] scale_factor = fetched['scale_factor'] objects_labels = objects_labels.tolist() if class_labels is not None: objects_labels = [class_labels[obj] for obj in objects_labels] # Scale objects to original image dimensions objects /= scale_factor objects = objects.tolist() objects_labels_prob = objects_labels_prob.tolist() if total is not None: objects = objects[:total] objects_labels = objects_labels[:total] objects_labels_prob = objects_labels_prob[:total] res = { 'objects': objects, 'objects_labels': objects_labels, 'objects_labels_prob': objects_labels_prob, 'inference_time': end_time - start_time, } if return_tf_vars: res['image_tensor'] = image_tensor res['fetches'] = fetches res['session'] = session return res
def train(job_id, service_account_json, bucket_name, region, config_files, dataset, scale_tier, master_type, worker_type, worker_count, parameter_server_type, parameter_server_count): project_id = get_project_id(service_account_json) if project_id is None: raise ValueError( 'Missing "project_id" in service_account_json "{}"'.format( service_account_json)) if bucket_name is None: client_id = get_client_id(service_account_json) bucket_name = 'luminoth-{}'.format(client_id) click.echo( 'Bucket name not specified. Using "{}".'.format(bucket_name)) credentials = get_credentials(service_account_json) validate_region(region, project_id, credentials) # Creates bucket for logs and models if it doesn't exist bucket = get_bucket(service_account_json, bucket_name) if not job_id: job_id = 'train_{}'.format(datetime.now().strftime("%Y%m%d_%H%M%S")) # Define path in bucket to store job's config, logs, etc. base_path = 'lumi_{}'.format(job_id) package_path = build_package(bucket, base_path) # Check if absolute or relative dataset path if not dataset.startswith('gs://'): dataset = 'gs://{}'.format(dataset) args = [] args.extend([ '-o', 'dataset.dir={}'.format(dataset), ]) override_params = [ 'dataset.dir={}'.format(dataset), ] custom_config = load_config(config_files) model_class = get_model(custom_config.model.type) config = get_model_config( model_class.base_config, custom_config, override_params, ) # We should validate config before submitting job # Update final config file to job bucket config_path = os.path.join(base_path, DEFAULT_CONFIG_FILENAME) upload_data(bucket, config_path, dump_config(config)) args = ['--config', 'gs://{}/{}'.format(bucket_name, config_path)] cloudml = cloud_service(credentials, 'ml') training_inputs = { 'scaleTier': scale_tier, 'packageUris': ['gs://{}/{}'.format(bucket_name, package_path)], 'pythonModule': 'luminoth.train', 'args': args, 'region': region, 'jobDir': 'gs://{}/{}/'.format(bucket_name, base_path), 'runtimeVersion': RUNTIME_VERSION } if scale_tier == 'CUSTOM': training_inputs['masterType'] = master_type training_inputs['workerType'] = worker_type training_inputs['workerCount'] = worker_count if parameter_server_count > 0: training_inputs['parameterServerCount'] = parameter_server_count training_inputs['parameterServerType'] = parameter_server_type job_spec = {'jobId': job_id, 'trainingInput': training_inputs} jobrequest = cloudml.projects().jobs().create( body=job_spec, parent='projects/{}'.format(project_id)) try: click.echo('Submitting training job.') res = jobrequest.execute() click.echo('Job {} submitted successfully.'.format(job_id)) click.echo('state = {}, createTime = {}'.format( res.get('state'), res.get('createTime'))) save_run(config, environment='gcloud', extra_config=job_spec) except Exception as err: click.echo('There was an error creating the training job. ' 'Check the details: \n{}'.format(err._get_reason()))
def eval(dataset_split, config_files, watch, from_global_step, override_params, files_per_class, iou_threshold, min_probability): """Evaluate models using dataset.""" # If the config file is empty, our config will be the base_config for the # default model. try: config = get_config(config_files, override_params=override_params) except KeyError: raise KeyError('model.type should be set on the custom config.') if not config.train.job_dir: raise KeyError('`job_dir` should be set.') if not config.train.run_name: raise KeyError('`run_name` should be set.') # `run_dir` is where the actual checkpoint and logs are located. run_dir = os.path.join(config.train.job_dir, config.train.run_name) # Only activate debug for if needed for debug visualization mode. if not config.train.debug: config.train.debug = config.eval.image_vis == 'debug' if config.train.debug or config.train.tf_debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) # Build the dataset tensors, overriding the default dataset split. config.dataset.split = dataset_split # Disable data augmentation. config.dataset.data_augmentation = [] # Only a single run over the dataset to calculate metrics. config.train.num_epochs = 1 if config.model.network.with_rcnn: config.model.rcnn.proposals.min_prob_threshold = min_probability else: config.model.rpn.proposals.min_prob_threshold = min_probability # Seed setup if config.train.seed: tf.set_random_seed(config.train.seed) # Set pretrained as not training config.model.base_network.trainable = False model_class = get_model(config.model.type) model = model_class(config) dataset_class = get_dataset(config.dataset.type) dataset = dataset_class(config) train_dataset = dataset() train_image = train_dataset['image'] train_objects = train_dataset['bboxes'] train_filename = train_dataset['filename'] # Build the graph of the model to evaluate, retrieving required # intermediate tensors. prediction_dict = model(train_image, train_objects) if config.model.network.with_rcnn: pred = prediction_dict['classification_prediction'] pred_objects = pred['objects'] pred_objects_classes = pred['labels'] pred_objects_scores = pred['probs'] else: # Force the num_classes to 1 config.model.network.num_classes = 1 pred = prediction_dict['rpn_prediction'] pred_objects = pred['proposals'] pred_objects_scores = pred['scores'] # When using only RPN all classes are 0. pred_objects_classes = tf.zeros( (tf.shape(pred_objects_scores)[0],), dtype=tf.int32 ) # Retrieve *all* the losses from the model and calculate their streaming # means, so we get the loss over the whole dataset. batch_losses = model.loss(prediction_dict, return_all=True) losses = {} for loss_name, loss_tensor in batch_losses.items(): loss_mean, _ = tf.metrics.mean( loss_tensor, name=loss_name, metrics_collections='metrics', updates_collections='metric_ops', ) full_loss_name = '{}_losses/{}'.format(dataset_split, loss_name) losses[full_loss_name] = loss_mean metric_ops = tf.get_collection('metric_ops') init_op = tf.group( tf.global_variables_initializer(), tf.local_variables_initializer() ) # Using a global saver instead of the one for the model. saver = tf.train.Saver(sharded=True, allow_empty=True) # Aggregate the required ops to evaluate into a dict.. ops = { 'init_op': init_op, 'metric_ops': metric_ops, 'pred_objects': pred_objects, 'pred_objects_classes': pred_objects_classes, 'pred_objects_scores': pred_objects_scores, 'train_objects': train_objects, 'losses': losses, 'prediction_dict': prediction_dict, 'filename': train_filename, 'train_image': train_image } metrics_scope = '{}_metrics'.format(dataset_split) # Use global writer for all checkpoints. We don't want to write different # files for each checkpoint. writer = tf.summary.FileWriter(run_dir) files_to_visualize = {} last_global_step = from_global_step while True: # Get the checkpoint files to evaluate. try: checkpoints = get_checkpoints( run_dir, last_global_step, last_only=not watch ) except ValueError as e: if not watch: tf.logging.error('Missing checkpoint.') raise e tf.logging.warning( 'Missing checkpoint; Checking again in a moment') time.sleep(5) continue for checkpoint in checkpoints: # Always returned in order, so it's safe to assign directly. tf.logging.info( 'Evaluating global_step {} using checkpoint \'{}\''.format( checkpoint['global_step'], checkpoint['file'] ) ) try: start = time.time() evaluate_once( config, writer, saver, ops, checkpoint, metrics_scope=metrics_scope, image_vis=config.eval.image_vis, files_per_class=files_per_class, files_to_visualize=files_to_visualize, iou_threshold=iou_threshold, min_probability=min_probability ) last_global_step = checkpoint['global_step'] tf.logging.info('Evaluated in {:.2f}s'.format( time.time() - start )) except tf.errors.NotFoundError: # The checkpoint is not ready yet. It was written in the # checkpoints file, but it still hasn't been completely saved. tf.logging.info( 'Checkpoint {} is not ready yet. ' 'Checking again in a moment.'.format( checkpoint['file'] ) ) time.sleep(5) continue # If no watching was requested, finish the execution. if not watch: return # Sleep for a moment and check for new checkpoints. tf.logging.info('All checkpoints evaluated; sleeping for a moment') time.sleep(5)
def get_prediction(model_type, image, config_file, session=None, prediction_dict=None, image_tensor=None, return_tf_vars=False): """ Gets the prediction given by the model `model_type` of the image `image`. If a checkpoint exists in the job's directory, load it. The names of the classes will be obtained from the dataset directory. Returns a dictionary with the objects, their labels and probabilities, the inference time and the scale factor. Also if the `return_tf_vars` is True, returns the image tensor, the entire prediction of the model and the sesssion. """ model_class = get_model(model_type) config = get_model_config( model_class.base_config, config_file, None ) if session is None or prediction_dict is None or image_tensor is None: graph = tf.Graph() session = tf.Session(graph=graph) with graph.as_default(): image_tensor = tf.placeholder(tf.float32, (1, None, None, 3)) model = model_class(model_class.base_config) prediction_dict = model(image_tensor) # Restore checkpoint if config.train.job_dir and config.train.run_name: ckpt = tf.train.get_checkpoint_state(os.path.join( config.train.job_dir, config.train.run_name)) if not ckpt or not ckpt.all_model_checkpoint_paths: raise ValueError('Could not find checkpoint in {}.'.format( config.train.job_dir )) ckpt = ckpt.all_model_checkpoint_paths[-1] ckpt_dir = os.path.join('.', ckpt) saver = tf.train.Saver(sharded=True, allow_empty=True) saver.restore(session, ckpt_dir) # A prediction without checkpoint is just used for testing else: init_op = tf.group( tf.global_variables_initializer(), tf.local_variables_initializer() ) session.run(init_op) classification_prediction = prediction_dict['classification_prediction'] objects_tf = classification_prediction['objects'] objects_labels_tf = classification_prediction['labels'] objects_labels_prob_tf = classification_prediction['probs'] image_resize_config = model_class.base_config.dataset.image_preprocessing image_array, scale_factor = resize_image( image, float(image_resize_config.min_size), float(image_resize_config.max_size) ) start_time = time.time() objects, objects_labels, objects_labels_prob = session.run([ objects_tf, objects_labels_tf, objects_labels_prob_tf ], feed_dict={ image_tensor: image_array }) end_time = time.time() if config.dataset.dir: # Gets the names of the classes classes_file = os.path.join(config.dataset.dir, 'classes.json') class_labels = json.load(tf.gfile.GFile(classes_file)) objects_labels = [class_labels[obj] for obj in objects_labels] else: objects_labels = objects_labels.tolist() res = { 'objects': objects.tolist(), 'objects_labels': objects_labels, 'objects_labels_prob': objects_labels_prob.tolist(), 'inference_time': end_time - start_time, 'scale_factor': scale_factor, } if return_tf_vars: res['image_tensor'] = image_tensor res['prediction_dict'] = prediction_dict res['session'] = session return res
def __init__(self, config): if config.dataset.dir: # Gets the names of the classes classes_file = os.path.join(config.dataset.dir, 'classes.json') if tf.gfile.Exists(classes_file): self.class_labels = json.load(tf.gfile.GFile(classes_file)) else: self.class_labels = None # Don't use data augmentation in predictions config.dataset.data_augmentation = None dataset_class = get_dataset(config.dataset.type) model_class = get_model(config.model.type) dataset = dataset_class(config) model = model_class(config) graph = tf.Graph() tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True self.session = tf.Session(config=tf_config, graph=graph) with graph.as_default(): self.image_placeholder = tf.placeholder( tf.float32, (None, None, 3) ) image_tf, _, process_meta = dataset.preprocess( self.image_placeholder ) pred_dict = model(image_tf) # Restore checkpoint if config.train.job_dir: job_dir = config.train.job_dir if config.train.run_name: job_dir = os.path.join(job_dir, config.train.run_name) ckpt = tf.train.get_checkpoint_state(job_dir) if not ckpt or not ckpt.all_model_checkpoint_paths: raise ValueError('Could not find checkpoint in {}.'.format( job_dir )) ckpt = ckpt.all_model_checkpoint_paths[-1] saver = tf.train.Saver(sharded=True, allow_empty=True) saver.restore(self.session, ckpt) tf.logging.info('Loaded checkpoint.') else: # A prediction without checkpoint is just used for testing tf.logging.warning( 'Could not load checkpoint. Using initialized model.') init_op = tf.group( tf.global_variables_initializer(), tf.local_variables_initializer() ) self.session.run(init_op) if config.model.type == 'ssd': cls_prediction = pred_dict['classification_prediction'] objects_tf = cls_prediction['objects'] objects_labels_tf = cls_prediction['labels'] objects_labels_prob_tf = cls_prediction['probs'] elif config.model.type == 'fasterrcnn': if config.model.network.get('with_rcnn', False): cls_prediction = pred_dict['classification_prediction'] objects_tf = cls_prediction['objects'] objects_labels_tf = cls_prediction['labels'] objects_labels_prob_tf = cls_prediction['probs'] else: rpn_prediction = pred_dict['rpn_prediction'] objects_tf = rpn_prediction['proposals'] objects_labels_prob_tf = rpn_prediction['scores'] # All labels without RCNN are zero objects_labels_tf = tf.zeros( tf.shape(objects_labels_prob_tf), dtype=tf.int32 ) else: raise ValueError( "Model type '{}' not supported".format(config.model.type) ) self.fetches = { 'objects': objects_tf, 'labels': objects_labels_tf, 'probs': objects_labels_prob_tf, 'scale_factor': process_meta['scale_factor'] } # If in debug mode, return the full prediction dictionary. if config.train.debug: self.fetches['_debug'] = pred_dict
def detect_tile_nuclei(slide_path, tile_position, args, it_kwargs, src_mu_lab=None, src_sigma_lab=None, debug=False): # ========================================================================= # ======================= Tile Loading ==================================== # ========================================================================= print('\n>> Loading Tile ... \n') csv_dict = {} csv_dict['PreparationTime'] = [] csv_dict['ColorDeconvTime'] = [] csv_dict['TotalTileLoadingTime'] = [] csv_dict['CKPTLoadingTime'] = [] csv_dict['ModelInfernceTime'] = [] csv_dict['DetectionTime'] = [] csv_dict['ROIShape'] = [] csv_dict['ObjectsDict'] = [] csv_dict['NumObjects'] = [] csv_dict['AnnotationWritingTime'] = [] csv_dict['AnnotationDict'] = [] csv_dict['AnalysisDict'] = [] start_time = time.time() total_tileloading_start_time = time.time() ts = large_image.getTileSource(slide_path) tile_info = ts.getSingleTile( tile_position=tile_position, format=large_image.tilesource.TILE_FORMAT_NUMPY, **it_kwargs) im_tile = tile_info['tile'][:, :, :3] csv_dict['ROIShape'] = im_tile.shape[:2] prep_time = time.time() - start_time csv_dict['PreparationTime'] = round(prep_time, 3) # ========================================================================= # =================Img Normalization & Color Deconv======================== # ========================================================================= print('\n>> Color Deconvolving ... \n') start_time = time.time() im_nmzd = htk_cnorm.reinhard( im_tile, REFERENCE_MU_LAB, REFERENCE_STD_LAB, src_mu=src_mu_lab, src_sigma=src_sigma_lab ) # perform color decovolution if args.deconv_method == 'ruifrok': w = cli_utils.get_stain_matrix(args) im_stains = htk_cdeconv.color_deconvolution( im_nmzd, w).Stains.astype(np.float)[:, :, :2] elif args.deconv_method == 'macenko': w_est = htk_cdeconv.rgb_separate_stains_macenko_pca(im_tile, 255) im_stains = htk_cdeconv.color_deconvolution( im_tile, w_est, 255).Stains.astype(np.float) ch1 = htk_cdeconv.find_stain_index( htk_cdeconv.stain_color_map[args.stain_1], w_est) ch2 = htk_cdeconv.find_stain_index( htk_cdeconv.stain_color_map[args.stain_2], w_est) im_stains = im_stains[:, :, [ch1, ch2]] else: raise ValueError('Invalid deconvolution method parameter.') # ========================================================================= # ====================== Fuse the stain1 & stain2 pix====================== # ========================================================================= # compute nuclear foreground mask im_fgnd_mask_stain_1 = im_stains[ :, :, 0] < threshold_yen(im_stains[:, :, 0]) im_fgnd_mask_stain_2 = im_stains[ :, :, 1] < threshold_yen(im_stains[:, :, 1]) im_fgnd_seg_mask = im_fgnd_mask_stain_1 | im_fgnd_mask_stain_2 # segment nuclei im_nuc_det_input = np.squeeze(np.min(im_stains[:, :, :2], axis=2)) print('---> Fusing 2 Stains') deconv_time = time.time() - start_time csv_dict['ColorDeconvTime'] = round(deconv_time, 3) # ========================================================================= # ================= Nuclie Detection Deep Learning Block ================== # ========================================================================= total_tileloading_time = time.time() - total_tileloading_start_time csv_dict['TotalTileLoadingTime'] = round(total_tileloading_time, 3) start_time = time.time() config = get_config(CONFIG) config.model.rcnn.proposals.total_max_detections = args.max_det config.model.rcnn.proposals.min_prob_threshold = args.min_prob im_nuc_det_input = np.stack((im_nuc_det_input,) * 3, axis=-1) # ==================================================================================================================================== tf.reset_default_graph() dataset_class = get_dataset('object_detection') model_class = get_model('fasterrcnn') dataset = dataset_class(config) model = model_class(config) graph = tf.Graph() session = tf.Session(graph=graph) with graph.as_default(): image_placeholder = tf.placeholder( tf.float32, (None, None, 3), name='Input_Placeholder' ) pred_dict = model(image_placeholder) ckpt_loading_start_time = time.time() saver = tf.train.Saver(sharded=True, allow_empty=True) saver.restore(session, CKPT_DIR) tf.logging.info('Loaded checkpoint.') ckpt_loading_time = time.time() - ckpt_loading_start_time csv_dict['CKPTLoadingTime'] = round(ckpt_loading_time, 3) inference_start_time = time.time() cls_prediction = pred_dict['classification_prediction'] objects_tf = cls_prediction['objects'] objects_labels_tf = cls_prediction['labels'] objects_labels_prob_tf = cls_prediction['probs'] fetches = { 'objects': objects_tf, 'labels': objects_labels_tf, 'probs': objects_labels_prob_tf, } fetched = session.run(fetches, feed_dict={ image_placeholder: np.array(im_nuc_det_input) }) inference_time = time.time() - inference_start_time csv_dict['ModelInfernceTime'] = round(inference_time, 3) objects = fetched['objects'] labels = fetched['labels'].tolist() probs = fetched['probs'].tolist() # Cast to int to consistently return the same type in Python 2 and 3 objects = [ [int(round(coord)) for coord in obj] for obj in objects.tolist() ] predictions = sorted([ { 'bbox': obj, 'label': label, 'prob': round(prob, 4), } for obj, label, prob in zip(objects, labels, probs) ], key=lambda x: x['prob'], reverse=True) print('\n>> Finishing Detection ... \n') print('***** Number of Detected Cells ****** : ', len(predictions)) detection_time = time.time() - start_time csv_dict['DetectionTime'] = round(detection_time, 3) csv_dict['NumObjects'] = len(predictions) csv_dict['ObjectsDict'] = predictions # ========================================================================= # ======================= TODO: Implement border deletion ================= # ========================================================================= # ========================================================================= # ======================= Write Annotations =============================== # ========================================================================= start_time = time.time() objects_df = pd.DataFrame(objects) formatted_annot_list,\ formatter_analysis_list = cli_utils.convert_preds_to_utilformat( objects_df, probs, args.ignore_border_nuclei, im_tile_size=args.analysis_tile_size) nuclei_annot_list = cli_utils.create_tile_nuclei_annotations( formatted_annot_list, tile_info, args.nuclei_annotation_format) csv_dict['AnnotationDict'] = nuclei_annot_list csv_dict['AnalysisDict'] = formatter_analysis_list num_nuclei = len(nuclei_annot_list) anot_time = time.time() - start_time csv_dict['AnnotationWritingTime'] = round(anot_time, 3) return csv_dict