def _path(self, key, is_saving): directory = self._directory(is_saving) log.debug('Using search path {!r} for checkpoints.'.format(directory)) if isinstance(key, int): cp_name = '{}-{}'.format(self._checkpoint_basename, key) elif key == self._checkpoint_latest and not is_saving: manifest_file = os.path.join(directory, 'checkpoint') try: with open(manifest_file, 'r') as f: manifest = yaml.load(f) except FileNotFoundError: raise CheckpointManifestNotFoundError( 'Manifest for the latest checkpoint cannot be found.') cp_name = manifest['model_checkpoint_path'] else: cp_name = key if is_saving: # ensure directory exists os.makedirs(directory, exist_ok=True) return os.path.join(directory, cp_name) # loading path = os.path.join(directory, cp_name) log.info('Loading checkpoint from {!r}...'.format(path)) if not os.path.exists(path + '.index'): raise CheckpointNotFoundError( 'Checkpoint {!r} not found.'.format(path)) return path
def cli_eval_all(self): """Evaluates all checkpoints for accuracy. """ result = self._get_session('validate').eval_all() file_name = 'eval_all.csv' with open(file_name, 'w') as f: f.write(result.csv()) log.info('Evaluation results saved in {!r}.'.format(file_name))
def run(self, ops, batch=False, **kwargs): # ensure variables are initialized uninit_vars = [] for var in self.global_variables(): if var not in self.initialized_variables: uninit_vars.append(var) if uninit_vars: desc = '\n '.join(v.op.name for v in uninit_vars) log.warn('Variables are not initialized:\n {}'.format(desc)) self.raw_run(tf.variables_initializer(uninit_vars)) self.initialized_variables += uninit_vars # assign overrider hyperparameters self._overrider_assign_parameters() # session run if batch: results, statistics = self.raw_run( (ops, self.estimator.operations), **kwargs) # update statistics self.estimator.append(statistics) text = self.estimator.format(batch_size=self.batch_size) log.info(text, update=True) if log.is_enabled('debug'): self.estimator.debug() else: results = self.raw_run(ops, **kwargs) return results
def global_kernel(self): for node, info in self.targets.items(): node_name = node.formatted_name() value = self._step_forward(info['from'], info['to'], info['step'], info['min_step'], info['type']) var = info['variable'] if value is False: log.debug('Stopping because of {!r}, as we cannot further ' 'increment/decrement {!r}.'.format(node_name, var)) return False self.assign(var, value) info['from'] = value log.info('Updated hyperparameter {!r} in layer {!r} with a new ' 'value {}.'.format(var.op.name, node_name, value)) # fine-tuning with updated hyperparameter tolerable = self.fine_tune() if tolerable: return True self.backtrack() for node, info in self.targets.items(): new_step = self._reduce_step(info['step'], info['type']) if abs(new_step) < abs(info['min_step']): log.debug('Stopping because of {!r}, as we cannot use smaller ' 'increment/decrement.'.format(node_name)) return False info['step'] = new_step return True
def _run(self, max_epochs, reset=False): log.info('Start profiling ...') self.config.system.checkpoint.save = False # reset num_epochs and stop at 1 epoch if reset: self.reset_num_epochs() # start training self.train(max_epochs=max_epochs)
def save(self, key): cp_path = self._path(key, True) if isinstance(key, int): log.info('Saving checkpoint at epoch {} to {!r}...'.format( key, cp_path)) else: log.info('Saving checkpoint to {!r}...'.format(cp_path)) saver = tf.train.Saver(self._global_variables()) saver.save(self.tf_session, cp_path, write_meta_graph=False)
def cont_list(self, load_doct=None): self.cont = [] if load_doct is not None: for variable in self.members: if load_doct.get(variable.name): self.cont.append(variable.name) else: self.cont = [variable.name for variable in self.members] log.info('continue on layers: {}'.format(self.cont))
def pick_layer(self, session, start=False): if self.priority_list == [] and not start: log.info('priority list is empty!!') return None else: self.sort_layers(session) if start: log.info('First time picking targets: {}'.format( self.priority_list)) return self.priority_list.pop()
def plot_parameters(self, variables): for node, name_value_map in variables.items(): for name, value in name_value_map.items(): layer_name = node.formatted_name() log.info('Plotting parameter {} in layer {}'.format( name, layer_name)) name = '{}-{}'.format(layer_name, name) name = name.replace('/', '-') # {root}/{layer_name}-{variable_name}.{ext} var_path = os.path.join(self._path, name) self._plot_histogram(value, var_path)
def profile(self): log.debug('Profiling starts.') try: self.profile_multi_epochs() except KeyboardInterrupt: log.info('Stopped.') save = self.config.system.checkpoint.get('save', {}) if save: countdown = save.get('countdown', 0) if log.countdown('Saving checkpoint', countdown): self.save_checkpoint('latest')
def test(self, names, inputs, predictions): results = {} for name, image, prediction in zip(names, inputs, predictions): name = name.decode() label = self.class_names[np.argmax(prediction)] log.info('{} labeled as {}.'.format(name, label)) results[name] = label output_dir = self.config.system.search_path.run.outputs[0] os.makedirs(output_dir, exist_ok=True) filename = os.path.join(output_dir, 'predictions.yaml') with open(filename, 'w') as f: yaml.dump(results, f)
def save(self, key): cp_path = self._path(key, True) if isinstance(key, int): log.info('Saving checkpoint at epoch {} to {!r}...'.format( key, cp_path)) else: log.info('Saving checkpoint to {!r}...'.format(cp_path)) try: saver = tf.train.Saver(self._global_variables()) saver.save(self.tf_session, cp_path, write_meta_graph=False) except tf.errors.ResourceExhaustedError: log.warn('Unable to save a checkpoint because we have ' 'no space left on device.')
def plot(self): input_tensor = self.task.inputs[0] label_tensor = self.task.truths[0] layer_tensors = self.net.layers() variable_tensors = self.net.variables input_image, label, layers, variables = self.session.run( [input_tensor, label_tensor, layer_tensors, variable_tensors]) try: if self.config.system.plot.get('features'): self.plot_features(input_image, label, layers) if self.config.system.plot.get('parameters'): # overridden variable histogram self.plot_parameters(variables) except KeyboardInterrupt: log.info('Abort.')
def train(self, max_epochs=None): # final debug outputs lr = self.run(self.learning_rate) log.info('Training start with a learning rate {}.'.format(lr)) try: # train iterations while self._iteration(max_epochs=max_epochs): pass except KeyboardInterrupt: log.info('Stopped.') save = self.config.system.checkpoint.get('save', {}) if save: countdown = save.get('countdown', 0) if log.countdown('Saving checkpoint', countdown): self.save_checkpoint('latest')
def search(self): # profile training accuracy for a given number of epochs self._profile() # initialize search self._init_search() # main procedure max_steps = self.config.search.max_steps step = 0 while True: if max_steps and step > max_steps: break step += 1 if not self.kernel(): break log.info('Automated hyperparameter optimization complete.')
def _profile(self): baseline = self.config.search.accuracy.get('baseline') if baseline: return baseline self.reset_num_epochs() log.info('Profiling baseline accuracy...') total_accuracy = step = epoch = 0 while epoch < self.config.search.max_epochs.profile: epoch = self.run(self.num_epochs, batch=True) total_accuracy += self.estimator.get_value('accuracy', 'train') step += 1 self.baseline = total_accuracy / step tolerance = self.config.search.accuracy.tolerance self.tolerable_baseline = self.baseline * (1 - tolerance) log.info('Baseline accuracy: {}, tolerable accuracy: {}.'.format( self.baseline, self.tolerable_baseline)) self.reset_num_epochs()
def _test(self, name, corners, scores, classes, count): image = Image.open(name) width, height = image.size image = image.convert('RGBA') thickness = int((height + width) / 300) font = os.path.join(os.path.split(__file__)[0], 'opensans.ttf') font = ImageFont.truetype(font, 5 * thickness) log.info('{}: {} detections.'.format(name.decode(), count)) max_score = max(scores) corners = corners[:count] iterer = list(zip(corners, scores, classes)) iterer = reversed(sorted(iterer, key=lambda v: v[1])) for corner, score, cls in iterer: layer = Image.new('RGBA', image.size, (255, 255, 255, 0)) draw = ImageDraw.ImageDraw(layer) top, left, bottom, right = corner top = round(max(0, top * height)) left = round(max(0, left * width)) bottom = round(min(height, bottom * height)) right = round(min(width, right * width)) transparency = 127 + int(128 * score / max_score) color = self._colors[cls] + (transparency, ) for i in range(thickness): draw.rectangle((left + i, top + i, right - i, bottom - i), outline=color) # draw label cls_name = self.class_names[cls] label = ' {} {:.2f} '.format(cls_name, score) label_width, label_height = draw.textsize(label, font=font) label_pos = [left, top] label_rect = [left + label_width, top + label_height] draw.rectangle(label_pos + label_rect, fill=color) draw.text(label_pos, label, fill=(0, 0, 0, 127), font=font) image = Image.alpha_composite(image, layer) box = [int(v) for v in (left, top, right, bottom)] log.info( ' Confidence: {:f}, class: {}, box: ({}, {}) ({}, {})'.format( score, cls_name, *box)) path = self.session.config.system.search_path.run.outputs[0] path = os.path.join(path, 'detect') os.makedirs(path, exist_ok=True) name = os.path.split(str(name))[1] name, ext = os.path.splitext(name) path = os.path.join(path, '{}.png'.format(name)) image.save(path, quality=90)
def post_eval(self): stats = {} num_examples = self.session.num_examples num_remaining = num_examples % self.session.batch_size for key in ('top1', 'top5'): history = self.estimator.get_history(key, 'eval') history[-1] = history[-1][:num_remaining] valids = total = 0 for h in history: valids += np.sum(h) total += len(h) stats[key] = Percent(valids / total) self.estimator.flush(key, 'eval') self._formatted_history = {} log.info( ' top1: {}, top5: {} [{} images]' .format(stats['top1'], stats['top5'], num_examples)) return stats
def _get_session(self, action=None): if not action: if self.session: return self.session keys = self._train_keys if self._validate_config(keys, 'train', test=True): self.session = self._get_session('train') else: self.session = self._get_session('validate') return self.session keys = self._model_keys + self._dataset_keys try: cls = self._session_map[action] keys += self._keys_map[action] except KeyError: raise TypeError('Action {!r} not recognized.'.format(action)) self._validate_config(keys, action) if not isinstance(self.session, cls): log.info('Starting a {} session...'.format(action)) self.session = cls(self.config) return self.session
def _setup_gpus(system): gpus = system.visible_gpus if gpus != 'auto': # system.visible_gpus == 'auto' -> auto select GPUs if isinstance(gpus, list): gpus = ','.join(str(g) for g in gpus) else: gpus = str(gpus) else: gpus = _auto_select_gpus(system.num_gpus, system.gpu_memory_bound) if gpus: log.info('Using GPUs: {}'.format(gpus)) else: log.info('No GPUs available, using one clone of the network.') # FIXME doesn't work. hacky way to make it instantiate only one tower system.num_gpus = 1 # force ordering to match PCIE bus id, hopefully the same # ordering seen in nvidia-smi os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' # sets the visible GPUs os.environ['CUDA_VISIBLE_DEVICES'] = gpus
def cli_profile_timeline(self): """Performs training profiling to produce timeline.json. """ # TODO integrate this into Profile. from tensorflow.python.client import timeline options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() session = self._get_session('train') # run 100 iterations to warm up max_iterations = 100 for i in range(max_iterations): log.info( 'Running {}/{} iterations to warm up...' .format(i, max_iterations), update=True) session.run(session._train_op) log.info('Running the final iteration to generate timeline...') session.run( session._train_op, options=options, run_metadata=run_metadata) fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(chrome_trace)
def plot_features(self, input_image, label, layers): num = self.config.system.batch_size_per_gpu for i in range(num): log.info('{}% Plotting image #{}...'.format( int(i / num * 100.0), i), update=True) path = os.path.join(self._path, str(i)) os.makedirs(path, exist_ok=True) # input image # {root}/{index}/input.{ext} input_path = os.path.join(path, 'input-{}'.format(label[i])) self._plot_rgb_image(input_image[i], input_path) # layer activations for node, value in layers.items(): if value.ndim != 4: # value is not a (N x H x W x C) layout continue name = node.formatted_name().replace('/', '-') # root/{index}/{layer_name}.{ext} layer_path = os.path.join(path, name) self._plot_images(value[i], layer_path)
def priority_kernel(self): priority = self._priority(self.blacklist) if not priority: log.debug('All nodes blacklisted.') return False node, node_priority = priority[0] info = self.targets[node] node_name = node.formatted_name() log.debug('Prioritize layer {!r} with importance {}.'.format( node_name, node_priority)) value = self._step_forward(info['from'], info['to'], info['step'], info['min_step'], info['type']) var = info['variable'] if value is False: log.debug('Blacklisting {!r} as we cannot further ' 'increment/decrement {!r}.'.format(node_name, var)) self.blacklist.add(node) return True self.assign(var, value) info['from'] = value log.info( 'Updated hyperparameter {!r} in layer {!r} with a new value {}.'. format(var.op.name, node_name, value)) # fine-tuning with updated hyperparameter tolerable = self.fine_tune() if tolerable: # satisfies the budget constraint return True # fail to satisfy, backtrack and decrement step size self.backtrack() info = self.targets[node] new_step = self._reduce_step(info['step'], info['type']) if new_step < info['min_step']: self.blacklist.add(node) log.debug('Blacklisting {!r} as we cannot use smaller ' 'increment/decrement.'.format(node_name)) return True info['step'] = new_step return True
def eval(self, key=None, keyboard_interrupt=True): # load checkpoint if key is None: key = self.config.system.checkpoint.load self.load_checkpoint(key) self.run(self.imgs_seen.initializer) # evaluation log.info('Starting evaluation...') num_iterations = math.ceil(self.num_examples / self.batch_size) try: for step in range(num_iterations): self.run([], batch=True) except KeyboardInterrupt as e: log.info('Evaluation aborted.') if not keyboard_interrupt: raise e else: log.info('Evaluation complete.') return self.task.post_eval()
def _iteration(self, max_epochs=None): system = self.config.system epoch = self.once() floor_epoch = math.floor(epoch) cp_interval = system.checkpoint.get('save.interval', 0) if self.change.every('checkpoint.epoch', floor_epoch, cp_interval): log.info('Saving checkpoint at epoch {}...'.format(epoch), update=True) with log.demote(): self.save_checkpoint(floor_epoch) self._checkpoint_epoch = floor_epoch max_epochs = max_epochs or system.max_epochs if max_epochs and epoch >= max_epochs: log.info('Maximum epoch count {} reached.'.format(max_epochs)) if self._checkpoint_epoch and floor_epoch > self._checkpoint_epoch: log.info('Saving final checkpoint...') self.save_checkpoint(floor_epoch) return False return True
def eval_all(self): log.info('Evaluating all checkpoints...') epochs = list(self._range(self.checkpoint.list_epochs())) epochs_to_eval = ', '.join(str(e) for e in epochs) log.info('Checkpoints to evaluate: {}'.format(epochs_to_eval)) table = None # ensures imgs_seen initialized and loaded try: for e in epochs: with log.demote(): stats = self.eval(e, keyboard_interrupt=False) table = table or Table(['epoch'] + list(sorted(stats))) table.add_row(dict({'epoch': e}, **stats)) infos = ['epoch: {}'.format(e)] infos += ['{}: {}'.format(k, v) for k, v in stats.items()] log.info(', '.join(infos)) except KeyboardInterrupt: pass return table
def overriders_reset(self): log.info('Resetting overriders internal variables...') self._overriders_call('reset')
def overriders_update(self): log.info('Updating overrider internal variables...') self._overriders_call('update')
def _purge_session(self): if not self.session: return log.info('Purging current session because config is updated...') del self.session self.session = None
def cli_export(self): """Exports the current config. """ name = 'export.yaml' with open(name, 'w') as f: f.write(self.config.to_yaml()) log.info('Config successfully exported to {!r}.'.format(name))