Example #1
0
 def _path(self, key, is_saving):
     directory = self._directory(is_saving)
     log.debug('Using search path {!r} for checkpoints.'.format(directory))
     if isinstance(key, int):
         cp_name = '{}-{}'.format(self._checkpoint_basename, key)
     elif key == self._checkpoint_latest and not is_saving:
         manifest_file = os.path.join(directory, 'checkpoint')
         try:
             with open(manifest_file, 'r') as f:
                 manifest = yaml.load(f)
         except FileNotFoundError:
             raise CheckpointManifestNotFoundError(
                 'Manifest for the latest checkpoint cannot be found.')
         cp_name = manifest['model_checkpoint_path']
     else:
         cp_name = key
     if is_saving:
         # ensure directory exists
         os.makedirs(directory, exist_ok=True)
         return os.path.join(directory, cp_name)
     # loading
     path = os.path.join(directory, cp_name)
     log.info('Loading checkpoint from {!r}...'.format(path))
     if not os.path.exists(path + '.index'):
         raise CheckpointNotFoundError(
             'Checkpoint {!r} not found.'.format(path))
     return path
Example #2
0
 def cli_eval_all(self):
     """Evaluates all checkpoints for accuracy.  """
     result = self._get_session('validate').eval_all()
     file_name = 'eval_all.csv'
     with open(file_name, 'w') as f:
         f.write(result.csv())
     log.info('Evaluation results saved in {!r}.'.format(file_name))
Example #3
0
    def run(self, ops, batch=False, **kwargs):
        # ensure variables are initialized
        uninit_vars = []
        for var in self.global_variables():
            if var not in self.initialized_variables:
                uninit_vars.append(var)
        if uninit_vars:
            desc = '\n    '.join(v.op.name for v in uninit_vars)
            log.warn('Variables are not initialized:\n    {}'.format(desc))
            self.raw_run(tf.variables_initializer(uninit_vars))
            self.initialized_variables += uninit_vars

        # assign overrider hyperparameters
        self._overrider_assign_parameters()

        # session run
        if batch:
            results, statistics = self.raw_run(
                (ops, self.estimator.operations), **kwargs)
            # update statistics
            self.estimator.append(statistics)
            text = self.estimator.format(batch_size=self.batch_size)
            log.info(text, update=True)
            if log.is_enabled('debug'):
                self.estimator.debug()
        else:
            results = self.raw_run(ops, **kwargs)
        return results
Example #4
0
 def global_kernel(self):
     for node, info in self.targets.items():
         node_name = node.formatted_name()
         value = self._step_forward(info['from'], info['to'], info['step'],
                                    info['min_step'], info['type'])
         var = info['variable']
         if value is False:
             log.debug('Stopping because of {!r}, as we cannot further '
                       'increment/decrement {!r}.'.format(node_name, var))
             return False
         self.assign(var, value)
         info['from'] = value
         log.info('Updated hyperparameter {!r} in layer {!r} with a new '
                  'value {}.'.format(var.op.name, node_name, value))
     # fine-tuning with updated hyperparameter
     tolerable = self.fine_tune()
     if tolerable:
         return True
     self.backtrack()
     for node, info in self.targets.items():
         new_step = self._reduce_step(info['step'], info['type'])
         if abs(new_step) < abs(info['min_step']):
             log.debug('Stopping because of {!r}, as we cannot use smaller '
                       'increment/decrement.'.format(node_name))
             return False
         info['step'] = new_step
     return True
Example #5
0
 def _run(self, max_epochs, reset=False):
     log.info('Start profiling ...')
     self.config.system.checkpoint.save = False
     # reset num_epochs and stop at 1 epoch
     if reset:
         self.reset_num_epochs()
     # start training
     self.train(max_epochs=max_epochs)
Example #6
0
 def save(self, key):
     cp_path = self._path(key, True)
     if isinstance(key, int):
         log.info('Saving checkpoint at epoch {} to {!r}...'.format(
             key, cp_path))
     else:
         log.info('Saving checkpoint to {!r}...'.format(cp_path))
     saver = tf.train.Saver(self._global_variables())
     saver.save(self.tf_session, cp_path, write_meta_graph=False)
Example #7
0
 def cont_list(self, load_doct=None):
     self.cont = []
     if load_doct is not None:
         for variable in self.members:
             if load_doct.get(variable.name):
                 self.cont.append(variable.name)
     else:
         self.cont = [variable.name for variable in self.members]
     log.info('continue on layers: {}'.format(self.cont))
Example #8
0
 def pick_layer(self, session, start=False):
     if self.priority_list == [] and not start:
         log.info('priority list is empty!!')
         return None
     else:
         self.sort_layers(session)
         if start:
             log.info('First time picking targets: {}'.format(
                 self.priority_list))
         return self.priority_list.pop()
Example #9
0
 def plot_parameters(self, variables):
     for node, name_value_map in variables.items():
         for name, value in name_value_map.items():
             layer_name = node.formatted_name()
             log.info('Plotting parameter {} in layer {}'.format(
                 name, layer_name))
             name = '{}-{}'.format(layer_name, name)
             name = name.replace('/', '-')
             # {root}/{layer_name}-{variable_name}.{ext}
             var_path = os.path.join(self._path, name)
             self._plot_histogram(value, var_path)
Example #10
0
 def profile(self):
     log.debug('Profiling starts.')
     try:
         self.profile_multi_epochs()
     except KeyboardInterrupt:
         log.info('Stopped.')
         save = self.config.system.checkpoint.get('save', {})
         if save:
             countdown = save.get('countdown', 0)
             if log.countdown('Saving checkpoint', countdown):
                 self.save_checkpoint('latest')
Example #11
0
 def test(self, names, inputs, predictions):
     results = {}
     for name, image, prediction in zip(names, inputs, predictions):
         name = name.decode()
         label = self.class_names[np.argmax(prediction)]
         log.info('{} labeled as {}.'.format(name, label))
         results[name] = label
     output_dir = self.config.system.search_path.run.outputs[0]
     os.makedirs(output_dir, exist_ok=True)
     filename = os.path.join(output_dir, 'predictions.yaml')
     with open(filename, 'w') as f:
         yaml.dump(results, f)
Example #12
0
 def save(self, key):
     cp_path = self._path(key, True)
     if isinstance(key, int):
         log.info('Saving checkpoint at epoch {} to {!r}...'.format(
             key, cp_path))
     else:
         log.info('Saving checkpoint to {!r}...'.format(cp_path))
     try:
         saver = tf.train.Saver(self._global_variables())
         saver.save(self.tf_session, cp_path, write_meta_graph=False)
     except tf.errors.ResourceExhaustedError:
         log.warn('Unable to save a checkpoint because we have '
                  'no space left on device.')
Example #13
0
 def plot(self):
     input_tensor = self.task.inputs[0]
     label_tensor = self.task.truths[0]
     layer_tensors = self.net.layers()
     variable_tensors = self.net.variables
     input_image, label, layers, variables = self.session.run(
         [input_tensor, label_tensor, layer_tensors, variable_tensors])
     try:
         if self.config.system.plot.get('features'):
             self.plot_features(input_image, label, layers)
         if self.config.system.plot.get('parameters'):
             # overridden variable histogram
             self.plot_parameters(variables)
     except KeyboardInterrupt:
         log.info('Abort.')
Example #14
0
 def train(self, max_epochs=None):
     # final debug outputs
     lr = self.run(self.learning_rate)
     log.info('Training start with a learning rate {}.'.format(lr))
     try:
         # train iterations
         while self._iteration(max_epochs=max_epochs):
             pass
     except KeyboardInterrupt:
         log.info('Stopped.')
         save = self.config.system.checkpoint.get('save', {})
         if save:
             countdown = save.get('countdown', 0)
             if log.countdown('Saving checkpoint', countdown):
                 self.save_checkpoint('latest')
Example #15
0
 def search(self):
     # profile training accuracy for a given number of epochs
     self._profile()
     # initialize search
     self._init_search()
     # main procedure
     max_steps = self.config.search.max_steps
     step = 0
     while True:
         if max_steps and step > max_steps:
             break
         step += 1
         if not self.kernel():
             break
     log.info('Automated hyperparameter optimization complete.')
Example #16
0
 def _profile(self):
     baseline = self.config.search.accuracy.get('baseline')
     if baseline:
         return baseline
     self.reset_num_epochs()
     log.info('Profiling baseline accuracy...')
     total_accuracy = step = epoch = 0
     while epoch < self.config.search.max_epochs.profile:
         epoch = self.run(self.num_epochs, batch=True)
         total_accuracy += self.estimator.get_value('accuracy', 'train')
         step += 1
     self.baseline = total_accuracy / step
     tolerance = self.config.search.accuracy.tolerance
     self.tolerable_baseline = self.baseline * (1 - tolerance)
     log.info('Baseline accuracy: {}, tolerable accuracy: {}.'.format(
         self.baseline, self.tolerable_baseline))
     self.reset_num_epochs()
Example #17
0
 def _test(self, name, corners, scores, classes, count):
     image = Image.open(name)
     width, height = image.size
     image = image.convert('RGBA')
     thickness = int((height + width) / 300)
     font = os.path.join(os.path.split(__file__)[0], 'opensans.ttf')
     font = ImageFont.truetype(font, 5 * thickness)
     log.info('{}: {} detections.'.format(name.decode(), count))
     max_score = max(scores)
     corners = corners[:count]
     iterer = list(zip(corners, scores, classes))
     iterer = reversed(sorted(iterer, key=lambda v: v[1]))
     for corner, score, cls in iterer:
         layer = Image.new('RGBA', image.size, (255, 255, 255, 0))
         draw = ImageDraw.ImageDraw(layer)
         top, left, bottom, right = corner
         top = round(max(0, top * height))
         left = round(max(0, left * width))
         bottom = round(min(height, bottom * height))
         right = round(min(width, right * width))
         transparency = 127 + int(128 * score / max_score)
         color = self._colors[cls] + (transparency, )
         for i in range(thickness):
             draw.rectangle((left + i, top + i, right - i, bottom - i),
                            outline=color)
         # draw label
         cls_name = self.class_names[cls]
         label = ' {} {:.2f} '.format(cls_name, score)
         label_width, label_height = draw.textsize(label, font=font)
         label_pos = [left, top]
         label_rect = [left + label_width, top + label_height]
         draw.rectangle(label_pos + label_rect, fill=color)
         draw.text(label_pos, label, fill=(0, 0, 0, 127), font=font)
         image = Image.alpha_composite(image, layer)
         box = [int(v) for v in (left, top, right, bottom)]
         log.info(
             '  Confidence: {:f}, class: {}, box: ({}, {}) ({}, {})'.format(
                 score, cls_name, *box))
     path = self.session.config.system.search_path.run.outputs[0]
     path = os.path.join(path, 'detect')
     os.makedirs(path, exist_ok=True)
     name = os.path.split(str(name))[1]
     name, ext = os.path.splitext(name)
     path = os.path.join(path, '{}.png'.format(name))
     image.save(path, quality=90)
Example #18
0
 def post_eval(self):
     stats = {}
     num_examples = self.session.num_examples
     num_remaining = num_examples % self.session.batch_size
     for key in ('top1', 'top5'):
         history = self.estimator.get_history(key, 'eval')
         history[-1] = history[-1][:num_remaining]
         valids = total = 0
         for h in history:
             valids += np.sum(h)
             total += len(h)
         stats[key] = Percent(valids / total)
         self.estimator.flush(key, 'eval')
         self._formatted_history = {}
     log.info(
         '    top1: {}, top5: {} [{} images]'
         .format(stats['top1'], stats['top5'], num_examples))
     return stats
Example #19
0
File: cli.py Project: zaf05/mayo
 def _get_session(self, action=None):
     if not action:
         if self.session:
             return self.session
         keys = self._train_keys
         if self._validate_config(keys, 'train', test=True):
             self.session = self._get_session('train')
         else:
             self.session = self._get_session('validate')
         return self.session
     keys = self._model_keys + self._dataset_keys
     try:
         cls = self._session_map[action]
         keys += self._keys_map[action]
     except KeyError:
         raise TypeError('Action {!r} not recognized.'.format(action))
     self._validate_config(keys, action)
     if not isinstance(self.session, cls):
         log.info('Starting a {} session...'.format(action))
         self.session = cls(self.config)
     return self.session
Example #20
0
def _setup_gpus(system):
    gpus = system.visible_gpus
    if gpus != 'auto':
        # system.visible_gpus == 'auto' -> auto select GPUs
        if isinstance(gpus, list):
            gpus = ','.join(str(g) for g in gpus)
        else:
            gpus = str(gpus)
    else:
        gpus = _auto_select_gpus(system.num_gpus, system.gpu_memory_bound)
    if gpus:
        log.info('Using GPUs: {}'.format(gpus))
    else:
        log.info('No GPUs available, using one clone of the network.')
        # FIXME doesn't work. hacky way to make it instantiate only one tower
        system.num_gpus = 1
    # force ordering to match PCIE bus id, hopefully the same
    # ordering seen in nvidia-smi
    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    # sets the visible GPUs
    os.environ['CUDA_VISIBLE_DEVICES'] = gpus
Example #21
0
File: cli.py Project: zaf05/mayo
 def cli_profile_timeline(self):
     """Performs training profiling to produce timeline.json.  """
     # TODO integrate this into Profile.
     from tensorflow.python.client import timeline
     options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
     run_metadata = tf.RunMetadata()
     session = self._get_session('train')
     # run 100 iterations to warm up
     max_iterations = 100
     for i in range(max_iterations):
         log.info(
             'Running {}/{} iterations to warm up...'
             .format(i, max_iterations), update=True)
         session.run(session._train_op)
     log.info('Running the final iteration to generate timeline...')
     session.run(
         session._train_op, options=options, run_metadata=run_metadata)
     fetched_timeline = timeline.Timeline(run_metadata.step_stats)
     chrome_trace = fetched_timeline.generate_chrome_trace_format()
     with open('timeline.json', 'w') as f:
         f.write(chrome_trace)
Example #22
0
 def plot_features(self, input_image, label, layers):
     num = self.config.system.batch_size_per_gpu
     for i in range(num):
         log.info('{}% Plotting image #{}...'.format(
             int(i / num * 100.0), i),
                  update=True)
         path = os.path.join(self._path, str(i))
         os.makedirs(path, exist_ok=True)
         # input image
         # {root}/{index}/input.{ext}
         input_path = os.path.join(path, 'input-{}'.format(label[i]))
         self._plot_rgb_image(input_image[i], input_path)
         # layer activations
         for node, value in layers.items():
             if value.ndim != 4:
                 # value is not a (N x H x W x C) layout
                 continue
             name = node.formatted_name().replace('/', '-')
             # root/{index}/{layer_name}.{ext}
             layer_path = os.path.join(path, name)
             self._plot_images(value[i], layer_path)
Example #23
0
 def priority_kernel(self):
     priority = self._priority(self.blacklist)
     if not priority:
         log.debug('All nodes blacklisted.')
         return False
     node, node_priority = priority[0]
     info = self.targets[node]
     node_name = node.formatted_name()
     log.debug('Prioritize layer {!r} with importance {}.'.format(
         node_name, node_priority))
     value = self._step_forward(info['from'], info['to'], info['step'],
                                info['min_step'], info['type'])
     var = info['variable']
     if value is False:
         log.debug('Blacklisting {!r} as we cannot further '
                   'increment/decrement {!r}.'.format(node_name, var))
         self.blacklist.add(node)
         return True
     self.assign(var, value)
     info['from'] = value
     log.info(
         'Updated hyperparameter {!r} in layer {!r} with a new value {}.'.
         format(var.op.name, node_name, value))
     # fine-tuning with updated hyperparameter
     tolerable = self.fine_tune()
     if tolerable:
         # satisfies the budget constraint
         return True
     # fail to satisfy, backtrack and decrement step size
     self.backtrack()
     info = self.targets[node]
     new_step = self._reduce_step(info['step'], info['type'])
     if new_step < info['min_step']:
         self.blacklist.add(node)
         log.debug('Blacklisting {!r} as we cannot use smaller '
                   'increment/decrement.'.format(node_name))
         return True
     info['step'] = new_step
     return True
Example #24
0
 def eval(self, key=None, keyboard_interrupt=True):
     # load checkpoint
     if key is None:
         key = self.config.system.checkpoint.load
     self.load_checkpoint(key)
     self.run(self.imgs_seen.initializer)
     # evaluation
     log.info('Starting evaluation...')
     num_iterations = math.ceil(self.num_examples / self.batch_size)
     try:
         for step in range(num_iterations):
             self.run([], batch=True)
     except KeyboardInterrupt as e:
         log.info('Evaluation aborted.')
         if not keyboard_interrupt:
             raise e
     else:
         log.info('Evaluation complete.')
     return self.task.post_eval()
Example #25
0
 def _iteration(self, max_epochs=None):
     system = self.config.system
     epoch = self.once()
     floor_epoch = math.floor(epoch)
     cp_interval = system.checkpoint.get('save.interval', 0)
     if self.change.every('checkpoint.epoch', floor_epoch, cp_interval):
         log.info('Saving checkpoint at epoch {}...'.format(epoch),
                  update=True)
         with log.demote():
             self.save_checkpoint(floor_epoch)
         self._checkpoint_epoch = floor_epoch
     max_epochs = max_epochs or system.max_epochs
     if max_epochs and epoch >= max_epochs:
         log.info('Maximum epoch count {} reached.'.format(max_epochs))
         if self._checkpoint_epoch and floor_epoch > self._checkpoint_epoch:
             log.info('Saving final checkpoint...')
             self.save_checkpoint(floor_epoch)
         return False
     return True
Example #26
0
 def eval_all(self):
     log.info('Evaluating all checkpoints...')
     epochs = list(self._range(self.checkpoint.list_epochs()))
     epochs_to_eval = ', '.join(str(e) for e in epochs)
     log.info('Checkpoints to evaluate: {}'.format(epochs_to_eval))
     table = None
     # ensures imgs_seen initialized and loaded
     try:
         for e in epochs:
             with log.demote():
                 stats = self.eval(e, keyboard_interrupt=False)
             table = table or Table(['epoch'] + list(sorted(stats)))
             table.add_row(dict({'epoch': e}, **stats))
             infos = ['epoch: {}'.format(e)]
             infos += ['{}: {}'.format(k, v) for k, v in stats.items()]
             log.info(', '.join(infos))
     except KeyboardInterrupt:
         pass
     return table
Example #27
0
 def overriders_reset(self):
     log.info('Resetting overriders internal variables...')
     self._overriders_call('reset')
Example #28
0
 def overriders_update(self):
     log.info('Updating overrider internal variables...')
     self._overriders_call('update')
Example #29
0
File: cli.py Project: zaf05/mayo
 def _purge_session(self):
     if not self.session:
         return
     log.info('Purging current session because config is updated...')
     del self.session
     self.session = None
Example #30
0
File: cli.py Project: zaf05/mayo
 def cli_export(self):
     """Exports the current config.  """
     name = 'export.yaml'
     with open(name, 'w') as f:
         f.write(self.config.to_yaml())
     log.info('Config successfully exported to {!r}.'.format(name))