Example #1
0
 def before_run(self, run_context):
     if self._need_sync:
         self._do_sync_offset(run_context.session)
         _log_event('BEFORE _sync_state_op')
         run_context.session.run(self._sync_state_op)
         _log_event('AFTER _sync_state_op')
         self._need_sync = False
         self._profiler.end()
Example #2
0
 def _do_sync_offset(self, sess):
     _log_event('BEFORE _sync_offset_op(%s)' % (self._trained_samples))
     new_offset = sess.run(
         self._sync_offset_op,
         feed_dict={self._trained_samples_place: self._trained_samples})
     print('sync offset %d -> %d on step %d' %
           (self._trained_samples, new_offset, self._step))
     self._trained_samples = new_offset
def main():
    _log_event('BEGIN :: main')
    args = parse_args()
    tf_methods = {
        'simple': run_simple_session,
        'monitored': run_with_session_and_hooks,
        'estimator': run_with_estimator,
    }
    tf_methods[args.tf_method](args)
    _log_event('END :: main')
Example #4
0
    def before_run(self, run_context):
        if self._step >= self._max_step:  # shouldn't happen
            print('request_stop before kungfu_step: %d' % (self._step))
            # run_context.request_stop()
            # FIXME: force quit

        if self._need_sync:
            is_first = self._step == 0
            if is_first:
                _log_event('BEFORE first _sync_step_op')
            self._step = run_context.session.run(
                self._sync_step_op, feed_dict={self._step_place: self._step})
            if is_first:
                _log_event('BEFORE first _sync_op')
            run_context.session.run(self._sync_op)
            if is_first:
                _log_event('AFTER first _sync_op')
            self._need_sync = False
def run_with_estimator(args):
    _log_event('BEGIN :: run_with_estimator')

    _log_event('BEGIN :: build_estimator')
    classifier = build_estimator(args)
    _log_event('END :: build_estimator')

    hooks = [
        debug_hooks.LogStepHook(),
    ]

    if args.show_training_throughput:
        hooks.append(debug_hooks.LogPerfHook(args.batch_size))

    if args.elastic:
        from kungfu.tensorflow.experimental.hook import ElasticHook
        elastic_hook = ElasticHook(args.batch_size, args.epochs,
                                   args.epoch_size)
        hooks.append(elastic_hook)

        schedule = parse_scheule(args.resize_schedule)
        profile_resize_hook = debug_hooks.ProfileResizeHook(schedule)
        hooks.append(profile_resize_hook)

        input_fn = build_input_fn(args.batch_size)
        classifier.train(input_fn, hooks=hooks)
    else:
        input_fn = build_input_fn(args.batch_size, args.train_steps)

        sync_step_hook = debug_hooks.SyncStepHook()
        if args.sync_step:
            hooks.append(sync_step_hook)

        _log_event('BEGIN :: classifier.train')
        classifier.train(input_fn, hooks=hooks, max_steps=args.train_steps)
        _log_event('END :: classifier.train')

    _log_event('END :: run_with_estimator')
Example #6
0
 def before_run(self, run_context):
     if self._step == 0:
         _log_event('before_run_step_0')
     print('%s::%s %d steps' % ('LogStepHook', 'before_run', self._step))
Example #7
0
 def end(self, run_context):
     _log_event('SyncStepHook::end')
Example #8
0
 def after_create_session(self, sess, coord):
     gs = sess.run(self._sync_step_op)
     sess.run(self._sync_state_op)
     print('_sync_step_op result %d' % (gs))
     _log_event('AFTER _sync_step_op')