def main(job_dir, data_dir, num_gpus, variable_strategy, use_distortion_for_training, log_device_placement, num_intra_threads, **hparams): # The env variable is on deprecation path, default is set to off. os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' print('hparams:', hparams) # Session configuration. sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=log_device_placement, intra_op_parallelism_threads=num_intra_threads, gpu_options=tf.GPUOptions(force_gpu_compatible=True)) config = cifar10_utils.RunConfig( session_config=sess_config, model_dir=job_dir, save_checkpoints_steps=hparams["eval_steps"]) tf.contrib.learn.learn_runner.run( get_experiment_fn(data_dir, num_gpus, variable_strategy, use_distortion_for_training), run_config=config, schedule="train_and_evaluate", hparams=tf.contrib.training.HParams(is_chief=config.is_chief, **hparams))
def main(job_dir, data_dir, num_gpus, variable_strategy, data_format, use_distortion_for_training, log_device_placement, num_intra_threads, **hparams): # The env variable is on deprecation path, default is set to off. os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) # on CPU. The exception is Intel MKL on CPU which is optimal with # channels_last. if not data_format: if num_gpus == 0: data_format = 'channels_last' else: data_format = 'channels_first' # Session configuration. sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=log_device_placement, intra_op_parallelism_threads=num_intra_threads, gpu_options=tf.GPUOptions(force_gpu_compatible=True)) config = cifar10_utils.RunConfig( session_config=sess_config, model_dir=job_dir) tf.contrib.learn.learn_runner.run( get_experiment_fn(data_dir, num_gpus, variable_strategy, data_format, use_distortion_for_training), run_config=config, hparams=tf.contrib.training.HParams(**hparams))
def main(job_dir, data_dir, num_gpus, variable_strategy, use_distortion_for_training, log_device_placement, num_intra_threads, **hparams): # The env variable is on deprecation path, default is set to off. os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Session configuration. sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=log_device_placement, intra_op_parallelism_threads=num_intra_threads, gpu_options=tf.GPUOptions(force_gpu_compatible=True)) sess_config.gpu_options.allow_growth = True # run_config = tf.estimator.RunConfig().replace(session_config=session_config) config = cifar10_utils.RunConfig(session_config=sess_config, model_dir=job_dir) tf.contrib.learn.learn_runner.run( get_experiment_fn(data_dir, num_gpus, variable_strategy, use_distortion_for_training), run_config=config, hparams=tf.contrib.training.HParams(is_chief=config.is_chief, **hparams))
def main(job_dir, data_dir, num_gpus, variable_strategy, use_distortion_for_training, log_device_placement, num_intra_threads, **hparams): # The env variable is on deprecation path, default is set to off. os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Session configuration. sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=log_device_placement, intra_op_parallelism_threads=num_intra_threads, gpu_options=tf.GPUOptions(force_gpu_compatible=True)) config = cifar10_utils.RunConfig(session_config=sess_config, model_dir=job_dir) run_config = config ''' tf.contrib.learn.learn_runner.run( get_experiment_fn(data_dir, num_gpus, variable_strategy, use_distortion_for_training), run_config=config, hparams=tf.contrib.training.HParams( is_chief=config.is_chief, **hparams)) ''' hparams = tf.contrib.training.HParams(is_chief=config.is_chief, **hparams) train_input_fn = functools.partial( input_fn, data_dir, subset='train', num_shards=num_gpus, batch_size=hparams.train_batch_size, use_distortion_for_training=use_distortion_for_training) eval_input_fn = functools.partial(input_fn, data_dir, subset='eval', batch_size=hparams.eval_batch_size, num_shards=num_gpus) num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval') if num_eval_examples % hparams.eval_batch_size != 0: raise ValueError( 'validation set size must be multiple of eval_batch_size') train_steps = hparams.train_steps eval_steps = num_eval_examples // hparams.eval_batch_size # Comment the below line when training locally classifier = tf.estimator.Estimator(model_fn=get_model_fn( num_gpus, variable_strategy, run_config.num_worker_replicas or 1), config=config, params=hparams) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=train_steps) #exporter=tf.estimator.LatestExporter('exporter', serving_input_fn) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=eval_steps) tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
def main(job_dir, data_dir, num_gpus, variable_strategy, use_distortion_for_training, log_device_placement, num_intra_threads, **hparams): # The env variable is on deprecation path, default is set to off. os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Session configuration. sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=log_device_placement, intra_op_parallelism_threads=num_intra_threads, gpu_options=tf.GPUOptions(force_gpu_compatible=True)) # override checkpoint saver to not do anything # from tensorflow.python.training import training # def dummy(*args, **kwargs): return # training.CheckpointSaverHook._save = dummy np.random.seed(1) tf.set_random_seed(1) # change event flush seconds to 1 from tensorflow.python.summary.writer.writer import FileWriter old_init = FileWriter.__init__ def newinit(*args, **kwargs): print("Overriding FileWriter flush_secs to 1") kwargs['flush_secs'] = 1 old_init(*args, **kwargs) FileWriter.__init__ = newinit config = cifar10_utils.RunConfig(session_config=sess_config, model_dir=job_dir, tf_random_seed=1) tf.contrib.learn.learn_runner.run( get_experiment_fn(data_dir, num_gpus, variable_strategy, use_distortion_for_training), run_config=config, hparams=tf.contrib.training.HParams(is_chief=config.is_chief, **hparams)) print(vals)
def main(job_dir, data_dir, num_gpus, variable_strategy, use_distortion_for_training, log_device_placement, num_intra_threads, **hparams): # The env variable is on deprecation path, default is set to off. os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Session configuration. sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=log_device_placement, intra_op_parallelism_threads=num_intra_threads, gpu_options=tf.GPUOptions(force_gpu_compatible=True)) # override default 100 steps. 122 e/sec = 4 steps/second config = cifar10_utils.RunConfig(session_config=sess_config, model_dir=job_dir, save_summary_steps=10) # change event flush seconds to 1 from tensorflow.python.summary.writer.writer import FileWriter old_init = FileWriter.__init__ def newinit(*args, **kwargs): new_flush_secs = hparams['event_flush_secs'] print("Overriding FileWriter flush_secs to " + str(new_flush_secs)) kwargs['flush_secs'] = new_flush_secs # kwargs['flush_secs']=1 old_init(*args, **kwargs) FileWriter.__init__ = newinit tf.contrib.learn.learn_runner.run( get_experiment_fn(data_dir, num_gpus, variable_strategy, use_distortion_for_training), run_config=config, hparams=tf.contrib.training.HParams(is_chief=config.is_chief, **hparams))
def main(job_dir, data_dir, num_gpus, variable_strategy, use_distortion_for_training, log_device_placement, num_intra_threads, **hparams): # The env variable is on deprecation path, default is set to off. os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Session configuration. sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=log_device_placement, intra_op_parallelism_threads=num_intra_threads, gpu_options=tf.GPUOptions(force_gpu_compatible=True)) config = cifar10_utils.RunConfig( session_config=sess_config, model_dir=job_dir) hparams = tf.contrib.training.HParams( is_chief=config.is_chief, **hparams) # # Edit this for steps you want to log # for i in range(41, 51): # hparams.train_steps = i * 2000 # tf.contrib.learn.learn_runner.run( # get_experiment_fn(data_dir, num_gpus, variable_strategy, # use_distortion_for_training), # run_config=config, # hparams=hparams) def evaluate_with_censor(): """Evaluate model with censored image Goal: Examine one image only and evaluate it with different sections censored (i.e., blacked out). Create a heat-map of most important/distinguishing pixels for analysis. """ # Create estimator. eval_input_fn = functools.partial( input_fn, data_dir, subset='eval', batch_size=hparams.eval_batch_size, num_shards=num_gpus) classifier = tf.estimator.Estimator( model_fn=get_model_fn(num_gpus, variable_strategy, config.num_worker_replicas or 1), config=config, params=hparams) num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval') if num_eval_examples % hparams.eval_batch_size != 0: raise ValueError( 'validation set size must be multiple of eval_batch_size') eval_steps = num_eval_examples // hparams.eval_batch_size experiment = tf.contrib.learn.Experiment( classifier, train_input_fn=None, eval_input_fn=eval_input_fn, train_steps=1, eval_steps=eval_steps) tf.contrib.learn.learn_runner.run( lambda x, y: experiment, run_config=config, hparams=hparams) evaluate_with_censor()
def _setup(self, config): # The env variable is on deprecation path, default is set to off. os.environ["TF_SYNC_ON_FINISH"] = "0" os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1" # Session configuration. sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=args.log_device_placement, intra_op_parallelism_threads=args.num_intra_threads, gpu_options=tf.GPUOptions(force_gpu_compatible=True, allow_growth=True), ) # Convert to actual hyperparameter values here using the grid (discrete) input hparams["train_batch_size"] = 2**(int(config["batch_size"]) + 5) hparams["momentum"] = 0.4 + (0.55 * int(config["momentum"]) / args.precision) hparams["weight_decay"] = 1e-4 + (1e-4 * int(config["weight_decay"]) / args.precision) hparams["batch_norm_decay"] = 0.8 + ( 0.199 * int(config["batch_norm_decay"]) / args.precision) hparams["batch_norm_epsilon"] = 1e-5 + ( 0.00099 * int(config["batch_norm_epsilon"]) / args.precision) hparams["learning_rate"] = 0.01 + (0.1 * int(config["learning_rate"]) / args.precision) opt = int(config["optimizer"]) if opt == 0: hparams["optimizer"] = "momentum" elif opt == 1: hparams["optimizer"] = "adam" elif opt == 2: hparams["optimizer"] = "adagrad" elif opt == 3: hparams["optimizer"] = "adadelta" elif opt == 4: hparams["optimizer"] = "sgd" else: hparams["optimizer"] = "rmsprop" # Calculate number of steps per one epoch self.train_steps = cifar10.Cifar10DataSet.num_examples_per_epoch( "train") // (hparams["train_batch_size"]) # TODO: Fix checkpoint dir run_config = cifar10_utils.RunConfig( session_config=sess_config, model_dir=None, save_checkpoints_secs=None, save_checkpoints_steps=self.train_steps, keep_checkpoint_max=None, keep_checkpoint_every_n_hours=None, ) self.run_config = run_config self.train_input_fn, self.eval_input_fn, self.estimator = build_estimator( data_dir=args.data_dir, num_gpus=args.num_gpus, variable_strategy=args.variable_strategy, use_distortion_for_training=args.use_distortion_for_training, run_config=run_config, hparams=tf.contrib.training.HParams(is_chief=run_config.is_chief, **hparams), ) self.logger = logging.getLogger("metrics") self.logger.setLevel(logging.INFO) file_handler = logging.FileHandler(args.log_path) self.logger.addHandler(file_handler) self.logger.info(f"[CONFIG] ID={self._experiment_id} config={hparams}")
def main(job_dir, data_dir, num_gpus, variable_strategy, use_distortion_for_training, log_device_placement, num_intra_threads, **hparams): # The env variable is on deprecation path, default is set to off. os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' <<<<<<< HEAD # Session configuration. sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=log_device_placement, intra_op_parallelism_threads=num_intra_threads, gpu_options=tf.GPUOptions(force_gpu_compatible=True)) config = cifar10_utils.RunConfig( session_config=sess_config, model_dir=job_dir) tf.contrib.learn.learn_runner.run( get_experiment_fn(data_dir, num_gpus, variable_strategy, use_distortion_for_training), run_config=config, hparams=tf.contrib.training.HParams(**hparams)) ======= if FLAGS.num_gpus < 0: raise ValueError( 'Invalid GPU count: \"num_gpus\" must be 0 or a positive integer.') if FLAGS.num_gpus == 0 and not FLAGS.is_cpu_ps: raise ValueError( 'No GPU available for use, must use CPU as parameter server.') if (FLAGS.num_layers - 2) % 6 != 0: raise ValueError('Invalid num_layers parameter.') if FLAGS.num_gpus != 0 and FLAGS.train_batch_size % FLAGS.num_gpus != 0: