Example #1
0
    def __init__(self, params):
        """Initialize BenchmarkCNN.

        Args:
          params: Params tuple, typically created by make_params or
                  make_params_from_flags.
        Raises:
          ValueError: Unsupported params settings.
        """
        self.params = params
        if FLAGS.deterministic:
            assert self.params.data_dir is None
            self.dataset = datasets.create_dataset(None,
                                                    self.params.data_name)
        else:
            self.dataset = datasets.create_dataset(self.params.data_dir,
                                               self.params.data_name)
        self.model = model_config.get_model_config(self.params.model,
                                                   self.dataset)
        self.data_format = self.params.data_format
        self.resize_method = self.params.resize_method
        self.use_synthetic_gpu_images = self.dataset.use_synthetic_gpu_images()
        self.num_batches_for_eval = self.params.num_batches_for_eval

        if ((self.params.num_epochs_per_decay or
             self.params.learning_rate_decay_factor) and
                not (
                        self.params.learning_rate and self.params.num_epochs_per_decay and
                        self.params.learning_rate_decay_factor)):
            raise ValueError('If one of num_epochs_per_decay or '
                             'learning_rate_decay_factor is set, both must be set'
                             'and learning_rate must be set')
        if (self.params.minimum_learning_rate and
                not (
                        self.params.learning_rate and self.params.num_epochs_per_decay and
                        self.params.learning_rate_decay_factor)):
            raise ValueError('minimum_learning_rate requires learning_rate,'
                             'num_epochs_per_decay, and '
                             'learning_rate_decay_factor to be set')

        # Use the batch size from the command line if specified, otherwise use the
        # model's default batch size.  Scale the benchmark's batch size by the
        # number of GPUs.
        if self.params.batch_size > 0:
            self.model.set_batch_size(self.params.batch_size)
        self.batch_size = self.model.get_batch_size()
        self.batch_group_size = self.params.batch_group_size
        self.loss_scale = None
        self.loss_scale_normal_steps = None
        self.image_preprocessor = self.get_image_preprocessor()
    def __init__(self):
        self.model = FLAGS.model
        self.model_conf = model_config.get_model_config(self.model)
        self.trace_filename = FLAGS.trace_file
        self.data_format = FLAGS.data_format
        self.num_batches = FLAGS.num_batches
        autotune_threshold = FLAGS.autotune_threshold if (
            FLAGS.autotune_threshold) else 1
        min_autotune_warmup = 5 * autotune_threshold * autotune_threshold
        self.num_warmup_batches = FLAGS.num_warmup_batches if (
            FLAGS.num_warmup_batches) else max(10, min_autotune_warmup)
        self.graph_file = FLAGS.graph_file
        self.resize_method = FLAGS.resize_method
        self.sync_queue_counter = 0
        self.num_gpus = FLAGS.num_gpus

        # Use the batch size from the command line if specified, otherwise use the
        # model's default batch size.  Scale the benchmark's batch size by the
        # number of GPUs.
        if FLAGS.batch_size > 0:
            self.model_conf.set_batch_size(FLAGS.batch_size)
        self.batch_size = self.model_conf.get_batch_size() * FLAGS.num_gpus

        # Use the learning rate from the command line if specified, otherwise use
        # the model's default learning rate, which must always be set.
        assert self.model_conf.get_learning_rate() > 0.0
        if FLAGS.learning_rate is not None:
            self.model_conf.set_learning_rate(FLAGS.learning_rate)

        self.job_name = FLAGS.job_name  # "" for local training
        self.ps_hosts = FLAGS.ps_hosts.split(',')
        self.worker_hosts = FLAGS.worker_hosts.split(',')
        self.dataset = None
        self.data_name = FLAGS.data_name
        if FLAGS.data_dir is not None:
            if self.data_name is None:
                if 'imagenet' in FLAGS.data_dir:
                    self.data_name = 'imagenet'
                elif 'flowers' in FLAGS.data_dir:
                    self.data_name = 'flowers'
                else:
                    raise ValueError('Could not identify name of dataset. '
                                     'Please specify with --data_name option.')
            if self.data_name == 'imagenet':
                self.dataset = datasets.ImagenetData(FLAGS.data_dir)
            elif self.data_name == 'flowers':
                self.dataset = datasets.FlowersData(FLAGS.data_dir)
            else:
                raise ValueError(
                    'Unknown dataset. Must be one of imagenet or flowers.')

        self.local_parameter_device_flag = FLAGS.local_parameter_device
        if self.job_name:
            self.task_index = FLAGS.task_index
            self.cluster = tf.train.ClusterSpec({
                'ps': self.ps_hosts,
                'worker': self.worker_hosts
            })
            self.server = None

            if not self.server:
                self.server = tf.train.Server(self.cluster,
                                              job_name=self.job_name,
                                              task_index=self.task_index,
                                              config=create_config_proto(),
                                              protocol=FLAGS.server_protocol)
            worker_prefix = '/job:worker/task:%s' % self.task_index
            self.param_server_device = tf.train.replica_device_setter(
                worker_device=worker_prefix + '/cpu:0', cluster=self.cluster)
            # This device on which the queues for managing synchronization between
            # servers should be stored.
            num_ps = len(self.ps_hosts)
            self.sync_queue_devices = [
                '/job:ps/task:%s/cpu:0' % i for i in range(num_ps)
            ]
        else:
            self.task_index = 0
            self.cluster = None
            self.server = None
            worker_prefix = ''
            self.param_server_device = '/%s:0' % FLAGS.local_parameter_device
            self.sync_queue_devices = [self.param_server_device]

        # Device to use for ops that need to always run on the local worker's CPU.
        self.cpu_device = '%s/cpu:0' % worker_prefix

        # Device to use for ops that need to always run on the local worker's
        # compute device, and never on a parameter server device.
        self.raw_devices = [
            '%s/%s:%i' % (worker_prefix, FLAGS.device, i)
            for i in xrange(FLAGS.num_gpus)
        ]

        if FLAGS.staged_vars and FLAGS.variable_update != 'parameter_server':
            raise ValueError('staged_vars for now is only supported with '
                             '--variable_update=parameter_server')

        if FLAGS.variable_update == 'parameter_server':
            if self.job_name:
                if not FLAGS.staged_vars:
                    self.variable_mgr = variable_mgr.VariableMgrDistributedFetchFromPS(
                        self)
                else:
                    self.variable_mgr = (
                        variable_mgr.VariableMgrDistributedFetchFromStagedPS(
                            self))
            else:
                if not FLAGS.staged_vars:
                    self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS(
                        self)
                else:
                    self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromStagedPS(
                        self)
        elif FLAGS.variable_update == 'replicated':
            if self.job_name:
                raise ValueError(
                    'Invalid --variable_update in distributed mode: %s' %
                    FLAGS.variable_update)
            self.variable_mgr = variable_mgr.VariableMgrLocalReplicated(
                self, FLAGS.use_nccl)
        elif FLAGS.variable_update == 'distributed_replicated':
            if not self.job_name:
                raise ValueError(
                    'Invalid --variable_update in local mode: %s' %
                    FLAGS.variable_update)
            self.variable_mgr = variable_mgr.VariableMgrDistributedReplicated(
                self)
        elif FLAGS.variable_update == 'independent':
            if self.job_name:
                raise ValueError(
                    'Invalid --variable_update in distributed mode: %s' %
                    FLAGS.variable_update)
            self.variable_mgr = variable_mgr.VariableMgrIndependent(self)
        else:
            raise ValueError('Invalid --variable_update: %s' %
                             FLAGS.variable_update)

        # Device to use for running on the local worker's compute device, but
        # with variables assigned to parameter server devices.
        self.devices = self.variable_mgr.get_devices()
        if self.job_name:
            self.global_step_device = self.param_server_device
        else:
            self.global_step_device = self.cpu_device
    if args.inference not in ['True', 'False']:
        print('inference only accept True or False as parameter')
        sys.exit()

    if args.inference == 'True':
        inference = True

# Load the json config file for the requested mode.
config_file = open(args.pwd + "/config.json", 'r')
config_contents = config_file.read()
config = json.loads(config_contents)[args.mode]


def get_backend_version():
    if keras.backend.backend() == "tensorflow":
        return tf.__version__
    if keras.backend.backend() == "mxnet":
        return mxnet.__version__
    return "undefined"


model = model_config.get_model_config(args.model_name)

use_dataset_tensors = False
if args.epochs:
    model.run_benchmark(gpus=config['gpus'], inference=inference, use_dataset_tensors=use_dataset_tensors, epochs=int(args.epochs))
else:
    model.run_benchmark(gpus=config['gpus'], inference=inference, use_dataset_tensors=use_dataset_tensors)
if args.dry_run:
    print("Model :total_time", model.test_name, model.total_time)
Example #4
0
in_name = 'imagenet'
model_name = 'inception3'
batch_size = 64

params = namedtuple('Params', 'param')
params.model = model_name
params.use_datasets = True
params.datasets_repeat_cached_sample = False
params.datasets_num_private_threads = None
params.datasets_use_caching = False
params.datasets_parallel_interleave_cycle_length = None
params.datasets_sloppy_parallel_interleave = False
params.datasets_parallel_interleave_prefetch = None

dataset = datasets.create_dataset(in_dir, in_name)
model = model_config.get_model_config(model_name, in_name)()

output_shape = model.get_input_shapes(subset='train')
print(output_shape)

reader = dataset.get_input_preprocessor('default')(batch_size,
                                                   output_shape,
                                                   1,
                                                   dtype=model.data_type,
                                                   train=False,
                                                   distortions=True,
                                                   resize_method='bilinear')
ds = reader.create_dataset(batch_size, 1, batch_size, dataset, 'train', False,
                           False, None, False, None, False, None)
it = tf.compat.v1.data.make_initializable_iterator(ds)