def profile(self, device_name, options, executor=None): """Profile the network with the given device spec. Returns: A dictionary contains the following keys: (layers, flops, executor, executor_std, flops_message, executor_msg) """ device_spec = device.DEVICES[device_name] logger.info('Profiling for device %s' % device_spec.name) results = [] for layer_spec in self.graph.topology_order: layer = layer_spec.layer_op # Always run flop-based profiler. if executor == 'tensorflow': # Here we disable the cudnn heuristics. # Tensorflow requires creating a cuda stream and does not allow # multiple context under one process. # We cannot use cuda stream because of the python wrapper. options.use_cudnn_heuristics = False flops_profiler = profilers.FlopsProfiler(options, device_spec) flop_based_time = flops_profiler.profile(layer) logger.info('Layer: %s' % layer_spec.name) logger.info( '- %s: %s %s' % (flops_profiler.name, flop_based_time, flops_profiler.message)) if device_spec.is_gpu: profiler = None if executor == 'cudnn': from profilers.cudnn_profiler import CudnnProfiler profiler = CudnnProfiler(options) elif executor == 'tensorflow': from profilers.tensorflow_profiler import ( TensorFlowProfiler) profiler = TensorFlowProfiler(options) if profiler: executor_time = profiler.profile(layer) logger.info( '- %s: %s %s' % (profiler.name, executor_time, profiler.message)) results.append( (layer_spec.name, flop_based_time.total_time, executor_time.total_time, 0, flops_profiler.message, profiler.message)) return results
def _profile_for_batch_size(layer_list, direction, device, batch_size, use_only_gemm, ppp_comp, ppp_comm, cross_device_bandwidth=None): """Use flops profiler to estiamte execution with under the given spec.""" logger.debug('Profile for\n pass: %s\n device: %s\n batch size: %s' % (direction, device.name, batch_size)) times = [] params_in_bytes = 0 # Estimate forward time for each layer. for layer_spec in layer_list: layer = layer_spec.layer_op if batch_size: layer.batch_size = batch_size options = profilers.ProfilerOptions() options.direction = direction options.gradient_wrt = None if use_only_gemm: options.use_cudnn_heuristics = False # FIXME: we don't include bias and activation for simplicity. options.include_bias_and_activation = False options.ppp_comp = ppp_comp options.ppp_comm = ppp_comm flops_profiler = profilers.FlopsProfiler( options, device) # Why instantiate new profiler for every layer? layer_time = flops_profiler.profile( layer, layer_spec.device_id, [p.device_id for p in layer_spec.parents], cross_device_bandwidth) params_in_bytes += layer.weights_in_bytes times.append(layer_time) return times, params_in_bytes
def _profile_for_apply_updates(params_in_bytes, device): flops_profiler = profilers.FlopsProfiler(profilers.ProfilerOptions(), device) return flops_profiler.profile_apply_updates(params_in_bytes)