def test_transpose(self, dtype, ndims, seed, null_axes, engine, gc, dc): if (gc.device_type == caffe2_pb2.CUDA and engine == "CUDNN"): # cudnn 5.1 does not support int. assume(workspace.GetCuDNNVersion() >= 6000 or dtype != np.int32) dims = (np.random.rand(ndims) * 16 + 1).astype(np.int32) X = (np.random.rand(*dims) * 16).astype(dtype) if null_axes: axes = None op = core.CreateOperator( "Transpose", ["input"], ["output"], engine=engine) else: np.random.seed(int(seed)) axes = [int(v) for v in list(np.random.permutation(X.ndim))] op = core.CreateOperator( "Transpose", ["input"], ["output"], axes=axes, engine=engine) def transpose_ref(x, axes): return (np.transpose(x, axes),) self.assertReferenceChecks(gc, op, [X, axes], transpose_ref)
def _cudnn_supports(dilation=False, nhwc=False): """Return True if cuDNN supports this configuration.""" v = workspace.GetCuDNNVersion() if dilation and v < 6000: # Dilation not supported until v6 return False if dilation and nhwc: # Dilation and NHWC not supported together return False return True
def _cudnn_supports(dilation=False, nhwc=False, backward=False): """Return True if cuDNN supports this configuration.""" v = workspace.GetCuDNNVersion() if backward: if nhwc: # nhwc isn't supported in backward ops. return False else: # Forward mode. if dilation and v < 6000: # Dilation not supported until v6 return False if dilation and nhwc: # Dilation and NHWC not supported together return False return True
def test_global_pooling(self, size, input_channels, batch_size, order, op_type, engine, gc, dc): # CuDNN 5 does not support deterministic max pooling. assume(workspace.GetCuDNNVersion() >= 6000 or op_type != "MaxPool") op = core.CreateOperator( op_type, ["X"], ["Y"], order=order, engine=engine, global_pooling=True, ) X = np.random.rand( batch_size, size, size, input_channels).astype(np.float32) if order == "NCHW": X = X.transpose((0, 3, 1, 2)) self.assertDeviceChecks(dc, op, [X], [0]) if 'MaxPool' not in op_type: self.assertGradientChecks(gc, op, [X], 0, [0])
def test_convolution_gradients(self, stride, pad, kernel, dilation, size, input_channels, output_channels, batch_size, order, engine, use_bias, gc, dc): dkernel = dilation * (kernel - 1) + 1 # cuDNN v6+ supports dilated convolutions if (workspace.GetCuDNNVersion() < 6000): assume("" == engine or 1 == dilation) assume(engine != "MKLDNN" or use_bias is True) op = core.CreateOperator( "Conv", ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, ) X = np.random.rand(batch_size, size, size, input_channels).astype( np.float32) - 0.5 w = np.random.rand( output_channels, kernel, kernel, input_channels).astype(np.float32)\ - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 if order == "NCHW": X = X.transpose((0, 3, 1, 2)) w = w.transpose((0, 3, 1, 2)) inputs = [X, w, b] if use_bias else [X, w] # Error handling path. if size + pad + pad < dkernel or size + pad + pad < dkernel: with self.assertRaises(RuntimeError): self.assertDeviceChecks(dc, op, inputs, [0]) return self.assertDeviceChecks(dc, op, inputs, [0]) for i in range(len(inputs)): self.assertGradientChecks(gc, op, inputs, i, [0])
def test_global_max_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc): ''' Special test to stress the fast path of NCHW max pool ''' # CuDNN 5 does not support deterministic max pooling. assume(workspace.GetCuDNNVersion() >= 6000 or engine != "CUDNN") op = core.CreateOperator( op_type, ["X"], ["Y"], stride=1, kernel=sz, pad=0, order="NCHW", engine=engine, deterministic=1, ) np.random.seed(1234) X = np.random.rand(batch_size, 3, sz, sz).astype(np.float32) self.assertDeviceChecks(dc, op, [X], [0]) self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-4)
def main(): args = parse_args() if args.dtype == 'float32': args.dtype = 'float' # report some available info if args.device == 'gpu': assert args.num_gpus > 0, "Number of GPUs must be specified in GPU mode" print("__caffe2.cuda_version__=%s" % (json.dumps(workspace.GetCUDAVersion()))) print("__caffe2.cudnn_version__=%s" % (json.dumps(workspace.GetCuDNNVersion()))) try: opts = vars(args) opts['phase'] = 'inference' if args.forward_only else 'training' model_title, times = benchmark(opts) except Exception as err: #TODO: this is not happenning, program terminates earlier. # For now, do not rely on __results.status__=... times = np.zeros(0) model_title = 'Unk' print ("Critical error while running benchmarks (%s). See stacktrace below." % (str(err))) traceback.print_exc(file=sys.stdout) if len(times) > 0: mean_time = np.mean(times) # seconds # Compute mean throughput num_local_devices = 1 if args.device == 'cpu' else args.num_gpus #Number of compute devices per node num_devices = num_local_devices * args.num_workers #Global number of devices replica_batch = args.batch_size #Input is a replica batch mean_throughput = num_devices * replica_batch / mean_time #images / sec # print("__results.time__=%s" % (json.dumps(1000.0 * mean_time))) print("__results.throughput__=%s" % (json.dumps(int(mean_throughput)))) print("__exp.model_title__=%s" % (json.dumps(model_title))) print("__results.time_data__=%s" % (json.dumps((1000.0*times).tolist()))) else: print("__results.status__=%s" % (json.dumps("failure")))
def test_convolution_layout(self, stride, pad, kernel, dilation, size, input_channels, output_channels, batch_size, use_bias, gc, dc): assume(size >= dilation * (kernel - 1) + 1) X = np.random.rand(batch_size, size, size, input_channels).astype( np.float32) - 0.5 w = np.random.rand( output_channels, kernel, kernel, input_channels).astype(np.float32)\ - 0.5 b = np.random.rand(output_channels).astype(np.float32) - 0.5 Output = collections.namedtuple("Output", ["Y", "engine", "order"]) outputs = [] cudnn_v6p = workspace.GetCuDNNVersion() >= 6000 dilated_conv = dilation == 1 # cuDNN v6+ supports dilated convolutions engine_list = ["", "CUDNN"] if cudnn_v6p or dilated_conv else [""] for order in ["NCHW", "NHWC"]: for engine in engine_list: op = core.CreateOperator( "Conv", ["X", "w", "b"] if use_bias else ["X", "w"], ["Y"], stride=stride, kernel=kernel, dilation=dilation, pad=pad, order=order, engine=engine, device_option=gc, ) if order == "NCHW": X_f = X.transpose((0, 3, 1, 2)) w_f = w.transpose((0, 3, 1, 2)) else: X_f = X w_f = w self.assertDeviceChecks( dc, op, [X_f, w_f, b] if use_bias else [X_f, w_f], [0]) self.ws.create_blob("X").feed(X_f, device_option=gc) self.ws.create_blob("w").feed(w_f, device_option=gc) self.ws.create_blob("b").feed(b, device_option=gc) self.ws.run(op) outputs.append( Output(Y=self.ws.blobs["Y"].fetch(), engine=engine, order=order)) def canonical(o): if o.order == "NHWC": return o.Y.transpose((0, 3, 1, 2)) else: return o.Y for o in outputs: np.testing.assert_allclose(canonical(outputs[0]), canonical(o), atol=1e-4, rtol=1e-4)
def get_nvidia_info(): return ( get_nvidia_smi_output(), workspace.GetCUDAVersion(), workspace.GetCuDNNVersion(), )
action='store_true', help= 'Enable volta\'s tensor ops (requires CUDA >= 9, cuDNN >= 7 and NVIDIA Volta GPU)' ) args = parser.parse_args() if args.dtype == 'float32': args.dtype = 'float' # report some available info if args.device == 'gpu': assert args.num_gpus > 0, "Number of GPUs must be specified in GPU mode" print("__caffe2.cuda_version__=%s" % (json.dumps(workspace.GetCUDAVersion()))) print("__caffe2.cudnn_version__=%s" % (json.dumps(workspace.GetCuDNNVersion()))) try: opts = vars(args) opts['phase'] = 'inference' if args.forward_only else 'training' model_title, times = benchmark(opts) except Exception as err: #TODO: this is not happenning, program terminates earlier. # For now, do not rely on __results.status__=... times = np.zeros(0) model_title = 'Unk' print( "Critical error while running benchmarks (%s). See stacktrace below." % (str(err))) traceback.print_exc(file=sys.stdout)
parser.add_argument('--num_decode_threads', type=int, required=False, default=1, help='Number of image decode threads. For high throughput models such as AlexNetOWT set to 6-8 for 4 Voltas.') parser.add_argument('--float16_compute', nargs='?', const=True, default=False, type=str2bool, help='If true, use FP16 SGD optimizer else use multi-precision SGD optimizer') # These parameters affect the ModelHelper behaviour and are now applied for GPU benchmarks parser.add_argument('--cudnn_workspace_limit_mb', type=int, required=False, default=64, help='CuDNN workspace limit in MBs') parser.add_argument('--use_cudnn', nargs='?', const=True, default=True, type=str2bool, help='Use NVIDIA cuDNN library.') parser.add_argument('--cudnn_exhaustive_search', nargs='?', const=True, default=True, type=str2bool, help='Benchmark inference (if true) else benchmark training.') args = parser.parse_args() if args.dtype == 'float32': args.dtype = 'float' # report some available info if args.device == 'gpu': assert args.num_gpus > 0, "Number of GPUs must be specified in GPU mode" print("__caffe2.cuda_version__=%s" % (json.dumps(workspace.GetCUDAVersion()))) print("__caffe2.cudnn_version__=%s" % (json.dumps(workspace.GetCuDNNVersion()))) try: opts = vars(args) opts['phase'] = 'inference' if args.forward_only else 'training' model_title, times = benchmark(opts) except Exception as err: #TODO: this is not happenning, program terminates earlier. # For now, do not rely on __results.status__=... times = np.zeros(0) model_title = 'Unk' print ("Critical error while running benchmarks (%s). See stacktrace below." % (str(err))) traceback.print_exc(file=sys.stdout) if len(times) > 0: mean_time = np.mean(times) # seconds