def testShapeInferenceSimpleFC(self): m = cnn.CNNModelHelper() m.FC("data", "fc1", dim_in=96, dim_out=32) m.FC("fc1", "fc2", dim_in=32, dim_out=55) (shapes, types) = workspace.InferShapesAndTypes([m.param_init_net, m.net], {'data': [64, 96]}) self.assertEquals(shapes['data'], [64, 96]) self.assertEquals(shapes['fc1_w'], [32, 96]) self.assertEquals(shapes['fc1_b'], [32]) self.assertEquals(shapes['fc1'], [64, 32]) self.assertEquals(shapes['fc2_w'], [55, 32]) self.assertEquals(shapes['fc2_b'], [55]) self.assertEquals(shapes['fc2'], [64, 55])
def testShapeInferenceTranspose(self): model = cnn.CNNModelHelper() workspace.FeedBlob("tensor", np.random.rand(4, 2, 3, 3, 5).astype(np.float32)) # Testing with axes undefined model.Transpose( ["tensor"], "transpose", ) self.InferTensorRunAndCompare(model) # Testing with axes defined model.Transpose(["tensor"], "transpose", axes=np.random.permutation(5)) return self.InferTensorRunAndCompare(model)
def gen_test_resnet50(_order, _cudnn_ws): model = cnn.CNNModelHelper( order="NCHW", name="resnet_50_test", cudnn_exhaustive_search=True, ) data = model.net.AddExternalInput("data") label = model.net.AddExternalInput("label") (_softmax, loss) = resnet.create_resnet50( model, data, num_input_channels=3, num_labels=1000, label=label, is_test=False, ) return model, 227
def testShapeInferencePad(self): model = cnn.CNNModelHelper(name="padtest") model.PadImage("data", 'padded', pad_t=100, pad_l=37, pad_b=28, pad_r=20, mode="constant", order="NCHW") workspace.FeedBlob( "data", np.random.rand(16, 3, 228, 228).astype(np.float32), ) self.InferTensorRunAndCompare(model)
def test_forward_only_fast_simplenet( create_model, last_out_blob, data_blob="gpu_0/data", num_labels=1000, ): model = cnn.CNNModelHelper( order="NCHW", name="test", cudnn_exhaustive_search=True, ) with core.NameScope("gpu_0"): data = model.net.AddExternalInput(data_blob) create_model(model, data, num_input_channels=3, num_labels=num_labels, is_test=True) count_before = count_blobs(model.net.Proto()) t = time.time() optim_proto = memonger.optimize_inference_fast( model.net.Proto(), set([data_blob, last_out_blob]).union(set(model.net.Proto().external_input))) print("Optimization took {} secs".format(time.time() - t)) count_after = count_blobs(optim_proto) num_shared_blobs = count_shared_blobs(optim_proto) print(count_after, count_before, num_shared_blobs) # Run model and compare results workspace.RunNetOnce(model.param_init_net) data = np.random.rand(4, 3, 227, 227).astype(np.float32) workspace.FeedBlob(data_blob, data) model.net.Proto().type = 'simple' workspace.RunNetOnce(model.net) loss1 = workspace.FetchBlob(last_out_blob) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob(last_out_blob) return [(count_after, count_before), (num_shared_blobs), (loss1, optimized_loss1)]
def test_copy_gradient_multiple_gpus(self): model = cnn.CNNModelHelper(name="copy_test") with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): x_cpu = model.net.AddExternalInputs("x_cpu") with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): x_gpu_1 = model.CopyCPUToGPU(x_cpu, "x_gpu_1") with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 1)): x_gpu_2 = model.Copy(x_gpu_1, "x_gpu_2") loss = model.AveragedLoss(x_gpu_2, "loss") gradient_map = model.AddGradientOperators([loss]) workspace.FeedBlob("x_cpu", np.random.rand(32).astype(np.float32)) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) print(model.net.Proto()) self.assertTrue( np.array_equal( workspace.FetchBlob("x_gpu_1"), workspace.FetchBlob("x_gpu_2"), )) self.assertTrue( np.array_equal( workspace.FetchBlob(gradient_map["x_gpu_1"]), workspace.FetchBlob(gradient_map["x_gpu_2"]), )) def get_op_with_output(model, output_blob_name): for op in model.net.Proto().op: if len(op.output) == 1 and op.output[0] == output_blob_name: return op return None self.assertEqual( get_op_with_output(model, "x_gpu_2_grad").device_option, core.DeviceOption(caffe2_pb2.CUDA, 1), ) self.assertEqual( get_op_with_output(model, "x_cpu_grad").device_option, core.DeviceOption(caffe2_pb2.CUDA, 0), )
def run(comm_rank, comm_size, tmpdir): def add_input_ops(model): pass def add_model_ops(model, loss_scale): return [] def add_optimizer(model): pass workspace.ResetWorkspace() store_handler = "store_handler" workspace.RunOperatorOnce( core.CreateOperator("FileStoreHandlerCreate", [], [store_handler], path=tmpdir)) rendezvous = dict( kv_handler=store_handler, shard_id=comm_rank, num_shards=comm_size, engine='GLOO', ) model = cnn.CNNModelHelper( order="NHWC", name="test", ) # Set network timeout to 2 seconds, and add a 3 seconds # sleep for 1 host. Make sure there is no timeout on the # second RunNet. data_parallel_model._DEFAULT_TIMEOUT_SEC = 2 data_parallel_model.Parallelize_CPU( model, input_builder_fun=add_input_ops, forward_pass_builder_fun=add_model_ops, optimizer_builder_fun=add_optimizer, devices=[1, 2, 3], rendezvous=rendezvous, barrier_net_timeout_sec=5) data_parallel_model.RunInitNet(model) data_parallel_model.RunNet(model, 2) if comm_rank == 0: time.sleep(data_parallel_model._DEFAULT_TIMEOUT_SEC) data_parallel_model.RunNet(model, 2)
def test_leaky_relu_cnn_helper(self, N, C, H, W, order, alpha, seed): np.random.seed(seed) model = cnn.CNNModelHelper(order=order) model.LeakyRelu('input', 'output', alpha=alpha) input_blob = np.random.rand(N, C, H, W).astype(np.float32) if order == 'NHWC': input_blob = np.transpose(input_blob, axes=(0, 2, 3, 1)) self.ws.create_blob('input').feed(input_blob) self.ws.create_net(model.param_init_net).run() self.ws.create_net(model.net).run() output_blob = self.ws.blobs['output'].fetch() if order == 'NHWC': output_blob = np.transpose(output_blob, axes=(0, 3, 1, 2)) assert output_blob.shape == (N, C, H, W)
def test_simple_cnnmodel(self): model = cnn.CNNModelHelper("NCHW", name="overfeat") workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32)) workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int)) with core.NameScope("conv1"): conv1 = model.Conv("data", "conv1", 3, 96, 11, stride=4) relu1 = model.Relu(conv1, conv1) pool1 = model.MaxPool(relu1, "pool1", kernel=2, stride=2) with core.NameScope("classifier"): fc = model.FC(pool1, "fc", 4096, 1000) pred = model.Softmax(fc, "pred") xent = model.LabelCrossEntropy([pred, "label"], "xent") loss = model.AveragedLoss(xent, "loss") model.net.RunAllOnMKL() model.param_init_net.RunAllOnMKL() model.AddGradientOperators([loss], skip=1) blob_name_tracker = {} graph = tb.model_to_graph_def( model, blob_name_tracker=blob_name_tracker, shapes={}, show_simplified=False, ) #self.assertEqual( # blob_name_tracker['GRADIENTS/conv1/conv1_b_grad'], # 'conv1/conv1_b_grad', #) self.maxDiff = None # We can't guarantee the order in which they appear, so we sort # both before we compare them with open('tests/expect/caffe_overfeat.expect') as f: EXPECTED_CNN = f.read() sep = "node {" expected = "\n".join(sorted( sep + "\n " + part.strip() for part in EXPECTED_CNN.strip().split(sep) if part.strip() )) actual = "\n".join(sorted( sep + "\n " + part.strip() for part in str(graph).strip().split(sep) if part.strip() ))
def test_net_gradient_checker(self): model = cnn.CNNModelHelper(name="test") const = model.net.AddExternalInputs("const1", "const2") fc = model.FC(dim_in=3, dim_out=4, blob_in="X", blob_out="Y", axis=0) dist = [model.net.SquaredL2Distance([fc, c]) for c in const] losses = [model.net.AveragedLoss(d) for d in dist] # using two losses here workspace.RunNetOnce(model.param_init_net) gradient_checker.NetGradientChecker.Check( model.net, outputs_with_grad=losses, input_values={ "X": np.array([1, 2, 3], dtype="float32"), const[0]: np.array([1, 1, 1, 1], dtype="float32"), const[1]: np.array([2, 2, 2, 2], dtype="float32") }, input_to_check="X", )
def _createDense(self): perfect_model = np.array([2, 6, 5, 0, 1]).astype(np.float32) np.random.seed(123) # make test deterministic data = np.random.randint(2, size=(20, perfect_model.size)).astype(np.float32) label = np.dot(data, perfect_model)[:, np.newaxis] model = cnn.CNNModelHelper("NCHW", name="test") out = model.FC('data', 'fc', perfect_model.size, 1, ('ConstantFill', {}), ('ConstantFill', {}), axis=0) sq = model.SquaredL2Distance([out, 'label']) loss = model.AveragedLoss(sq, "avg_loss") grad_map = model.AddGradientOperators([loss]) self.assertIsInstance(grad_map['fc_w'], core.BlobReference) return (model, perfect_model, data, label)
def add_parameter_update_ops(model): model.Iter("ITER") LR = model.param_init_net.ConstantFill([], 'LR', shape=[1], value=0.1) for param in model.GetParams(): param_grad = model.param_to_grad[param] param_momentum = model.param_init_net.ConstantFill([param], param + '_momentum', value=0.0) model.net.MomentumSGDUpdate( [param_grad, param_momentum, LR, param], [param_grad, param_momentum, param], ) model = cnn.CNNModelHelper( order="NHWC", name="test", ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=add_input_ops, forward_pass_builder_fun=add_model_ops, param_update_builder_fun=add_parameter_update_ops, devices=[1, 2, 3], ) # Only gpu_1 params should be returned (gpu_1 is the first gpu) checkpoint_params = data_parallel_model.GetCheckpointParams(model) for p in model.GetParams("gpu_1/"): self.assertTrue(p in checkpoint_params) self.assertTrue(p + "_momentum" in checkpoint_params) for p in model.GetParams("gpu_2/"): self.assertTrue(p in checkpoint_params) for c in model.GetComputedParams("gpu_1/"): self.assertFalse(c in checkpoint_params) for c in model.GetComputedParams("gpu_2/"): self.assertFalse(c in checkpoint_params) self.assertFalse( core.BlobReference("gpu_1/data") in checkpoint_params) self.assertTrue( core.BlobReference("gpu_1/ITER") in checkpoint_params)
def test_simple_cnnmodel(self): model = cnn.CNNModelHelper("NCHW", name="overfeat") data, label = model.ImageInput(["db"], ["data", "label"], is_test=0) with core.NameScope("conv1"): conv1 = model.Conv(data, "conv1", 3, 96, 11, stride=4) relu1 = model.Relu(conv1, conv1) pool1 = model.MaxPool(relu1, "pool1", kernel=2, stride=2) with core.NameScope("classifier"): fc = model.FC(pool1, "fc", 4096, 1000) pred = model.Softmax(fc, "pred") xent = model.LabelCrossEntropy([pred, label], "xent") loss = model.AveragedLoss(xent, "loss") model.net.RunAllOnGPU() model.param_init_net.RunAllOnGPU() model.AddGradientOperators([loss], skip=1) with SummaryWriter(filename_suffix='.test') as writer: writer.add_graph(model) blob_name_tracker = {} graph = tb.model_to_graph_def( model, blob_name_tracker=blob_name_tracker, shapes={}, show_simplified=False, ) self.assertEqual( blob_name_tracker['GRADIENTS/conv1/conv1_b_grad'], 'conv1/conv1_b_grad', ) self.maxDiff = None # We can't guarantee the order in which they appear, so we sort # both before we compare them sep = "node {" expected = "\n".join(sorted( sep + "\n " + part.strip() for part in EXPECTED_CNN.strip().split(sep) if part.strip() )) actual = "\n".join(sorted( sep + "\n " + part.strip() for part in str(graph).strip().split(sep) if part.strip() )) self.assertMultiLineEqual(actual, expected)
def run_test_copy_gradient(self, device_opt): model = cnn.CNNModelHelper(name="copy_test") with core.DeviceScope(device_opt): x = model.net.AddExternalInputs("x") y = model.Copy(x, "y") loss = model.AveragedLoss(y, "loss") gradient_map = model.AddGradientOperators([loss]) workspace.FeedBlob(x, np.random.rand(32).astype(np.float32)) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) self.assertTrue( np.array_equal( workspace.FetchBlob(x), workspace.FetchBlob(y), )) self.assertTrue( np.array_equal( workspace.FetchBlob(gradient_map[x]), workspace.FetchBlob(gradient_map[y]), ))
def testShapeInferenceConvNet(self): model = cnn.CNNModelHelper(name="convtest", order="NCHW") model.NHWC2NCHW("data", "data_nchw") model.Conv("data_nchw", 'conv1', 3, 64, weight_init=("MSRAFill", {}), kernel=7, stride=2, pad=3, no_bias=0) model.SpatialBN('conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3) model.Relu('conv1_spatbn_relu', 'conv1_spatbn_relu') model.MaxPool('conv1_spatbn_relu', 'pool1', kernel=3, stride=2) model.FC('pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=100) model.Sigmoid('fc', 'fc_sigm') model.Softmax('fc_sigm', 'softmax') workspace.FeedBlob( "data", np.random.rand(16, 227, 227, 3).astype(np.float32), ) # Then do automatic comparison test: run the next once to # initialize everything self.InferTensorRunAndCompare(model)
def test_resnet_forward_only(self): model = cnn.CNNModelHelper( order="NCHW", name="test", cudnn_exhaustive_search=True, ) with core.NameScope("gpu_0"): data = model.net.AddExternalInput("gpu_0/data") resnet.create_resnet50( model, data, num_input_channels=3, num_labels=1000, is_test=True ) count_before = count_blobs(model.net.Proto()) optim_proto = memonger.optimize_inference_for_dag( model.net, ["gpu_0/data"], "gpu_0/" ) count_after = count_blobs(optim_proto) num_shared_blobs = count_shared_blobs(optim_proto) # Run model and compare results workspace.RunNetOnce(model.param_init_net) data = np.random.rand(4, 3, 227, 227).astype(np.float32) workspace.FeedBlob("gpu_0/data", data) workspace.RunNetOnce(model.net) model.net.Proto().type = 'dag' model.net.Proto().num_workers = 4 loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") self.assertTrue(memonger.verify_graph_equality( model.net.Proto(), optim_proto)) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") self.assertTrue(count_after < count_before) self.assertTrue(num_shared_blobs < 7) np.testing.assert_almost_equal(loss1, optimized_loss1)
def test_topological_sort_longest_path(self): m = cnn.CNNModelHelper() # 0 m.Copy("conv0_w_comp", "conv0_w") # 1 conv0 = m.Conv("data", "conv0", 32, 32, 4) # 2 m.Copy("conv2_w", "conv2_w") # 3 m.Conv(conv0, "conv2", 16, 32, 4) g = memonger.compute_interference_graph(m.net.Proto().op) orders_org = memonger.topological_sort_traversal(g) orders_gt_org = [2, 0, 1, 3] self.assertEqual(orders_gt_org, orders_org) orders = memonger.topological_sort_traversal_longest_path(g) # longer path is in front of the shorter one orders_gt = [0, 1, 2, 3] self.assertEqual(orders_gt, orders)
def run(comm_rank, comm_size, tmpdir): def add_input_ops(model): pass def add_model_ops(model, loss_scale): return [] def add_optimizer(model): pass store_handler = "store_handler" workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=tmpdir)) rendezvous = dict( kv_handler=store_handler, shard_id=comm_rank, num_shards=comm_size, engine='GLOO', ) model = cnn.CNNModelHelper( order="NHWC", name="test", ) data_parallel_model.Parallelize_CPU( model, input_builder_fun=add_input_ops, forward_pass_builder_fun=add_model_ops, optimizer_builder_fun=add_optimizer, devices=[1, 2, 3], rendezvous=rendezvous ) data_parallel_model.RunInitNet(model) for _ in range(2): data_parallel_model.Synchronize(model)
def test_read_from_db(self): random_label = np.random.randint(0, 100) VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi" temp_list = tempfile.NamedTemporaryFile(delete=False).name line_str = '{} 0 {}\n'.format(VIDEO, random_label) self.create_a_list( temp_list, line_str, 16) video_db_dir = tempfile.mkdtemp() self.create_video_db(temp_list, video_db_dir) model = cnn.CNNModelHelper(name="Video Loader from LMDB") reader = model.CreateDB( "sample", db=video_db_dir, db_type="lmdb") model.VideoInput( reader, ["data", "label"], name="data", batch_size=10, width=171, height=128, crop=112, length=8, sampling_rate=2, mirror=1, use_local_file=0, temporal_jitter=1) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(model.net) data = workspace.FetchBlob("data") label = workspace.FetchBlob("label") np.testing.assert_equal(label, random_label) np.testing.assert_equal(data.shape, [10, 3, 8, 112, 112]) os.remove(temp_list) shutil.rmtree(video_db_dir)
def test_gradient_optim(self, input_dim, output_dim, batch_size): m = cnn.CNNModelHelper() with core.NameScope("name_x"): fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5)\ .Softmax([], "pred") \ .LabelCrossEntropy(["label"], ["xent"]) \ .AveragedLoss([], "loss") input_to_grad = m.AddGradientOperators(["name_x/loss"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(m.param_to_grad.values()), "name_x/", ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss = workspace.FetchBlob("name_x/loss") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.RunNetOnce(optim_proto) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad)
def add_model_ops(model): model = cnn.CNNModelHelper(name="convtest", order="NCHW") model.NHWC2NCHW("data", "data_nchw") model.Conv("data_nchw", 'conv1', 3, 64, weight_init=("MSRAFill", {}), kernel=7, stride=2, pad=3, no_bias=0) model.SpatialBN('conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3) model.Relu('conv1_spatbn_relu', 'conv1_spatbn_relu') model.MaxPool('conv1_spatbn_relu', 'pool1', kernel=3, stride=2) model.FC('pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=100) model.Sigmoid('fc', 'fc_sigm') model.Softmax('fc_sigm', 'softmax') model.LabelCrossEntropy(['softmax', 'label'], 'xent') loss = model.AveragedLoss('xent', 'loss') model.AddGradientOperators([loss])
def test_forward_only( create_model, last_out_blob, data_blob='gpu_0/data', num_labels=1000, ): model = cnn.CNNModelHelper( order="NCHW", name="test", cudnn_exhaustive_search=True, ) with core.NameScope("gpu_0"): data = model.net.AddExternalInput(data_blob) create_model(model, data, num_input_channels=3, num_labels=num_labels, is_test=True) count_before = count_blobs(model.net.Proto()) optim_proto = memonger.optimize_inference_for_dag(model.net, [data_blob], "gpu_0/") count_after = count_blobs(optim_proto) num_shared_blobs = count_shared_blobs(optim_proto) # Run model and compare results workspace.RunNetOnce(model.param_init_net) data = np.random.rand(4, 3, 227, 227).astype(np.float32) workspace.FeedBlob(data_blob, data) workspace.RunNetOnce(model.net) model.net.Proto().type = 'dag' model.net.Proto().num_workers = 4 loss1 = workspace.FetchBlob(last_out_blob) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob(last_out_blob) return [(count_after, count_before), (num_shared_blobs), (loss1, optimized_loss1)]
def test_caffe2_simple_cnnmodel(self): model = cnn.CNNModelHelper("NCHW", name="overfeat") workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32)) workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int)) with core.NameScope("conv1"): conv1 = model.Conv("data", "conv1", 3, 96, 11, stride=4) relu1 = model.Relu(conv1, conv1) pool1 = model.MaxPool(relu1, "pool1", kernel=2, stride=2) with core.NameScope("classifier"): fc = model.FC(pool1, "fc", 4096, 1000) pred = model.Softmax(fc, "pred") xent = model.LabelCrossEntropy([pred, "label"], "xent") loss = model.AveragedLoss(xent, "loss") blob_name_tracker = {} graph = c2_graph.model_to_graph_def( model, blob_name_tracker=blob_name_tracker, shapes={}, show_simplified=False, ) compare_proto(graph, self)
def MLP(order, cudnn_ws): model = cnn.CNNModelHelper() d = 256 depth = 20 width = 3 for i in range(depth): for j in range(width): current = "fc_{}_{}".format(i, j) if i > 0 else "data" next_ = "fc_{}_{}".format(i + 1, j) model.FC( current, next_, dim_in=d, dim_out=d, weight_init=model.XavierInit, bias_init=model.XavierInit) model.Sum(["fc_{}_{}".format(depth, j) for j in range(width)], ["sum"]) model.FC("sum", "last", dim_in=d, dim_out=1000, weight_init=model.XavierInit, bias_init=model.XavierInit) xent = model.LabelCrossEntropy(["last", "label"], "xent") model.AveragedLoss(xent, "loss") return model, d
def testReLUConsistencyWithCPU(self): X = np.random.randn(128, 4096).astype(np.float32) mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN) # Makes sure that feed works. workspace.FeedBlob("X", X) workspace.FeedBlob("X_mkl", X, device_option=mkl_do) model = cnn.CNNModelHelper() # Makes sure that we can run relu. model.Relu("X", "Y") model.Relu("X_mkl", "Y_mkl", device_option=mkl_do) workspace.CreateNet(model.net) workspace.RunNet(model.net) # makes sure that the results are good. np.testing.assert_allclose(workspace.FetchBlob("Y"), workspace.FetchBlob("Y_mkl"), atol=1e-10, rtol=1e-10) runtime = workspace.BenchmarkNet(model.net.Proto().name, 1, 10, True) # The returned runtime is the time of # [whole_net, cpu_op, mkl_op] # so we will assume that the MKL one runs faster than the CPU one. self.assertTrue(runtime[1] >= runtime[2]) print("CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
def testNonParallelModel(self): workspace.ResetWorkspace() model = cnn.CNNModelHelper(name="test") old_seq_id = data_workers.global_coordinator._fetcher_id_seq coordinator = data_workers.init_data_input_workers( model, ["data", "label"], dummy_fetcher, 32, 2, input_source_name="unittest") new_seq_id = data_workers.global_coordinator._fetcher_id_seq self.assertEqual(new_seq_id, old_seq_id + 2) coordinator.start() workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) for _i in range(500): with timeout_guard.CompleteInTimeOrDie(5): workspace.RunNet(model.net.Proto().name) data = workspace.FetchBlob("data") labels = workspace.FetchBlob("label") self.assertEqual(data.shape[0], labels.shape[0]) self.assertEqual(data.shape[0], 32) for j in range(32): self.assertEqual(labels[j], data[j, 0]) self.assertEqual(labels[j], data[j, 1]) self.assertEqual(labels[j], data[j, 2]) coordinator.stop_coordinator("unittest") self.assertEqual(coordinator._coordinators, [])
def bmuf_process(filestore_dir, process_id, shared_results, cpu_device=False, nesterov=False): # We need to import caffe2 in every process to initialize CUDA independently. from caffe2.python import core, cnn, data_parallel_model, dyndep, workspace from caffe2.proto import caffe2_pb2 dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops") if not cpu_device: if not workspace.has_gpu_support and not workspace.has_hip_support: log.info('No GPU support test is Ignored.') return if workspace.NumGpuDevices() < 4: log.info('Not enough GPU support, test IGNORED') return model = cnn.CNNModelHelper(order="NHWC", name="test") if not cpu_device: device_type = workspace.GpuDeviceType device_prefix = "gpu" else: device_type = caffe2_pb2.CPU device_prefix = "cpu" devices = [0, 1] if process_id == 0 else [2, 3] def _model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) # For testing explicit sync model.param_init_net.UniformFill([], ["sync_num"], shape=[1]) return [loss] def _input_builder_fun(model): return None def _param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): grad = model.param_to_grad[param] model.WeightedSum([param, ONE, grad, LR], param) def _generate_data(devices, process_id, device_type, device_prefix): np.random.seed(26 + process_id * 10) # Each run has same input, independent of number of gpus batch_size = 64 for _ in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(device_type, g)): workspace.FeedBlob("{}_{}/data".format(device_prefix, g), data) workspace.FeedBlob("{}_{}/label".format(device_prefix, g), labels) _generate_data(devices, process_id, device_type, device_prefix) workspace.RunOperatorOnce( core.CreateOperator("FileStoreHandlerCreate", [], ["store_handler"], path=filestore_dir)) rendezvous = dict(kv_handler="store_handler", shard_id=process_id, num_shards=2, engine="GLOO", exit_nets=None) data_parallel_model.Parallelize_BMUF(model, _input_builder_fun, _model_build_fun, _param_update_fun, devices=devices, rendezvous=rendezvous, nesterov=nesterov, add_blobs_to_sync=["sync_num"], cpu_device=cpu_device) data_parallel_model.RunInitNet(model) def _device_pid(device, pid): if pid == 1: return device + 2 return device np.testing.assert_equal( workspace.FetchBlob("{}_{}/fc_w_v".format(device_prefix, _device_pid(0, process_id))), np.zeros(16).astype(np.float32).reshape(1, 16)) # Run the algorithm for one iteration to have non-zero params. data_parallel_model.RunNet(model, 1) # Save iteration momentum and post local update params results = {} v_b_ = workspace.FetchBlob("{}_{}/fc_b_v".format( device_prefix, _device_pid(0, process_id))) v_w_ = workspace.FetchBlob("{}_{}/fc_w_v".format( device_prefix, _device_pid(0, process_id))) results['v_b_'] = v_b_ results['v_w_'] = v_w_ workspace.RunNetOnce(model.net) b_0_ = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix, _device_pid(0, process_id))) w_0_ = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix, _device_pid(0, process_id))) b_1_ = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix, _device_pid(1, process_id))) w_1_ = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix, _device_pid(1, process_id))) results['b_0_'] = b_0_ results['w_0_'] = w_0_ results['b_1_'] = b_1_ results['w_1_'] = w_1_ # Test sync if process_id == 0: workspace.FeedBlob(device_prefix + "_0/sync_num", np.array([2603]).astype(np.float32), device_option=core.DeviceOption(device_type, 0)) # Compute block gradients. b_g_ = workspace.FetchBlob("{}_{}/fc_b_g".format( device_prefix, _device_pid(0, process_id))) w_g_ = workspace.FetchBlob("{}_{}/fc_w_g".format( device_prefix, _device_pid(0, process_id))) results['b_g_'] = b_g_ results['w_g_'] = w_g_ workspace.RunNetOnce(model._global_model_param_updates_net) # g_b = (b_0_ + b_1_) / 2 - b_g_ # g_w = (w_0_ + w_1_) / 2 - w_g_ v_b = workspace.FetchBlob("{}_{}/fc_b_v".format(device_prefix, _device_pid(0, process_id))) v_w = workspace.FetchBlob("{}_{}/fc_w_v".format(device_prefix, _device_pid(0, process_id))) w_g = workspace.FetchBlob("{}_{}/fc_w_g".format(device_prefix, _device_pid(0, process_id))) b_g = workspace.FetchBlob("{}_{}/fc_b_g".format(device_prefix, _device_pid(0, process_id))) w_0 = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix, _device_pid(0, process_id))) b_0 = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix, _device_pid(0, process_id))) w_1 = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix, _device_pid(1, process_id))) b_1 = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix, _device_pid(1, process_id))) results['v_b'] = v_b results['v_w'] = v_w results['w_g'] = w_g results['b_g'] = b_g results['w_0'] = w_0 results['b_0'] = b_0 results['w_1'] = w_1 results['b_1'] = b_1 # Test add_blobs_to_sync for j in devices: sync = workspace.FetchBlob(device_prefix + "_{}/sync_num".format(j))[0] results['sync_{}'.format(j)] = sync shared_results[process_id] = results
def VGGA(order, cudnn_ws): model = cnn.CNNModelHelper(order, name='vgg-a', use_cudnn=True, cudnn_exhaustive_search=True, ws_nbytes_limit=cudnn_ws) conv1 = model.Conv("data", "conv1", 3, 64, 3, ('XavierFill', {}), ('ConstantFill', {}), pad=1) relu1 = model.Relu(conv1, "conv1") pool1 = model.MaxPool(relu1, "pool1", kernel=2, stride=2) conv2 = model.Conv(pool1, "conv2", 64, 128, 3, ('XavierFill', {}), ('ConstantFill', {}), pad=1) relu2 = model.Relu(conv2, "conv2") pool2 = model.MaxPool(relu2, "pool2", kernel=2, stride=2) conv3 = model.Conv(pool2, "conv3", 128, 256, 3, ('XavierFill', {}), ('ConstantFill', {}), pad=1) relu3 = model.Relu(conv3, "conv3") conv4 = model.Conv(relu3, "conv4", 256, 256, 3, ('XavierFill', {}), ('ConstantFill', {}), pad=1) relu4 = model.Relu(conv4, "conv4") pool4 = model.MaxPool(relu4, "pool4", kernel=2, stride=2) conv5 = model.Conv(pool4, "conv5", 256, 512, 3, ('XavierFill', {}), ('ConstantFill', {}), pad=1) relu5 = model.Relu(conv5, "conv5") conv6 = model.Conv(relu5, "conv6", 512, 512, 3, ('XavierFill', {}), ('ConstantFill', {}), pad=1) relu6 = model.Relu(conv6, "conv6") pool6 = model.MaxPool(relu6, "pool6", kernel=2, stride=2) conv7 = model.Conv(pool6, "conv7", 512, 512, 3, ('XavierFill', {}), ('ConstantFill', {}), pad=1) relu7 = model.Relu(conv7, "conv7") conv8 = model.Conv(relu7, "conv8", 512, 512, 3, ('XavierFill', {}), ('ConstantFill', {}), pad=1) relu8 = model.Relu(conv8, "conv8") pool8 = model.MaxPool(relu8, "pool8", kernel=2, stride=2) fcix = model.FC(pool8, "fcix", 512 * 7 * 7, 4096, ('XavierFill', {}), ('ConstantFill', {})) reluix = model.Relu(fcix, "fcix") fcx = model.FC(reluix, "fcx", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})) relux = model.Relu(fcx, "fcx") fcxi = model.FC(relux, "fcxi", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})) pred = model.Softmax(fcxi, "pred") xent = model.LabelCrossEntropy([pred, "label"], "xent") loss = model.AveragedLoss(xent, "loss") return model, 231
def create_model(args, queue, label_queue, input_shape): model = cnn.CNNModelHelper(name="LSTM_bench") seq_lengths, hidden_init, cell_init, target = \ model.net.AddExternalInputs( 'seq_lengths', 'hidden_init', 'cell_init', 'target', ) input_blob = model.DequeueBlobs(queue, "input_data") labels = model.DequeueBlobs(label_queue, "label") if args.implementation == "own": output, last_hidden, _, last_state = rnn_cell.LSTM( model=model, input_blob=input_blob, seq_lengths=seq_lengths, initial_states=(hidden_init, cell_init), dim_in=args.input_dim, dim_out=args.hidden_dim, scope="lstm1", memory_optimization=args.memory_optimization, forward_only=args.forward_only, drop_states=True, ) elif args.implementation == "cudnn": # We need to feed a placeholder input so that RecurrentInitOp # can infer the dimensions. model.param_init_net.ConstantFill([], input_blob, shape=input_shape) output, last_hidden, _ = rnn_cell.cudnn_LSTM( model=model, input_blob=input_blob, initial_states=(hidden_init, cell_init), dim_in=args.input_dim, dim_out=args.hidden_dim, scope="cudnnlstm", num_layers=1, ) else: assert False, "Unknown implementation" weights = model.UniformFill(labels, "weights") softmax, loss = model.SoftmaxWithLoss( [model.Flatten(output), labels, weights], ['softmax', 'loss'], ) if not args.forward_only: model.AddGradientOperators([loss]) # carry states over model.net.Copy(last_hidden, hidden_init) model.net.Copy(last_hidden, cell_init) workspace.FeedBlob(hidden_init, np.zeros( [1, args.batch_size, args.hidden_dim], dtype=np.float32 )) workspace.FeedBlob(cell_init, np.zeros( [1, args.batch_size, args.hidden_dim], dtype=np.float32 )) return model, output
def run_model(self, V, gpu_devices, cpu_indices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): if cpu_indices: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): gathered_cpu = model.net.Gather([self.vecs, 'indices'], 'gathered_cpu') gathered = model.CopyCPUToGPU(gathered_cpu, "gathered") else: gpu_vecs = model.param_init_net.CopyCPUToGPU( self.vecs, "gpuvecs", ) model.params.append(gpu_vecs) gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered') flattened = model.Flatten(gathered, "flattened") fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: param_momentum = model.param_init_net.ConstantFill( [param], '{}_momentum'.format(param), value=0.0, ) model.net.SparseMomentumSGDUpdate( [ param_grad.values, param_momentum, LR, param, param_grad.indices, ], [param_grad.values, param_momentum, param], momentum=0.1, nesterov=0, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) self.vecs = model.param_init_net.UniformFill([], "vecs", shape=[V, 16]) if cpu_indices: model.params.append(self.vecs) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs if cpu_indices: with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): for param in model.GetParams(): param_grad = model.param_to_grad[param] model.ScatterWeightedSum([ param, self.ONE_CPU, param_grad.indices, param_grad.values, self.LR ], self.vecs) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_indices = np.random.permutation(V)[:batch_size * 16].reshape( batch_size, 16) full_labels = full_indices[:, 0] % 2 batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en, :].astype(np.int32) labels = full_labels[st:en].astype(np.float32) device_for_indices = core.DeviceOption(caffe2_pb2.CPU) if not cpu_indices: device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g) with core.DeviceScope(device_for_indices): workspace.FeedBlob("gpu_{}/indices".format(g), indices) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = np.random.rand(V, 16).astype(np.float32) workspace.FeedBlob(self.vecs, orig_vecs) if not cpu_indices: for g in gpu_devices: workspace.FeedBlob( "gpu_{}/gpuvecs".format(g), orig_vecs, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) if len(gpu_devices) == 2: open("dump.txt", "w").write(str(model.net.Proto())) if not cpu_indices: idx = workspace.FetchBlob("gpu_0/indices") idx = list(idx.flatten()) n = len(idx) nu = len(set(idx)) assert n == nu, "We cannot have duplicate indices" # Sanity check to see the vecs were updated self.assertFalse(np.allclose(workspace.FetchBlob(self.vecs), orig_vecs)) return [ workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"), workspace.FetchBlob("gpu_0/fc_w") ]