def test_forward_optim_tree_enforce_inplace_op_invalid(self): m = model_helper.ModelHelper() m.Proto().type = "dag" m.Proto().num_workers = 4 net = m.net net.IndexFreeze("A", "B") # enforce inplace op net.Sum(["B", "B"], "C") net.Relu("C", "D") net.Sum(["D", "D"], "E") with self.assertRaises(RuntimeError): memonger.optimize_inference_for_dag(net, ["A"], "")
def test_forward_optim_tree_dag_traversal(self): input_dim = 4 output_dim = 4 batch_size = 4 m = model_helper.ModelHelper() m.Proto().type = "dag" m.Proto().num_workers = 4 with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) # Branch fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim) fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim) fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim) fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum") fc5.Relu([], fc5sum) \ .Softmax([], "pred1") \ .LabelCrossEntropy(["label"], ["xent1"]) \ .AveragedLoss([], "loss1") fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim) fc6.Relu([], fc6) \ .Softmax([], "pred2") \ .LabelCrossEntropy(["label"], ["xent2"]) \ .AveragedLoss([], "loss2") blobs_before = count_blobs(m.net.Proto()) # adding name_x/fc5_w as heads (which belongs to non-root op) # to make sure that dag traversal always starts from root ops optim_proto = memonger.optimize_inference_for_dag( m.net, ["name_x/fc5_w", "name_x/data"], "name_x") blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before)
def test_forward_optim_tree_harder(self, input_dim, output_dim, batch_size): m = cnn.CNNModelHelper() m.net.Proto().type = "dag" m.net.Proto().num_workers = 4 m.net.AddExternalInput("label") m.net.AddExternalInput("data") with core.NameScope("name_x"): fc1 = m.FC("data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = m.FC(fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = m.FC(fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = m.FC(fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = m.FC(fc4, "fc5", dim_in=output_dim, dim_out=output_dim) # Branch fc3b = m.FC(fc2, "fc3b", dim_in=output_dim, dim_out=output_dim) fc4b = m.FC(fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim) fc5b = m.FC(fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim) fc5sum = m.Sum([fc5, fc5b], "fc5sum") fc5sum.Relu([], "relu1") \ .Softmax([], "pred1") \ .LabelCrossEntropy(["label"], ["xent1"]) \ .AveragedLoss([], "loss1") fc6 = m.FC(fc5, "fc6", dim_in=output_dim, dim_out=output_dim) fc6.Relu([], fc6) \ .Softmax([], "pred2") \ .LabelCrossEntropy(["label"], ["xent2"]) \ .AveragedLoss([], "loss2") blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.optimize_inference_for_dag( m.net, ["name_x/data"], "name_x/") blobs_after = count_blobs(optim_proto) print(str(optim_proto)) self.assertLess(blobs_after, blobs_before) # Test networks produce exactly same results data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss1 = workspace.FetchBlob("name_x/loss1") loss2 = workspace.FetchBlob("name_x/loss2") workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("name_x/loss1") optimized_loss2 = workspace.FetchBlob("name_x/loss2") np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(loss2, optimized_loss2)
def test_forward_optim_tree_daggy(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() m.Proto().type = "dag" m.Proto().num_workers = 4 with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) # Branch fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim) fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim) fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim) fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum") fc5.Relu([], fc5sum) \ .Softmax([], "pred1") \ .LabelCrossEntropy(["label"], ["xent1"]) \ .AveragedLoss([], "loss1") fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim) fc6.Relu([], fc6) \ .Softmax([], "pred2") \ .LabelCrossEntropy(["label"], ["xent2"]) \ .AveragedLoss([], "loss2") blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.optimize_inference_for_dag( m.net, ["name_x/data"], "name_x" ) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) # Test networks produce exactly same results data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint( low=0, high=output_dim, size=(batch_size,)).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss1 = workspace.FetchBlob("name_x/loss1") loss2 = workspace.FetchBlob("name_x/loss2") workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("name_x/loss1") optimized_loss2 = workspace.FetchBlob("name_x/loss2") np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(loss2, optimized_loss2)
def test_forward_optim_tree_enforce_inplace_op_valid_and_as_head(self): m = model_helper.ModelHelper() m.Proto().type = "dag" m.Proto().num_workers = 4 net = m.net net.IndexFreeze("A", "A") # enforce inplace op net.Sum(["A", "A"], "B") net.Relu("B", "C") net.Relu("C", "D") net.Sum(["D", "D"], "E") blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.optimize_inference_for_dag(net, ["A"], "") blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before)
def test_resnet_forward_only(self): model = cnn.CNNModelHelper( order="NCHW", name="test", cudnn_exhaustive_search=True, ) with core.NameScope("gpu_0"): data = model.net.AddExternalInput("gpu_0/data") resnet.create_resnet50( model, data, num_input_channels=3, num_labels=1000, is_test=True ) count_before = count_blobs(model.net.Proto()) optim_proto = memonger.optimize_inference_for_dag( model.net, ["gpu_0/data"], "gpu_0/" ) count_after = count_blobs(optim_proto) num_shared_blobs = count_shared_blobs(optim_proto) # Run model and compare results workspace.RunNetOnce(model.param_init_net) data = np.random.rand(4, 3, 227, 227).astype(np.float32) workspace.FeedBlob("gpu_0/data", data) workspace.RunNetOnce(model.net) model.net.Proto().type = 'dag' model.net.Proto().num_workers = 4 loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") self.assertTrue(memonger.verify_graph_equality( model.net.Proto(), optim_proto)) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") self.assertTrue(count_after < count_before) self.assertTrue(num_shared_blobs < 7) np.testing.assert_almost_equal(loss1, optimized_loss1)
def test_forward_only( create_model, last_out_blob, data_blob='gpu_0/data', num_labels=1000, ): model = cnn.CNNModelHelper( order="NCHW", name="test", cudnn_exhaustive_search=True, ) with core.NameScope("gpu_0"): data = model.net.AddExternalInput(data_blob) create_model(model, data, num_input_channels=3, num_labels=num_labels, is_test=True) count_before = count_blobs(model.net.Proto()) optim_proto = memonger.optimize_inference_for_dag(model.net, [data_blob], "gpu_0/") count_after = count_blobs(optim_proto) num_shared_blobs = count_shared_blobs(optim_proto) # Run model and compare results workspace.RunNetOnce(model.param_init_net) data = np.random.rand(4, 3, 227, 227).astype(np.float32) workspace.FeedBlob(data_blob, data) workspace.RunNetOnce(model.net) model.net.Proto().type = 'dag' model.net.Proto().num_workers = 4 loss1 = workspace.FetchBlob(last_out_blob) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob(last_out_blob) return [(count_after, count_before), (num_shared_blobs), (loss1, optimized_loss1)]
def test_resnet_forward_only(self): model = cnn.CNNModelHelper( order="NCHW", name="test", cudnn_exhaustive_search=True, ) with core.NameScope("gpu_0"): data = model.net.AddExternalInput("gpu_0/data") resnet.create_resnet50( model, data, num_input_channels=3, num_labels=1000, is_test=True ) count_before = count_blobs(model.net.Proto()) optim_proto = memonger.optimize_inference_for_dag( model.net, ["gpu_0/data"], "gpu_0/" ) count_after = count_blobs(optim_proto) num_shared_blobs = count_shared_blobs(optim_proto) # Run model and compare results workspace.RunNetOnce(model.param_init_net) data = np.random.rand(4, 3, 227, 227).astype(np.float32) workspace.FeedBlob("gpu_0/data", data) workspace.RunNetOnce(model.net) model.net.Proto().type = 'dag' model.net.Proto().num_workers = 4 loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") self.assertTrue(count_after < count_before) self.assertTrue(num_shared_blobs < 7 and num_shared_blobs > 0) np.testing.assert_almost_equal(loss1, optimized_loss1)
def optimize_inference_memory(model): for device in range(cfg.NUM_GPUS): namescope = 'gpu_{}/'.format(device) model.net._net = memonger.optimize_inference_for_dag( model.net, ["{}teacher/data".format(namescope)], namescope)