def test_verify_graph_inequality(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() m.Proto().type = "dag" m.Proto().num_workers = 4 with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim) brew.sum(m, [fc2, fc3], "out") m2 = model_helper.ModelHelper() m2.Proto().type = "dag" m2.Proto().num_workers = 4 with core.NameScope("name_x"): fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim) brew.sum(m2, [fc2, fc3], "out") self.assertFalse( memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
def test_verify_graph_inequality(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() m.Proto().type = "dag" m.Proto().num_workers = 4 with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim) brew.sum(m, [fc2, fc3], "out") m2 = model_helper.ModelHelper() m2.Proto().type = "dag" m2.Proto().num_workers = 4 with core.NameScope("name_x"): fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim) brew.sum(m2, [fc2, fc3], "out") self.assertFalse(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
def test_resnet_forward_only(self): model = cnn.CNNModelHelper( order="NCHW", name="test", cudnn_exhaustive_search=True, ) with core.NameScope("gpu_0"): data = model.net.AddExternalInput("gpu_0/data") resnet.create_resnet50( model, data, num_input_channels=3, num_labels=1000, is_test=True ) count_before = count_blobs(model.net.Proto()) optim_proto = memonger.optimize_inference_for_dag( model.net, ["gpu_0/data"], "gpu_0/" ) count_after = count_blobs(optim_proto) num_shared_blobs = count_shared_blobs(optim_proto) # Run model and compare results workspace.RunNetOnce(model.param_init_net) data = np.random.rand(4, 3, 227, 227).astype(np.float32) workspace.FeedBlob("gpu_0/data", data) workspace.RunNetOnce(model.net) model.net.Proto().type = 'dag' model.net.Proto().num_workers = 4 loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") self.assertTrue(memonger.verify_graph_equality( model.net.Proto(), optim_proto)) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") self.assertTrue(count_after < count_before) self.assertTrue(num_shared_blobs < 7) np.testing.assert_almost_equal(loss1, optimized_loss1)
def test_gradient_optim(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5)\ .Softmax([], "pred") \ .LabelCrossEntropy(["label"], ["xent"]) \ .AveragedLoss([], "loss") input_to_grad = m.AddGradientOperators(["name_x/loss"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(m.param_to_grad.values()), "name_x/", share_activations=False, ) self.assertTrue(memonger.verify_graph_equality(m.Proto(), optim_proto)) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) optim_proto_wacts = memonger.share_grad_blobs( m.net, ["name_x/loss"], set(m.param_to_grad.values()), "name_x/", share_activations=True, dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]), ) self.assertTrue( memonger.verify_graph_equality(m.Proto(), optim_proto_wacts)) blobs_wact_optim = count_blobs(optim_proto_wacts) self.assertLessEqual(blobs_wact_optim, blobs_after) # Check that the last activations are not shared self.assertTrue(has_blob(optim_proto, "name_x/fc5")) self.assertTrue( has_blob(optim_proto_wacts, "name_x/fc5"), "Dont remap final activation", ) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss = workspace.FetchBlob("name_x/loss") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.RunNetOnce(optim_proto) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad) workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0])) # Run with the forward optimization workspace.RunNetOnce(optim_proto_wacts) optimized_loss = workspace.FetchBlob("name_x/loss") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad)
def test_forward_optim_tree_harder(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() m.net.Proto().type = "dag" m.net.Proto().num_workers = 4 m.net.AddExternalInput("label") m.net.AddExternalInput("data") with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) # Branch fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim) fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim) fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim) fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum") fc5sum.Relu([], "relu1") \ .Softmax([], "pred1") \ .LabelCrossEntropy(["label"], ["xent1"]) \ .AveragedLoss([], "loss1") fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim) fc6.Relu([], fc6) \ .Softmax([], "pred2") \ .LabelCrossEntropy(["label"], ["xent2"]) \ .AveragedLoss([], "loss2") blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.optimize_inference_for_dag( m.net, ["name_x/data"], "name_x/") self.assertTrue( memonger.verify_graph_equality(m.net.Proto(), optim_proto)) blobs_after = count_blobs(optim_proto) print(str(optim_proto)) self.assertLess(blobs_after, blobs_before) # Test networks produce exactly same results data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss1 = workspace.FetchBlob("name_x/loss1") loss2 = workspace.FetchBlob("name_x/loss2") workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("name_x/loss1") optimized_loss2 = workspace.FetchBlob("name_x/loss2") np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(loss2, optimized_loss2)
def test_gradient_optim_tree(self, input_dim, output_dim, batch_size): m = model_helper.ModelHelper() with core.NameScope("name_x"): fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim) fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim) fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim) fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim) fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim) fc5.Relu([], fc5) \ .Softmax([], "pred1") \ .LabelCrossEntropy(["label"], ["xent1"]) \ .AveragedLoss([], "loss1") fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim) fc6.Relu([], fc6) \ .Softmax([], "pred2") \ .LabelCrossEntropy(["label"], ["xent2"]) \ .AveragedLoss([], "loss2") input_to_grad = m.AddGradientOperators( ["name_x/loss1", "name_x/loss2"]) blobs_before = count_blobs(m.net.Proto()) optim_proto = memonger.share_grad_blobs( m.net, ["name_x/loss1", "name_x/loss2"], set(m.param_to_grad.values()), "name_x", # "name_x//shared_gradinp_0_shared" if using "name_x/" share_activations=True, dont_share_blobs=set([ 'name_x/fc6', 'name_x/fc5', str(input_to_grad["name_x/fc1_w"]) ]), ) self.assertTrue( memonger.verify_graph_equality(m.net.Proto(), optim_proto)) blobs_after = count_blobs(optim_proto) self.assertLess(blobs_after, blobs_before) self.assertTrue(has_blob(optim_proto, "name_x/fc6")) # Test networks produce exactly same gradients data = np.random.randn(batch_size, input_dim).astype(np.float32) label = np.random.randint(low=0, high=output_dim, size=(batch_size, )).astype(np.int32) workspace.RunNetOnce(m.param_init_net) workspace.FeedBlob("name_x/data", data) workspace.FeedBlob("name_x/label", label) workspace.RunNetOnce(m.net) loss1 = workspace.FetchBlob("name_x/loss1") loss2 = workspace.FetchBlob("name_x/loss2") grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"])) workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0])) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("name_x/loss1") optimized_loss2 = workspace.FetchBlob("name_x/loss2") optimized_grad = workspace.FetchBlob(str( input_to_grad["name_x/fc1_w"])) np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(loss2, optimized_loss2) np.testing.assert_almost_equal(grad, optimized_grad)
def share_freeze_blobs( net, namescope, ): log.warn("NOTE: Executing memonger to optimize gradient memory") # Collect ops that have something to do with gradients if namescope != "" and not namescope.endswith("/"): namescope += "/" netproto = copy.deepcopy(net.Proto()) new_net = copy.deepcopy(net) activations = [] external_input = set(new_net.Proto().external_input) external_output = set(new_net.Proto().external_output) start_idx = -1 end_idx = -1 # ops for idx, op in enumerate(new_net._net.op): # print(op) if namescope not in op.input[0]: continue if op.type == 'Conv' and start_idx < 0: start_idx = idx if op.type == 'StopGradient': end_idx = idx # print(namescope, 'start_idx: ', start_idx, ' end_idx: ', end_idx) # Hacky way to get activations, think of a better way for idx, op in enumerate(new_net._net.op[start_idx:end_idx]): if namescope not in op.input[0]: continue for b in op.output: if b not in external_output: activations.append(b) # print('activations: ', activations) used_activations = [] for a in activations: if a in used_activations: continue share_pool = [ namescope + '_shared_' + str(i) for i in range(1000, 10000) ] # print(a) first_idx = -1 for idx, op in enumerate(new_net._net.op): if namescope not in op.input[0]: continue if a in list(op.input) + list(op.output): first_idx = idx break assert first_idx >= 0, first_idx for idx, op in enumerate(new_net._net.op[first_idx:]): if namescope not in op.input[0]: continue for b in list(op.input) + list(op.output): if b in share_pool: share_pool.remove(b) for idx, op in enumerate(new_net._net.op): if namescope not in op.input[0]: continue op_input = copy.deepcopy(op.input) is_found = False for i, b in enumerate(op_input): if a == b: op_input[i] = share_pool[-1] is_found = True if is_found: del new_net._net.op[idx].input[:] new_net._net.op[idx].input.extend(op_input) op_output = copy.deepcopy(op.output) is_found = False for i, b in enumerate(op_output): if a == b: op_output[i] = share_pool[-1] is_found = True if is_found: del new_net._net.op[idx].output[:] new_net._net.op[idx].output.extend(op_output) used_activations.append(a) assert verify_graph_equality(net.Proto(), new_net.Proto()), \ "Memonger graph is not equal to original." assert verify_inplace_blobs(net.Proto(), new_net.Proto()), \ "Inplace assignments differ in memonger net." share_pool = [namescope + '_shared_' + str(i) for i in range(1000, 10000)] share_pool_used = {} for idx, op in enumerate(new_net._net.op): if namescope not in op.input[0]: continue for b in list(op.input) + list(op.output): if b in share_pool: share_pool_used[b] = idx for idx, op in enumerate(new_net._net.op[end_idx:]): if namescope not in op.input[0]: continue for b in list(op.input) + list(op.output): if b in share_pool_used.keys(): share_pool_used.pop(b) ops = list(new_net._net.op) for inp in share_pool_used.keys(): # print('free: ', inp) # new_net.Free([inp], [inp]) ops.insert(share_pool_used[inp] + 1, core.CreateOperator("Free", [inp], [inp])) del new_net._net.op[:] new_net._net.op.extend(ops) return new_net.Proto()
def share_freeze_blobs_c2( net, namescope, ): log.warn("NOTE: Executing memonger to optimize gradient memory") # Collect ops that have something to do with gradients if namescope != "" and not namescope.endswith("/"): namescope += "/" netproto = copy.deepcopy(net.Proto()) activations = [] external_input = set(net.Proto().external_input) external_output = set(net.Proto().external_output) start_idx = -1 end_idx = -1 # ops for idx, op in enumerate(netproto.op): # print(op) if namescope not in op.input[0]: continue if op.type == 'Conv' and start_idx < 0: start_idx = idx if op.type == 'StopGradient': end_idx = idx print(namescope, 'start_idx: ', start_idx, ' end_idx: ', end_idx) # Hacky way to get activations, think of a better way for idx, op in enumerate(netproto.op[start_idx:end_idx]): for b in op.output: if b not in external_output: activations.append(b) print('activations: ', activations) share_pool = [namescope + '_shared_' + str(i) for i in range(1000, 10000)] map_pool = {} heads = [namescope + 'data'] print('heads: ', heads) # Remove last activations, as they are usually accessed externally activations = set(activations[:-1]) print('activations: ', activations) shared_blobs = activations dont_share_blobs = None blob_shapes = None op_indices = [ index for index, op in enumerate(netproto.op[start_idx:end_idx + 2]) ] print(op_indices) start_time = time.time() optim_str = C.memonger_compute_blob_recycling_for_dag( netproto.SerializeToString(), [str(s).encode('utf-8') for s in heads], op_indices, set(str(s).encode('utf-8') for s in shared_blobs), namescope.encode('utf-8'), set() if dont_share_blobs is None else dont_share_blobs, {} if blob_shapes is None else blob_shapes) optim = caffe2_pb2.NetDef() optim.ParseFromString(optim_str) assert verify_graph_equality(net.Proto(), optim), \ "Memonger graph is not equal to original." assert verify_inplace_blobs(net.Proto(), optim), \ "Inplace assignments differ in memonger net." return optim
def test_resnet_shared_grads(self, with_shapes, gc, dc): model = cnn.CNNModelHelper( order="NCHW", name="test", cudnn_exhaustive_search=True, ) with core.NameScope("gpu_0"): data = model.net.AddExternalInput("gpu_0/data") label = model.net.AddExternalInput("gpu_0/label") (_softmax, loss) = resnet.create_resnet50( model, data, num_input_channels=3, num_labels=1000, label=label, is_test=False, ) param_to_grad = model.AddGradientOperators([loss]) (shapes, types) = workspace.InferShapesAndTypes( [model.param_init_net, model.net], {'gpu_0/data': [4, 3, 227, 227], 'gpu_0/label': [4]}, ) count_before = count_blobs(model.net.Proto()) optim_proto = memonger.share_grad_blobs( model.net, ["gpu_0/loss"], set(model.param_to_grad.values()), "gpu_0/", share_activations=True, dont_share_blobs=set([str(param_to_grad["gpu_0/conv1_w"])]), blob_shapes=shapes if with_shapes else None, ) self.assertTrue(memonger.verify_graph_equality(model.net.Proto(), optim_proto)) count_after = count_blobs(optim_proto) self.assertTrue(count_after < count_before) # Run model and compare results. We check that the loss is same # and also that the final gradient (conv1_w_grad is same) workspace.RunNetOnce(model.param_init_net) data = np.random.rand(4, 3, 227, 227).astype(np.float32) label = (np.random.rand(4) * 1000).astype(np.int32) workspace.FeedBlob("gpu_0/data", data) workspace.FeedBlob("gpu_0/label", label) workspace.RunNetOnce(model.net) model.net.Proto().type = 'dag' model.net.Proto().num_workers = 4 loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"]) workspace.FeedBlob(param_to_grad["gpu_0/conv1_w"], np.array([0.0])) workspace.RunNetOnce(optim_proto) optimized_loss1 = workspace.FetchBlob("gpu_0/last_out_L1000") optim_conv1_w_grad = workspace.FetchBlob(param_to_grad["gpu_0/conv1_w"]) print("before: {} after: {}".format(count_before, count_after)) np.testing.assert_almost_equal(loss1, optimized_loss1) np.testing.assert_almost_equal(conv1_w_grad, optim_conv1_w_grad)