def FeedBlobWrapper(self, tag, val): if self.gpu_en: _d = core.DeviceOption(caffe2_pb2.CUDA, 0) workspace.FeedBlob(tag, val, device_option=_d) else: workspace.FeedBlob(tag, val)
def testFeedFetchBlobIDEEP(self): arr = np.random.randn(2, 3).astype(np.float32) workspace.FeedBlob( "testblob_ideep", arr, core.DeviceOption(caffe2_pb2.IDEEP)) fetched = workspace.FetchBlob("testblob_ideep") np.testing.assert_array_equal(arr, fetched)
def _run(self, net, param_init_net, param_info): param = param_info.blob grad = param_info.grad if self.base_learning_rate == 0: return assert self.base_learning_rate > 0, ( "Expect positive base learning rate, got {}".format( self.base_learning_rate)) # TODO(zqq): support LARS for sparse parameters if self.lars is not None and not isinstance(grad, core.GradientSlice): assert self.lars >= 0, ( 'Lars offset must be nonnegative, got {}'.format(self.lars)) lr_lars_multiplier = net.Lars( [param, grad], self.make_unique_blob_name(str(param) + "_lars"), offset=self.lars) current_scope = scope.CurrentDeviceScope() self.add_lr_multiplier( lr_lars_multiplier, is_gpu_blob=(current_scope is not None and current_scope.device_type == caffe2_pb2.CUDA), ) # We need negative sign for LR when used directly with WeightedSum # below. lr_sign = -1 if self.momentum else 1 lr, _ = self.build_lr(net, param_init_net, base_learning_rate=self.base_learning_rate * lr_sign, policy=self.policy, **(self.init_kwargs)) dev = scope.CurrentDeviceScope() if dev is None: dev = core.DeviceOption(caffe2_pb2.CPU) # Each GPU/CPU must have its own ONE blob, thus modify the name # to include device information. ONE = param_init_net.ConstantFill([], "ONE_{}_{}{}".format( dev.device_type, dev.cuda_gpu_id, dev.node_name), shape=[1], value=1.0) self._aux_params.shared.append(ONE) if self.momentum > 0: momentum_data = param_init_net.ConstantFill(param, str(param) + "_momentum", value=0.) self._aux_params.local.append(momentum_data) if isinstance(grad, core.GradientSlice): grad = self.dedup(net, self.sparse_dedup_aggregator, grad) if self.momentum > 0.: net.SparseMomentumSGDUpdate( [grad.values, momentum_data, lr, param, grad.indices], [grad.values, momentum_data, param], momentum=self.momentum, nesterov=self.nesterov) else: net.ScatterWeightedSum( [param, ONE, grad.indices, grad.values, lr], param) else: if self.momentum > 0.: net.MomentumSGDUpdate([grad, momentum_data, lr, param], [grad, momentum_data, param], momentum=self.momentum, nesterov=self.nesterov) else: coeff = lr net.WeightedSum([param, ONE, grad, coeff], param)
def CudaDevice(gpu_id): """Create a Cuda device.""" return core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
for i in range(num_images): vis_im = _generate_visualizations(frames[i], i, dets['all_boxes'], dets['all_keyps'], dets['all_tracks']) cv2.imwrite(osp.join(args.out_path, args.vid_name + '_vis','%08d.jpg'%(i+1)),vis_im) if __name__=='__main__': workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) args = parse_args() if args.out_path == None: args.out_path = args.video_path args.vid_name = args.video_path.split('/')[-1].split('.')[0] utils.c2.import_custom_ops() utils.c2.import_detectron_ops() utils.c2.import_contrib_ops() if args.cfg_file is not None: cfg_from_file(args.cfg_file) if args.opts is not None: cfg_from_list(args.opts) assert_and_infer_cfg() if osp.exists(osp.join(args.out_path,args.vid_name + '_vis')): shutil.rmtree(osp.join(args.out_path, args.vid_name + '_vis')) os.makedirs(osp.join(args.out_path,args.vid_name+ '_vis')) num_images = _read_video(args) gpu_dev = core.DeviceOption(caffe2_pb2.CUDA, cfg.ROOT_GPU_ID) name_scope = 'gpu_{}'.format(cfg.ROOT_GPU_ID) main(name_scope, gpu_dev, num_images, args)
class TestGradientCalculation(test_util.TestCase): def assertOperatorListEqual(self, operatorDefList1, operatorDefList2): for op in operatorDefList1: op.debug_info = "" if op.device_option: del op.device_option.extra_info[:] for op in operatorDefList2: op.debug_info = "" if op.device_option: del op.device_option.extra_info[:] self.assertEqual(operatorDefList1, operatorDefList2) @given(device_option=st.sampled_from( [None, core.DeviceOption(workspace.GpuDeviceType, 1)])) def testDirect(self, device_option): operators = [ CreateOperator('Direct', 'in', 'hidden'), CreateOperator('Direct', 'hidden', 'out'), ] if device_option: for op in operators: op.device_option.CopyFrom(device_option) desired_grad_operators = [ CreateOperator('DirectGradient', 'out_grad', 'hidden_grad'), CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'), ] if device_option: for op in desired_grad_operators: op.device_option.CopyFrom(device_option) gradients, _ = GradientRegistry.GetBackwardPass( operators, {'out': 'out_grad'}) self.assertOperatorListEqual(gradients, desired_grad_operators) def testDirectImplicitGradientSource(self): operators = [ CreateOperator('Direct', 'in', 'hidden'), CreateOperator('Direct', 'hidden', 'out'), ] desired_grad_operators = [ CreateOperator("ConstantFill", 'out', "out_autogen_grad", value=1.0), CreateOperator('DirectGradient', 'out_autogen_grad', 'hidden_grad'), CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'), ] for op in desired_grad_operators: op.debug_info = "" gradients, _ = GradientRegistry.GetBackwardPass(operators, ['out']) self.assertOperatorListEqual(gradients, desired_grad_operators) def testDoesNotGenerateUnnecessaryGradients(self): operators = [ CreateOperator('Direct', 'in', 'hidden'), CreateOperator('Direct', 'hidden', 'out'), ] desired_grad_operators = [ CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'), ] for op in desired_grad_operators: op.debug_info = "" gradients, _ = GradientRegistry.GetBackwardPass( operators, {'hidden': 'hidden_grad'}) self.assertOperatorListEqual(gradients, desired_grad_operators) def testDirectButNoOutputGradientGiven(self): operators = [ CreateOperator('Direct', 'in', 'hidden'), CreateOperator('Direct', 'hidden', 'out'), ] gradients, _ = GradientRegistry.GetBackwardPass(operators, {}) self.assertOperatorListEqual(gradients, []) def testDirectInPlace(self): operators = [ CreateOperator('Direct', 'in', 'in'), CreateOperator('Direct', 'in', 'out'), ] desired_grad_operators = [ CreateOperator('DirectGradient', 'out_grad', 'in_grad'), CreateOperator('DirectGradient', 'in_grad', 'in_grad'), ] gradients, _ = GradientRegistry.GetBackwardPass( operators, {'out': 'out_grad'}) self.assertOperatorListEqual(gradients, desired_grad_operators) def testVersionMismatch(self): operators = [ CreateOperator('Direct', 'x', 'x'), CreateOperator('Direct', 'y', 'x'), CreateOperator('Direct', 'x', 'y'), ] try: gradients, _ = GradientRegistry.GetBackwardPass( operators, {'y': 'y_grad'}) self.assertFalse(True, "Should raise exception of incorrect version") except RuntimeError as e: print(e) self.assertTrue("version" in str(e)) pass def testUseOutput(self): operators = [ CreateOperator('UseOutput', 'in', 'hidden'), CreateOperator('UseOutput', 'hidden', 'out'), CreateOperator('Direct', 'out', 'sink'), ] desired_grad_operators = [ CreateOperator('DirectGradient', 'sink_grad', 'out_grad'), CreateOperator('UseOutputGradient', ['out', 'out_grad'], 'hidden_grad'), CreateOperator('UseOutputGradient', ['hidden', 'hidden_grad'], 'in_grad'), ] gradients, _ = GradientRegistry.GetBackwardPass( operators, {'sink': 'sink_grad'}) self.assertOperatorListEqual(gradients, desired_grad_operators) def testUseOutputInPlace(self): operators = [ CreateOperator('UseOutput', 'in', 'in'), CreateOperator('UseOutput', 'in', 'out'), CreateOperator('Direct', 'out', 'sink'), ] desired_grad_operators = [ CreateOperator('DirectGradient', 'sink_grad', 'out_grad'), CreateOperator('UseOutputGradient', ['out', 'out_grad'], 'in_grad'), CreateOperator('UseOutputGradient', ['in', 'in_grad'], 'in_grad'), ] gradients, _ = GradientRegistry.GetBackwardPass( operators, {'sink': 'sink_grad'}) self.assertOperatorListEqual(gradients, desired_grad_operators) def testUseOutputButOutputHasBeenChanged(self): operators = [ CreateOperator('UseOutput', 'in', 'hidden'), # Note here: we overwrite hidden, but hidden will be needed by the # gradient calculation of the first operator, so the gradient # registry should return an error. CreateOperator('Direct', 'hidden', 'hidden'), CreateOperator('UseOutput', 'hidden', 'out'), CreateOperator('Direct', 'out', 'sink'), ] with self.assertRaises(RuntimeError): gradients, _ = GradientRegistry.GetBackwardPass( operators, {'sink': 'sink_grad'}) def testUseInput(self): operators = [ CreateOperator('Direct', 'in', 'hidden'), CreateOperator('UseInput', 'hidden', 'out'), CreateOperator('Direct', 'out', 'sink'), ] desired_grad_operators = [ CreateOperator('DirectGradient', 'sink_grad', 'out_grad'), CreateOperator('UseInputGradient', ['hidden', 'out_grad'], 'hidden_grad'), CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'), ] gradients, _ = GradientRegistry.GetBackwardPass( operators, {'sink': 'sink_grad'}) self.assertOperatorListEqual(gradients, desired_grad_operators) def testUseInputButInputHasBeenChanged(self): """Test gradient for the following case: in -> out, with UseInput in -> in Since we overwrite in in op#1, but in will be needed by the gradient calculation of op#0, the gradient registry should raise an error. """ operators = [ CreateOperator('UseInput', 'in', 'out'), CreateOperator('Direct', 'in', 'in'), ] with self.assertRaises(RuntimeError): gradients, _ = GradientRegistry.GetBackwardPass( operators, {'out': 'out_grad'}) @given(device_option=st.sampled_from( [None, core.DeviceOption(workspace.GpuDeviceType, 1)])) def testMultiUseInput(self, device_option): """Test gradient for the following case: in -> hidden1 in -> hidden2 hidden1, hidden2 -> out """ operators = [ CreateOperator('Direct', 'in', 'hidden1'), CreateOperator('Direct', 'in', 'hidden2'), CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'), ] if device_option: for op in operators: op.device_option.CopyFrom(device_option) desired_grad_operators = [ CreateOperator('DirectGradient', 'out_grad', ['hidden1_grad', 'hidden2_grad']), CreateOperator('DirectGradient', 'hidden2_grad', 'in_grad'), CreateOperator('DirectGradient', 'hidden1_grad', '_in_grad_autosplit_0'), CreateOperator('Sum', ['in_grad', '_in_grad_autosplit_0'], 'in_grad'), ] if device_option: for op in desired_grad_operators: op.device_option.CopyFrom(device_option) gradients, _ = GradientRegistry.GetBackwardPass( operators, {"out": "out_grad"}) self.assertOperatorListEqual(gradients, desired_grad_operators) def testMultiUseInputButWithNoGradient(self): """Test gradient for the following case: in -> hidden1 in -(no gradient)-> hidden2 hidden1, hidden2 -> out """ operators = [ CreateOperator('Direct', 'in', 'hidden1'), CreateOperator('Nogradient', 'in', 'hidden2'), CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'), ] desired_grad_operators = [ CreateOperator('DirectGradient', 'out_grad', ['hidden1_grad', 'hidden2_grad']), CreateOperator('DirectGradient', 'hidden1_grad', 'in_grad'), ] gradients, _ = GradientRegistry.GetBackwardPass( operators, {'out': 'out_grad'}) self.assertOperatorListEqual(gradients, desired_grad_operators) def testMultiUseInputAndMultipleVersions(self): """Test gradient for the following case: in -> in in -> hidden1, hidden2 hidden1, hidden2 -> out """ operators = [ CreateOperator('Direct', 'in', 'in'), CreateOperator('Direct', 'in', 'hidden1'), CreateOperator('Direct', 'in', 'hidden2'), CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'), ] desired_grad_operators = [ CreateOperator('DirectGradient', 'out_grad', ['hidden1_grad', 'hidden2_grad']), CreateOperator('DirectGradient', 'hidden2_grad', 'in_grad'), CreateOperator('DirectGradient', 'hidden1_grad', '_in_grad_autosplit_0'), CreateOperator('Sum', ['in_grad', '_in_grad_autosplit_0'], 'in_grad'), CreateOperator('DirectGradient', 'in_grad', 'in_grad'), ] gradients, _ = GradientRegistry.GetBackwardPass( operators, {'out': 'out_grad'}) self.assertOperatorListEqual(gradients, desired_grad_operators) def testMultiUseInputAndMultipleVersionsBig(self): """Test gradient for the following case: in -> in in -> hidden1, hidden2 hidden1, hidden2 -> in in -> hidden3, hidden4, hidden5 hidden3, hidden4, hidden5 -> out """ operators = [ CreateOperator('Direct', 'in', 'in'), CreateOperator('Direct', 'in', 'hidden1'), CreateOperator('Direct', 'in', 'hidden2'), CreateOperator('Direct', ['hidden1', 'hidden2'], 'in'), CreateOperator('Direct', 'in', 'hidden3'), CreateOperator('Direct', 'in', 'hidden4'), CreateOperator('Direct', 'in', 'hidden5'), CreateOperator('Direct', ['hidden3', 'hidden4', 'hidden5'], 'out'), ] desired_grad_operators = [ CreateOperator('DirectGradient', 'out_grad', ['hidden3_grad', 'hidden4_grad', 'hidden5_grad']), CreateOperator('DirectGradient', 'hidden5_grad', 'in_grad'), CreateOperator('DirectGradient', 'hidden4_grad', '_in_grad_autosplit_0'), CreateOperator('DirectGradient', 'hidden3_grad', '_in_grad_autosplit_1'), CreateOperator( 'Sum', ['in_grad', '_in_grad_autosplit_0', '_in_grad_autosplit_1'], 'in_grad'), CreateOperator('DirectGradient', 'in_grad', ['hidden1_grad', 'hidden2_grad']), CreateOperator('DirectGradient', 'hidden2_grad', 'in_grad'), CreateOperator('DirectGradient', 'hidden1_grad', '_in_grad_autosplit_0'), CreateOperator('Sum', ['in_grad', '_in_grad_autosplit_0'], 'in_grad'), CreateOperator('DirectGradient', 'in_grad', 'in_grad'), ] gradients, _ = GradientRegistry.GetBackwardPass( operators, {'out': 'out_grad'}) for s in gradients: print(str(s)) self.assertOperatorListEqual(gradients, desired_grad_operators) def testGradientMappingUsingSumOp(self): """Since Sum is used in accumulating gradients, we will test if it is OK to also explicitly use it in the graph.""" operators = [ CreateOperator('FC', ['in', 'w', 'b'], 'fc'), CreateOperator('Sum', 'fc', 'agg'), CreateOperator('AveragedLoss', 'agg', 'loss'), ] # This should run correctly. gradient_ops, _ = GradientRegistry.GetBackwardPass( operators, {'loss': 'loss_grad'}) for s in gradient_ops: print(str(s)) def testGradientCalculationWithPrint(self): """Test a common use case where we have Print in the forward pass.""" operators = [ CreateOperator('FC', ['in', 'w', 'b'], 'fc'), CreateOperator('Print', 'fc', []), CreateOperator('AveragedLoss', 'fc', 'loss'), ] desired_grad_operators = [ CreateOperator('AveragedLossGradient', ['fc', 'loss_grad'], 'fc_grad'), CreateOperator('FCGradient', ['in', 'w', 'fc_grad'], ['w_grad', 'b_grad', 'in_grad']), ] for g in desired_grad_operators: g.is_gradient_op = 1 # This should run correctly. gradient_ops, _ = GradientRegistry.GetBackwardPass( operators, {'loss': 'loss_grad'}) for s in gradient_ops: print(str(s)) self.assertOperatorListEqual(gradient_ops, desired_grad_operators) def testStopGradient(self): operators = [ CreateOperator('Direct', 'in', 'hidden'), CreateOperator('StopGradient', 'hidden', 'hidden2'), CreateOperator('Direct', 'hidden2', 'out'), ] desired_grad_operators = [ CreateOperator('DirectGradient', 'out_grad', 'hidden2_grad'), ] gradients, _ = GradientRegistry.GetBackwardPass( operators, {'out': 'out_grad'}) self.assertOperatorListEqual(gradients, desired_grad_operators) def testStopGradientOrphan(self): operators = [ CreateOperator('Direct', 'in', 'hidden'), CreateOperator('StopGradient', 'hidden', 'auto_blobx'), CreateOperator('Direct', 'hidden', 'out'), ] with self.assertRaises(ValueError): # This should complain about incorrect use of StopGradient gradients, _ = GradientRegistry.GetBackwardPass( operators, {'out': 'out_grad'}) def testStopGradientInplace(self): operators = [ CreateOperator('Direct', 'in', 'hidden'), CreateOperator('StopGradient', 'hidden', 'hidden'), CreateOperator('Direct', 'hidden', 'out'), ] desired_grad_operators = [ CreateOperator('DirectGradient', 'out_grad', 'hidden_grad'), ] gradients, grad_map = GradientRegistry.GetBackwardPass( operators, {'out': 'out_grad'}) self.assertOperatorListEqual(gradients, desired_grad_operators) self.assertEqual(grad_map, {'out': 'out_grad'}) def testStopGradientWithMultiUseOperators(self): operators = [ CreateOperator('Direct', 'in', 'hidden'), CreateOperator('Direct', 'hidden', 'hidden2'), CreateOperator('StopGradient', 'hidden', 'hidden3'), CreateOperator('Direct', ['hidden2', 'hidden3'], 'out'), ] desired_grad_operators = [ CreateOperator('DirectGradient', 'out_grad', ['hidden2_grad', 'hidden3_grad']), CreateOperator('DirectGradient', 'hidden2_grad', 'hidden_grad'), CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'), ] gradients, grad_map = GradientRegistry.GetBackwardPass( operators, {'out': 'out_grad'}) self.assertOperatorListEqual(gradients, desired_grad_operators) self.assertEqual( grad_map, { 'out': 'out_grad', 'hidden2': 'hidden2_grad', 'hidden3': 'hidden3_grad', 'hidden': 'hidden_grad', 'in': 'in_grad' }) def test_zero_gradient(self): net = core.Net("zero_grad_test") hidden_prev, cell, gates, seq_lengths, timestep =\ net.AddExternalInput("h", "c", "g", "s", "t") hidden, cell = net.LSTMUnit( [hidden_prev, cell, gates, seq_lengths, timestep], ["hidden_t", "cell_t"]) with self.assertRaises(Exception): net.AddGradientOperators([hidden]) net.ZeroGradient(cell, []) net.AddGradientOperators([hidden]) def test_two_grads(self): net = core.Net("test_two_grads") input, two, three = net.AddExternalInput("input", "two", "three") m1 = net.Mul([input, two], "mul_1") m2 = net.Mul([m1, three], "mul_2") grad_map = net.AddGradientOperators([m2, m1]) workspace.ResetWorkspace() workspace.blobs[input] = np.array([1]).astype(np.float32) workspace.blobs[two] = np.array([2]).astype(np.float32) workspace.blobs[three] = np.array([3]).astype(np.float32) workspace.RunNetOnce(net) print(net.Proto()) for blob in workspace.blobs: print(blob, workspace.blobs[blob]) print("Input grad: ", workspace.blobs[grad_map[str(input)]]) assert workspace.blobs[grad_map[str(input)]] == 8.0
def rewrite_run_net_simple_xrayocr_lstm(net, ideep=True): # For xrayocr model with lstm, only rewrite the non-lstm part of the net to # enable mkl, then copy the temporary output blob at the break point # and all external inputs for lstm part to cpu, and execuate rest of the net # (two lstm) on cpu # This only works for the xrayocr lstm model which uses the first 'Shape' op # to decide the break point, and after two lstm it's external_output # directly so there's no need to copy back to ideep/mkl def mkl_tmp(name): return "{}__MKL__".format(name) def cpu_tmp(name): return "{}__CPU__".format(name) input_blob = net.external_input[0] if input_blob != net.op[0].input[0]: raise Exception( "Input blob: {} is not consumed by first op: {}".format( input_blob, net.op[0])) # Modify input/outputs to point to copied MKL blobs. from_cpu = "CopyCPUToIDEEP" if ideep else "CopyCPUToMKL" to_cpu = "CopyIDEEPToCPU" if ideep else "CopyMKLToCPU" copy_input_op = core.CreateOperator(from_cpu, input_blob, mkl_tmp(input_blob)) net.op[0].input[0] = mkl_tmp(input_blob) # the net may contain some external_inputs falsely added during ONNX->Caffe2 # This should be taken care of in early steps during pytorch_to_caffe2, # but if not it can cause issue in follow up steps, so check here to confirm for input_blob in net.external_input: for op in net.op: # look for if the external_input blob is output of any op in the net assert input_blob not in op.output external_output = None external_inputs_to_cpu = set() find_first_shape_op = False cpu_op_start_idx = -1 for op_idx, op in enumerate(net.op): # the first Shape op mark the starting point of LSTM chunk of the net if not find_first_shape_op: if op.type == 'Shape': external_output = op.input find_first_shape_op = True cpu_op_start_idx = op_idx else: # any external input in the LSTM part need to be copied to CPU for in_blob in op.input: if in_blob in net.external_input: external_inputs_to_cpu.add(in_blob) # make sure we found the expected break point of the net assert external_output is not None # create op to copy external input blobs used in LSTM part from IDEEP to CPU copy_extra_input_ops = [] for in_blob in external_inputs_to_cpu: copy_extra_input_ops.append( core.CreateOperator(to_cpu, in_blob, cpu_tmp(in_blob))) # rename input blobs in LSTM part to use the CPU copy for op in net.op[cpu_op_start_idx:]: renamed_input = [ blob if blob != in_blob else cpu_tmp(in_blob) for blob in op.input ] op.input[:] = renamed_input copy_output_ops = [ core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob) for output_blob in external_output ] for output_blob in external_output: last_producer_idx = last_producer(net.op, output_blob) renamed_outputs = [ blob if blob != output_blob else mkl_tmp(blob) for blob in net.op[last_producer_idx].output ] net.op[last_producer_idx].output[:] = renamed_outputs # rearrange all ops in correct order ops = [copy_input_op] + net.op[:cpu_op_start_idx] \ + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:] del net.op[:] net.op.extend(ops) device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN for op in net.op: # the first Shape op mark the starting point of LSTM chunk of the net if op.type == 'Shape': # all LSTM ops should run on CPU device = caffe2_pb2.CPU op.device_option.MergeFrom(core.DeviceOption(device_type=device)) op.engine = "" # RecurrentNetwork has a nested step_net that needs special treatment if op.type == 'RecurrentNetwork': for arg in op.arg: if arg.name == 'step_net': for nested_op in arg.n.op: # set device to CPU nested_op.device_option.MergeFrom( core.DeviceOption(device_type=device)) nested_op.engine = "" # rename inputs in op of nested net renamed_input = [] for blob in nested_op.input: renamed_input.append( blob if blob not in external_inputs_to_cpu else cpu_tmp(blob)) nested_op.input[:] = renamed_input # rename external inputs of nested net new_external_input = [] for blob in arg.n.external_input: new_external_input.append( blob if blob not in external_inputs_to_cpu else cpu_tmp(blob)) arg.n.external_input[:] = new_external_input if ideep: # Temporarily disbale conv+relu fusion until we verify further # net.ParseFromString( # C.transform_optimizeForIDEEP(net.SerializeToString())) fix_BoxWithNMSLimit(net)
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) # For testing explicit sync model.param_init_net.UniformFill([], ["sync_num"], shape=[1]) return [loss] def add_optimizer(model): return optimizer.build_sgd( model, 0.1, policy="fixed", max_gradient_norm=5.0, allow_lr_injection=True, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(devices), ) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, optimizer_builder_fun=add_optimizer, devices=devices, cpu_device=not gpu, shared_model=not gpu, combine_spatial_bn=not gpu, ) data_parallel_model.AddBlobSync(model, ["sync_num"]) # Light test for LR names lr_names = data_parallel_model.GetLearningRateBlobNames(model) self.assertGreater(len(lr_names), 0) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data ) workspace.FeedBlob( "{}_{}/label".format(model._device_prefix, g), labels ) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.FeedBlob( model._device_prefix + "_0/sync_num", np.array([i * 2]).astype(np.float32), device_option=core.DeviceOption(model._device_type, 0)) workspace.RunNet(model.net.Proto().name) # Test AddBlobSync for j in model._devices: sync = workspace.FetchBlob( model._device_prefix + "_{}/sync_num".format(j))[0] self.assertTrue(abs(sync - i * 2) < 0.01) return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
def run_model(self, V, gpu_devices, cpu_indices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): if cpu_indices: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): gathered_cpu = model.net.Gather( [self.vecs, 'indices'], 'gathered_cpu') gathered = model.CopyCPUToGPU(gathered_cpu, "gathered") else: gpu_vecs = model.param_init_net.CopyCPUToGPU( self.vecs, "gpuvecs", ) model.params.append(gpu_vecs) gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered') flattened = model.Flatten(gathered, "flattened") fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: param_momentum = model.param_init_net.ConstantFill( [param], param + '_momentum', value=0.0, ) model.net.SparseMomentumSGDUpdate( [ param_grad.values, param_momentum, LR, param, param_grad.indices, ], [ param_grad.values, param_momentum, param ], momentum=0.1, nesterov=0, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) self.vecs = model.param_init_net.UniformFill( [], "vecs", shape=[V, 16]) if cpu_indices: model.params.append(self.vecs) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs if cpu_indices: with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): for param in model.GetParams(): param_grad = model.param_to_grad[param] model.ScatterWeightedSum([param, self.ONE_CPU, param_grad.indices, param_grad.values, self.LR], self.vecs) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_indices = np.random.permutation(V)[:batch_size * 16].reshape( batch_size, 16 ) full_labels = full_indices[:, 0] % 2 batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en, :].astype(np.int32) labels = full_labels[st:en].astype(np.float32) device_for_indices = core.DeviceOption(caffe2_pb2.CPU) if not cpu_indices: device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g) with core.DeviceScope(device_for_indices): workspace.FeedBlob("gpu_{}/indices".format(g), indices) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = np.random.rand(V, 16).astype(np.float32) workspace.FeedBlob( self.vecs, orig_vecs ) if not cpu_indices: for g in gpu_devices: workspace.FeedBlob( "gpu_{}/gpuvecs".format(g), orig_vecs, device_option=core.DeviceOption(caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) if len(gpu_devices) == 2: if not cpu_indices: idx = workspace.FetchBlob("gpu_0/indices") idx = list(idx.flatten()) n = len(idx) nu = len(set(idx)) assert n == nu, "We cannot have duplicate indices" # Sanity check to see the vecs were updated self.assertFalse( np.allclose(workspace.FetchBlob(self.vecs), orig_vecs)) return [workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"), workspace.FetchBlob("gpu_0/fc_w")]
def test_device_scope_check(self): with self.assertRaises(AssertionError): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): data_parallel_model.Parallelize_GPU(None, None, None)
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): workspace.FeedBlob( core.ScopedBlobReference("seq_lengths"), np.array([self.T] * self.batch_per_device, dtype=np.int32) ) model.param_init_net.ConstantFill( [], "hidden_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim] ) model.param_init_net.ConstantFill( [], "cell_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim] ) output, _last_hidden, _, _last_state, = rnn_cell.LSTM( model=model, input_blob="data", seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=self.input_dim, dim_out=self.hidden_dim, scope="partest", ) # A silly loss function loss = model.AveragedLoss( model.Sub([output, "target"], "dist"), "loss", ) loss = model.Scale(loss, "loss_scaled", scale=loss_scale) return [loss] def param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param) assert len(model.GetParams()) == len(model.params) // len(model._devices) workspace.ResetWorkspace() model = cnn.CNNModelHelper( name="recurrent_test{}".format(devices), ) self.T = 8 self.batch_size = 64 self.input_dim = 8 self.hidden_dim = 31 self.batch_per_device = self.batch_size // len(devices) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=devices, optimize_gradient_memory=True, cpu_device=not gpu, ) # Change all initialization to be ConstantFills so that # the everything is deterministic for op in model.param_init_net.Proto().op: if op.type.endswith('Fill'): op.type = 'ConstantFill' # Each run has same input, independent of number of gpus np.random.seed(20150210) for i in range(0, 10): full_data = np.random.rand(self.T, self.batch_size, self.input_dim) full_target = np.random.rand( self.T, self.batch_size, self.hidden_dim ) for (j, g) in enumerate(devices): st = j * self.batch_per_device en = st + self.batch_per_device data = full_data[:, st:en, :].astype(np.float32) targets = full_target[:, st:en, :].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data ) workspace.FeedBlob( "{}_{}/target".format(model._device_prefix, g), targets ) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("{}_0/partest/i2h_w".format(model._device_prefix))
def initialize_master_xpu_model_params(model, weights_file, opts, reset_epoch): log.info("Initializing model params from file: {}".format(weights_file)) with open(weights_file, 'r') as fopen: blobs = pickle.load(fopen) if 'blobs' in blobs: blobs = blobs['blobs'] start_epoch = 0 best_metric = float('-inf') if 'epoch' in blobs: log.info('epoch {} is found in model file'.format(blobs['epoch'])) if not reset_epoch: start_epoch = blobs['epoch'] else: log.info('Reset epoch') else: log.info('no epoch is found in model file') lr = opts['model_param']['base_learning_rate'] if 'lr' in blobs: lr = blobs['lr'] if 'best_metric' in blobs and not reset_epoch: best_metric = blobs['best_metric'] if model is not None: log.info('initialize model parameters using weights file: {}'.format( weights_file)) ws_blobs = workspace.Blobs() unscoped_blob_names = OrderedDict() for blob in model.GetAllParams(): unscoped_blob_names[unscope_name(str(blob))] = True root_xpu_id = opts['distributed']['first_xpu_id'] device = opts['distributed']['device'] caffe2_pb2_DEVICE =\ caffe2_pb2.CUDA if opts['distributed']['device'] == 'gpu'\ else caffe2_pb2.CPU with core.NameScope('{}_{}'.format(device, root_xpu_id)): with core.DeviceScope(core.DeviceOption(caffe2_pb2_DEVICE, 0)): for unscoped_blob_name in unscoped_blob_names.keys(): scoped_blob_name = scoped_name(unscoped_blob_name) if unscoped_blob_name not in blobs: log.info('{:s} not found'.format(unscoped_blob_name)) continue log.info('{:s} loaded from weights file into: {:s}'.format( unscoped_blob_name, scoped_blob_name)) if scoped_blob_name in ws_blobs: ws_blob = workspace.FetchBlob(scoped_blob_name) if not ws_blob.shape == blobs[unscoped_blob_name].shape: log.info( ('Workspace blob {} with shape {} does ' 'not match weights file shape {}').format( unscoped_blob_name, ws_blob.shape, blobs[unscoped_blob_name].shape)) else: workspace.FeedBlob( scoped_blob_name, blobs[unscoped_blob_name].astype(np.float32, copy=False)) else: log.info('Skip initializing model parameters from file: {}'.format( weights_file)) log.info('Complete initialize_master_xpu_model_params') return start_epoch, lr, best_metric
def normalize_dense_matrix( self, input_matrix: str, features: List[int], normalization_parameters: Dict[int, NormalizationParameters], blobname_prefix: str, split_expensive_feature_groups: bool, ) -> Tuple[str, List[str]]: """ Normalizes inputs according to parameters. Expects a dense matrix whose ith column corresponds to feature i. Note that the Caffe2 BatchBoxCox operator isn't implemented on CUDA GPU so we need to use a CPU context. :param input_matrix: Input matrix to normalize. :param features: Array that maps feature ids to column indices. :param normalization_parameters: Mapping from feature names to NormalizationParameters. :param blobname_prefix: Prefix for input blobs to norm_net. :param num_output_features: The number of features in an output processed datapoint. If set to None, this function will compute it. """ with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): feature_starts = self._get_type_boundaries( features, normalization_parameters) normalized_input_blobs = [] parameters: List[str] = [] for i, feature_type in enumerate(FEATURE_TYPES): start_index = feature_starts[i] if (i + 1) == len(FEATURE_TYPES): end_index = len(normalization_parameters) else: end_index = feature_starts[i + 1] if start_index == end_index: continue # No features of this type slices = [] split_feature_group, split_intervals = self._should_split_feature_group( split_expensive_feature_groups, start_index, end_index, feature_type) if split_feature_group: for j in range(len(split_intervals) - 1): slice_blob = self._get_input_blob_indexed( blobname_prefix, feature_type, j) C2.net().Slice( [input_matrix], [slice_blob], starts=[0, split_intervals[j]], ends=[-1, split_intervals[j + 1]], ) slices.append((slice_blob, split_intervals[j], split_intervals[j + 1])) else: sliced_input_features = self._get_input_blob( blobname_prefix, feature_type) C2.net().Slice( [input_matrix], [sliced_input_features], starts=[0, start_index], ends=[-1, end_index], ) slices.append( (sliced_input_features, start_index, end_index)) for (slice_blob, start, end) in slices: normalized_input_blob, blob_parameters = self.preprocess_blob( slice_blob, [ normalization_parameters[x] for x in features[start:end] ], ) logger.info( "Processed split ({}, {}) for feature type {}".format( start, end, feature_type)) parameters.extend(blob_parameters) normalized_input_blobs.append(normalized_input_blob) for i, inp in enumerate(normalized_input_blobs): logger.info("input# {}: {}".format(i, inp)) concatenated_input_blob, concatenated_input_blob_dim = C2.Concat( *normalized_input_blobs, axis=1) return concatenated_input_blob, parameters
def __init__( self, cli_args, model=None, tag=None, enable_prof=False, ): super(Wide_and_Deep_Wrapper, self).__init__() self.args = cli_args # GPU Enable Flags gpu_en = self.args.use_gpu if gpu_en: device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) ngpus = C.num_cuda_devices # 1 print("(Wrapper) Using {} GPU(s)...".format(ngpus)) else: device_opt = core.DeviceOption(caffe2_pb2.CPU) print("(Wrapper) Using CPU...") self.gpu_en = gpu_en num_tables = len(cli_args.arch_embedding_size.split("-")) # We require 3 datastructures in caffe2 to enable non-blocking inputs for Wide_and_Deep # At a high-level each input needs an input queue. Inputs are enqueued # when they arrive on the "server" or "core" and dequeued by the # model's inference engine # Input Blob -> Input Net -> ID Q ===> Wide_and_Deep model self.id_qs = [] self.id_input_blobs = [] self.id_input_nets = [] # Same thing for the lengths inputs self.len_qs = [] self.len_input_blobs = [] self.len_input_nets = [] for i in range(num_tables): q, input_blob, net = self.build_wnd_sparse_queue(tag="id", qid=i) self.id_qs.append(q) self.id_input_blobs.append(input_blob) self.id_input_nets.append(net) q, input_blob, net = self.build_wnd_sparse_queue(tag="len", qid=i) self.len_qs.append(q) self.len_input_blobs.append(input_blob) self.len_input_nets.append(net) self.fc_q, self.fc_input_blob, self.fc_input_net = self.build_wnd_fc_queue( ) if self.args.queue: with core.DeviceScope(device_opt): self.wnd = Wide_and_Deep(cli_args, model, tag, enable_prof, id_qs=self.id_qs, len_qs=self.len_qs, fc_q=self.fc_q) else: with core.DeviceScope(device_opt): self.wnd = Wide_and_Deep(cli_args, model, tag, enable_prof)
def test_lstm_extract_predictor_net(self): model = ModelHelper(name="lstm_extract_test") with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): output, _, _, _ = rnn_cell.LSTM( model=model, input_blob="input", seq_lengths="seqlengths", initial_states=("hidden_init", "cell_init"), dim_in=20, dim_out=40, scope="test", drop_states=True, return_last_layer_only=True, ) # Run param init net to get the shapes for all inputs shapes = {} workspace.RunNetOnce(model.param_init_net) for b in workspace.Blobs(): shapes[b] = workspace.FetchBlob(b).shape # But export in CPU (predict_net, export_blobs) = ExtractPredictorNet( net_proto=model.net.Proto(), input_blobs=["input"], output_blobs=[output], device=core.DeviceOption(caffe2_pb2.CPU, 1), ) # Create the net and run once to see it is valid # Populate external inputs with correctly shaped random input # and also ensure that the export_blobs was constructed correctly. workspace.ResetWorkspace() shapes['input'] = [10, 4, 20] shapes['cell_init'] = [1, 4, 40] shapes['hidden_init'] = [1, 4, 40] print(predict_net.Proto().external_input) self.assertTrue('seqlengths' in predict_net.Proto().external_input) for einp in predict_net.Proto().external_input: if einp == 'seqlengths': workspace.FeedBlob("seqlengths", np.array([10] * 4, dtype=np.int32)) else: workspace.FeedBlob( einp, np.zeros(shapes[einp]).astype(np.float32), ) if einp != 'input': self.assertTrue(einp in export_blobs) print(str(predict_net.Proto())) self.assertTrue(workspace.CreateNet(predict_net.Proto())) self.assertTrue(workspace.RunNet(predict_net.Proto().name)) # Validate device options set correctly for the RNNs import google.protobuf.text_format as protobuftx for op in predict_net.Proto().op: if op.type == 'RecurrentNetwork': for arg in op.arg: if arg.name == "step_net": step_proto = caffe2_pb2.NetDef() protobuftx.Merge(arg.s, step_proto) for step_op in step_proto.op: self.assertEqual(0, step_op.device_option.device_type) self.assertEqual(1, step_op.device_option.cuda_gpu_id) elif arg.name == 'backward_step_net': self.assertEqual("", arg.s)
def run_model(self, V, gpu_devices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): gpu_vecs_gathered = [] gpu_vecs = [] for num, vec in enumerate(self.vecs): gpu_vec = model.param_init_net.CopyCPUToGPU( vec, 'gpuvec_{}'.format(num), ) if num != 2: model.params.append(gpu_vec) gpu_vecs.append(gpu_vec) for num, gpu_vec in enumerate(gpu_vecs): gpu_vec_gathered = model.net.Gather( [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)] ) gpu_vecs_gathered.append(gpu_vec_gathered) assert len(gpu_vecs_gathered) == 3 fc = model.net.FC( [ gpu_vecs_gathered[2], gpu_vecs_gathered[0], gpu_vecs_gathered[1], ], ['fc'], ) _, loss = model.net.SoftmaxWithLoss( [fc, 'label'], ['ce_loss', 'avg_loss'], only_loss=True, ) loss = model.Scale(loss, scale=loss_scale) model.net.Print(loss, [], limit=10) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: model.net.ScatterWeightedSum( [ param, ONE, param_grad.indices, param_grad.values, ONE, ], param, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) batch_size = 32 batch_per_device = batch_size // len(gpu_devices) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) ''' self.vecs consists of 3 big blobs on which we call Gather: 1) FC weights, shape=(V, 16) 2) FC bias, shape=(V) 3) FC input, shape=(batch_per_device, 16) ''' self.vecs = [ model.param_init_net.UniformFill( [], "vec_{}".format(num), shape=[V, 16]) for num in range(2) ] self.vecs.append( model.param_init_net.UniformFill( [], "vec_2", shape=[batch_per_device, 16] ) ) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): for num, vec in enumerate(self.vecs[:-1]): model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec) # Each run has same input, independent of number of gpus for i in range(0, 10): np.random.seed(2603) full_indices = np.random.permutation(V)[:batch_size].reshape( batch_size ) full_labels = full_indices[:] % batch_per_device for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en].astype(np.int32) labels = full_labels[st:en].astype(np.int32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/indices".format(g), indices) workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = [ np.random.rand(V, 16).astype(np.float32), np.random.rand(V).astype(np.float32), np.random.rand(V, 16).astype(np.float32), ] for vec, orig_vec in zip(self.vecs, orig_vecs): workspace.FeedBlob( vec, orig_vec ) for g in gpu_devices: for num, orig_vec in enumerate(orig_vecs): workspace.FeedBlob( "gpu_{}/gpuvec_{}".format(g, num), orig_vec, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) idx = workspace.FetchBlob('gpu_0/indices') grad_slices = [ workspace.FetchBlob( 'gpu_{}/gpu_vec_gathered_{}_grad'.format(g, num)) for g in gpu_devices for num in range(2) ] for grad_slice in grad_slices: # print (len(idx), len(grad_slice)) assert len(idx) == len(grad_slice), ( 'Number of indices {} is not same as number of gradient ' 'slices {}. This might lead to illegal memory access'.format( len(idx), len(grad_slice) ) )
def test_resnet50_core(self): N = 2 warmup = 20 repeat = 100 print("Batch size: {}, repeat inference {} times, warmup {} times". format(N, repeat, warmup)) init_net, pred_net, _ = self._get_c2_model('resnet50') self._add_head_tail(pred_net, 'real_data', 'real_softmax') input_blob_dims = (N, 3, 224, 224) input_name = "real_data" device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) init_net.device_option.CopyFrom(device_option) pred_net.device_option.CopyFrom(device_option) for op in pred_net.op: op.device_option.CopyFrom(device_option) op.engine = 'CUDNN' net_outputs = pred_net.external_output Y_c2 = None data = np.random.randn(*input_blob_dims).astype(np.float32) c2_time = 1 workspace.SwitchWorkspace("gpu_test", True) with core.DeviceScope(device_option): workspace.FeedBlob(input_name, data) workspace.RunNetOnce(init_net) workspace.CreateNet(pred_net) for _ in range(warmup): workspace.RunNet(pred_net.name) start = time.time() for _ in range(repeat): workspace.RunNet(pred_net.name) end = time.time() c2_time = end - start output_values = [workspace.FetchBlob(name) for name in net_outputs] Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values) workspace.ResetWorkspace() # Fill the workspace with the weights with core.DeviceScope(device_option): workspace.RunNetOnce(init_net) # Cut the graph start = time.time() pred_net_cut = transform_caffe2_net(pred_net, {input_name: input_blob_dims}, build_serializable_op=True) del init_net, pred_net #_print_net(pred_net_cut) Y_trt = None input_name = pred_net_cut.external_input[0] print("C2 runtime: {}s".format(c2_time)) with core.DeviceScope(device_option): workspace.FeedBlob(input_name, data) workspace.CreateNet(pred_net_cut) end = time.time() print("Conversion time: {:.2f}s".format(end - start)) for _ in range(warmup): workspace.RunNet(pred_net_cut.name) start = time.time() for _ in range(repeat): workspace.RunNet(pred_net_cut.name) end = time.time() trt_time = end - start print("TRT runtime: {}s, improvement: {}%".format( trt_time, (c2_time - trt_time) / c2_time * 100)) output_values = [workspace.FetchBlob(name) for name in net_outputs] Y_trt = namedtupledict('Outputs', net_outputs)(*output_values) np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
def caffe2_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu): caffe2_res = {} alpha = 1.0 mu = 0.0 beta = 0.999 curv_win_width = 20 epsilon = 1e-6 net = core.Net("net") param_init_net = core.Net("param_init_net") workspace.ResetWorkspace() with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): iteration = param_init_net.ConstantFill([], "iteration", shape=[1], value=0, dtype=core.DataType.INT64) iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"]) net.AtomicIter([iter_mutex, iteration], [iteration]) pre_grad = param_init_net.ConstantFill([], "pre_grad", shape=[n_dim], value=grad_coef) if gpu: iteration = net.CopyCPUToGPU([iteration], "iteration_cpu") iteration_float = net.Cast([iteration], "iteration_float") grad = net.Mul([pre_grad, iteration_float], "grad", broadcast=True) w = param_init_net.ConstantFill([], "w", shape=[n_dim], value=0.0) # a hack to create an object with __dict__ param_info = lambda: None param_info.blob = w param_info.grad = grad optimizer.YellowFinOptimizer(alpha=alpha, mu=mu, beta=beta, curv_win_width=curv_win_width, epsilon=epsilon, zero_debias=zero_debias)._run( net, param_init_net, param_info) workspace.RunNetOnce(param_init_net) workspace.CreateNet(net, overwrite=True) for i in range(n_iter): workspace.RunNet(net) scalars_memory_blob = workspace.FetchBlob("w_scalars_memory") g_norm2_avg = scalars_memory_blob[1] g_norm2_min_avg = scalars_memory_blob[2] g_norm2_max_avg = scalars_memory_blob[3] distance_avg = scalars_memory_blob[4] g_avg_blob = workspace.FetchBlob("w_g_avg") res_lr = workspace.FetchBlob("w_lr_avg")[0] res_mu = workspace.FetchBlob("w_mu_avg")[0] g_deb = self.deb(g_avg_blob, beta, i + 1, zero_debias) variance = max( self.deb(g_norm2_avg, beta, i + 1, zero_debias) - g_deb.dot(g_deb), epsilon) if i > 0: caffe2_res[i] = { 'h_max': np.exp(self.deb(g_norm2_max_avg, beta, i + 1, zero_debias)), 'h_min': np.exp(self.deb(g_norm2_min_avg, beta, i + 1, zero_debias)), 'var': variance, 'dist': self.deb(distance_avg, beta, i + 1, zero_debias), 'lr': res_lr, 'mu': res_mu } return caffe2_res
import sys sys.path.insert(0, '/home/ernie/caffe2/build') from caffe2.python import cnn, workspace, core from caffe2.proto import caffe2_pb2 import numpy as np import time #device_opts = caffe2_pb2.DeviceOption() #device_opts.device_type = caffe2_pb2.CUDA #device_opts.cuda_gpu_id = 0 device_opts = core.DeviceOption(caffe2_pb2.CPU, 0) net = core.Net("smoothL1Loss_test") net.SmoothL1LossGradient(["data1", "data2", "avg_loss"], "loss", device_option=device_opts) print net.Proto() data1 = np.load('data1.npy') data2 = np.load('data2.npy') avg_loss = np.ones(1, dtype=np.float32) workspace.FeedBlob("data1", data1, device_option=device_opts) workspace.FeedBlob("data2", data2, device_option=device_opts) workspace.FeedBlob("avg_loss", avg_loss, device_option=device_opts) workspace.CreateNet(net.Proto()) workspace.RunNet("smoothL1Loss_test", 1) caffe2_out = workspace.FetchBlob('loss')
def feature_extractor(load_model_path=None, test_data=None, gpu_list=None, num_gpus=0, batch_size=4, clip_per_video=1, decode_type=2, clip_length_rgb=4, sampling_rate_rgb=1, scale_h=128, scale_w=171, crop_size=112, video_res_type=0, num_decode_threads=4, multi_label=0, num_labels=101, input_type=0, clip_length_of=8, sampling_rate_of=2, frame_gap_of=2, do_flow_aggregation=0, flow_data_type=0, get_video_id=1, get_start_frame=0, use_local_file=1, crop_per_clip=1, db_type='pickle', model_name='r2plus1d', model_depth=18, num_channels=3, output_path=None, use_cudnn=1, layers='final_avg', num_iterations=1, channel_multiplier=1.0, bottleneck_multiplier=1.0, use_pool1=0, use_convolutional_pred=0, use_dropout=0, **kwargs): """ :param gpu_list: list of gpu ids to use :param batch_size: batch size :param clip_per_video: When clips_per_video > 1, sample this many clips uniformly in time :param decode_type: 0: random, 1: uniform sampling, 2: use starting frame :param clip_length_rgb: Length of input clips :param sampling_rate_rgb: Frame sampling rate :param scale_h: Scale image height to :param scale_w: Scale image width to :param crop_size: Input image size (to crop to) :param video_res_type: Video frame scaling option, 0: scaled by height x width; 1: scaled by short edge :param num_decode_threads: number of decoding threads :param multi_label: Multiple label csv file input :param num_labels: Number of labels :param input_type: 0=rgb, 1=optical flow :param clip_length_of: Frames of optical flow data :param sampling_rate_of: Sampling rate for optial flows :param frame_gap_of: Frame gap of optical flows :param do_flow_aggregation: whether to aggregate optical flow across multiple frames :param flow_data_type: 0=Flow2C, 1=Flow3C, 2=FlowWithGray, 3=FlowWithRGB :param get_video_id: Output video id :param get_start_frame: Output clip start frame :param use_local_file: use local file :param crop_per_clip: number of spatial crops per clip :param db_type: Db type of the testing model :param model_name: Model name :param model_depth: Model depth :param num_channels: Number of channels :param load_model_path: Load saved model for testing :param test_data: Path to output pickle; defaults to layers.pickle next to <test_data> :param output_path: Path to output pickle; defaults to layers.pickle next to <test_data> :param use_cudnn: Use CuDNN :param layers: Comma-separated list of blob names to fetch :param num_iterations: Run only this many iterations :param channel_multiplier: Channel multiplier :param bottleneck_multiplier: Bottleneck multiplier :param use_pool1: use pool1 layer :param use_convolutional_pred: using convolutional predictions :param use_dropout: Use dropout at the prediction layer """ if load_model_path is None or test_data is None: raise Exception('Model path AND test data need to be specified') # Initialize Caffe2 workspace.GlobalInit(['caffe2', '--caffe2_log_level=2']) if gpu_list is None: if num_gpus == 0: raise Exception('Must specify GPUs') else: gpus = [i for i in range(num_gpus)] else: gpus = gpu_list num_gpus = len(gpus) my_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True } model = cnn.CNNModelHelper(name="Extract features", **my_arg_scope) video_input_args = dict( batch_size=batch_size, clip_per_video=clip_per_video, decode_type=decode_type, length_rgb=clip_length_rgb, sampling_rate_rgb=sampling_rate_rgb, scale_h=scale_h, scale_w=scale_w, crop_size=crop_size, video_res_type=video_res_type, short_edge=min(scale_h, scale_w), num_decode_threads=num_decode_threads, do_multi_label=multi_label, num_of_class=num_labels, random_mirror=False, random_crop=False, input_type=input_type, length_of=clip_length_of, sampling_rate_of=sampling_rate_of, frame_gap_of=frame_gap_of, do_flow_aggregation=do_flow_aggregation, flow_data_type=flow_data_type, get_rgb=input_type == 0, get_optical_flow=input_type == 1, get_video_id=get_video_id, get_start_frame=get_start_frame, use_local_file=use_local_file, crop_per_clip=crop_per_clip, ) reader_args = dict( name="extract_features" + '_reader', input_data=test_data, ) reader, num_examples = reader_utils.create_data_reader( model, **reader_args) def input_fn(model): model_helper.AddVideoInput(model, reader, **video_input_args) def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=model_name, model_depth=model_depth, num_labels=num_labels, batch_size=batch_size, num_channels=num_channels, crop_size=crop_size, clip_length=(clip_length_of if input_type == 1 else clip_length_rgb), loss_scale=loss_scale, is_test=1, multi_label=multi_label, channel_multiplier=channel_multiplier, bottleneck_multiplier=bottleneck_multiplier, use_dropout=use_dropout, use_convolutional_pred=use_convolutional_pred, use_pool1=use_pool1, ) ## if num_gpus > 0: data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, # 'None' since we aren't training devices=gpus, optimize_gradient_memory=True, ) else: model._device_type = caffe2_pb2.CPU model._devices = [0] device_opt = core.DeviceOption(model._device_type, 0) with core.DeviceScope(device_opt): # Because our loaded models are named with "gpu_x", keep the naming for now. # TODO: Save model using `data_parallel_model.ExtractPredictorNet` # to extract the model for "gpu_0". It also renames # the input and output blobs by stripping the "gpu_x/" prefix with core.NameScope("{}_{}".format("gpu", 0)): input_fn(model) create_model_ops(model, 1.0) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) if db_type == 'pickle': model_loader.LoadModelFromPickleFile(model, load_model_path) elif db_type == 'minidb': if num_gpus > 0: model_helper.LoadModel(load_model_path, db_type) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(load_model_path, db_type) else: log.warning("Unsupported db_type: {}".format(db_type)) data_parallel_model.FinalizeAfterCheckpoint(model) ## def fetchActivations(model, outputs, num_iterations): all_activations = {} for counter in range(num_iterations): workspace.RunNet(model.net.Proto().name) num_devices = 1 # default for cpu if num_gpus > 0: num_devices = num_gpus for g in range(num_devices): for output_name in outputs: blob_name = 'gpu_{}/'.format(g) + output_name activations = workspace.FetchBlob(blob_name) if output_name not in all_activations: all_activations[output_name] = [] all_activations[output_name].append(activations) # each key holds a list of activations obtained from each minibatch. # we now concatenate these lists to get the final arrays. # concatenating during the loop requires a realloc and can get slow. for key in all_activations: all_activations[key] = np.concatenate(all_activations[key]) return all_activations if not isinstance(layers, list): layers = [layers] if 'video_id' not in layers: layers.append('video_id') assert len(layers) > 0 examples_per_iteration = batch_size * num_gpus num_iterations = int(num_examples / examples_per_iteration) activations = fetchActivations(model, layers, num_iterations) # saving extracted layers for index in range(len(layers)): log.info("Read '{}' with shape {}".format( layers[index], activations[layers[index]].shape)) if output_path: log.info('Writing to {}'.format(output_path)) if save_h5: with h5py.File(output_path, 'w') as handle: for name, activation in activations.items(): handle.create_dataset(name, data=activation) else: with open(output_path, 'wb') as handle: pickle.dump(activations, handle) else: return activations
def CpuScope(): """Create a CPU device scope.""" cpu_dev = core.DeviceOption(caffe2_pb2.CPU) with core.DeviceScope(cpu_dev): yield
def get_device_option(device): m = { DeviceType.CPU: caffe2_pb2.CPU, DeviceType.CUDA: workspace.GpuDeviceType } return core.DeviceOption(m[device.type], device.device_id)
def load_save(self, src_device_type, src_gpu_id, dst_device_type, dst_gpu_id): workspace.ResetWorkspace() dtypes = [ np.float16, np.float32, np.float64, np.bool, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16 ] arrays = [ np.random.permutation(6).reshape(2, 3).astype(T) for T in dtypes ] assume(src_device_type == caffe2_pb2.CUDA or src_gpu_id == 0) assume(dst_device_type == caffe2_pb2.CUDA or dst_gpu_id == 0) src_device_option = core.DeviceOption(src_device_type, src_gpu_id) dst_device_option = core.DeviceOption(dst_device_type, dst_gpu_id) for i, arr in enumerate(arrays): self.assertTrue(workspace.FeedBlob(str(i), arr, src_device_option)) self.assertTrue(workspace.HasBlob(str(i))) try: # Saves the blobs to a local db. tmp_folder = tempfile.mkdtemp() op = core.CreateOperator("Save", [str(i) for i in range(len(arrays))], [], absolute_path=1, db=os.path.join(tmp_folder, "db"), db_type=self._db_type) self.assertTrue(workspace.RunOperatorOnce(op)) # Reset the workspace so that anything we load is surely loaded # from the serialized proto. workspace.ResetWorkspace() self.assertEqual(len(workspace.Blobs()), 0) def _LoadTest(keep_device, device_type, gpu_id, blobs, loadAll): """A helper subfunction to test keep and not keep.""" op = core.CreateOperator("Load", [], blobs, absolute_path=1, db=os.path.join(tmp_folder, "db"), db_type=self._db_type, device_option=dst_device_option, keep_device=keep_device, load_all=loadAll) self.assertTrue(workspace.RunOperatorOnce(op)) for i, arr in enumerate(arrays): self.assertTrue(workspace.HasBlob(str(i))) fetched = workspace.FetchBlob(str(i)) self.assertEqual(fetched.dtype, arr.dtype) np.testing.assert_array_equal(workspace.FetchBlob(str(i)), arr) proto = caffe2_pb2.BlobProto() proto.ParseFromString(workspace.SerializeBlob(str(i))) self.assertTrue(proto.HasField('tensor')) self.assertEqual(proto.tensor.device_detail.device_type, device_type) if device_type == caffe2_pb2.CUDA: self.assertEqual( proto.tensor.device_detail.cuda_gpu_id, gpu_id) blobs = [str(i) for i in range(len(arrays))] # Load using device option stored in the proto, i.e. # src_device_option _LoadTest(1, src_device_type, src_gpu_id, blobs, 0) # Load again, but this time load into dst_device_option. _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0) # Load back to the src_device_option to see if both paths are able # to reallocate memory. _LoadTest(1, src_device_type, src_gpu_id, blobs, 0) # Reset the workspace, and load directly into the dst_device_option. workspace.ResetWorkspace() _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0) # Test load all which loads all blobs in the db into the workspace. workspace.ResetWorkspace() _LoadTest(1, src_device_type, src_gpu_id, [], 1) # Load again making sure that overwrite functionality works. _LoadTest(1, src_device_type, src_gpu_id, [], 1) # Load again with different device. _LoadTest(0, dst_device_type, dst_gpu_id, [], 1) workspace.ResetWorkspace() _LoadTest(0, dst_device_type, dst_gpu_id, [], 1) finally: # clean up temp folder. try: shutil.rmtree(tmp_folder) except OSError as e: if e.errno != errno.ENOENT: raise
def CpuDevice(): """Create a Cuda device.""" return core.DeviceOption(caffe2_pb2.CPU)
action="store_true", help="Whether to use memory optimized LSTM or not", ) parser.add_argument("--forward_only", action="store_true", help="Whether to run only forward pass") parser.add_argument( "--num_layers", type=int, default=1, help="Number of LSTM layers. All output dimensions are going to be" "of hidden_dim size", ) return parser if __name__ == '__main__': args = GetArgumentParser().parse_args() workspace.GlobalInit([ 'caffe2', '--caffe2_log_level=0', '--caffe2_print_blob_sizes_at_exit=0', '--caffe2_gpu_memory_tracking=1' ]) device = core.DeviceOption(caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0) with core.DeviceScope(device): Benchmark(args)
def ExtractFeatures(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus if num_gpus > 0: log.info("Running on GPUs: {}".format(gpus)) else: log.info("Running on CPU") my_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True } model = cnn.CNNModelHelper(name="Extract Features", **my_arg_scope) video_input_args = dict( batch_size=args.batch_size, clip_per_video=args.clip_per_video, decode_type=args.decode_type, length_rgb=args.clip_length_rgb, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, video_res_type=args.video_res_type, short_edge=min(args.scale_h, args.scale_w), num_decode_threads=args.num_decode_threads, do_multi_label=args.multi_label, num_of_class=args.num_labels, random_mirror=False, random_crop=False, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=args.input_type == 0, get_optical_flow=args.input_type == 1, get_video_id=args.get_video_id, get_start_frame=args.get_start_frame, use_local_file=args.use_local_file, crop_per_clip=args.crop_per_clip, ) reader_args = dict( name="extract_features" + '_reader', input_data=args.test_data, ) reader, num_examples = reader_utils.create_data_reader( model, **reader_args) def input_fn(model): model_helper.AddVideoInput(model, reader, **video_input_args) def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, batch_size=args.batch_size, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=(args.clip_length_of if args.input_type == 1 else args.clip_length_rgb), loss_scale=loss_scale, is_test=1, multi_label=args.multi_label, channel_multiplier=args.channel_multiplier, bottleneck_multiplier=args.bottleneck_multiplier, use_dropout=args.use_dropout, use_convolutional_pred=args.use_convolutional_pred, use_pool1=args.use_pool1, ) if num_gpus > 0: data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, # 'None' since we aren't training devices=gpus, optimize_gradient_memory=True, ) else: model._device_type = caffe2_pb2.CPU model._devices = [0] device_opt = core.DeviceOption(model._device_type, 0) with core.DeviceScope(device_opt): # Because our loaded models are named with "gpu_x", keep the naming for now. # TODO: Save model using `data_parallel_model.ExtractPredictorNet` # to extract the model for "gpu_0". It also renames # the input and output blobs by stripping the "gpu_x/" prefix with core.NameScope("{}_{}".format("gpu", 0)): input_fn(model) create_model_ops(model, 1.0) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) if args.db_type == 'pickle': model_loader.LoadModelFromPickleFile(model, args.load_model_path) elif args.db_type == 'minidb': if num_gpus > 0: model_helper.LoadModel(args.load_model_path, args.db_type) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): model_helper.LoadModel(args.load_model_path, args.db_type) else: log.warning("Unsupported db_type: {}".format(args.db_type)) data_parallel_model.FinalizeAfterCheckpoint(model) def fetchActivations(model, outputs, num_iterations): all_activations = {} for counter in range(num_iterations): workspace.RunNet(model.net.Proto().name) num_devices = 1 # default for cpu if num_gpus > 0: num_devices = num_gpus for g in range(num_devices): for output_name in outputs: blob_name = 'gpu_{}/'.format(g) + output_name activations = workspace.FetchBlob(blob_name) if output_name not in all_activations: all_activations[output_name] = [] all_activations[output_name].append(activations) if counter % 20 == 0: log.info('{}/{} iterations'.format(counter, num_iterations)) # each key holds a list of activations obtained from each minibatch. # we now concatenate these lists to get the final arrays. # concatenating during the loop requires a realloc and can get slow. for key in all_activations: all_activations[key] = np.concatenate(all_activations[key]) return all_activations outputs = [name.strip() for name in args.features.split(',')] assert len(outputs) > 0 if args.num_iterations > 0: num_iterations = args.num_iterations else: if num_gpus > 0: examples_per_iteration = args.batch_size * num_gpus else: examples_per_iteration = args.batch_size num_iterations = int(num_examples / examples_per_iteration) activations = fetchActivations(model, outputs, num_iterations) # saving extracted features for index in range(len(outputs)): log.info("Read '{}' with shape {}".format( outputs[index], activations[outputs[index]].shape)) if args.output_path: output_path = args.output_path else: output_path = os.path.dirname(args.test_data) + '/features.pickle' log.info('Writing to {}'.format(output_path)) if args.save_h5: with h5py.File(output_path, 'w') as handle: for name, activation in activations.items(): handle.create_dataset(name, data=activation) else: with open(output_path, 'wb') as handle: pickle.dump(activations, handle) # perform sanity check if args.sanity_check == 1: # check clip accuracy assert args.multi_label == 0 clip_acc = 0 softmax = activations['softmax'] label = activations['label'] for i in range(len(softmax)): sorted_preds = \ np.argsort(softmax[i]) sorted_preds[:] = sorted_preds[::-1] if sorted_preds[0] == label[i]: clip_acc += 1 log.info('Sanity check --- clip accuracy: {}'.format(clip_acc / len(softmax))) elif args.sanity_check == 2: # check auc assert args.multi_label == 1 prob = activations['prob'] label = activations['label'] mean_auc, mean_ap, mean_wap, _ = metric.mean_ap_metric(prob, label) log.info('Sanity check --- AUC: {}, mAP: {}, mWAP: {}'.format( mean_auc, mean_ap, mean_wap))
def apply_over_sequence( self, model, inputs, seq_lengths, initial_states, outputs_with_grads=None, ): inputs = self.cell.prepare_input(model, inputs) # Now they are blob references - outputs of splitting the input sequence split_inputs = model.net.Split( inputs, [str(inputs) + "_timestep_{}".format(i) for i in range(self.T)], axis=0) if self.T == 1: split_inputs = [split_inputs] states = initial_states all_states = [] for t in range(0, self.T): scope_name = "timestep_{}".format(t) # Parameters of all timesteps are shared with ParameterSharing({scope_name: ''}),\ scope.NameScope(scope_name): timestep = model.param_init_net.ConstantFill( [], "timestep", value=t, shape=[1], dtype=core.DataType.INT32, device_option=core.DeviceOption(caffe2_pb2.CPU)) states = self.cell._apply( model=model, input_t=split_inputs[t], seq_lengths=seq_lengths, states=states, timestep=timestep, ) all_states.append(states) all_states = zip(*all_states) all_states = [ model.net.Concat( list(full_output), [ str(full_output[0])[len("timestep_0/"):] + "_concat", str(full_output[0])[len("timestep_0/"):] + "_concat_info" ], axis=0)[0] for full_output in all_states ] outputs = tuple( six.next(it) for it in itertools.cycle([iter(all_states), iter(states)]) ) outputs_without_grad = set(range(len(outputs))) - set( outputs_with_grads) for i in outputs_without_grad: model.net.ZeroGradient(outputs[i], []) logging.debug("Added 0 gradients for blobs:", [outputs[i] for i in outputs_without_grad]) final_output = self.cell._prepare_output_sequence(model, outputs) return final_output, outputs
def get_device_option_cpu(): device_option = core.DeviceOption(caffe2_pb2.CPU) return device_option
def _run(self, net, param_init_net, param_info): # Note: This is number of persistent scalars in YellowFin optimizer. # It should always be the number of scalars being used. The same # number should be used in class for the operation. SCALARS_MEMORY_SIZE = 5 param = param_info.blob grad = param_info.grad moment = param_init_net.ConstantFill([param], param + "_moment", value=0.0) curv_win = param_init_net.ConstantFill([], param + "_curv_win", shape=[self.curv_win_width], value=0.0) g_avg = param_init_net.ConstantFill([param], param + "_g_avg", value=0.0) g2_avg = param_init_net.ConstantFill([param], param + "_g2_avg", value=0.0) lr_avg = param_init_net.ConstantFill([], param + "_lr_avg", shape=[1], value=self.alpha) mu_avg = param_init_net.ConstantFill([], param + "_mu_avg", shape=[1], value=self.mu) scalars_memory = param_init_net.ConstantFill( [], param + "_scalars_memory", shape=[SCALARS_MEMORY_SIZE], value=0.0) assert self.alpha > 0 assert not isinstance(grad, core.GradientSlice), \ "YellowFin does not support sparse gradients" if not param_init_net.BlobIsDefined(_OPTIMIZER_ITERATION_NAME): # Add training operators. with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): iteration = param_init_net.ConstantFill( [], _OPTIMIZER_ITERATION_NAME, shape=[1], value=0, dtype=core.DataType.INT64) iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"]) net.AtomicIter([iter_mutex, iteration], [iteration]) else: iteration = param_init_net.GetBlobRef(_OPTIMIZER_ITERATION_NAME) self._aux_params.shared.append(iteration) self._aux_params.local.append(moment) self._aux_params.local.append(lr_avg) self._aux_params.local.append(mu_avg) self._aux_params.local.append(curv_win) self._aux_params.local.append(g_avg) self._aux_params.local.append(g2_avg) self._aux_params.local.append(scalars_memory) yf_in_out_args = [ param, moment, lr_avg, mu_avg, curv_win, g_avg, g2_avg, scalars_memory ] net.YellowFin(yf_in_out_args + [grad, iteration], yf_in_out_args, beta=self.beta, epsilon=self.epsilon, curv_win_width=self.curv_win_width, zero_debias=self.zero_debias)
sys.path.append("..") # data generation from data_generator.wnd_data_caffe2 import Wide_and_DeepDataGenerator from utils.utils import cli args = cli() ### some basic setup ### np.random.seed(args.numpy_rand_seed) np.set_printoptions(precision=args.print_precision) use_gpu = args.use_gpu if use_gpu: device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) ngpus = C.num_cuda_devices # 1 print("Using {} GPU(s)...".format(ngpus)) else: device_opt = core.DeviceOption(caffe2_pb2.CPU) print("Using CPU...") ### prepare training data ### ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-") # TODO: Make this Wide_and_Deep generator dc = Wide_and_DeepDataGenerator(args) if args.data_generation == "dataset": print("Error we have disabled this function currently....") sys.exit() # input and target data