def test_small_sls_acc32(self, seed): workspace.GlobalInit([ "caffe2", "--glow_global_fp16=0", "--glow_global_fused_scale_offset_fp16=0", "--glow_global_force_sls_fp16_accum=0", ]) np.random.seed(seed) workspace.ResetWorkspace() n = 2 DIM = 3 data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32) lengths = np.array([n], dtype=np.int32) indices = np.array(range(n), dtype=np.int64) weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"])) quantized_data = workspace.FetchBlob("quantized_data") onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=1, max_seq_size=n, debug=True, adjust_batch=True, use_onnx=False, ) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: np.set_printoptions(precision=12) print( "ref", Y_ref.astype(np.float16).astype(np.float32), "glow", Y_glow.astype(np.float16).astype(np.float32), ) print_test_debug_info( "test_small_sls_acc32", { "seed": seed, "indices": indices, "data": data, "quantized_data": quantized_data, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def _test_layernorm(self): size = 3 input_channels = 2 batch_size = 4 seed = int(time.time()) np.random.seed(seed) epsilon = 1e-3 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X"]) pred_net.external_output.extend(["Y", "mean", "rstd"]) pred_net.op.add().CopyFrom( core.CreateOperator( "LayerNorm", ["X"], ["Y", "mean", "rstd"], # axis=-1, epsilon=epsilon)) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "pred" pred_net_ref.external_input.extend(["X"]) pred_net_ref.external_output.extend(["Y", "mean", "rstd"]) pred_net_ref.op.add().CopyFrom( core.CreateOperator( "LayerNormFakeFP16", ["X"], ["Y", "mean", "rstd"], # axis=-1, epsilon=epsilon)) X = np.random.rand(batch_size, input_channels, size, size).astype( np.float32) - 0.5 pred_net_onnxified = onnxifi_caffe2_net( pred_net, {"X": [batch_size, input_channels, size, size]}, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y") mean_c2 = workspace.FetchBlob("mean") std_c2 = workspace.FetchBlob("rstd") workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") mean_glow = workspace.FetchBlob("mean") std_glow = workspace.FetchBlob("rstd") if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)): diff_Y = np.abs(Y_glow - Y_c2).astype(np.float16) diff_std = np.abs(std_glow - std_c2).astype(np.float16) diff_mean = np.abs(mean_glow - mean_c2).astype(np.float16) print_test_debug_info( "layernorm", { "seed": seed, "X": X, "Y_glow": Y_glow, "Y_c2": Y_c2, "Y": diff_Y, "mean": diff_mean, "std": diff_std, }) assert (0)
def _test_binary_op_graph(self, name, seed): np.random.seed(seed) workspace.ResetWorkspace() # First dimension is the batch size dims = np.concatenate((np.array([1]), np.random.randint(1, 20, size=3))) A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) # Avoid dividing by 0 B[np.abs(B) < 1e-3] = 1e-3 print(A.shape, B.shape) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["A", "B"]) pred_net.external_output.append("C") pred_net.op.add().CopyFrom(core.CreateOperator(name, ["A", "B"], ["C"])) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "ref" pred_net_ref.external_input.extend(["A", "B"]) pred_net_ref.external_output.append("C_ref") pred_net_ref.op.add().CopyFrom( core.CreateOperator( name + "FakeFp16", ["A", "B"], ["C_ref"], )) shape_hints = {"A": A.shape, "B": B.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=True, use_onnx=False) print(pred_net_onnxified) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("A", A) workspace.FeedBlob("B", B) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) num_iterations = 10 for _ in range(num_iterations): A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32) # Avoid dividing by 0 B[np.abs(B) < 1e-3] = 1e-3 workspace.FeedBlob("A", A) workspace.FeedBlob("B", B) # Run caffe2 net workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("C_ref") # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("C") Y_glow[Y_glow == np.Inf] = np.finfo(np.float16).max Y_glow[Y_glow == np.NINF] = np.finfo(np.float16).min # Ignore mismatches solely due to difference in precision fp16_finite = np.isfinite( A.astype(np.float16) / B.astype(np.float16)) # Results should be identical since we are comparing with the C2 emulation if not np.allclose(Y_c2[fp16_finite], Y_glow[fp16_finite]): diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon)) print_test_debug_info( name, { "dims": dims, "iter": _, "seed": seed, "A": A, "B": B, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff }) assert (0)
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def add_optimizer(model): optimizer.build_sgd(model, 0.1, policy="fixed") workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(devices), ) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, optimizer_builder_fun=add_optimizer, devices=devices, cpu_device=not gpu, ) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data) workspace.FeedBlob( "{}_{}/label".format(model._device_prefix, g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
def run_model(self, V, gpu_devices, cpu_indices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): if cpu_indices: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): gathered_cpu = model.net.Gather([self.vecs, 'indices'], 'gathered_cpu') gathered = model.CopyCPUToGPU(gathered_cpu, "gathered") else: gpu_vecs = model.param_init_net.CopyCPUToGPU( self.vecs, "gpuvecs", ) model.params.append(gpu_vecs) gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered') flattened = model.Flatten(gathered, "flattened") fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: param_momentum = model.param_init_net.ConstantFill( [param], param + '_momentum', value=0.0, ) model.net.SparseMomentumSGDUpdate( [ param_grad.values, param_momentum, LR, param, param_grad.indices, ], [param_grad.values, param_momentum, param], momentum=0.1, nesterov=0, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) self.vecs = model.param_init_net.UniformFill([], "vecs", shape=[V, 16]) if cpu_indices: model.params.append(self.vecs) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs if cpu_indices: with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): for param in model.GetParams(): param_grad = model.param_to_grad[param] model.ScatterWeightedSum([ param, self.ONE_CPU, param_grad.indices, param_grad.values, self.LR ], self.vecs) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_indices = np.random.permutation(V)[:batch_size * 16].reshape( batch_size, 16) full_labels = full_indices[:, 0] % 2 batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en, :].astype(np.int32) labels = full_labels[st:en].astype(np.float32) device_for_indices = core.DeviceOption(caffe2_pb2.CPU) if not cpu_indices: device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g) with core.DeviceScope(device_for_indices): workspace.FeedBlob("gpu_{}/indices".format(g), indices) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = np.random.rand(V, 16).astype(np.float32) workspace.FeedBlob(self.vecs, orig_vecs) if not cpu_indices: for g in gpu_devices: workspace.FeedBlob( "gpu_{}/gpuvecs".format(g), orig_vecs, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) if len(gpu_devices) == 2: with open("/tmp/dump.txt", "w") as f: f.write(str(model.net.Proto())) if not cpu_indices: idx = workspace.FetchBlob("gpu_0/indices") idx = list(idx.flatten()) n = len(idx) nu = len(set(idx)) assert n == nu, "We cannot have duplicate indices" # Sanity check to see the vecs were updated self.assertFalse(np.allclose(workspace.FetchBlob(self.vecs), orig_vecs)) return [ workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"), workspace.FetchBlob("gpu_0/fc_w") ]
def testPartialClone(self): params = core.Net('params') p1 = params.ConstantFill([], ['p1']) workspace.CreateNet(params) workspace.RunNetOnce(params) n = core.Net('original') a1 = n.AddExternalInput('a1') a2 = n.AddExternalInput('a2') b1, b2 = n.Concat([a1, a2], ['b1', 'b2'], axis=0) c1 = n.Sum([b1, p1], ['c1']) c2 = n.Sum([b2], ['c2']) d = n.Sum([c1, c2], ['d']) # test that gradient ops are ignored when partial-cloning n.AddGradientOperators([d]) # test some in-place ops k = n.Sum([p1], ['k']) e = n.Sum([d], ['e']) e = n.Sum([e, k], [e]) e = n.Sum([e], [e]) f = n.Sum(e, ['f']) def net_assert(net, num_ops, inputs, outputs, internals): self.assertEqual(len(net.Proto().op), num_ops) self.assertEqual(set(net.Proto().external_input), inputs) self.assertEqual(set(net.Proto().external_output), outputs) all_blobs = set(net.Proto().external_input) all_blobs |= set(net.Proto().external_output) for op in net.Proto().op: all_blobs |= set(op.input) | set(op.output) self.assertEqual(all_blobs, inputs | outputs | internals) # create net to make sure its valid for input in inputs: workspace.FeedBlob(input, np.array([])) workspace.CreateNet(net) n2, (d22, ) = n.ClonePartial('f1', {a1: 'a11', a2: 'a22'}, [d]) net_assert(n2, 4, {'p1', 'a11', 'a22'}, {'f1/d'}, {'f1/b1', 'f1/b2', 'f1/c1', 'f1/c2', 'p1'}) self.assertTrue(isinstance(d22, core.BlobReference)) self.assertEqual(d22.Net(), n2) self.assertEqual(str(d22), 'f1/d') n3, (d22, ) = n.ClonePartial('f2', [b1, b2], [d]) net_assert(n3, 3, {'p1', 'b1', 'b2'}, {'f2/d'}, {'f2/c1', 'f2/c2', 'p1'}) self.assertEqual(str(d22), 'f2/d') n4, (c22, ) = n.ClonePartial('f3', [b1], [c1]) net_assert(n4, 1, {'p1', 'b1'}, {'f3/c1'}, {'p1'}) self.assertEqual(str(c22), 'f3/c1') n5, (c11, c22) = n.ClonePartial('f4', [b1, b2], [c1, c2]) net_assert(n5, 2, {'p1', 'b1', 'b2'}, {'f4/c1', 'f4/c2'}, {'p1'}) self.assertEqual(str(c11), 'f4/c1') self.assertEqual(str(c22), 'f4/c2') with self.assertRaises(AssertionError): n.ClonePartial('f4', [a1, a2, c2], [d]) n6, (e22, ) = n.ClonePartial('f5', [d], [e]) net_assert(n6, 4, {'p1', 'd'}, {'f5/e'}, {'f5/k', 'p1'}) self.assertEqual(str(e22), 'f5/e') n8, (e22, f22) = n.ClonePartial('f7', [d], [e, f]) net_assert(n8, 5, {'p1', 'd'}, {'f7/e', 'f7/f'}, {'p1', 'f7/k'}) self.assertEqual(str(e22), 'f7/e') self.assertEqual(str(f22), 'f7/f') params._CheckLookupTables() n._CheckLookupTables()
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustice_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="resnet50", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id if num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", exit_nets=None) else: rendezvous = None # Model building functions def create_resnet50_model_ops(model, loss_scale): [softmax, loss] = resnet.create_resnet50( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, label="label", no_bias=True, ) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] # SGD def add_parameter_update_ops(model): brew.add_weight_decay(model, args.weight_decay) ITER = brew.iter(model, "ITER") stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) LR = model.net.LearningRate( [ITER], "LR", base_lr=args.base_learning_rate, policy="step", stepsize=stepsz, gamma=0.1, ) AddMomentumParameterUpdate(model, LR) # Input. Note that the reader must be shared with all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, ) # Create parallelized model data_parallel_model.Parallelize_GPU( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet50_model_ops, param_update_builder_fun=add_parameter_update_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=True, ) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="resnet50_test", arg_scope=test_arg_scope) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, ) data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnet50_model_ops, param_update_builder_fun=None, devices=gpus, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint( train_model, GetCheckpointParams(train_model), ) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def export_actor( cls, trainer, state_normalization_parameters, action_feature_ids, min_action_range_tensor_serving, max_action_range_tensor_serving, model_on_gpu=False, ): """Export caffe2 preprocessor net and pytorch actor forward pass as one caffe2 net. :param trainer DDPGTrainer :param state_normalization_parameters state NormalizationParameters :param min_action_range_tensor_serving pytorch tensor that specifies min action value for each dimension :param max_action_range_tensor_serving pytorch tensor that specifies min action value for each dimension :param state_normalization_parameters state NormalizationParameters :param model_on_gpu boolean indicating if the model is a GPU model or CPU model """ model = model_helper.ModelHelper(name="predictor") net = model.net C2.set_model(model) parameters: List[str] = [] workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32)) workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32)) input_feature_lengths = "input_feature_lengths" input_feature_keys = "input_feature_keys" input_feature_values = "input_feature_values" C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths]) C2.net().Copy(["input/float_features.keys"], [input_feature_keys]) C2.net().Copy(["input/float_features.values"], [input_feature_values]) preprocessor = PreprocessorNet() sparse_to_dense_processor = Caffe2SparseToDenseProcessor() sorted_features, _ = sort_features_by_normalization( state_normalization_parameters) state_dense_matrix, new_parameters = sparse_to_dense_processor( sorted_features, StackedAssociativeArray(input_feature_lengths, input_feature_keys, input_feature_values), ) parameters.extend(new_parameters) state_normalized_dense_matrix, new_parameters = preprocessor.normalize_dense_matrix( state_dense_matrix, sorted_features, state_normalization_parameters, "state_norm", False, ) parameters.extend(new_parameters) torch_init_net, torch_predict_net, new_parameters, actor_input_blob, actor_output_blob, min_action_training_blob, max_action_training_blob, min_action_serving_blob, max_action_serving_blob = DDPGPredictor.generate_train_net( trainer, model, min_action_range_tensor_serving, max_action_range_tensor_serving, model_on_gpu, ) parameters.extend(new_parameters) net.Copy([state_normalized_dense_matrix], [actor_input_blob]) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(torch_init_net) net.AppendNet(torch_predict_net) # Scale actors actions from [-1, 1] to serving range prev_range = C2.Sub(max_action_training_blob, min_action_training_blob) new_range = C2.Sub(max_action_serving_blob, min_action_serving_blob) subtract_prev_min = C2.Sub(actor_output_blob, min_action_training_blob) div_by_prev_range = C2.Div(subtract_prev_min, prev_range) scaled_for_serving_actions = C2.Add( C2.Mul(div_by_prev_range, new_range), min_action_serving_blob) output_lengths = "output/float_features.lengths" workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32)) C2.net().ConstantFill( [C2.FlattenToVec(C2.ArgMax(actor_output_blob))], [output_lengths], value=trainer.actor.layers[-1].out_features, dtype=caffe2_pb2.TensorProto.INT32, ) action_feature_ids_blob = C2.NextBlob("action_feature_ids") workspace.FeedBlob(action_feature_ids_blob, np.array(action_feature_ids, dtype=np.int64)) parameters.append(action_feature_ids_blob) output_keys = "output/float_features.keys" workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64)) num_examples, _ = C2.Reshape(C2.Size("input/float_features.lengths"), shape=[1]) C2.net().Tile([action_feature_ids_blob, num_examples], [output_keys], axis=0) output_values = "output/float_features.values" workspace.FeedBlob(output_values, np.zeros(1, dtype=np.float32)) C2.net().FlattenToVec([scaled_for_serving_actions], [output_values]) workspace.CreateNet(net) return DDPGPredictor(net, torch_init_net, parameters)
def main(): init_net = core.Net("init") # The ground truth parameters. W_gt = init_net.GivenTensorFill( [], "W_gt", shape=[1, 2], values=[2.0, 1.5]) B_gt = init_net.GivenTensorFill([], "B_gt", shape=[1], values=[0.5]) # Constant value ONE is used in weighted sum when updating parameters. ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.) # ITER is the iterator count. ITER = init_net.ConstantFill([], "ITER", shape=[1], value=0, dtype=core.DataType.INT32) # For the parameters to be learned: we randomly initialize weight # from [-1, 1] and init bias with 0.0. W = init_net.UniformFill([], "W", shape=[1, 2], min=-1., max=1.) B = init_net.ConstantFill([], "B", shape=[1], value=0.0) print('Created init net.') train_net = core.Net("train") # First, we generate random samples of X and create the ground truth. X = train_net.GaussianFill([], "X", shape=[64, 2], mean=0.0, std=1.0, run_once=0) Y_gt = X.FC([W_gt, B_gt], "Y_gt") # We add Gaussian noise to the ground truth noise = train_net.GaussianFill([], "noise", shape=[64, 1], mean=0.0, std=1.0, run_once=0) Y_noise = Y_gt.Add(noise, "Y_noise") # Note that we do not need to propagate the gradients back through Y_noise, # so we mark StopGradient to notify the auto differentiating algorithm # to ignore this path. Y_noise = Y_noise.StopGradient([], "Y_noise") # Now, for the normal linear regression prediction, this is all we need. Y_pred = X.FC([W, B], "Y_pred") # The loss function is computed by a squared L2 distance, and then averaged # over all items in the minibatch. dist = train_net.SquaredL2Distance([Y_noise, Y_pred], "dist") loss = dist.AveragedLoss([], ["loss"]) # Get gradients for all the computations above. gradient_map = train_net.AddGradientOperators([loss]) # Increment the iteration by one. train_net.Iter(ITER, ITER) # Compute the learning rate that corresponds to the iteration. LR = train_net.LearningRate(ITER, "LR", base_lr=-0.1, policy="step", stepsize=20, gamma=0.9) # Weighted sum train_net.WeightedSum([W, ONE, gradient_map[W], LR], W) train_net.WeightedSum([B, ONE, gradient_map[B], LR], B) workspace.RunNetOnce(init_net) workspace.CreateNet(train_net) print("Before training, W is: {}".format(workspace.FetchBlob("W"))) print("Before training, B is: {}".format(workspace.FetchBlob("B"))) for i in range(100): workspace.RunNet(train_net.Proto().name) print("After training, W is: {}".format(workspace.FetchBlob("W"))) print("After training, B is: {}".format(workspace.FetchBlob("B"))) print("Ground truth W is: {}".format(workspace.FetchBlob("W_gt"))) print("Ground truth B is: {}".format(workspace.FetchBlob("B_gt")))
def export( cls, trainer, actions, state_normalization_parameters, int_features=False, model_on_gpu=False, ): """Export caffe2 preprocessor net and pytorch DQN forward pass as one caffe2 net. :param trainer DQNTrainer :param state_normalization_parameters state NormalizationParameters :param int_features boolean indicating if int features blob will be present :param model_on_gpu boolean indicating if the model is a GPU model or CPU model """ input_dim = trainer.num_features buffer = PytorchCaffe2Converter.pytorch_net_to_buffer( trainer.q_network, input_dim, model_on_gpu ) qnet_input_blob, qnet_output_blob, caffe2_netdef = PytorchCaffe2Converter.buffer_to_caffe2_netdef( buffer ) torch_workspace = caffe2_netdef.workspace parameters = torch_workspace.Blobs() for blob_str in parameters: workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str)) torch_init_net = core.Net(caffe2_netdef.init_net) torch_predict_net = core.Net(caffe2_netdef.predict_net) model = model_helper.ModelHelper(name="predictor") net = model.net C2.set_model(model) workspace.FeedBlob("input/image", np.zeros([1, 1, 1, 1], dtype=np.int32)) workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32)) workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32)) input_feature_lengths = "input_feature_lengths" input_feature_keys = "input_feature_keys" input_feature_values = "input_feature_values" if int_features: workspace.FeedBlob( "input/int_features.lengths", np.zeros(1, dtype=np.int32) ) workspace.FeedBlob("input/int_features.keys", np.zeros(1, dtype=np.int64)) workspace.FeedBlob("input/int_features.values", np.zeros(1, dtype=np.int32)) C2.net().Cast( ["input/int_features.values"], ["input/int_features.values_float"], dtype=caffe2_pb2.TensorProto.FLOAT, ) C2.net().MergeMultiScalarFeatureTensors( [ "input/float_features.lengths", "input/float_features.keys", "input/float_features.values", "input/int_features.lengths", "input/int_features.keys", "input/int_features.values_float", ], [input_feature_lengths, input_feature_keys, input_feature_values], ) else: C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths]) C2.net().Copy(["input/float_features.keys"], [input_feature_keys]) C2.net().Copy(["input/float_features.values"], [input_feature_values]) if state_normalization_parameters is not None: preprocessor = PreprocessorNet(clip_anomalies=True) state_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix( input_feature_lengths, input_feature_keys, input_feature_values, state_normalization_parameters, blobname_prefix="state_norm", split_sparse_to_dense=False, split_expensive_feature_groups=False, normalize=True, ) parameters.extend(new_parameters) else: # Image input. Note: Currently this does the wrong thing if # more than one image is passed at a time. state_normalized_dense_matrix = "input/image" net.Copy([state_normalized_dense_matrix], [qnet_input_blob]) workspace.RunNetOnce(model.param_init_net) workspace.RunNetOnce(torch_init_net) net.AppendNet(torch_predict_net) new_parameters, q_values = RLPredictor._forward_pass( model, trainer, state_normalized_dense_matrix, actions, qnet_output_blob ) parameters.extend(new_parameters) # Get 1 x n action index tensor under the max_q policy max_q_act_idxs = "max_q_policy_actions" C2.net().Flatten([C2.ArgMax(q_values)], [max_q_act_idxs], axis=0) shape_of_num_of_states = "num_states_shape" C2.net().FlattenToVec([max_q_act_idxs], [shape_of_num_of_states]) num_states, _ = C2.Reshape(C2.Size(shape_of_num_of_states), shape=[1]) # Get 1 x n action index tensor under the softmax policy temperature = C2.NextBlob("temperature") parameters.append(temperature) workspace.FeedBlob( temperature, np.array([trainer.rl_temperature], dtype=np.float32) ) tempered_q_values = C2.Div(q_values, temperature, broadcast=1) softmax_values = C2.Softmax(tempered_q_values) softmax_act_idxs_nested = "softmax_act_idxs_nested" C2.net().WeightedSample([softmax_values], [softmax_act_idxs_nested]) softmax_act_idxs = "softmax_policy_actions" C2.net().Flatten([softmax_act_idxs_nested], [softmax_act_idxs], axis=0) action_names = C2.NextBlob("action_names") parameters.append(action_names) workspace.FeedBlob(action_names, np.array(actions)) # Concat action index tensors to get 2 x n tensor - [[max_q], [softmax]] # transpose & flatten to get [a1_maxq, a1_softmax, a2_maxq, a2_softmax, ...] max_q_act_blob = C2.Cast(max_q_act_idxs, to=caffe2_pb2.TensorProto.INT32) softmax_act_blob = C2.Cast(softmax_act_idxs, to=caffe2_pb2.TensorProto.INT32) C2.net().Append([max_q_act_blob, softmax_act_blob], [max_q_act_blob]) transposed_action_idxs = C2.Transpose(max_q_act_blob) flat_transposed_action_idxs = C2.FlattenToVec(transposed_action_idxs) workspace.FeedBlob(OUTPUT_SINGLE_CAT_VALS_NAME, np.zeros(1, dtype=np.int64)) C2.net().Gather( [action_names, flat_transposed_action_idxs], [OUTPUT_SINGLE_CAT_VALS_NAME] ) workspace.FeedBlob(OUTPUT_SINGLE_CAT_LENGTHS_NAME, np.zeros(1, dtype=np.int32)) C2.net().ConstantFill( [shape_of_num_of_states], [OUTPUT_SINGLE_CAT_LENGTHS_NAME], value=2, dtype=caffe2_pb2.TensorProto.INT32, ) workspace.FeedBlob(OUTPUT_SINGLE_CAT_KEYS_NAME, np.zeros(1, dtype=np.int64)) output_keys_tensor, _ = C2.Concat( C2.ConstantFill(shape=[1, 1], value=0, dtype=caffe2_pb2.TensorProto.INT64), C2.ConstantFill(shape=[1, 1], value=1, dtype=caffe2_pb2.TensorProto.INT64), axis=0, ) output_key_tile = C2.Tile(output_keys_tensor, num_states, axis=0) C2.net().FlattenToVec([output_key_tile], [OUTPUT_SINGLE_CAT_KEYS_NAME]) workspace.CreateNet(net) return DQNPredictor(net, torch_init_net, parameters, int_features)
srcimg = skimage.io.imread(IMAGE_LOCATION, as_grey=True) #print srcimg #[0,1] #srcimg = skimage.transform.resize(srcimg, (width, height)) #print srcimg #img = skimage.img_as_float(srcimg).astype(np.float32) #[-1,1] img = srcimg - 127.5 img = img / 127.5 img = skimage.transform.resize(img, (width, height)) img = img[np.newaxis, :, :].astype(np.float32) img = img[np.newaxis, :, :, :].astype(np.float32) with open(INIT_NET) as f: init_net = f.read() with open(PREDICT_NET) as f: predict_net = f.read() workspace.RunNetOnce(init_net) workspace.CreateNet(predict_net) p = workspace.Predictor(init_net, predict_net) results = p.run([img]) img_out = workspace.FetchBlob(output) print(type(img_out), img_out.size, img_out.shape) for i in range(img_out.shape[1]): if i % 16 == 0: print("\n") print img_out[0][i], print("\n")
def Skip_test_tanhquantize(self, scale, zp, size, rand_seed): np.random.seed(rand_seed) workspace.ResetWorkspace() pred_net = caffe2_pb2.NetDef() pred_net.name = "ref" pred_net.external_input.append("X") pred_net.external_output.append("Y_q") pred_net.op.add().CopyFrom( core.CreateOperator( "Tanh", ["X"], ["Y"] ) ) pred_net.op.add().CopyFrom( core.CreateOperator( "Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp ) ) X = np.linspace(-1, 1, size).astype(np.float16).astype(np.float32) pred_net_onnxified = onnxifi_caffe2_net( pred_net, {"X": X.shape}, debug=True, adjust_batch=False, use_onnx=False, ) num_onnxified_ops = sum( 1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op ) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net_onnxified) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchInt8Blob("Y_q") ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.append("X") ref_net.external_output.append("Y_q") ref_net.op.add().CopyFrom( core.CreateOperator( "TanhQuantFakeFp16NNPI", ["X"], ["Y_q"], Y_scale=scale, Y_zero_point=zp ) ) workspace.CreateNet(ref_net) workspace.RunNet(ref_net.name) Y_ref = workspace.FetchInt8Blob("Y_q") if not np.array_equal(Y_ref.data, Y_glow.data) or \ not Y_ref.scale == Y_glow.scale or \ not Y_ref.zero_point == Y_glow.zero_point: print_test_debug_info( "tanhfusion", { "scale": scale, "zp": zp, "input": X, "ideal nonquant": np.tanh(X), "Y_glow": Y_glow, "Y_c2": Y_ref, } ) assert(0)
def test_batch_matmul(self, M, K, N, rand_seed, trans_a, trans_b, run_ints): np.random.seed(rand_seed) workspace.ResetWorkspace() C = 0 # TODO batch_dims = np.random.randint(low=1, high=3, size=C, dtype=np.int64).tolist() if run_ints: X = np.random.randint(low=1, high=3, size=((1, M, K))).astype(np.float32) else: X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype( np.float32) - 0.5) if trans_a: X = X.swapaxes(-1, -2) if run_ints: Y = np.random.randint(low=1, high=3, size=((1, K, N))).astype(np.float32) else: Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype( np.float32) - 0.5) if trans_b: Y = Y.swapaxes(-1, -2) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X", "Y"]) pred_net.external_output.append("out") pred_net.op.add().CopyFrom( core.CreateOperator('BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b)) pred_net_ref = core.Net("pred_net_ref") pred_net_ref.BatchMatMulFP16Acc16Fake(["X", "Y"], ['out'], trans_a=trans_a, trans_b=trans_b) print("dims", batch_dims, X.shape, Y.shape) pred_net_onnxified = onnxifi_caffe2_net(pred_net, { "X": X.shape, "Y": Y.shape }, debug=True, adjust_batch=False, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("X", X) workspace.FeedBlob("Y", Y) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(pred_net_ref) # Run Glow net workspace.RunNet(pred_net_onnxified.name) out_glow = workspace.FetchBlob('out') # Run caffe2 net workspace.RunNet(pred_net_ref) out_c2_fakefp16 = workspace.FetchBlob('out') diff = np.abs((out_c2_fakefp16 - out_glow) / (out_c2_fakefp16 + 1e-8)) rowdiff = np.max(diff, axis=1) if not np.allclose(out_glow, out_c2_fakefp16): print_test_debug_info( "bmm", { "seed": rand_seed, "m": M, "k": K, "n": N, "X": X, "Y": Y, "out_glow": out_glow, "out_c2_fakefp16": out_c2_fakefp16, "diff": diff }) assert (0)
def test_slws_fused_8bit_rowwise_acc32_nnpi(self, seed, num_rows, embedding_dim, batch_size, max_weight): workspace.GlobalInit([ "caffe2", "--glow_global_fp16=0", "--glow_global_fused_scale_offset_fp16=0", "--glow_global_force_sls_fp16_accum=0", ]) workspace.ResetWorkspace() np.random.seed(seed) data = np.random.rand(num_rows, embedding_dim).astype(np.float32) lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32) indices = [] for length in lengths: indices.extend(np.random.choice(np.arange(1, num_rows), length)) indices = np.asarray(indices).astype(np.int64) weights = np.random.uniform(low=0, high=max_weight, size=[len(indices)]).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"])) onnxified_net = onnxifi_caffe2_net( pred_net, {}, max_batch_size=batch_size, max_seq_size=batch_size * np.max(lengths), debug=True, adjust_batch=True, use_onnx=False, ) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(onnxified_net) workspace.CreateNet(ref_net) workspace.RunNet(onnxified_net.name) Y_glow = workspace.FetchBlob("Y") workspace.RunNet(ref_net.name) Y_ref = workspace.FetchBlob("Y") diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8)) max_err = np.max(diff, axis=1) num_offenders = (max_err > 0).sum() if num_offenders > 0: print_test_debug_info( "test_slws_fused_8bit_rowwise_acc32_nnpi", { "indices": indices, "data": data.shape, "lengths": lengths, "weights": weights, "Y_glow": Y_glow, "Y_ref": Y_ref, "diff": diff, "rowwise_diff": np.max(diff, axis=1), }, ) assert 0
def run_model(self, gpu_devices): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): grad = model.param_to_grad[param] model.WeightedSum([param, ONE, grad, LR], param) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(gpu_devices), ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(gpu_devices) for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/data".format(g), data) workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) print(i, workspace.FetchBlob("gpu_0/fc_w").flatten()[:5]) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("gpu_0/fc_w")
def load_feature_map(params_file, is_train): assert params_file, 'FEATURE_MAP_LOADER.MODEL_PARAMS_FILE is not specified.' assert cfg.FEATURE_MAP_LOADER.OUT_DIR, 'FEATURE_MAP_LOADER.OUT_DIR is not specified.' logger.info('Inferring feature map from %s' % params_file) cfg.FEATURE_MAP_LOADER.ENALBE = True cfg.GET_TRAIN_LFB = is_train timer = Timer() test_model = model_builder_video.ModelBuilder( train=False, use_cudnn=True, cudnn_exhaustive_search=True, split=cfg.TEST.DATA_TYPE, ) suffix = 'infer_{}'.format('train' if is_train else 'test') if cfg.LFB.ENABLED: lfb_path = os.path.join(cfg.LFB.LOAD_LFB_PATH, 'train_lfb.pkl' if is_train else 'val_lfb.pkl') logger.info('Loading LFB from %s' % lfb_path) with open(lfb_path, 'r') as f: lfb = pickle.load(f) test_model.build_model( lfb=lfb, suffix=suffix, shift=1, ) else: test_model.build_model( lfb=None, suffix=suffix, shift=1, ) if cfg.PROF_DAG: test_model.net.Proto().type = 'prof_dag' else: test_model.net.Proto().type = 'dag' workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) total_test_net_iters = misc.get_total_test_iters(test_model) test_model.start_data_loader() checkpoints.load_model_from_params_file_for_test(test_model, params_file) all_features = {} for feat_name in cfg.FEATURE_MAP_LOADER.NAME_LIST: all_features[feat_name] = [] all_metadata = [] all_labels = [] all_proposals = [] all_original_boxes = [] if cfg.FEATURE_MAP_LOADER.TEST_ITERS > 0: total_test_net_iters = cfg.FEATURE_MAP_LOADER.TEST_ITERS for test_iter in range(total_test_net_iters): timer.tic() workspace.RunNet(test_model.net.Proto().name) timer.toc() if test_iter == 0: misc.print_net(test_model) os.system('nvidia-smi') if test_iter % 10 == 0: logger.info("Iter {}/{} Time: {}".format(test_iter, total_test_net_iters, timer.diff)) if cfg.DATASET == "ava": for feat_name in cfg.FEATURE_MAP_LOADER.NAME_LIST: all_features[feat_name].append(get_features(feat_name)) all_metadata.append(get_features('metadata{}'.format(suffix))) all_labels.append(get_features('labels{}'.format(suffix))) all_proposals.append(get_features('proposals{}'.format(suffix))) all_original_boxes.append( get_features('original_boxes{}'.format(suffix))) # elif cfg.DATASET in ['charades', 'epic']: # all_features.append(get_features('pool5')) else: raise Exception("Dataset {} not recognized.".format(cfg.DATASET)) lfb = construct_lfb(all_features, all_metadata, all_labels, all_proposals, all_original_boxes, test_model.input_db, is_train) write_lfb(lfb, is_train) logger.info("Shutting down data loader...") test_model.shutdown_data_loader() workspace.ResetWorkspace() logger.info("Done ResetWorkspace...") cfg.GET_TRAIN_LFB = False
def Test(args): if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = range(args.num_gpus) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) total_batch_size = args.batch_size * num_gpus # Model building functions def create_model_ops(model, loss_scale): return model_builder.build_model( model=model, model_name=args.model_name, model_depth=args.model_depth, num_labels=args.num_labels, num_channels=args.num_channels, crop_size=args.crop_size, clip_length=( args.clip_length_of if args.input_type == 1 else args.clip_length_rgb ), loss_scale=loss_scale, is_test=1, pred_layer_name=args.pred_layer_name, ) test_model = cnn.CNNModelHelper( order="NCHW", name="video_model_test", use_cudnn=(True if args.use_cudnn == 1 else False), cudnn_exhaustive_search=True, ) test_reader, number_of_examples = model_builder.create_data_reader( test_model, name="test_reader", input_data=args.test_data, ) if args.num_iter <= 0: num_iter = int(number_of_examples / total_batch_size) else: num_iter = args.num_iter def test_input_fn(model): model_helper.AddVideoInput( test_model, test_reader, batch_size=args.batch_size, clip_per_video=args.clip_per_video, decode_type=1, length_rgb=args.clip_length_rgb, sampling_rate_rgb=args.sampling_rate_rgb, scale_h=args.scale_h, scale_w=args.scale_w, crop_size=args.crop_size, num_decode_threads=4, num_of_class=args.num_labels, random_mirror=False, random_crop=False, input_type=args.input_type, length_of=args.clip_length_of, sampling_rate_of=args.sampling_rate_of, frame_gap_of=args.frame_gap_of, do_flow_aggregation=args.do_flow_aggregation, flow_data_type=args.flow_data_type, get_rgb=(args.input_type == 0), get_optical_flow=(args.input_type == 1), get_video_id=args.get_video_id, use_local_file=args.use_local_file, ) data_parallel_model.Parallelize_GPU( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_model_ops, param_update_builder_fun=None, devices=gpus, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) if args.db_type == 'minidb': model_helper.LoadModel(args.load_model_path, args.db_type) elif args.db_type == 'pickle': model_loader.LoadModelFromPickleFile( test_model, args.load_model_path, root_gpu_id=gpus[0] ) else: log.warning("Unsupported db_type: {}".format(args.db_type)) data_parallel_model.FinalizeAfterCheckpoint(test_model) # metric counters for classification clip_acc = 0 video_top1 = 0 video_topk = 0 video_count = 0 clip_count = 0 for i in range(num_iter): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: # get labels label = workspace.FetchBlob( "gpu_{}".format(g) + '/label' ) # get predictions predicts = workspace.FetchBlob("gpu_{}".format(g) + '/softmax') assert predicts.shape[0] == args.batch_size * args.clip_per_video for j in range(args.batch_size): # get label for one video sample_label = label[j * args.clip_per_video] # get clip accuracy for k in range(args.clip_per_video): c1, _ = metric.accuracy_metric( predicts[j * args.clip_per_video + k, :], label[j * args.clip_per_video + k]) clip_acc = clip_acc + c1 # get all clip predictions for one video all_clips = predicts[ j * args.clip_per_video:(j + 1) * args.clip_per_video, :] # aggregate predictions into one video_pred = PredictionAggregation(all_clips, args.aggregation) c1, ck = metric.accuracy_metric( video_pred, sample_label, args.top_k) video_top1 = video_top1 + c1 video_topk = video_topk + ck video_count = video_count + args.batch_size clip_count = clip_count + label.shape[0] if i > 0 and i % args.display_iter == 0: log.info('Iter {}/{}: clip: {}, top1: {}, top 5: {}'.format( i, num_iter, clip_acc / clip_count, video_top1 / video_count, video_topk / video_count)) log.info("Test accuracy: clip: {}, top 1: {}, top{}: {}".format( clip_acc / clip_count, video_top1 / video_count, args.top_k, video_topk / video_count )) flops, params = model_helper.GetFlopsAndParams(test_model, args.gpus[0]) log.info('FLOPs: {}, params: {}'.format(flops, params))
db=os.path.join(data_folder, 'mnist-test-nchw-lmdb'), db_type='lmdb') softmax = AddModel(test_model, data) # Deployment model. We simply need the main AddModel part. deploy_model = model_helper.ModelHelper(name="mnist_deploy", arg_scope=arg_scope, init_params=False) AddModel(deploy_model, "data") # The parameter initialization network only needs to be run once. # Now all the parameter blobs are going to be initialized in the workspace. workspace.RunNetOnce(train_model.param_init_net) # overwrite=True allows you to run this cell several times and avoid errors workspace.CreateNet(train_model.net, overwrite=True) # Set the iterations number and track the accuracy & loss total_iters = 200 accuracy = np.zeros(total_iters) loss = np.zeros(total_iters) print("The blobs in the workspace pre-train: {}".format(workspace.Blobs())) # Now, we will manually run the network for 200 iterations. for i in range(total_iters): workspace.RunNet(train_model.net) accuracy[i] = workspace.blobs['accuracy'] loss[i] = workspace.blobs['loss'] print("The blobs in the workspace post-train: {}".format(workspace.Blobs()))
def InferTensorRunAndCompare(self, model, expected_uninferred_blobs=None): ''' Runs shape inference, and then the model to check that the inferred shapes agree with the actual ones 'expected_uninferred_blobs' is the list of blobs for which type and shape cannot be inferred. ''' (shapes, types) = workspace.InferShapesAndTypes( [model.param_init_net, model.net], ) # .. Create net workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net, True) workspace.RunNet(model.Proto().name) # ... and then check the shapes mismatch correct_shapes = {} correct_types = {} for b in workspace.Blobs(): arr = workspace.FetchBlob(b) correct_shapes[b] = arr.shape if type(arr) is np.ndarray: if arr.dtype == np.dtype('float32'): correct_types[b] = caffe2_pb2.TensorProto.FLOAT elif arr.dtype == np.dtype('int32'): correct_types[b] = caffe2_pb2.TensorProto.INT32 # BYTE # STRING elif arr.dtype == np.dtype('bool'): correct_types[b] = caffe2_pb2.TensorProto.BOOL elif arr.dtype == np.dtype('uint8'): correct_types[b] = caffe2_pb2.TensorProto.UINT8 elif arr.dtype == np.dtype('int8'): correct_types[b] = caffe2_pb2.TensorProto.INT8 elif arr.dtype == np.dtype('uint16'): correct_types[b] = caffe2_pb2.TensorProto.UINT16 elif arr.dtype == np.dtype('int16'): correct_types[b] = caffe2_pb2.TensorProto.INT16 elif arr.dtype == np.dtype('int64'): correct_types[b] = caffe2_pb2.TensorProto.INT64 elif arr.dtype == np.dtype('float16'): correct_types[b] = caffe2_pb2.TensorProto.FLOAT16 elif arr.dtype == np.dtype('float64'): correct_types[b] = caffe2_pb2.TensorProto.DOUBLE else: correct_types[b] = "unknown {}".format(arr.dtype) else: correct_types[b] = str(type(arr)) if expected_uninferred_blobs is None: expected_uninferred_blobs = [] for b in correct_shapes: # skip blobs for which shape couldn't be inferred if b in expected_uninferred_blobs: continue self.assertTrue( np.array_equal( np.array(shapes[b]).astype(np.int32), np.array(correct_shapes[b]).astype(np.int32)), "Shape {} mismatch: {} vs. correct {}".format( b, shapes[b], correct_shapes[b])) self.assertFalse( b not in types and b in correct_types, "Type for {} not defined".format(b), ) self.assertEqual( types[b], correct_types[b], "Type {} mismatch: {} vs. {}".format( b, types[b], correct_types[b], ))
def test_dataset_ops(self): """ 1. Defining the schema of our dataset. This example schema could represent, for example, a search query log. """ schema = Struct( # fixed size vector, which will be stored as a matrix when batched ('dense', Scalar((np.float32, 3))), # could represent a feature map from feature ID to float value ('floats', Map(Scalar(np.int32), Scalar(np.float32))), # could represent a multi-valued categorical feature map ('int_lists', Map( Scalar(np.int32), List(Scalar(np.int64)), )), # could represent a multi-valued, weighted categorical feature map ('id_score_pairs', Map( Scalar(np.int32), Map(Scalar(np.int64), Scalar(np.float32), keys_name='ids', values_name='scores'), )), # additional scalar information ('metadata', Struct( ('user_id', Scalar(np.int64)), ('user_embed', Scalar((np.float32, 2))), ('query', Scalar(str)), )), ) """ This is what the flattened fields for this schema look like, along with its type. Each one of these fields will be stored, read and written as a tensor. """ expected_fields = [ ('dense', (np.float32, 3)), ('floats:lengths', np.int32), ('floats:values:keys', np.int32), ('floats:values:values', np.float32), ('int_lists:lengths', np.int32), ('int_lists:values:keys', np.int32), ('int_lists:values:values:lengths', np.int32), ('int_lists:values:values:values', np.int64), ('id_score_pairs:lengths', np.int32), ('id_score_pairs:values:keys', np.int32), ('id_score_pairs:values:values:lengths', np.int32), ('id_score_pairs:values:values:values:ids', np.int64), ('id_score_pairs:values:values:values:scores', np.float32), ('metadata:user_id', np.int64), ('metadata:user_embed', (np.float32, 2)), ('metadata:query', str), ] zipped = zip(expected_fields, schema.field_names(), schema.field_types()) for (ref_name, ref_type), name, dtype in zipped: self.assertEquals(ref_name, name) self.assertEquals(np.dtype(ref_type), dtype) """ 2. The contents of our dataset. Contents as defined below could represent, for example, a log of search queries along with dense, sparse features and metadata. The dataset below has 3 top-level entries. """ contents_raw = [ # dense [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]], # floats [1, 2, 3], # len [11, 21, 22, 31, 32, 33], # key [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value # int lists [2, 0, 1], # len [11, 12, 31], # key [2, 4, 3], # value:len [111, 112, 121, 122, 123, 124, 311, 312, 313], # value:value # id score pairs [1, 2, 2], # len [11, 21, 22, 31, 32], # key [1, 1, 2, 2, 3], # value:len [111, 211, 221, 222, 311, 312, 321, 322, 323], # value:ids [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3], # val:score # metadata [123, 234, 456], # user_id [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]], # user_embed ['dog posts', 'friends who like to', 'posts about ca'], # query ] # convert the above content to ndarrays, checking against the schema contents = from_blob_list(schema, contents_raw) """ 3. Creating and appending to the dataset. We first create an empty dataset with the given schema. Then, a Writer is used to append these entries to the dataset. """ ds = dataset.Dataset(schema) net = core.Net('init') with core.NameScope('init'): ds.init_empty(net) content_blobs = NewRecord(net, contents) FeedRecord(content_blobs, contents) writer = ds.writer(init_net=net) writer.write_record(net, content_blobs) workspace.RunNetOnce(net) """ 4. Iterating through the dataset contents. If we were to iterate through the top level entries of our dataset, this is what we should expect to see: """ entries_raw = [ ( [[1.1, 1.2, 1.3]], # dense [1], [11], [1.1], # floats [2], [11, 12], [2, 4], [111, 112, 121, 122, 123, 124], # intlst [1], [11], [1], [111], [11.1], # id score pairs [123], [[0.2, 0.8]], ['dog posts'], # metadata ), ( [[2.1, 2.2, 2.3]], # dense [2], [21, 22], [2.1, 2.2], # floats [0], [], [], [], # int list [2], [21, 22], [1, 2], [211, 221, 222], [21.1, 22.1, 22.2], [234], [[0.5, 0.5]], ['friends who like to'], # metadata ), ( [[3.1, 3.2, 3.3]], # dense [3], [31, 32, 33], [3.1, 3.2, 3.3], # floats [1], [31], [3], [311, 312, 313], # int lst [2], [31, 32], [2, 3], [311, 312, 321, 322, 323], [31.1, 31.2, 32.1, 32.2, 32.3], # id score list [456], [[0.7, 0.3]], ['posts about ca'], # metadata ), # after the end of the dataset, we will keep getting empty vectors ( [], ) * 16, ([], ) * 16, ] entries = [from_blob_list(schema, e) for e in entries_raw] """ Let's go ahead and create the reading nets. We will run `read` net multiple times and assert that we are reading the entries the way we stated above. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.reader(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for entry in entries: workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ 5. Reading/writing in a single plan If all of operations on the data are expressible as Caffe2 operators, we don't need to load the data to python, iterating through the dataset in a single Plan. Where we will process the dataset a little and store it in a second dataset. We can reuse the same Reader since it supports reset. """ reset_net = core.Net('reset_net') reader.reset(reset_net) read_step, batch = reader.execution_step() """ We will add the line number * 1000 to the feature ids. """ process_net = core.Net('process') line_no = Const(process_net, 0, dtype=np.int32) const_one = Const(process_net, 1000, dtype=np.int32) process_net.Add([line_no, const_one], [line_no]) field = batch.floats.keys.get() process_net.Print(field, []) process_net.Add([field, line_no], field, broadcast=1, axis=0) """ Lets create a second dataset and append to it. """ ds2 = dataset.Dataset(schema, name='dataset2') ds2.init_empty(reset_net) writer = ds2.writer(reset_net) writer.write_record(process_net, batch) # commit is not necessary for DatasetWriter but will add it for # generality of the example commit_net = core.Net('commit') writer.commit(commit_net) """ Time to create and run a plan which will do the processing """ plan = core.Plan('process') plan.AddStep(core.execution_step('reset', reset_net)) plan.AddStep(read_step.AddNet(process_net)) plan.AddStep(core.execution_step('commit', commit_net)) workspace.RunPlan(plan) """ Now we should have dataset2 populated. """ ds2_data = FetchRecord(ds2.content()) field = ds2_data.floats.keys field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000]) _assert_records_equal(contents, ds2_data) """ 6. Slicing a dataset You can create a new schema from pieces of another schema and reuse the same data. """ subschema = Struct(('top_level', schema.int_lists.values)) int_list_contents = contents.int_lists.values.field_names() self.assertEquals(len(subschema.field_names()), len(int_list_contents)) """ 7. Random Access a dataset """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for i in range(len(entries)): k = idx[i] if i in idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) workspace.RunNet(str(read_next_net)) self.assertEquals(True, workspace.FetchBlob(should_stop)) """ 8. Random Access a dataset with loop_over = true """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob, loop_over=True) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for _ in range(len(entries) * 3): workspace.RunNet(str(read_next_net)) self.assertEquals(False, workspace.FetchBlob(should_stop)) """ 9. Sort and shuffle a dataset This sort the dataset using the score of a certain column, and then shuffle within each chunk of size batch_size * shuffle_size before shuffling the chunks. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.random_reader(read_init_net) reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2) reader.computeoffset(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) expected_idx = np.array([2, 1, 0]) for i in range(len(entries)): k = expected_idx[i] if i in expected_idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ Trim a dataset """ trim_net = core.Net('trim_ds') ds.trim(trim_net, multiple_of=2) workspace.RunNetOnce(trim_net) trimmed = FetchRecord(ds.content()) EXPECTED_SIZES = [2, 2, 3, 3, 2, 2, 2, 6, 2, 3, 3, 4, 4, 2, 2, 2] actual_sizes = [d.shape[0] for d in trimmed.field_blobs()] self.assertEquals(EXPECTED_SIZES, actual_sizes)
def testEqualToCudnn(self): with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType)): T = 8 batch_size = 4 input_dim = 8 hidden_dim = 31 workspace.FeedBlob( "seq_lengths", np.array([T] * batch_size, dtype=np.int32) ) workspace.FeedBlob("target", np.zeros( [T, batch_size, hidden_dim], dtype=np.float32 )) workspace.FeedBlob("hidden_init", np.zeros( [1, batch_size, hidden_dim], dtype=np.float32 )) workspace.FeedBlob("cell_init", np.zeros( [1, batch_size, hidden_dim], dtype=np.float32 )) own_model = model_helper.ModelHelper(name="own_lstm") input_shape = [T, batch_size, input_dim] cudnn_model = model_helper.ModelHelper(name="cudnn_lstm") input_blob = cudnn_model.param_init_net.UniformFill( [], "input", shape=input_shape) workspace.FeedBlob("CUDNN/hidden_init_cudnn", np.zeros( [1, batch_size, hidden_dim], dtype=np.float32 )) workspace.FeedBlob("CUDNN/cell_init_cudnn", np.zeros( [1, batch_size, hidden_dim], dtype=np.float32 )) cudnn_output, cudnn_last_hidden, cudnn_last_state, param_extract = rnn_cell.cudnn_LSTM( model=cudnn_model, input_blob=input_blob, initial_states=("hidden_init_cudnn", "cell_init_cudnn"), dim_in=input_dim, dim_out=hidden_dim, scope="CUDNN", return_params=True, ) cudnn_loss = cudnn_model.AveragedLoss( cudnn_model.SquaredL2Distance( [cudnn_output, "target"], "CUDNN/dist" ), "CUDNN/loss" ) own_output, own_last_hidden, _, own_last_state, own_params = rnn_cell.LSTM( model=own_model, input_blob=input_blob, seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=input_dim, dim_out=hidden_dim, scope="OWN", return_params=True, ) own_loss = own_model.AveragedLoss( own_model.SquaredL2Distance([own_output, "target"], "OWN/dist"), "OWN/loss" ) # Add gradients cudnn_model.AddGradientOperators([cudnn_loss]) own_model.AddGradientOperators([own_loss]) # Add parameter updates LR = cudnn_model.param_init_net.ConstantFill( [], shape=[1], value=0.01 ) ONE = cudnn_model.param_init_net.ConstantFill( [], shape=[1], value=1.0 ) for param in cudnn_model.GetParams(): cudnn_model.WeightedSum( [param, ONE, cudnn_model.param_to_grad[param], LR], param ) for param in own_model.GetParams(): own_model.WeightedSum( [param, ONE, own_model.param_to_grad[param], LR], param ) # Copy states over own_model.net.Copy(own_last_hidden, "hidden_init") own_model.net.Copy(own_last_state, "cell_init") cudnn_model.net.Copy(cudnn_last_hidden, "CUDNN/hidden_init_cudnn") cudnn_model.net.Copy(cudnn_last_state, "CUDNN/cell_init_cudnn") workspace.RunNetOnce(cudnn_model.param_init_net) workspace.CreateNet(cudnn_model.net) ## ## CUDNN LSTM MODEL EXECUTION ## # Get initial values from CuDNN LSTM so we can feed them # to our own. (param_extract_net, param_extract_mapping) = param_extract workspace.RunNetOnce(param_extract_net) cudnn_lstm_params = { input_type: { k: workspace.FetchBlob(v[0]) for k, v in viewitems(pars) } for input_type, pars in viewitems(param_extract_mapping) } # Run the model 3 times, so that some parameter updates are done workspace.RunNet(cudnn_model.net.Proto().name, 3) ## ## OWN LSTM MODEL EXECUTION ## # Map the cuDNN parameters to our own workspace.RunNetOnce(own_model.param_init_net) rnn_cell.InitFromLSTMParams(own_params, cudnn_lstm_params) # Run the model 3 times, so that some parameter updates are done workspace.CreateNet(own_model.net) workspace.RunNet(own_model.net.Proto().name, 3) ## ## COMPARE RESULTS ## # Then compare that final results after 3 runs are equal own_output_data = workspace.FetchBlob(own_output) own_last_hidden = workspace.FetchBlob(own_last_hidden) own_loss = workspace.FetchBlob(own_loss) cudnn_output_data = workspace.FetchBlob(cudnn_output) cudnn_last_hidden = workspace.FetchBlob(cudnn_last_hidden) cudnn_loss = workspace.FetchBlob(cudnn_loss) self.assertTrue(np.allclose(own_output_data, cudnn_output_data)) self.assertTrue(np.allclose(own_last_hidden, cudnn_last_hidden)) self.assertTrue(np.allclose(own_loss, cudnn_loss))
def test_slws_fused_4bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight): workspace.ResetWorkspace() np.random.seed(seed) data = np.random.rand(num_rows, embedding_dim).astype(np.float32) data = data * 1e-3 lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32) indices = [] for length in lengths: indices.extend(np.random.choice(np.arange(1, num_rows), length)) indices = np.asarray(indices).astype(np.int64) weights = np.random.uniform( low=0, high=max_weight, size=[len(indices)]).astype( np.float32) - max_weight / 2.0 pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused4BitRowwiseQuantized", ["data"], ["quantized_data"])) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {}, max_batch_size=batch_size, max_seq_size=np.max(lengths), debug=True, adjust_batch=True, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(ref_net) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob('Y') if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "slws_fused_4bit_rowwise", { "seed": seed, "indices": indices, "data": data.shape, "lengths": lengths, "weights": weights, "Y_c2": Y_c2.shape, "Y_glow": Y_glow.shape, "diff": Y_glow - Y_c2, "rowwise_diff": (Y_glow - Y_c2)[:, 0] }) assert (0)
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): workspace.FeedBlob( core.ScopedBlobReference("seq_lengths"), np.array([self.T] * self.batch_per_device, dtype=np.int32)) model.param_init_net.ConstantFill( [], "hidden_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim]) model.param_init_net.ConstantFill( [], "cell_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim]) output, _last_hidden, _, _last_state, = rnn_cell.LSTM( model=model, input_blob="data", seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=self.input_dim, dim_out=self.hidden_dim, scope="partest", ) # A silly loss function loss = model.AveragedLoss( model.Sub([output, "target"], "dist"), "loss", ) loss = model.Scale(loss, "loss_scaled", scale=loss_scale) return [loss] def param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param) assert len( model.GetParams()) == len(model.params) // len(model._devices) workspace.ResetWorkspace() model = cnn.CNNModelHelper(name="recurrent_test{}".format(devices), ) self.T = 8 self.batch_size = 64 self.input_dim = 8 self.hidden_dim = 31 self.batch_per_device = self.batch_size // len(devices) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=devices, optimize_gradient_memory=True, cpu_device=not gpu, ) # Change all initialization to be ConstantFills so that # the everything is deterministic for op in model.param_init_net.Proto().op: if op.type.endswith('Fill'): op.type = 'ConstantFill' # Each run has same input, independent of number of gpus np.random.seed(20150210) for i in range(0, 10): full_data = np.random.rand(self.T, self.batch_size, self.input_dim) full_target = np.random.rand(self.T, self.batch_size, self.hidden_dim) for (j, g) in enumerate(devices): st = j * self.batch_per_device en = st + self.batch_per_device data = full_data[:, st:en, :].astype(np.float32) targets = full_target[:, st:en, :].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data) workspace.FeedBlob( "{}_{}/target".format(model._device_prefix, g), targets) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("{}_0/partest/i2h_w".format( model._device_prefix))
def test_slws_fused_4bit_rowwise_all_same(self, seed): np.random.seed(seed) workspace.ResetWorkspace() n = 1 m = 2 data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1 max_segments = 5 max_segment_length = 100 num_lengths = np.random.randint(1, max_segments + 1) # number of segments to run lengths = np.random.randint(0, max_segment_length + 1, size=num_lengths).astype(np.int32) num_indices = np.sum(lengths) indices = np.zeros(num_indices, dtype=np.int64) weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)])\ .astype(np.float32) weights = np.ones(len(indices)).astype(np.float32) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwise", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) ref_net = caffe2_pb2.NetDef() ref_net.name = "ref" ref_net.external_input.extend( ["quantized_data", "weights", "indices", "lengths"]) ref_net.external_output.append("Y") ref_net.op.add().CopyFrom( core.CreateOperator( "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI", ["quantized_data", "weights", "indices", "lengths"], ["Y"], )) workspace.FeedBlob("data", data) workspace.RunOperatorOnce( core.CreateOperator("FloatToFused4BitRowwiseQuantized", ['data'], ['quantized_data'])) print("quantized", workspace.FetchBlob("quantized_data")) pred_net_onnxified = onnxifi_caffe2_net(pred_net, {}, max_batch_size=max_segments, max_seq_size=max_segments * max_segment_length, debug=True, adjust_batch=True, use_onnx=False) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.FeedBlob("indices", indices) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("weights", weights) workspace.CreateNet(pred_net_onnxified) workspace.CreateNet(ref_net) workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob('Y') workspace.RunNet(ref_net.name) Y_c2 = workspace.FetchBlob('Y') if not np.allclose(Y_c2, Y_glow): print_test_debug_info( "slws_fused_4bit_rowwise", { "seed": seed, "indices": indices, "data": data, "lengths": lengths, "weights": weights, "Y_c2": Y_c2, "Y_glow": Y_glow, "diff": Y_glow - Y_c2, "rowwise_diff": (Y_glow - Y_c2)[:, 0] }) assert (0)
def run_model(self, V, gpu_devices): def input_builder_fun(model): return None def model_build_fun(model, loss_scale): gpu_vecs_gathered = [] gpu_vecs = [] for num, vec in enumerate(self.vecs): gpu_vec = model.param_init_net.CopyCPUToGPU( vec, 'gpuvec_{}'.format(num), ) if num != 2: model.params.append(gpu_vec) gpu_vecs.append(gpu_vec) for num, gpu_vec in enumerate(gpu_vecs): gpu_vec_gathered = model.net.Gather( [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)]) gpu_vecs_gathered.append(gpu_vec_gathered) assert len(gpu_vecs_gathered) == 3 fc = model.net.FC( [ gpu_vecs_gathered[2], gpu_vecs_gathered[0], gpu_vecs_gathered[1], ], ['fc'], ) _, loss = model.net.SoftmaxWithLoss( [fc, 'label'], ['ce_loss', 'avg_loss'], only_loss=True, ) loss = model.Scale(loss, scale=loss_scale) model.net.Print(loss, [], limit=10) return [loss] def param_update_fun(model): ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) LR = model.CopyCPUToGPU(self.LR, "LR") for param in model.GetParams(): param_grad = model.param_to_grad[param] if not isinstance(param_grad, core.GradientSlice): model.WeightedSum([param, ONE, param_grad, LR], param) else: model.net.ScatterWeightedSum( [ param, ONE, param_grad.indices, param_grad.values, ONE, ], param, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="sparse_test{}".format(gpu_devices), ) batch_size = 32 batch_per_device = batch_size // len(gpu_devices) with core.NameScope("cpu"): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): self.ITER = model.Iter("ITER") self.LR = model.net.LearningRate( [self.ITER], "LR", base_lr=(-0.1), policy="fixed", ) ''' self.vecs consists of 3 big blobs on which we call Gather: 1) FC weights, shape=(V, 16) 2) FC bias, shape=(V) 3) FC input, shape=(batch_per_device, 16) ''' self.vecs = [ model.param_init_net.UniformFill([], "vec_{}".format(num), shape=[V, 16]) for num in range(2) ] self.vecs.append( model.param_init_net.UniformFill( [], "vec_2", shape=[batch_per_device, 16])) self.ONE_CPU = model.param_init_net.ConstantFill( [], "ONE_CPU", shape=[1], value=1.0, ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=gpu_devices, ) # Update the vecs with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): for num, vec in enumerate(self.vecs[:-1]): model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec) # Each run has same input, independent of number of gpus for i in range(0, 10): np.random.seed(2603) full_indices = np.random.permutation(V)[:batch_size].reshape( batch_size) full_labels = full_indices[:] % batch_per_device for (j, g) in enumerate(gpu_devices): st = j * batch_per_device en = st + batch_per_device indices = full_indices[st:en].astype(np.int32) labels = full_labels[st:en].astype(np.int32) with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)): workspace.FeedBlob("gpu_{}/indices".format(g), indices) workspace.FeedBlob("gpu_{}/label".format(g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) # Force vecs to be same on all runs orig_vecs = [ np.random.rand(V, 16).astype(np.float32), np.random.rand(V).astype(np.float32), np.random.rand(V, 16).astype(np.float32), ] for vec, orig_vec in zip(self.vecs, orig_vecs): workspace.FeedBlob(vec, orig_vec) for g in gpu_devices: for num, orig_vec in enumerate(orig_vecs): workspace.FeedBlob( "gpu_{}/gpuvec_{}".format(g, num), orig_vec, device_option=core.DeviceOption( caffe2_pb2.CUDA, g), ) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) idx = workspace.FetchBlob('gpu_0/indices') grad_slices = [ workspace.FetchBlob('gpu_{}/gpu_vec_gathered_{}_grad'.format( g, num)) for g in gpu_devices for num in range(2) ] for grad_slice in grad_slices: # print (len(idx), len(grad_slice)) assert len(idx) == len(grad_slice), ( 'Number of indices {} is not same as number of gradient ' 'slices {}. This might lead to illegal memory access'. format(len(idx), len(grad_slice)))
pred = m.net.Sigmoid(fc_1, "pred") softmax, loss = m.net.SoftmaxWithLoss([pred, "label"], ["softmax", "loss"]) # print(m.net.Proto()) # print(m.param_init_net.Proto()) # save the model as graph graph = net_drawer.GetPydotGraph(m.net, rankdir="BT") graph.write_png("hello.png") # init, create and run m.AddGradientOperators([loss]) # add gradient # print(m.net.Proto()) # observe gradient workspace.RunNetOnce(m.param_init_net) workspace.CreateNet(m.net) for ii in range(100): data = np.random.rand(16, 100).astype(np.float32) label = (np.random.rand(16) * 10).astype(np.int32) workspace.FeedBlob("data", data) workspace.FeedBlob("label", label) workspace.RunNet(m.name, 10) # run for 10 times # print("Run: ", ii) # save the model with grad graph = net_drawer.GetPydotGraph(m.net, rankdir="BT") graph.write_png("hello_with_grad.png")
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="resnet50", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict(kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model building functions def create_resnet50_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = resnet.create_resnet50( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_bias=True, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) if args.float16_compute: # TODO: merge with multi-prceision optimizer opt = optimizer.build_fp16_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, # weight decay included policy="step", stepsize=stepsz, gamma=0.1) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) # Create parallelized model data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet50_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="resnet50_test", arg_scope=test_arg_scope, init_params=False) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnet50_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def testIf(self): W_a_values = [2.0, 1.5] B_a_values = [0.5] W_b_values = [7.0, 3.5] B_b_values = [1.5] with NetBuilder(_use_control_ops=True) as init_nb: W_a = ops.UniformFill([], "W_a", shape=[1, 2], min=-1., max=1.) B_a = ops.ConstantFill([], "B_a", shape=[1], value=0.0) W_b = ops.UniformFill([], "W_b", shape=[1, 2], min=-1., max=1.) B_b = ops.ConstantFill([], "B_b", shape=[1], value=0.0) W_gt_a = ops.GivenTensorFill([], "W_gt_a", shape=[1, 2], values=W_a_values) B_gt_a = ops.GivenTensorFill([], "B_gt_a", shape=[1], values=B_a_values) W_gt_b = ops.GivenTensorFill([], "W_gt_b", shape=[1, 2], values=W_b_values) B_gt_b = ops.GivenTensorFill([], "B_gt_b", shape=[1], values=B_b_values) params = [W_gt_a, B_gt_a, W_a, B_a, W_gt_b, B_gt_b, W_b, B_b] with NetBuilder(_use_control_ops=True, initial_scope=params) as train_nb: Y_pred = ops.ConstantFill([], "Y_pred", shape=[1], value=0.0) Y_noise = ops.ConstantFill([], "Y_noise", shape=[1], value=0.0) switch = ops.UniformFill([], "switch", shape=[1], min=-1., max=1., run_once=0) zero = ops.ConstantFill([], "zero", shape=[1], value=0.0) X = ops.GaussianFill([], "X", shape=[4096, 2], mean=0.0, std=1.0, run_once=0) noise = ops.GaussianFill([], "noise", shape=[4096, 1], mean=0.0, std=1.0, run_once=0) with ops.IfNet(ops.LT([switch, zero])): Y_gt = ops.FC([X, W_gt_a, B_gt_a], "Y_gt") ops.Add([Y_gt, noise], Y_noise) ops.FC([X, W_a, B_a], Y_pred) with ops.Else(): Y_gt = ops.FC([X, W_gt_b, B_gt_b], "Y_gt") ops.Add([Y_gt, noise], Y_noise) ops.FC([X, W_b, B_b], Y_pred) dist = ops.SquaredL2Distance([Y_noise, Y_pred], "dist") loss = dist.AveragedLoss([], ["loss"]) assert len(init_nb.get()) == 1, "Expected a single init net produced" assert len(train_nb.get()) == 1, "Expected a single train net produced" train_net = train_nb.get()[0] gradient_map = train_net.AddGradientOperators([loss]) init_net = init_nb.get()[0] ITER = init_net.ConstantFill([], "ITER", shape=[1], value=0, dtype=core.DataType.INT32) train_net.Iter(ITER, ITER) LR = train_net.LearningRate(ITER, "LR", base_lr=-0.1, policy="step", stepsize=20, gamma=0.9) ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.) train_net.WeightedSum([W_a, ONE, gradient_map[W_a], LR], W_a) train_net.WeightedSum([B_a, ONE, gradient_map[B_a], LR], B_a) train_net.WeightedSum([W_b, ONE, gradient_map[W_b], LR], W_b) train_net.WeightedSum([B_b, ONE, gradient_map[B_b], LR], B_b) workspace.RunNetOnce(init_net) workspace.CreateNet(train_net) # print("Before training, W_a is: {}".format(workspace.FetchBlob("W_a"))) # print("Before training, B_a is: {}".format(workspace.FetchBlob("B_a"))) # print("Before training, W_b is: {}".format(workspace.FetchBlob("W_b"))) # print("Before training, B_b is: {}".format(workspace.FetchBlob("B_b"))) for _epoch in range(1000): workspace.RunNet(train_net.Proto().name) # print("After training, W_a is: {}".format(workspace.FetchBlob("W_a"))) # print("After training, B_a is: {}".format(workspace.FetchBlob("B_a"))) # print("After training, W_b is: {}".format(workspace.FetchBlob("W_b"))) # print("After training, B_b is: {}".format(workspace.FetchBlob("B_b"))) # print("Ground truth W_a is: {}".format(workspace.FetchBlob("W_gt_a"))) # print("Ground truth B_a is: {}".format(workspace.FetchBlob("B_gt_a"))) # print("Ground truth W_b is: {}".format(workspace.FetchBlob("W_gt_b"))) # print("Ground truth B_b is: {}".format(workspace.FetchBlob("B_gt_b"))) values_map = { "W_a": W_a_values, "B_a": B_a_values, "W_b": W_b_values, "B_b": B_b_values, } train_eps = 0.01 for blob_name, values in values_map.items(): trained_values = workspace.FetchBlob(blob_name) if trained_values.ndim == 2: self.assertEqual(trained_values.shape[0], 1) trained_values = trained_values[0][:] else: self.assertEqual(trained_values.ndim, 1) self.assertEqual(trained_values.size, len(values)) for idx in range(len(trained_values)): self.assertTrue( abs(trained_values[idx] - values[idx]) < train_eps)
def TrainModel(self): log.debug("Training model") workspace.RunNetOnce(self.model.param_init_net) # As though we predict the same probablity for each character smooth_loss = -np.log(1.0 / self.D) * self.seq_length last_n_iter = 0 last_n_loss = 0.0 num_iter = 0 N = len(self.text) # We split text into batch_size peaces. Each peace will be used only # by a corresponding batch during the training process text_block_positions = np.zeros(self.batch_size, dtype=np.int32) text_block_size = N // self.batch_size text_block_starts = range(0, N, text_block_size) text_block_sizes = [text_block_size] * self.batch_size text_block_sizes[self.batch_size - 1] += N % self.batch_size assert sum(text_block_sizes) == N # Writing to output states which will be copied to input # states within the loop below workspace.FeedBlob( self.hidden_output, np.zeros([1, self.batch_size, self.hidden_size], dtype=np.float32)) workspace.FeedBlob( self.cell_state, np.zeros([1, self.batch_size, self.hidden_size], dtype=np.float32)) workspace.CreateNet(self.prepare_state) # We iterate over text in a loop many times. Each time we peak # seq_length segment and feed it to LSTM as a sequence last_time = datetime.now() progress = 0 while True: workspace.FeedBlob( "seq_lengths", np.array([self.seq_length] * self.batch_size, dtype=np.int32)) workspace.RunNet(self.prepare_state.Name()) input = np.zeros([self.seq_length, self.batch_size, self.D]).astype(np.float32) target = np.zeros([self.seq_length * self.batch_size ]).astype(np.int32) for e in range(self.batch_size): for i in range(self.seq_length): pos = text_block_starts[e] + text_block_positions[e] input[i][e][self._idx_at_pos(pos)] = 1 target[i * self.batch_size + e] =\ self._idx_at_pos((pos + 1) % N) text_block_positions[e] = (text_block_positions[e] + 1) % text_block_sizes[e] progress += 1 workspace.FeedBlob('input_blob', input) workspace.FeedBlob('target', target) CreateNetOnce(self.model.net) workspace.RunNet(self.model.net.Name()) num_iter += 1 last_n_iter += 1 if num_iter % self.iters_to_report == 0: new_time = datetime.now() print("Characters Per Second: {}".format( int(progress / (new_time - last_time).total_seconds()))) print("Iterations Per Second: {}".format( int(self.iters_to_report / (new_time - last_time).total_seconds()))) last_time = new_time progress = 0 print("{} Iteration {} {}".format('-' * 10, num_iter, '-' * 10)) loss = workspace.FetchBlob(self.loss) * self.seq_length smooth_loss = 0.999 * smooth_loss + 0.001 * loss last_n_loss += loss if num_iter % self.iters_to_report == 0: self.GenerateText(500, np.random.choice(self.vocab)) log.debug("Loss since last report: {}".format(last_n_loss / last_n_iter)) log.debug("Smooth loss: {}".format(smooth_loss)) last_n_loss = 0.0 last_n_iter = 0
def from_trainers(cls, trainer, features, actions, normalization_parameters): """ Creates DiscreteActionPredictor from a list of action trainers :param trainer DiscreteActionTrainer :param features list of state feature names :param actions list of action names """ int_features = [int(feature) for feature in features] inputs = [ 'input/float_features.lengths', 'input/float_features.keys', 'input/float_features.values' ] workspace.FeedBlob('input/float_features.lengths', np.zeros(1, dtype=np.int32)) workspace.FeedBlob('input/float_features.keys', np.zeros(1, dtype=np.int32)) workspace.FeedBlob('input/float_features.values', np.zeros(1, dtype=np.float32)) model = model_helper.ModelHelper(name="predictor") net = model.net dense_input = net.NextBlob('dense_input') workspace.FeedBlob(dense_input, np.zeros(1, dtype=np.float32)) default_input_value = net.NextBlob('default_input_value') workspace.FeedBlob(default_input_value, np.array([MISSING_VALUE], dtype=np.float32)) net.GivenTensorFill([], [default_input_value], shape=[], values=[MISSING_VALUE]) net.SparseToDenseMask([ 'input/float_features.keys', 'input/float_features.values', default_input_value, 'input/float_features.lengths', ], [dense_input], mask=int_features) for i, feature in enumerate(features): net.Slice( [dense_input], [feature], starts=[0, i], ends=[-1, (i + 1)], ) normalizer = PreprocessorNet(net, True) parameters = list(normalizer.parameters[:]) parameters.append(default_input_value) normalized_input_blobs = [] zero = "ZERO_from_trainers" workspace.FeedBlob(zero, np.array(0)) parameters.append(zero) for feature in features: normalized_input_blob, blob_parameters = normalizer.preprocess_blob( feature, normalization_parameters[feature], ) parameters.extend(blob_parameters) normalized_input_blobs.append(normalized_input_blob) concatenated_input_blob = "PredictorInput" output_dim = "PredictorOutputDim" for i, inp in enumerate(normalized_input_blobs): logger.info("input# {}: {}".format(i, inp)) net.Concat(normalized_input_blobs, [concatenated_input_blob, output_dim], axis=1) net.NanCheck(concatenated_input_blob, concatenated_input_blob) q_values = "q_values" workspace.FeedBlob(q_values, np.zeros(1, dtype=np.float32)) trainer.build_predictor(model, concatenated_input_blob, q_values) parameters.extend(model.GetAllParams()) action_names = net.NextBlob("action_names") parameters.append(action_names) workspace.FeedBlob(action_names, np.array(actions)) action_range = net.NextBlob("action_range") parameters.append(action_range) workspace.FeedBlob(action_range, np.array(list(range(len(actions))))) output_shape = net.NextBlob("output_shape") workspace.FeedBlob(output_shape, np.zeros(1, dtype=np.int64)) net.Shape([q_values], [output_shape]) output_shape_row_count = net.NextBlob("output_shape_row_count") net.Slice([output_shape], [output_shape_row_count], starts=[0], ends=[1]) output_row_shape = net.NextBlob("output_row_shape") workspace.FeedBlob(output_row_shape, np.zeros(1, dtype=np.int64)) net.Slice([q_values], [output_row_shape], starts=[0, 0], ends=[-1, 1]) output_feature_keys = 'output/string_weighted_multi_categorical_features.keys' workspace.FeedBlob(output_feature_keys, np.zeros(1, dtype=np.int64)) output_feature_keys_matrix = net.NextBlob('output_feature_keys_matrix') net.ConstantFill([output_row_shape], [output_feature_keys_matrix], value=0, dtype=caffe2_pb2.TensorProto.INT64) net.FlattenToVec( [output_feature_keys_matrix], [output_feature_keys], ) output_feature_lengths = \ 'output/string_weighted_multi_categorical_features.lengths' workspace.FeedBlob(output_feature_lengths, np.zeros(1, dtype=np.int32)) output_feature_lengths_matrix = net.NextBlob( 'output_feature_lengths_matrix') net.ConstantFill([output_row_shape], [output_feature_lengths_matrix], value=1, dtype=caffe2_pb2.TensorProto.INT32) net.FlattenToVec( [output_feature_lengths_matrix], [output_feature_lengths], ) output_keys = 'output/string_weighted_multi_categorical_features.values.keys' workspace.FeedBlob(output_keys, np.array(['a'])) net.Tile([action_names, output_shape_row_count], [output_keys], axis=1) output_lengths_matrix = net.NextBlob('output_lengths_matrix') net.ConstantFill([output_row_shape], [output_lengths_matrix], value=len(actions), dtype=caffe2_pb2.TensorProto.INT32) output_lengths = \ 'output/string_weighted_multi_categorical_features.values.lengths' workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32)) net.FlattenToVec( [output_lengths_matrix], [output_lengths], ) output_values = \ 'output/string_weighted_multi_categorical_features.values.values' workspace.FeedBlob(output_values, np.array([1.0])) net.FlattenToVec([q_values], [output_values]) output_blobs = [ output_feature_keys, output_feature_lengths, output_keys, output_lengths, output_values, ] workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(net) predictor = cls(net, inputs, output_blobs, parameters, workspace.CurrentWorkspace()) return predictor