def test_weight_decay(self): from caffe2.python import brew from caffe2.python.model_helper import ModelHelper model = ModelHelper(name="test", arg_scope={'order': 'NCHW'}) cnv = brew.conv(model, 'data', 'cnv', 32, 32, 4) a = brew.fc(model, cnv, 'a', 100, 200) pred = brew.fc(model, a, 'b', 200, 5) (softmax, loss) = model.SoftmaxWithLoss( [pred, 'label'], ['softmax', 'loss'], ) model.AddGradientOperators([loss]) add_weight_decay(model, weight_decay=1e-4) build_sgd(model, 0.11) expected_weight_grad = {'b_w_grad', 'a_w_grad', 'cnv_w_grad'} # Check the proto that all weights are decayed and not non-weights # are decayed. for op in model.net.Proto().op: if op.type == 'WeightedSum' and 'wd_0_0' in op.input: if op.output[0] not in expected_weight_grad: print("Unexpected param for weight_decay: {}".format( op.output[0])) self.assertTrue(op.output[0] in expected_weight_grad) expected_weight_grad.remove(op.output[0]) self.assertEqual( expected_weight_grad, set(), "Not all weights were decayed: {}".format(expected_weight_grad))
def test_caffe2_simple_model(self): model = ModelHelper(name="mnist") # how come those inputs don't break the forward pass =.=a workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32)) workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int)) with core.NameScope("conv1"): conv1 = brew.conv(model, "data", 'conv1', dim_in=1, dim_out=20, kernel=5) # Image size: 24 x 24 -> 12 x 12 pool1 = brew.max_pool(model, conv1, 'pool1', kernel=2, stride=2) # Image size: 12 x 12 -> 8 x 8 conv2 = brew.conv(model, pool1, 'conv2', dim_in=20, dim_out=100, kernel=5) # Image size: 8 x 8 -> 4 x 4 pool2 = brew.max_pool(model, conv2, 'pool2', kernel=2, stride=2) with core.NameScope("classifier"): # 50 * 4 * 4 stands for dim_out from previous layer multiplied by the image size fc3 = brew.fc(model, pool2, 'fc3', dim_in=100 * 4 * 4, dim_out=500) relu = brew.relu(model, fc3, fc3) pred = brew.fc(model, relu, 'pred', 500, 10) softmax = brew.softmax(model, pred, 'softmax') xent = model.LabelCrossEntropy([softmax, "label"], 'xent') # compute the expected loss loss = model.AveragedLoss(xent, "loss") model.net.RunAllOnMKL() model.param_init_net.RunAllOnMKL() model.AddGradientOperators([loss], skip=1) blob_name_tracker = {} graph = c2_graph.model_to_graph_def( model, blob_name_tracker=blob_name_tracker, shapes={}, show_simplified=False, ) compare_proto(graph, self)
def _createDense(self, dtype=core.DataType.FLOAT): perfect_model = np.array([2, 6, 5, 0, 1]).astype(np.float32) np.random.seed(123) # make test deterministic numpy_dtype = np.float32 if dtype == core.DataType.FLOAT else np.float16 initializer = Initializer if dtype == core.DataType.FLOAT else pFP16Initializer data = np.random.randint(2, size=(20, perfect_model.size)).astype(numpy_dtype) label = np.dot(data, perfect_model)[:, np.newaxis] model = ModelHelper(name="test", arg_scope={'order': 'NCHW'}) out = brew.fc(model, 'data', 'fc', perfect_model.size, 1, ('ConstantFill', {}), ('ConstantFill', {}), axis=0, WeightInitializer=initializer, BiasInitializer=initializer) if dtype == core.DataType.FLOAT16: out = model.HalfToFloat(out, out + "_fp32") sq = model.SquaredL2Distance([out, 'label']) loss = model.AveragedLoss(sq, "avg_loss") grad_map = model.AddGradientOperators([loss]) self.assertIsInstance(grad_map['fc_w'], core.BlobReference) return (model, perfect_model, data, label)
def test_optimizer_context(self): from caffe2.python import brew, optimizer from caffe2.python.model_helper import ModelHelper model = ModelHelper(name="test", arg_scope={'order': 'NCHW'}) count = optimizer._optimizer_instance_count['SgdOptimizer'] cnv_optim = SgdOptimizer(0.15) weight_optim = SgdOptimizer(0.2) bias_optim = SgdOptimizer(0.1) with UseOptimizer(cnv_optim): cnv = brew.conv(model, 'data', 'cnv', 32, 32, 4) with UseOptimizer({'WEIGHT': weight_optim, 'BIAS': bias_optim}): a = brew.fc(model, cnv, 'a', 100, 200) pred = brew.fc(model, a, 'b', 200, 5) (softmax, loss) = model.SoftmaxWithLoss( [pred, 'label'], ['softmax', 'loss'], ) model.AddGradientOperators([loss]) add_weight_decay(model, weight_decay=1e-4) # use the following optimizer if none specified in param_info build_sgd(model, 0.11) expected_weight_grad = {'b_w_grad', 'a_w_grad', 'cnv_w_grad'} expected_learning_rate = { "SgdOptimizer_{}_lr_cpu".format(count): -0.15, "SgdOptimizer_{}_lr_cpu".format(count + 1): -0.2, "SgdOptimizer_{}_lr_cpu".format(count + 2): -0.1, "SgdOptimizer_{}_lr_cpu".format(count + 3): -0.11 } for op in model.net.Proto().op: # Check the proto that all weights are decayed and not non-weights # are decayed. if op.type == 'WeightedSum' and 'wd_0_0' in op.input: if op.output[0] not in expected_weight_grad: print( "Unexpected param for weight_decay: {}". format(op.output[0]) ) self.assertTrue(op.output[0] in expected_weight_grad) expected_weight_grad.remove(op.output[0]) # Check the learning rate for each parameter if op.type == 'LearningRate': val = 0 for arg in op.arg: if arg.name == 'base_lr': val = arg.f self.assertAlmostEqual( val, expected_learning_rate[op.output[0]] ) self.assertEqual( expected_weight_grad, set(), "Not all weights were decayed: {}".format(expected_weight_grad) )
def createTrainModel(lmdb_path): """Create and return a training model, complete with training ops.""" model = ModelHelper(name='train', arg_scope={'order': 'NCHW'}) reader = model.CreateDB('train_reader', db=lmdb_path, db_type='lmdb') AddInputOps(model, reader, BATCH_SIZE) losses = AddForwardPassOps(model) model.AddGradientOperators(losses) AddOptimizerOps(model) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) return model
def main(opt_name): workspace.FeedBlob('input', np.random.randn(2, 16).astype(np.float32)) workspace.FeedBlob('label', np.array([0, 1]).astype(np.float32)) helper = ModelHelper("sample_model") fc = brew.fc(helper, "input", "fc", dim_in=16, dim_out=8) relu = helper.Relu(fc, 'relu') fc2 = brew.fc(helper, relu, "fc2", dim_in=8, dim_out=1) label_ex = helper.ExpandDims("label", "label_ex", dims=[1]) xent = helper.SigmoidCrossEntropyWithLogits([fc2, label_ex], 'xent') loss = helper.AveragedLoss(xent, 'loss') helper.AddGradientOperators([loss]) if opt_name == "manual": ONE = helper.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) LR = helper.param_init_net.ConstantFill([], "LR", shape=[1], value=-0.03) for param in helper.params: param_grad = helper.param_to_grad[param] helper.WeightedSum([param, ONE, param_grad, LR], param) elif opt_name == "sgd": optimizer.build_sgd(helper, 0.03) elif opt_name == "adagrad": optimizer.build_adagrad(helper, 0.03) # caffe2 does not support rowwise adagrad for dense parameters # caffe2 seems not have lamb support yet elif opt_name == "adam": optimizer.build_adam(helper, 0.03) else: assert False, f"Unsupported optimizer {opt_name}" workspace.RunNetOnce(helper.param_init_net) workspace.RunNetOnce(helper.net) import pdb pdb.set_trace()
def test_multiple_optimizers(self): from caffe2.python import brew, core, optimizer from caffe2.python.model_helper import ModelHelper model = ModelHelper(name="test") fc1 = brew.fc(model, 'data', 'fc1', 100, 50) fc2 = brew.fc(model, fc1, 'fc2', 50, 25) pred = brew.fc(model, fc2, 'fc3', 25, 10) (softmax, loss) = model.SoftmaxWithLoss( [pred, 'label'], ['softmax', 'loss'], ) model.AddGradientOperators([loss]) param_to_device = optimizer._get_param_to_device(model) def infer_blob_device(blob_name): return optimizer.get_param_device(blob_name, "{}_grad".format(blob_name), param_to_device) sgd_1 = optimizer.SgdOptimizer(base_learning_rate=0.1) sgd_2 = optimizer.SgdOptimizer(base_learning_rate=0.2) adagrad = optimizer.AdagradOptimizer() # Check same optimizer share the same learning rate. with core.DeviceScope(infer_blob_device("fc1_w")): sgd_1(model.net, model.param_init_net, "fc1_w", "fc1_w_grad") with core.DeviceScope(infer_blob_device("fc1_b")): sgd_1(model.net, model.param_init_net, "fc1_b", "fc1_b_grad") fc1_lr_blobs = [] for op in model.net.Proto().op: if op.type == 'WeightedSum' and op.input[0] == 'fc1_w' or \ op.input[0] == 'fc1_b': fc1_lr_blobs.append(op.input[3]) self.assertEqual(fc1_lr_blobs[0], fc1_lr_blobs[1]) # Check different instance of the same optimizer has a different lr. with core.DeviceScope(infer_blob_device("fc2_w")): sgd_2(model.net, model.param_init_net, "fc2_w", "fc2_w_grad") with core.DeviceScope(infer_blob_device("fc2_b")): sgd_2(model.net, model.param_init_net, "fc2_b", "fc2_b_grad") fc2_lr_blobs = [] for op in model.net.Proto().op: if op.type == 'WeightedSum' and op.input[0] == 'fc2_w' or \ op.input[0] == 'fc2_b': self.assertTrue(op.input[3] not in fc1_lr_blobs) fc2_lr_blobs.append(op.input[3]) self.assertEqual(fc2_lr_blobs[0], fc2_lr_blobs[1]) # Check different optimizer type case with core.DeviceScope(infer_blob_device("fc3_w")): adagrad(model.net, model.param_init_net, "fc3_w", "fc3_w_grad") with core.DeviceScope(infer_blob_device("fc3_b")): adagrad(model.net, model.param_init_net, "fc3_b", "fc3_b_grad") fc3_lr_blobs = [] for op in model.net.Proto().op: if op.type == 'Adagrad' and op.input[0] == 'fc3_w' or \ op.input[0] == 'fc3_b': self.assertTrue(op.input[3] not in fc2_lr_blobs) self.assertTrue(op.input[3] not in fc1_lr_blobs) fc3_lr_blobs.append(op.input[3]) self.assertEqual(fc3_lr_blobs[0], fc3_lr_blobs[1])
def test_convolution_sync(self, net_type, num_workers, engine, gc, dc): m = ModelHelper(name="test_model") n = 1 d = 2 depth = 3 iters = 5 h = 5 w = 5 workspace.ResetWorkspace() use_cudnn = (engine == 'CUDNN') np.random.seed(1701) # Build a binary tree of conv layers, summing at each node. for i in reversed(range(depth)): for j in range(2**i): bottom_1 = "{}_{}".format(i + 1, 2 * j) bottom_2 = "{}_{}".format(i + 1, 2 * j + 1) mid_1 = "{}_{}_m".format(i + 1, 2 * j) mid_2 = "{}_{}_m".format(i + 1, 2 * j + 1) top = "{}_{}".format(i, j) w1, b1, w2, b2 = np.random.randn(4).tolist() brew.conv(m, bottom_1, mid_1, dim_in=d, dim_out=d, kernel=3, weight_init=('ConstantFill', dict(value=w1)), bias_init=('ConstantFill', dict(value=b1)), cudnn_state=np.random.randint(0, 3), stride=1, pad=1, deterministic=1, use_cudnn=use_cudnn, engine=engine) brew.conv(m, bottom_2, mid_2, dim_in=d, dim_out=d, kernel=3, stride=1, pad=1, weight_init=('ConstantFill', dict(value=w2)), bias_init=('ConstantFill', dict(value=b2)), deterministic=1, cudnn_state=np.random.randint(0, 3), use_cudnn=use_cudnn, engine=engine) m.net.Sum([mid_1, mid_2], top) m.net.Flatten(["0_0"], ["0_0_flat"]) m.net.SquaredL2Distance(["0_0_flat", "label"], "xent") m.net.AveragedLoss("xent", "loss") input_to_grad = m.AddGradientOperators(["loss"]) m.Proto().device_option.CopyFrom(gc) m.param_init_net.Proto().device_option.CopyFrom(gc) m.Proto().type = net_type m.Proto().num_workers = num_workers self.ws.run(m.param_init_net) def run(): import numpy as np np.random.seed(1701) input_blobs = ["{}_{}".format(depth, j) for j in range(2**depth)] for input_blob in input_blobs: self.ws.create_blob(input_blob).feed(np.random.randn( n, d, h, w).astype(np.float32), device_option=gc) self.ws.create_blob("label").feed(np.random.randn( n, d * h * w).astype(np.float32), device_option=gc) self.ws.run(m.net) gradients = [ self.ws.blobs[str(input_to_grad[input_blob])].fetch() for input_blob in input_blobs ] return gradients outputs = [run() for _ in range(iters)] for output in outputs[1:]: np.testing.assert_array_equal(outputs[0], output) np.testing.assert_allclose(np.sum(np.square(output)), 1763719461732352.0, rtol=1e-5)
class MLTrainer: """ This is meant to be a generic neural net trainer. It uses minibatch and ADAM for momentum/smoothing. """ def __init__( self, name: str, parameters: TrainingParameters, ) -> None: """ :param name: A unique name for this trainer used to create the data on the caffe2 workspace :param parameters: The set of training parameters """ self.model_id = name self.optimizer = parameters.optimizer self.layers = parameters.layers self.activations = parameters.activations self.learning_rate = parameters.learning_rate self.gamma = parameters.gamma self.lr_policy = parameters.lr_policy self.dropout_ratio = parameters.dropout_ratio self._validate_inputs() self._setup_initial_blobs() self._build_fwd_pass_score_model() self._build_fwd_pass_train_model() self._generate_train_model_loss() self._update_train_model() workspace.RunNetOnce(self.score_model.param_init_net) workspace.CreateNet(self.score_model.net) workspace.RunNetOnce(self.train_model.param_init_net) workspace.CreateNet(self.train_model.net) def _validate_inputs(self): num_layers = len(self.layers) num_activations = len(self.activations) if num_activations != num_layers - 1: raise Exception( "Incompatible input `layers` and `activations` sizes.") if not all(x > 0 and int(x) == x for x in self.layers): raise Exception( "All values in `layers` should be positive integers.") def _generate_train_model_loss(self): GenerateLossOps(self.train_model, self.model_id + "_train", self.output_blob, self.labels_blob, self.loss_blob) def _build_fwd_pass_train_model(self): self.train_model.StopGradient(self.labels_blob, self.labels_blob) MakeForwardPassOps(self.train_model, self.model_id + "_train", self.input_blob, self.output_blob, self.weights, self.biases, self.activations, self.layers, self.dropout_ratio, False) def _build_fwd_pass_score_model(self): MakeForwardPassOps(self.score_model, self.model_id + "_score", self.input_blob, self.output_blob, self.weights, self.biases, self.activations, self.layers, self.dropout_ratio, True) def _setup_initial_blobs(self): # Define models self.score_model = ModelHelper(name="score_" + self.model_id) self.train_model = ModelHelper(name="train_" + self.model_id) # Create input, output, labels, and loss blobs self.input_blob = "ModelInput_" + self.model_id workspace.FeedBlob(self.input_blob, np.zeros(1, dtype=np.float32)) self.output_blob = "ModelOutput_" + self.model_id workspace.FeedBlob(self.output_blob, np.zeros(1, dtype=np.float32)) self.labels_blob = "ModelLabels_" + self.model_id workspace.FeedBlob(self.labels_blob, np.zeros(1, dtype=np.float32)) self.loss_blob = "loss" # "ModelLoss_" + self.model_id workspace.FeedBlob(self.loss_blob, np.zeros(1, dtype=np.float32)) # Create blobs for model parameters self.weights: List[str] = [] self.biases: List[str] = [] for x in six.moves.range(len(self.layers) - 1): dim_in = self.layers[x] dim_out = self.layers[x + 1] weight_name = "Weights_" + str(x) + "_" + self.model_id bias_name = "Biases_" + str(x) + "_" + self.model_id self.weights.append(weight_name) self.biases.append(bias_name) bias = np.zeros(shape=[ dim_out, ], dtype=np.float32) workspace.FeedBlob(bias_name, bias) gain = np.sqrt(2) if self.activations[x] == 'relu' else 1 weights = scipy.stats.norm(0, gain * np.sqrt(1 / dim_in)).rvs( size=[dim_out, dim_in]).astype(np.float32) workspace.FeedBlob(weight_name, weights) def _update_train_model(self): self.train_model.AddGradientOperators([self.loss_blob]) for param in self.train_model.params: param_grad = self.train_model.param_to_grad[param] self.train_model.net.NanCheck([param_grad], [param_grad]) AddParameterUpdateOps( self.train_model, optimizer_input=self.optimizer, base_learning_rate=self.learning_rate, gamma=self.gamma, policy=self.lr_policy, ) def build_predictor(self, model, input_blob, output_blob) -> List[str]: MakeForwardPassOps(model, self.model_id + "_score", input_blob, output_blob, self.weights, self.biases, self.activations, self.layers, self.dropout_ratio, is_test=True) return self.weights + self.biases def score(self, inputs: np.ndarray) -> np.ndarray: """ Runs the net on a set of data and returns the outputs. :param inputs: Numpy array containing examples to score. """ workspace.FeedBlob(self.input_blob, inputs) workspace.RunNet(self.score_model.net) return workspace.FetchBlob(self.output_blob) def train_batch(self, inputs: np.ndarray, labels: np.ndarray) -> None: """ Trains net on inputs and labels. Please ensure that inputs are batched to an appropriate size and are shuffled. :param inputs: Numpy array containing training examples. :param labels: Numpy array containing training labels. """ workspace.FeedBlob(self.input_blob, inputs) workspace.FeedBlob(self.labels_blob, labels) workspace.RunNet(self.train_model.net) @property def output(self) -> np.ndarray: return workspace.FetchBlob(self.output_blob) @property def loss(self) -> np.ndarray: return workspace.FetchBlob('loss')
print("\n************* Init Net *************") print(regression_model.param_init_net.Proto()) # #### Add the training operators and prime the workspace # # In this **very important** step, we specify the loss function, setup the SGD training algorithm, prime and initialize the workspace, and initialize our model's weights and biases. # In[5]: # The loss function is computed by a squared L2 distance, # and then averaged over all items. dist = regression_model.SquaredL2Distance(['Y_gt', y_pred], "dist") loss = regression_model.AveragedLoss(dist, "loss") # Add the gradient operators and setup the SGD algorithm regression_model.AddGradientOperators([loss]) optimizer.build_sgd(regression_model, base_learning_rate=learning_rate) # Prime the workspace with some data workspace.FeedBlob("Y_gt", Y_gt.astype(np.float32)) workspace.FeedBlob("X", X.astype(np.float32)) # Run the init net to prepare the workspace then create the net workspace.RunNetOnce(regression_model.param_init_net) workspace.CreateNet(regression_model.net) # Inject our desired initial weights and bias workspace.FeedBlob("y_pred_w", np.array([initial_weights]).astype(np.float32)) workspace.FeedBlob("y_pred_b", np.array([0.]).astype(np.float32)) # #### Run the training