def AddParameterUpdateOps( model, optimizer_input="SGD", base_learning_rate=0.01, *args, **kwargs ): if optimizer_input not in OPTIMIZER_DICT: raise Exception( "Optimizer {} unknown. Valid choices are {}" .format(optimizer_input, ', '.join(OPTIMIZER_DICT.keys())) ) optimizer_rule = OPTIMIZER_DICT[optimizer_input] if optimizer_rule == GRAD_OPTIMIZER.SGD: build_sgd( model, base_learning_rate, gamma=kwargs['gamma'], policy=kwargs['policy'], stepsize=1 ) elif optimizer_rule == GRAD_OPTIMIZER.ADAGRAD: build_adagrad(model, base_learning_rate) elif optimizer_rule == GRAD_OPTIMIZER.ADAM: build_adam(model, base_learning_rate) elif optimizer_rule == GRAD_OPTIMIZER.FTRL: build_ftrl(model, base_learning_rate) else: print( "Unrecognized in caffe2 setting, using default SGD", optimizer_rule ) build_sgd(model, base_learning_rate)
def addParameterUpdateOps(self, model): if self.optimizer not in OPTIMIZER_DICT: raise Exception( "Optimizer {} unknown. Valid choices are {}".format( self.optimizer, ", ".join(OPTIMIZER_DICT.keys()))) optimizer_rule = OPTIMIZER_DICT[self.optimizer] if optimizer_rule == GRAD_OPTIMIZER.SGD: build_sgd( model, self.learning_rate, gamma=self.lr_decay, policy=self.lr_policy, stepsize=1, ) elif optimizer_rule == GRAD_OPTIMIZER.ADAGRAD: build_adagrad(model, self.learning_rate) elif optimizer_rule == GRAD_OPTIMIZER.ADAM: build_adam(model, self.learning_rate) elif optimizer_rule == GRAD_OPTIMIZER.FTRL: build_ftrl(model, self.learning_rate) else: print("Unrecognized in caffe2 setting, using default SGD", optimizer_rule) build_sgd(model, self.learning_rate)
def AddOptimizerOps_adam(model): # Use adam as optimization function optimizer.build_adam( model, base_learning_rate=base_learning_rate # policy="step", # momentum=0.9, # weight_decay=0.004 )
def build_optimizer(self, model, **kwargs): self._skip_gpu = False kwargs['beta1'] = 0.0 return build_adam(model, base_learning_rate=0.1, use_smart_decay=True, **kwargs)
def add_training_operators(output_segmentation, model, device_opts): with core.DeviceScope(device_opts): loss = model.SigmoidCrossEntropyWithLogits( [output_segmentation, "gt_segmentation"], 'loss') avg_loss = model.AveragedLoss(loss, "avg_loss") model.AddGradientOperators([loss]) opt = optimizer.build_adam(model, base_learning_rate=0.01)
def main(opt_name): workspace.FeedBlob('input', np.random.randn(2, 16).astype(np.float32)) workspace.FeedBlob('label', np.array([0, 1]).astype(np.float32)) helper = ModelHelper("sample_model") fc = brew.fc(helper, "input", "fc", dim_in=16, dim_out=8) relu = helper.Relu(fc, 'relu') fc2 = brew.fc(helper, relu, "fc2", dim_in=8, dim_out=1) label_ex = helper.ExpandDims("label", "label_ex", dims=[1]) xent = helper.SigmoidCrossEntropyWithLogits([fc2, label_ex], 'xent') loss = helper.AveragedLoss(xent, 'loss') helper.AddGradientOperators([loss]) if opt_name == "manual": ONE = helper.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) LR = helper.param_init_net.ConstantFill([], "LR", shape=[1], value=-0.03) for param in helper.params: param_grad = helper.param_to_grad[param] helper.WeightedSum([param, ONE, param_grad, LR], param) elif opt_name == "sgd": optimizer.build_sgd(helper, 0.03) elif opt_name == "adagrad": optimizer.build_adagrad(helper, 0.03) # caffe2 does not support rowwise adagrad for dense parameters # caffe2 seems not have lamb support yet elif opt_name == "adam": optimizer.build_adam(helper, 0.03) else: assert False, f"Unsupported optimizer {opt_name}" workspace.RunNetOnce(helper.param_init_net) workspace.RunNetOnce(helper.net) import pdb pdb.set_trace()
def build_optimizer(self, model, **kwargs): self._skip_gpu = False return build_adam(model, base_learning_rate=0.1, **kwargs)
def build_net( self, base_learning_rate=0.1 # base_learning_rate * seq_size ): log.debug('>>> Building Mask-RNN') model = model_helper.ModelHelper(name="mask_rnn") hidden_init = model.net.AddExternalInputs('hidden_init', ) # TODO: do I still need this? model.net.AddExternalInputs( 'input_blob', 'seq_lengths', 'target', ) # Add external inputs (read directly from the database) # the dimension of class_target_mask: [BATCH_SIZE, SEQ_LEN, 1] # the dimension of regre_target_mask: [BATCH_SIZE, SEQ_LEN, regre_output_dim] (seq_lengths, _input_blob, _class_target, _regre_target, _class_target_mask, _regre_target_mask) = build_input_reader( model, self.db_name, 'minidb', [ 'seq_lengths', 'input_blob_batch_first', 'class_target_batch_first', 'regre_target_batch_first', 'class_target_mask_batch_first', 'regre_target_mask_batch_first' ], batch_size=self.batch_size, data_type='train') # In order to put into batches, the input_blob is # [BATCH_SIZE, SEQ_LEN, INPUT_DIM] # i.e. the first dim is the batch size # However the required input dim is: # [SEQ_LEN, BATCH_SIZE, INPUT_DIM] input_blob = model.net.Transpose([_input_blob], 'input_blob', axes=[1, 0, 2]) class_target = model.net.Transpose([_class_target], 'class_target', axes=[1, 0, 2]) regre_target = model.net.Transpose([_regre_target], 'regre_target', axes=[1, 0, 2]) class_target_mask = model.net.Transpose([_class_target_mask], 'class_target_mask', axes=[1, 0, 2]) regre_target_mask = model.net.Transpose([_regre_target_mask], 'regre_target_mask', axes=[1, 0, 2]) hidden_output_all, self.hidden_output = MaskGRU(model, input_blob, seq_lengths, (hidden_init, ), self.input_dim, self.hidden_size, scope="MaskRNN") # axis is 2 as first two are T (time) and N (batch size) # multi-task learning: regression regre_output = brew.fc(model, hidden_output_all, None, dim_in=self.hidden_size, dim_out=self.regre_output_dim, axis=2) # multi-task learning: classification class_output = brew.fc(model, hidden_output_all, None, dim_in=self.hidden_size, dim_out=self.class_output_dim, axis=2) # softmax head for testing only class_softmax_output = model.net.Softmax(class_output, 'class_softmax_output', axis=2) # Get the predict net (self.net_store['predict'], self.external_inputs) = model_helper.ExtractPredictorNet( model.net.Proto(), [input_blob, seq_lengths, hidden_init], [class_softmax_output, regre_output], ) # Then, we add loss and gradient ops # We treat them as one big batch of size T * N # we use the logit of classification head # class_output_reshaped, _ = model.net.Reshape( # class_output, ['class_output_reshaped', '_class_output_shape'], # shape=[-1, self.class_output_dim]) class_softmax_output_reshaped, _ = model.net.Reshape( class_softmax_output, ['class_softmax_output_reshaped', '_class_output_shape'], shape=[-1, self.class_output_dim]) regre_output_reshaped, _ = model.net.Reshape( regre_output, ['regre_output_reshaped', '_regre_output_shape'], shape=[-1, self.regre_output_dim]) class_target_reshaped, _ = model.net.Reshape( class_target, ['class_target_reshaped', '_class_target_shape'], shape=[-1, self.class_output_dim]) regre_target_reshaped, _ = model.net.Reshape( regre_target, ['regre_target_reshaped', '_regre_target_shape'], shape=[-1, self.regre_output_dim]) class_target_mask_reshaped, _ = model.net.Reshape( class_target_mask, ['class_target_mask_reshaped', '_class_target_mask_shape'], shape=[-1, 1]) regre_target_mask_reshaped, _ = model.net.Reshape( regre_target_mask, ['regre_target_mask_reshaped', '_regre_target_mask_shape'], shape=[-1, self.regre_output_dim]) # stop gradient to label and mask class_target_reshaped = model.net.StopGradient( class_target_reshaped, 'stopped_class_target_reshaped') regre_target_reshaped = model.net.StopGradient( regre_target_reshaped, 'stopped_regre_target_reshaped') class_target_mask_reshaped = model.net.StopGradient( class_target_mask_reshaped, 'stopped_class_target_mask_reshaped') regre_target_mask_reshaped = model.net.StopGradient( regre_target_mask_reshaped, 'stopped_regre_target_mask_reshaped') # model.net.Print([class_output_reshaped], 'print', to_file=0) # classification error # combined softmax and log likelihood for numerical stability # weighted by class_target_mask_reshaped # # _, class_average_loss = model.net.SoftmaxWithLoss( # [class_output_reshaped, class_target_reshaped, class_target_mask_reshaped], # ['_train_softmax_ouput', 'class_average_loss'], label_prob=1 # ) # class_l2_dist = model.net.SquaredL2Distance( [class_softmax_output_reshaped, class_target_reshaped], 'class_l2_dist') class_target_mask_reshaped = model.net.Squeeze( class_target_mask_reshaped, 'squeezed_class_target_mask', dims=[1]) masked_class_l2_dist = model.net.Mul( [class_target_mask_reshaped, class_l2_dist], 'masked_class_l2_dist') class_average_loss = model.net.AveragedLoss(masked_class_l2_dist, 'class_average_loss') # regression error # mask need to be applied to *each* individual dimension of output vector regre_output_reshaped_list = model.net.Split( [regre_output_reshaped], [ 'regre_output_reshaped_' + str(i) for i in range(self.regre_output_dim) ], axis=1, # has been reshaped to 2D tensor ) regre_target_reshaped_list = model.net.Split( [regre_target_reshaped], [ 'regre_target_reshaped_' + str(i) for i in range(self.regre_output_dim) ], axis=1, # has been reshaped to 2D tensor ) regre_target_mask_reshaped_list = model.net.Split( [regre_target_mask_reshaped], [ 'regre_target_mask_reshaped_' + str(i) for i in range(self.regre_output_dim) ], axis=1, # has been reshaped to 2D tensor ) regre_average_loss_lst = [] i = 0 for o, t, m in zip(regre_output_reshaped_list, regre_target_reshaped_list, regre_target_mask_reshaped_list): l2_dist = model.net.SquaredL2Distance([o, t], 'l2_dist_' + str(i)) m = model.net.Squeeze(m, 'squeezed_regre_target_mask_' + str(i), dims=[1]) masked_l2_dist = model.net.Mul([m, l2_dist], 'masked_l2_dist_' + str(i)) # masked_l2_dist = l2_dist regre_average_loss_lst.append( model.net.AveragedLoss(masked_l2_dist, 'regre_average_loss_' + str(i))) i += 1 assert i == self.regre_output_dim, 'output dim != # of loss split' # Training net model.AddGradientOperators([class_average_loss] + regre_average_loss_lst) build_adam( model, base_learning_rate=base_learning_rate * self.seq_size, ) self.model = model self.predictions = [class_softmax_output, regre_output] self.loss = [class_average_loss] + regre_average_loss_lst for loss in self.loss: loss = str(loss) self.reports[loss] = [] # Create a net to copy hidden_output to hidden_init prepare_state = core.Net("prepare_state") prepare_state.Copy(self.hidden_output, hidden_init) self.net_store['prepare'] = prepare_state self.net_store['train'] = core.Net(model.net.Proto())
def build_optimizer(self, model, **kwargs): self._skip_gpu = True return build_adam(model, base_learning_rate=0.1, enableRAdam=True, **kwargs)
def add_training_operators(self, model, output, label, device_opts, loss, opt_type, base_learning_rate, policy, stepsize, epsilon, beta1, beta2, gamma, momentum): with core.DeviceScope(device_opts): if loss == 'cross_entropy': xent = model.LabelCrossEntropy([output, label], 'xent') loss = model.AveragedLoss(xent, "loss") elif loss == 'euclidean': dist = model.net.SquaredL2Distance([label, output], 'dist') loss = dist.AveragedLoss([], ['loss']) model.AddGradientOperators([loss]) if opt_type == 'adam': if policy == 'step': opt = optimizer.build_adam( model, base_learning_rate=base_learning_rate, policy=policy, stepsize=stepsize, beta1=beta1, beta2=beta2, epsilon=epsilon) elif policy == 'fixed' or policy == 'inv': opt = optimizer.build_adam( model, base_learning_rate=base_learning_rate, policy=policy, beta1=beta1, beta2=beta2, epsilon=epsilon) print("adam optimizer selected") elif opt_type == 'sgd': if policy == 'step': opt = optimizer.build_sgd( model, base_learning_rate=base_learning_rate, policy=policy, stepsize=stepsize, gamma=gamma, momentum=momentum) elif policy == 'fixed' or policy == 'inv': opt = optimizer.build_sgd( model, base_learning_rate=base_learning_rate, policy=policy, gamma=gamma, momentum=momentum) print("sgd optimizer selected") elif opt_type == 'rmsprop': if policy == 'step': opt = optimizer.build_rms_prop( model, base_learning_rate=base_learning_rate, policy=policy, stepsize=stepsize, decay=gamma, momentum=momentum, epsilon=epsilon) elif policy == 'fixed' or policy == 'inv': opt = optimizer.build_rms_prop( model, base_learning_rate=base_learning_rate, policy=policy, decay=gamma, momentum=momentum, epsilon=epsilon) print("rmsprop optimizer selected") elif opt_type == 'adagrad': if policy == 'step': opt = optimizer.build_adagrad( model, base_learning_rate=base_learning_rate, policy=policy, stepsize=stepsize, decay=gamma, epsilon=epsilon) elif policy == 'fixed' or policy == 'inv': opt = optimizer.build_adagrad( model, base_learning_rate=base_learning_rate, policy=policy, decay=gamma, epsilon=epsilon) print("adagrad optimizer selected")
def build_optimizer(self, model): build_adam(model, base_learning_rate=0.1)