Exemple #1
0
def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)
Exemple #2
0
def eval_loss(y_true, y_pred):
    target = y_true
    ######output times weights here###############
    mask = K.cast(K.expand_dims(weights, axis=-1), dtype='float32')
    tensor_shape = y_pred.get_shape()
    # x = tf.add(x, tf.constant(1, shape=x.shape))
    yPred_stack = []
    for i in range(tensor_shape[1]):
        mask_i = K.cast(K.expand_dims(mask[i], axis=-1), dtype='float32')
        yPred_i = K.cast(K.expand_dims(y_pred[:, i], axis=-1), dtype='float32')
        yPred_stack.append(K.dot(yPred_i, mask_i))
    output = tf.reshape(tf.stack(yPred_stack, axis=1, name='stack'),
                        [-1, tensor_shape[1]])

    return y_pred[0, 7]
Exemple #3
0
def K_sparse_categorical_crossentropy(target,
                                      output,
                                      from_logits=False,
                                      axis=-1):
    output_dimensions = list(range(len(output.get_shape())))
    if axis != -1 and axis not in output_dimensions:
        raise ValueError('{}{}{}'.format(
            'Unexpected channels axis {}. '.format(axis),
            'Expected to be -1 or one of the axes of `output`, ',
            'which has {} dimensions.'.format(len(output.get_shape()))))
    # If the channels are not in the last axis, move them to be there:
    if axis != -1 and axis != output_dimensions[-1]:
        permutation = output_dimensions[:axis] + output_dimensions[axis + 1:]
        permutation += [axis]
        output = tf.transpose(output, perm=permutation)

    # Note: tf.nn.sparse_softmax_cross_entropy_with_logits
    # expects logits, Keras expects probabilities.
    if not from_logits:
        _epsilon = _to_tensor(epsilon(), output.dtype.base_dtype)
        output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
        output = tf.log(output)

    output_shape = output.get_shape()
    targets = cast(flatten(target), 'int64')
    logits = tf.reshape(output, [-1, tf.shape(output)[-1]])
    res = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets,
                                                         logits=logits)
    if len(output_shape) >= 3:
        # if our output includes timestep dimension
        # or spatial dimensions we need to reshape
        return tf.reshape(res, tf.shape(output)[:-1])
    else:
        return res
Exemple #4
0
 def get_lr_decay(self):
     lr = self.model.optimizer.lr
     if self.model.optimizer.initial_decay > 0:
         lr = lr * (1. / (1. + self.model.optimizer.decay * backend.cast(
             self.model.optimizer.iterations,
             backend.dtype(self.model.optimizer.decay))))
     return backend.eval(lr), backend.eval(self.model.optimizer.decay)
Exemple #5
0
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)
    return TP / P
Exemple #6
0
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    TN = K.sum((1 - y_pred) * (1 - y_true))
    return TN / N
Exemple #7
0
 def loop_body(b, ignore_mask):
     true_box = tf.boolean_mask(y_true[l][b, ..., 0:4],
                                object_mask_bool[b, ..., 0])
     iou = box_iou(pred_box[b], true_box)
     best_iou = K.max(iou, axis=-1)
     ignore_mask = ignore_mask.write(
         b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
     return b + 1, ignore_mask
 def __metrics_base(self, y_true, y_pred):
     """ Base for all the metrics defined below """
     y_true, y_pred = K.flatten(tf.math.argmax(y_true, axis=-1)), K.flatten(
         tf.math.argmax(y_pred, axis=-1))
     con_mat = K.cast(tf.math.confusion_matrix(y_true, y_pred), K.floatx())
     correct = tf.linalg.diag_part(con_mat)
     total = K.sum(con_mat, axis=-1)
     return correct, total, con_mat
Exemple #9
0
 def step(self, inputs, states):
     states = list(states)
     if self.teacher_force:
         readout = states.pop()
         ground_truth = states.pop()
         assert K.ndim(ground_truth) == 3, K.ndim(ground_truth)
         counter = states.pop()
         if K.backend() == 'tensorflow':
             with tf.control_dependencies(None):
                 zero = K.cast(K.zeros((1, ))[0], 'int32')
                 one = K.cast(K.zeros((1, ))[0], 'int32')
         else:
             zero = K.cast(K.zeros((1, ))[0], 'int32')
             one = K.cast(K.zeros((1, ))[0], 'int32')
         slices = [
             slice(None), counter[0] - K.switch(counter[0], one, zero)
         ] + [slice(None)] * (K.ndim(ground_truth) - 2)
         ground_truth_slice = ground_truth[slices]
         readout = K.in_train_phase(
             K.switch(counter[0], ground_truth_slice, readout), readout)
         states.append(readout)
     if self.decode:
         model_input = states
     else:
         model_input = [inputs] + states
     shapes = []
     for x in model_input:
         if hasattr(x, '_keras_shape'):
             shapes.append(x._keras_shape)
             del x._keras_shape  # Else keras internals will get messed up.
     model_output = _to_list(self.model.call(model_input))
     for x, s in zip(model_input, shapes):
         setattr(x, '_keras_shape', s)
     if self.decode:
         model_output.insert(1, model_input[0])
     for tensor in model_output:
         tensor._uses_learning_phase = self.uses_learning_phase
     states = model_output[1:]
     output = model_output[0]
     if self.readout:
         states += [output]
         if self.teacher_force:
             states.insert(-1, counter + 1)
             states.insert(-1, ground_truth)
     return output, states
def softmax_cross_entropy(target,
                          output,
                          from_logits=True,
                          axis=-1,
                          normalize=False):
    """ Compute Softmax cross entropy loss for sparse target.

    Args:
        target (tensor): Target label. If 2D, shape is (w, h).
        output (tensor): Logits or Probabilities. If 2D, shape is (w, h, ch).
        from_logits (bool, optional): logits or softmax outputs? Defaults to True.
        axis (int, optional): Specifying the channels axis. Defaults to -1.
        normalize (bool, optional): Normalize loss across all instances. Defaults to False.
    """
    _check_dtype(target, 'int32')
    _check_dtype(output, 'float32')

    output_dimensions = list(range(len(output.get_shape())))
    if axis != -1 and axis not in output_dimensions:
        raise ValueError('{}{}{}'.format(
            'Unexpected channels axis {}. '.format(axis),
            'Expected to be -1 or one of the axes of `output`, ',
            'which has {} dimensions.'.format(len(output.get_shape()))))

    # move the channels to be in the last axis:
    if axis != -1 and axis != output_dimensions[-1]:
        permutation = output_dimensions[:axis] + output_dimensions[axis + 1:]
        permutation += [axis]
        output = tf.transpose(output, perm=permutation)

    # convert to the logits
    if not from_logits:
        _epsilon = _to_tensor(epsilon(), output.dtype.base_dtype)
        output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
        output = tf.log(output)  # NOTE: log(exp(x)) = x
    logits = output

    # softmax_cross_entropy
    output_shape = output.get_shape()
    targets = cast(tf.reshape(target,
                              tf.shape(output)[:-1]), 'int32')  # NOTE: cast...

    res = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets,
                                                         logits=logits)

    # reduce
    if normalize:
        return tf.reduce_mean(res)
    else:
        return tf.reduce_sum(tf.reduce_mean(res, axis=0))  # only batch-axis
Exemple #11
0
    def yolo_head(self,
                  feats,
                  anchors,
                  num_classes,
                  input_shape,
                  calc_loss=False):
        """Convert final layer features to bounding box parameters."""
        num_anchors = len(anchors)
        # Reshape to batch, height, width, num_anchors, box_params.
        anchors_tensor = K.reshape(K.constant(anchors),
                                   [1, 1, 1, num_anchors, 2])

        grid_shape = K.shape(feats)[1:3]  # height, width
        grid_y = K.tile(
            K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]),
            [1, grid_shape[1], 1, 1])
        grid_x = K.tile(
            K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]),
            [grid_shape[0], 1, 1, 1])
        grid = K.concatenate([grid_x, grid_y])
        grid = K.cast(grid, K.dtype(feats))

        feats = K.reshape(
            feats,
            [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5])

        # Adjust preditions to each spatial grid point and anchor size.
        box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(
            grid_shape[::-1], K.dtype(feats))
        box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(
            input_shape[::-1], K.dtype(feats))
        box_confidence = K.sigmoid(feats[..., 4:5])
        box_class_probs = K.sigmoid(feats[..., 5:])

        if calc_loss == True:
            return grid, feats, box_xy, box_wh
        return box_xy, box_wh, box_confidence, box_class_probs
    def call(self, inputs, mask=None, **kwargs):
        input_len = K.shape(inputs)[1]

        if self.attention_type == Attention.ATTENTION_TYPE_ADD:
            e = self._call_additive_emission(inputs)
        elif self.attention_type == Attention.ATTENTION_TYPE_MUL:
            e = self._call_multiplicative_emission(inputs)

        if self.attention_activation is not None:
            e = self.attention_activation(e)
        if self.attention_width is not None:
            if self.history_only:
                lower = K.arange(0, input_len) - (self.attention_width - 1)
            else:
                lower = K.arange(0, input_len) - self.attention_width // 2
            lower = K.expand_dims(lower, axis=-1)
            upper = lower + self.attention_width
            indices = K.expand_dims(K.arange(0, input_len), axis=0)
            e -= 10000.0 * (1.0 - K.cast(lower <= indices, K.floatx()) *
                            K.cast(indices < upper, K.floatx()))
        if mask is not None:
            mask = K.expand_dims(K.cast(mask, K.floatx()), axis=-1)
            e -= 10000.0 * ((1.0 - mask) *
                            (1.0 - K.permute_dimensions(mask, (0, 2, 1))))

        # a_{t} = \text{softmax}(e_t)
        e = K.exp(e - K.max(e, axis=-1, keepdims=True))
        a = e / K.sum(e, axis=-1, keepdims=True)

        # l_t = \sum_{t'} a_{t, t'} x_{t'}
        v = K.batch_dot(a, inputs)
        if self.attention_regularizer_weight > 0.0:
            self.add_loss(self._attention_regularizer(a))

        if self.return_attention:
            return [v, a]
        return v
Exemple #13
0
    def yolo_correct_boxes(self, box_xy, box_wh, input_shape, image_shape):
        '''Get corrected boxes'''
        box_yx = box_xy[..., ::-1]
        box_hw = box_wh[..., ::-1]
        input_shape = K.cast(input_shape, K.dtype(box_yx))
        image_shape = K.cast(image_shape, K.dtype(box_yx))
        new_shape = K.round(image_shape * K.min(input_shape / image_shape))
        offset = (input_shape - new_shape) / 2. / input_shape
        scale = input_shape / new_shape
        box_yx = (box_yx - offset) * scale
        box_hw *= scale

        box_mins = box_yx - (box_hw / 2.)
        box_maxes = box_yx + (box_hw / 2.)
        boxes = K.concatenate([
            box_mins[..., 0:1],  # y_min
            box_mins[..., 1:2],  # x_min
            box_maxes[..., 0:1],  # y_max
            box_maxes[..., 1:2]  # x_max
        ])

        # Scale boxes back to original image shape.
        boxes *= K.concatenate([image_shape, image_shape])
        return boxes
Exemple #14
0
    def yolo_loss(self,
                  args,
                  anchors,
                  num_classes,
                  ignore_thresh=.5,
                  print_loss=False):
        '''Return yolo_loss tensor

        Parameters
        ----------
        yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body
        y_true: list of array, the output of preprocess_true_boxes
        anchors: array, shape=(N, 2), wh
        num_classes: integer
        ignore_thresh: float, the iou threshold whether to ignore object confidence loss

        Returns
        -------
        loss: tensor, shape=(1,)

        '''

        num_layers = len(anchors) // 3  # default setting

        yolo_outputs = args[:num_layers]
        y_true = args[num_layers:]

        anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]
                       ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]]
        input_shape = K.cast(
            K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))
        grid_shapes = [
            K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0]))
            for l in range(num_layers)
        ]
        loss = 0
        m = K.shape(yolo_outputs[0])[0]  # batch size, tensor
        mf = K.cast(m, K.dtype(yolo_outputs[0]))

        for l in range(num_layers):
            object_mask = y_true[l][..., 4:5]
            true_class_probs = y_true[l][..., 5:]

            grid, raw_pred, pred_xy, pred_wh = self.yolo_head(
                yolo_outputs[l],
                anchors[anchor_mask[l]],
                num_classes,
                input_shape,
                calc_loss=True)
            pred_box = K.concatenate([pred_xy, pred_wh])

            # Darknet raw box to calculate loss.
            raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid
            raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] *
                                input_shape[::-1])
            raw_true_wh = K.switch(
                object_mask, raw_true_wh,
                K.zeros_like(raw_true_wh))  # avoid log(0)=-inf
            box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4]

            # Find ignore mask, iterate over each of batch.
            ignore_mask = tf.TensorArray(K.dtype(y_true[0]),
                                         size=1,
                                         dynamic_size=True)
            object_mask_bool = K.cast(object_mask, 'bool')

            def loop_body(b, ignore_mask):
                true_box = tf.boolean_mask(y_true[l][b, ..., 0:4],
                                           object_mask_bool[b, ..., 0])
                iou = box_iou(pred_box[b], true_box)
                best_iou = K.max(iou, axis=-1)
                ignore_mask = ignore_mask.write(
                    b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
                return b + 1, ignore_mask

            _, ignore_mask = K.control_flow_ops.while_loop(
                lambda b, *args: b < m, loop_body, [0, ignore_mask])
            ignore_mask = ignore_mask.stack()
            ignore_mask = K.expand_dims(ignore_mask, -1)

            # K.binary_crossentropy is helpful to avoid exp overflow.
            xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(
                raw_true_xy, raw_pred[..., 0:2], from_logits=True)
            wh_loss = object_mask * box_loss_scale * 0.5 * K.square(
                raw_true_wh - raw_pred[..., 2:4])
            confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \
                (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask
            class_loss = object_mask * K.binary_crossentropy(
                true_class_probs, raw_pred[..., 5:], from_logits=True)

            xy_loss = K.sum(xy_loss) / mf
            wh_loss = K.sum(wh_loss) / mf
            confidence_loss = K.sum(confidence_loss) / mf
            class_loss = K.sum(class_loss) / mf
            loss += xy_loss + wh_loss + confidence_loss + class_loss
            if print_loss:
                loss = tf.Print(loss, [
                    loss, xy_loss, wh_loss, confidence_loss, class_loss,
                    K.sum(ignore_mask)
                ],
                                message='loss: ')
        return loss
Exemple #15
0
    def call(self,
             inputs,
             initial_state=None,
             initial_readout=None,
             ground_truth=None,
             mask=None,
             training=None):
        # input shape: `(samples, time (padded with zeros), input_dim)`
        # note that the .build() method of subclasses MUST define
        # self.input_spec and self.state_spec with complete input shapes.
        if type(mask) is list:
            mask = mask[0]
        if self.model is None:
            raise Exception('Empty RecurrentModel.')
        num_req_states = self.num_states
        if self.readout:
            num_actual_states = num_req_states - 1
        else:
            num_actual_states = num_req_states
        if type(inputs) is list:
            inputs_list = inputs[:]
            inputs = inputs_list.pop(0)
            initial_states = inputs_list[:num_actual_states]
            if len(initial_states) > 0:
                if self._is_optional_input_placeholder(initial_states[0]):
                    initial_states = self.get_initial_state(inputs)
            inputs_list = inputs_list[num_actual_states:]
            if self.readout:
                initial_readout = inputs_list.pop(0)
                if self.teacher_force:
                    ground_truth = inputs_list.pop()
        else:
            if initial_state is not None:
                if not isinstance(initial_state, (list, tuple)):
                    initial_states = [initial_state]
                else:
                    initial_states = list(initial_state)
                if self._is_optional_input_placeholder(initial_states[0]):
                    initial_states = self.get_initial_state(inputs)

            elif self.stateful:
                initial_states = self.states
            else:
                initial_states = self.get_initial_state(inputs)
        if self.readout:
            if initial_readout is None or self._is_optional_input_placeholder(
                    initial_readout):
                output_shape = K.int_shape(_to_list((self.model.output))[0])
                output_ndim = len(output_shape)
                input_ndim = K.ndim(inputs)
                initial_readout = K.zeros_like(inputs)
                slices = [slice(None)] + [0] * (input_ndim - 1)
                initial_readout = initial_readout[slices]  # (batch_size,)
                initial_readout = K.reshape(initial_readout,
                                            (-1, ) + (1, ) * (output_ndim - 1))
                initial_readout = K.tile(initial_readout,
                                         (1, ) + tuple(output_shape[1:]))
            initial_states.append(initial_readout)
            if self.teacher_force:
                if ground_truth is None or self._is_optional_input_placeholder(
                        ground_truth):
                    raise Exception(
                        'ground_truth must be provided for RecurrentModel with teacher_force=True.'
                    )
                if K.backend() == 'tensorflow':
                    with tf.control_dependencies(None):
                        counter = K.zeros((1, ))
                else:
                    counter = K.zeros((1, ))
                counter = K.cast(counter, 'int32')
                initial_states.insert(-1, counter)
                initial_states[-2]
                initial_states.insert(-1, ground_truth)
                num_req_states += 2
        if len(initial_states) != num_req_states:
            raise ValueError('Layer requires ' + str(num_req_states) +
                             ' states but was passed ' +
                             str(len(initial_states)) + ' initial states.')
        input_shape = K.int_shape(inputs)
        if self.unroll and input_shape[1] is None:
            raise ValueError('Cannot unroll a RNN if the '
                             'time dimension is undefined. \n'
                             '- If using a Sequential model, '
                             'specify the time dimension by passing '
                             'an `input_shape` or `batch_input_shape` '
                             'argument to your first layer. If your '
                             'first layer is an Embedding, you can '
                             'also use the `input_length` argument.\n'
                             '- If using the functional API, specify '
                             'the time dimension by passing a `shape` '
                             'or `batch_shape` argument to your Input layer.')
        preprocessed_input = self.preprocess_input(inputs, training=None)
        constants = self.get_constants(inputs, training=None)
        if self.decode:
            initial_states.insert(0, inputs)
            preprocessed_input = K.zeros((1, self.output_length, 1))
            input_length = self.output_length
        else:
            input_length = input_shape[1]
        if self.uses_learning_phase:
            with learning_phase_scope(0):
                last_output_test, outputs_test, states_test, updates = rnn(
                    self.step,
                    preprocessed_input,
                    initial_states,
                    go_backwards=self.go_backwards,
                    mask=mask,
                    constants=constants,
                    unroll=self.unroll,
                    input_length=input_length)
            with learning_phase_scope(1):
                last_output_train, outputs_train, states_train, updates = rnn(
                    self.step,
                    preprocessed_input,
                    initial_states,
                    go_backwards=self.go_backwards,
                    mask=mask,
                    constants=constants,
                    unroll=self.unroll,
                    input_length=input_length)

            last_output = K.in_train_phase(last_output_train,
                                           last_output_test,
                                           training=training)
            outputs = K.in_train_phase(outputs_train,
                                       outputs_test,
                                       training=training)
            states = []
            for state_train, state_test in zip(states_train, states_test):
                states.append(
                    K.in_train_phase(state_train,
                                     state_test,
                                     training=training))

        else:
            last_output, outputs, states, updates = rnn(
                self.step,
                preprocessed_input,
                initial_states,
                go_backwards=self.go_backwards,
                mask=mask,
                constants=constants,
                unroll=self.unroll,
                input_length=input_length)
        states = list(states)
        if self.decode:
            states.pop(0)
        if self.readout:
            states.pop()
            if self.teacher_force:
                states.pop()
                states.pop()
        if len(updates) > 0:
            self.add_update(updates)
        if self.stateful:
            updates = []
            for i in range(len(states)):
                updates.append((self.states[i], states[i]))
            self.add_update(updates, inputs)

        # Properly set learning phase
        if 0 < self.dropout + self.recurrent_dropout:
            last_output._uses_learning_phase = True
            outputs._uses_learning_phase = True

        if self.return_sequences:
            y = outputs
        else:
            y = last_output
        if self.return_states:
            return [y] + states
        else:
            return y
Exemple #16
0
    def get_updates(self, params, loss):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. /
                   (1. +
                    self.decay * K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (
            1. - K.pow(self.beta_1, t))

        shapes = [K.get_variable_shape(p) for p in params]
        ms = [K.zeros(shape) for shape in shapes]
        vs = [K.zeros(shape) for shape in shapes]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):

            # if a weight tensor (len > 1) use weight normalized parameterization
            # this is the only part changed w.r.t. keras.optimizers.Adam
            ps = K.get_variable_shape(p)
            if len(ps) > 1:

                # get weight normalization parameters
                V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(
                    p, g)

                # Adam containers for the 'g' parameter
                V_scaler_shape = K.get_variable_shape(V_scaler)
                m_g = K.zeros(V_scaler_shape)
                v_g = K.zeros(V_scaler_shape)

                # update g parameters
                m_g_t = (self.beta_1 * m_g) + (1. - self.beta_1) * grad_g
                v_g_t = (self.beta_2 *
                         v_g) + (1. - self.beta_2) * K.square(grad_g)
                new_g_param = g_param - lr_t * m_g_t / (K.sqrt(v_g_t) +
                                                        self.epsilon)
                self.updates.append(K.update(m_g, m_g_t))
                self.updates.append(K.update(v_g, v_g_t))

                # update V parameters
                m_t = (self.beta_1 * m) + (1. - self.beta_1) * grad_V
                v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(grad_V)
                new_V_param = V - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
                self.updates.append(K.update(m, m_t))
                self.updates.append(K.update(v, v_t))

                # if there are constraints we apply them to V, not W
                # if p in constraints:
                #     c = constraints[p]
                #     new_V_param = c(new_V_param)

                # wn param updates --> W updates
                add_weightnorm_param_updates(self.updates, new_V_param,
                                             new_g_param, p, V_scaler)

            else:  # do optimization normally
                m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
                v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

                self.updates.append(K.update(m, m_t))
                self.updates.append(K.update(v, v_t))

                new_p = p_t
                # apply constraints
                # if p in constraints:
                #     c = constraints[p]
                #     new_p = c(new_p)
                self.updates.append(K.update(p, new_p))
        return self.updates