def svm_loss(x, y):
  """
  Computes the loss and gradient using for multiclass SVM classification.

  Inputs:
  - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
    for the ith input.
  - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
    0 <= y[i] < C

  Returns a tuple of:
  - loss: Scalar giving the loss
  - dx: Gradient of the loss with respect to x
  """

  N = x.shape[0]
  correct_class_scores = x[np.arange(N), y]
  
  #TODO: Support broadcast case: (X,) (X, Y)
  #shape(x) is (d0, d1)
  #shape(correct_class_scores) is (d0,)
  #margins = np.maximum(0, x - correct_class_scores + 1.0)
  margins = np.transpose(np.maximum(0, np.transpose(x) - np.transpose(correct_class_scores) + 1.0))

  loss = (np.sum(margins) - np.sum(margins[np.arange(N), y])) / N

  return loss
Exemple #2
0
def softmax_loss(x, y):
    """
  Computes the loss and gradient for softmax classification.

  Inputs:
  - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
    for the ith input.
  - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
    0 <= y[i] < C

  Returns a tuple of:
  - loss: Scalar giving the loss
  - dx: Gradient of the loss with respect to x
  """
    #np.expand_dims(correct_class_scores, axis = 1)
    #probs = np.exp(x - np.max(x, axis=1, keepdims=True))
    #print "x.shape", x.shape

    #Somehow Buggy. Max doesn't work.
    probs = np.exp(x - np.max(x, axis=1))
    #probs /= np.expand_dims(np.sum(probs, axis=1), axis = 1)
    probs /= np.expand_dims(np.sum(probs, axis=1), axis=1)
    N = x.shape[0]
    loss = -np.sum(np.log(probs[np.arange(N), y])) / N

    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N

    return loss, dx
def softmax_loss(x, y):
  """
  Computes the loss and gradient for softmax classification.

  Inputs:
  - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
    for the ith input.
  - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
    0 <= y[i] < C

  Returns a tuple of:
  - loss: Scalar giving the loss
  - dx: Gradient of the loss with respect to x
  """
  #np.expand_dims(correct_class_scores, axis = 1)
  #probs = np.exp(x - np.max(x, axis=1, keepdims=True))
  #print "x.shape", x.shape

  #Somehow Buggy. Max doesn't work.
  probs = np.exp(x - np.max(x, axis=1))
  #probs /= np.expand_dims(np.sum(probs, axis=1), axis = 1)
  probs /= np.expand_dims(np.sum(probs, axis=1), axis = 1)
  N = x.shape[0]
  loss = -np.sum(np.log(probs[np.arange(N), y])) / N

  dx = probs.copy()
  dx[np.arange(N), y] -= 1
  dx /= N

  return loss, dx
Exemple #4
0
def svm_loss(x, y):
    """
  Computes the loss and gradient using for multiclass SVM classification.

  Inputs:
  - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
    for the ith input.
  - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
    0 <= y[i] < C

  Returns a tuple of:
  - loss: Scalar giving the loss
  - dx: Gradient of the loss with respect to x
  """

    N = x.shape[0]
    correct_class_scores = x[np.arange(N), y]

    #TODO: Support broadcast case: (X,) (X, Y)
    #shape(x) is (d0, d1)
    #shape(correct_class_scores) is (d0,)
    #margins = np.maximum(0, x - correct_class_scores + 1.0)
    margins = np.transpose(
        np.maximum(0,
                   np.transpose(x) - np.transpose(correct_class_scores) + 1.0))

    loss = (np.sum(margins) - np.sum(margins[np.arange(N), y])) / N

    return loss
Exemple #5
0
def softmax_loss(x, y):
    """
    Computes the loss and gradient for softmax classification.

    Inputs:
    - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
      for the ith input.
    - y: Either of the followings:
      - One hot encoding of labels, of shape (N, C)
      - Label index of shape (N, ), each y[i] is the label of i^th example
        (0 <= y[i] < C)

    Returns a tuple of:
    - loss: Scalar giving the loss
    """
    N = x.shape[0]
    C = x.shape[1]
    if len(y.shape) == 1:
        #convert it to one hot encoding
        onehot_y = np.zeros([N, C])
        np.onehot_encode(y, onehot_y)
    else:
        onehot_y = y
    probs = x - np.max(x, axis=1, keepdims=True)
    loss = -np.sum(probs * onehot_y) / N
    loss += np.sum(np.log(np.sum(np.exp(probs), axis=1, keepdims=True))) / N
    return loss
Exemple #6
0
def quick_grad_check(fun, arg0, extra_args=(), kwargs={}, verbose=True,
                     eps=EPS, rtol=RTOL, atol=ATOL, rs=None):
    """Checks the gradient of a function (w.r.t. to its first arg) in a random direction"""

    if verbose:
        print("Checking gradient of {0} at {1}".format(fun, arg0))

    if rs is None:
        rs = nnp.random.RandomState()
    
    random_dir = rs.standard_normal(nnp.shape(arg0))
    random_dir = random_dir / nnp.sqrt(nnp.sum(random_dir * random_dir))
 
    if not extra_args == ():
      unary_fun = lambda x : fun(arg0 + x * random_dir, extra_args)
      numeric_grad = (unary_fun(eps/2) - unary_fun(-eps/2)) / eps
      analytic_grad = np.sum(grad(fun)(arg0, extra_args) * random_dir)
    else:
      unary_fun = lambda x : fun(arg0 + x * random_dir)
      numeric_grad = (unary_fun(eps/2) - unary_fun(-eps/2)) / eps
      analytic_grad = np.sum(grad(fun)(arg0) * random_dir)
  
    if isinstance(numeric_grad, minpy.array.Number):
        assert abs((analytic_grad - numeric_grad).get_data(None)) < atol and abs((analytic_grad - numeric_grad).get_data(None)) < abs((analytic_grad * rtol).get_data(None)), \
            "Check failed! nd={0}, ad={1}".format(numeric_grad, analytic_grad)
    elif isinstance(numeric_grad, minpy.array.Array):
        assert nnp.prod(nnp.shape(analytic_grad.asnumpy())[:]) == 1, "Currently only support check loss"
        assert abs((analytic_grad - numeric_grad).asnumpy()) < atol and abs((analytic_grad - numeric_grad).asnumpy()) < abs((analytic_grad * rtol).asnumpy()), \
            "Check failed! nd={0}, ad={1}".format(numeric_grad, analytic_grad)
    else:
        assert False
    if verbose:
        print("Gradient projection OK (numeric grad: {0}, analytic grad: {1})".format(
            numeric_grad, analytic_grad))
def svm_loss(x, y, mode):
  """
  Computes the loss and gradient using for multiclass SVM classification.

  Inputs:
  - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
    for the ith input.
  - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
    0 <= y[i] < C

  Returns a tuple of:
  - loss: Scalar giving the loss
  - dx: Gradient of the loss with respect to x
  """
  if mode == 'cpu':
    np.set_policy(policy.OnlyNumpyPolicy())
  else:
    np.set_policy(policy.PreferMXNetPolicy())

  N = x.shape[0]
  correct_class_scores = x[np.arange(N), y]
  
  #margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0)
  margins = np.maximum(0, x - np.expand_dims(correct_class_scores, axis = 1) + 1.0)

  margins[np.arange(N), y] = 0
  loss = np.sum(margins) / N
  num_pos = np.sum(margins > 0, axis=1)
  dx = np.zeros_like(x)
  dx[margins > 0] = 1
  dx[np.arange(N), y] -= num_pos
  dx /= N

  return loss, dx
Exemple #8
0
 def loss(self, ps, as_, vs, rs, advs):
     ps = np.maximum(1.0e-5, np.minimum(1.0 - 1e-5, ps))
     policy_grad_loss = -np.sum(np.log(ps) * as_ * advs)
     vf_loss = 0.5*np.sum((vs - rs)**2)
     entropy = -np.sum(ps*np.log(ps))
     loss_ = policy_grad_loss + self.config.vf_wt*vf_loss - self.config.entropy_wt*entropy
     return loss_
Exemple #9
0
def softmax_loss(x, y):
    """
    Computes the loss and gradient for softmax classification.

    Inputs:
    - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
      for the ith input.
    - y: Either of the followings:
      - One hot encoding of labels, of shape (N, C)
      - Label index of shape (N, ), each y[i] is the label of i^th example
        (0 <= y[i] < C)

    Returns a tuple of:
    - loss: Scalar giving the loss
    """
    N = x.shape[0]
    C = x.shape[1]
    if len(y.shape) == 1:
        #convert it to one hot encoding
        onehot_y = np.zeros([N, C])
        np.onehot_encode(y, onehot_y)
    else:
        onehot_y = y
    probs = x - np.max(x, axis=1, keepdims=True)
    loss = -np.sum(probs * onehot_y) / N
    loss += np.sum(np.log(np.sum(np.exp(probs), axis=1, keepdims=True))) / N
    return loss
Exemple #10
0
 def _softmax_loss(self, X, y, *args):
     N = X.shape[0]
     scores = self._forward(X, *args)
     scores = np.exp(scores - np.max(scores, axis=1, keepdims=True))
     prob = scores / np.sum(scores, axis=1, keepdims=True)
     loss = np.sum(-np.log(prob[np.arange(N), y])) / float(N)
     return loss
def softmax(x, y):
    import numpy as np
    y = y.astype(int)
    probs = np.exp(x - np.max(x, axis=1, keepdims=True))
    probs /= np.sum(probs, axis=1, keepdims=True)
    N = x.shape[0]
    loss = -np.sum(np.log(probs[np.arange(N), y])) / N
    return loss
Exemple #12
0
def layer_normalization(X, gamma, beta, epsilon=0.001):
    N, D = X.shape
    mean = np.sum(X, axis=1).reshape((N, 1)) / float(D)
    X = X - mean
    variance = np.sum(X**2, axis=1).reshape((N, 1)) / float(D)
    std = variance**0.5
    X = X / (std + epsilon)
    return X * gamma + beta
Exemple #13
0
def softmax_crossentropy(x, y):
    # x should be (batch, prob)
    # y should be (batch, )

    x_dev = x - np.max(x, axis=1, keepdims=True) # minpy doesn't support x.max()
    sm = x_dev - np.log(np.sum(np.exp(x_dev), axis=1, keepdims=True))
    ids = np.arange(0, y.shape[0])*x.shape[1] + y
    ce = -np.sum(sm.reshape((sm.shape[0]*sm.shape[1],))[ids])/(1.0*y.shape[0])  # minpy doesn't support -1 in shape inference
    return ce
Exemple #14
0
def softmax_probability(p, channel):
    N, C = p.shape
    p -= np.max(p, axis=1).reshape((N, 1))
    code = np.zeros((N, C))
    np.onehot_encode(channel, code)
    p = np.exp(p)
    selected_p = p * code
    total_p = np.sum(p, axis=1).reshape((N, 1))
    return np.sum(selected_p / total_p, axis=1)
Exemple #15
0
 def _loss_function(*args):
     normal_loss = model.loss(model.forward(X, 'train'), Y)
     noisy_output = model.forward(noisy_X, 'train')
     noisy_output -= np.max(noisy_output, axis=1).reshape((K, 1))
     noisy_output = np.exp(noisy_output)
     model_p_noisy_X = noisy_output / np.sum(noisy_output, axis=1).reshape(
         (K, 1))
     kl = KL(1.0 / N_CLASSES, model_p_noisy_X)
     noisy_loss = gamma * np.sum(kl) / float(K)
     return gamma * normal_loss + (1 - gamma) * noisy_loss
Exemple #16
0
 def train_loss(X, y, W1, W2, b1, b2):
     l1 = affine_relu_forward(X, W1, b1)
     l2 = affine_forward(l1, W2, b2)
     scores = l2
     if y:
         #[TODO]: softmax is not supported yet
         # loss, d_scores = softmax_loss(scores, y)
         loss = svm_loss(scores, y)
         loss_with_reg = loss + np.sum(W1**2) * 0.5 * self.reg + np.sum(
             W2**2) * 0.5 * self.reg
     return loss_with_reg
Exemple #17
0
    def train_loss(X, y, W1, W2, b1, b2):
      l1, l1_cache = affine_relu_forward(X, W1, b1)
      l2, l2_cache = affine_forward(l1, W2, b2)
      scores = l2 

      if y is None:
        return scores
   
      loss, d_scores = softmax_loss(scores, y)
      loss += np.sum(W1 ** 2) * 0.5 * self.reg
      loss += np.sum(W2 ** 2) * 0.5 * self.reg
      return loss
Exemple #18
0
        def train_loss(X, y, W1, W2, b1, b2):
            l1 = affine_relu_forward(X, W1, b1)
            l2 = affine_forward(l1, W2, b2)
            scores = l2

            if y is None:
                return scores

            #[TODO]: softmax is not supported yet
            # loss, d_scores = softmax_loss(scores, y)
            loss = svm_loss(scores, y)
            loss_with_reg = loss + np.sum(W1**2) * 0.5 * self.reg + np.sum(
                W2**2) * 0.5 * self.reg

            return loss_with_reg
    def train_loss(*args):
      inputs = args[0]
      softmax_label = args[1]
      probs = self.symbol_func(**self.make_mxnet_weight_dict(inputs, softmax_label, args[self.data_target_cnt:len(args)]))
      if softmax_label is None:
        return probs 

      samples_num = X.shape[0]
      targets = np.zeros((samples_num, self.num_classes))
      targets[np.arange(samples_num), softmax_label] = 1
      loss = -np.sum(targets * np.log(probs)) / samples_num
      for i in self.get_index_reg_weight():
        loss = loss + np.sum(0.5*args[i]**2*self.reg)

      return loss
Exemple #20
0
    def check_accuracy(self, dataiter, num_samples=None):
        """
        Check accuracy of the model on the provided data.

        Inputs:
        - dataiter: data iterator that can produce batches.
        - num_samples: If not None and dataiter has more than num_samples datapoints,
          subsample the data and only test the model on num_samples datapoints.

        Returns:
        - acc: Scalar giving the fraction of instances that were correctly
          classified by the model.
        """

        # Maybe subsample the data
        N = dataiter.num_data
        check_dataiter = dataiter
        if num_samples is not None and N > num_samples:
            # Sample a sub iter
            check_dataiter = dataiter.getsubiter(num_samples)
        else:
            # Use the entire dataiter otherwise.
            check_dataiter.reset()

        acc_count = 0
        num_samples = 0
        for each_batch in check_dataiter:
            predict = self.model.forward_batch(each_batch,
                                               mode='test').asnumpy()
            # TODO(minjie): multiple labels.
            acc_count += np.sum(
                np.argmax(predict, axis=1) == each_batch.label[0])
            num_samples += check_dataiter.batch_size
        return float(acc_count.asnumpy()) / num_samples
Exemple #21
0
    def check_accuracy(self, dataiter, num_samples=None):
        """
        Check accuracy of the model on the provided data.

        Inputs:
        - dataiter: data iterator that can produce batches.
        - num_samples: If not None and dataiter has more than num_samples datapoints,
          subsample the data and only test the model on num_samples datapoints.

        Returns:
        - acc: Scalar giving the fraction of instances that were correctly
          classified by the model.
        """

        # Maybe subsample the data
        N = dataiter.num_data
        check_dataiter = dataiter
        if num_samples is not None and N > num_samples:
            # Sample a sub iter
            check_dataiter = dataiter.getsubiter(num_samples)
        else:
            # Use the entire dataiter otherwise.
            check_dataiter.reset()

        acc_count = 0
        num_samples = 0
        for each_batch in check_dataiter:
            predict = self.model.forward_batch(
                each_batch, mode='test').asnumpy()
            # TODO(minjie): multiple labels.
            acc_count += np.sum(
                np.argmax(
                    predict, axis=1) == each_batch.label[0])
            num_samples += check_dataiter.batch_size
        return float(acc_count.asnumpy()) / num_samples
Exemple #22
0
def softmax_cross_entropy(prob, label):
    """
    Computes the cross entropy for softmax activation.

    Inputs:
    - prob: Probability, of shape (N, C) where x[i, j] is the probability for the jth class
      for the ith input.
    - label: Either of the followings:
      - One hot encoding of labels, of shape (N, C)
      - Label index of shape (N, ), each y[i] is the label of i^th example
        (0 <= y[i] < C)

    Returns a Value:
    - cross_entropy
    """

    N = prob.shape[0]
    C = prob.shape[1]
    if len(label.shape) == 1:
        #convert it to one hot encoding
        onehot_label = np.zeros([N, C])
        np.onehot_encode(label, onehot_label)
    else:
        onehot_label = label
    return -np.sum(np.log(prob) * onehot_label) / N
Exemple #23
0
 def gan_loss(*args):
   p_X = dmodel.forward(X, 'train')
   random_X = gmodel.forward(noise, 'train')
   p_random_X = dmodel.forward(random_X, 'train')
   value = np.log(clip(p_X, lower, upper)) + np.log(clip(1 - p_random_X, lower, upper))
   loss = np.sum(value) / float(N)
   return loss
Exemple #24
0
 def loss(self, predict, y):
     # Add L2 regularization for all the weights.
     reg_loss = 0.0
     for name, weight in self.params.items():
         reg_loss += np.sum(weight**2) * 0.5
     return layers.softmax_cross_entropy(predict,
                                         y) + weight_decay * reg_loss
Exemple #25
0
def softmax_cross_entropy(prob, label):
    """
    Computes the cross entropy for softmax activation.

    Inputs:
    - prob: Probability, of shape (N, C) where x[i, j] is the probability for the jth class
      for the ith input.
    - label: Either of the followings:
      - One hot encoding of labels, of shape (N, C)
      - Label index of shape (N, ), each y[i] is the label of i^th example
        (0 <= y[i] < C)

    Returns a Value:
    - cross_entropy
    """

    N = prob.shape[0]
    C = prob.shape[1]
    if len(label.shape) == 1:
        #convert it to one hot encoding
        onehot_label = np.zeros([N, C])
        np.onehot_encode(label, onehot_label)
    else:
        onehot_label = label
    return -np.sum(np.log(prob) * onehot_label) / N
Exemple #26
0
 def log_likelihood(weights, inputs, targets):
     logprobs = outputs(weights, inputs)
     loglik = 0.0
     num_time_steps, num_examples, _ = inputs.shape
     for t in range(num_time_steps):
         loglik += np.sum(logprobs[t] * targets[t])
     return loglik / (num_time_steps * num_examples)
Exemple #27
0
def accuracy(p, l):
    import minpy.numpy as np
    if len(l.shape) == 1:
        return 1 - np.count_nonzero(p - l).val / float(p.shape[0])
    else:
        inputs, labels = p, l
        return np.mean(np.sum((inputs - labels)**2, axis=1))
 def getLaplacian(self, W):
     D = np.zeros((W.shape[0], W.shape[1]))
     L = np.zeros((W.shape[0], W.shape[1]))
     for i in range(W.shape[1]):
         D[i][i] = np.sum(W[:, i])
     L = D - W
     return [D, L]
Exemple #29
0
def affine_backward(dout, cache):
    """
  Computes the backward pass for an affine layer.

  Inputs:
  - dout: Upstream derivative, of shape (N, M)
  - cache: Tuple of:
    - x: Input data, of shape (N, d_1, ... d_k)
    - w: Weights, of shape (D, M)

  Returns a tuple of:
  - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
  - dw: Gradient with respect to w, of shape (D, M)
  - db: Gradient with respect to b, of shape (M,)
  """
    x, w, b = cache
    x_plain = np.reshape(x, (x.shape[0], -1))

    db = np.sum(dout, axis=0)

    dx_plain = np.dot(dout, np.transpose(w))

    dx = np.reshape(dx_plain, x.shape)
    dw = np.dot(np.transpose(x_plain), dout)

    return dx, dw, db
Exemple #30
0
 def log_likelihood(weights, inputs, targets):
     logprobs = outputs(weights, inputs)
     loglik = 0.0
     num_time_steps, num_examples, _ = inputs.shape
     for t in range(num_time_steps):
         loglik += np.sum(logprobs[t] * targets[t])
     return loglik / (num_time_steps * num_examples)
Exemple #31
0
        def train_loss(*args):
            inputs = args[0]
            softmax_label = args[1]
            probs = self.symbol_func(**self.make_mxnet_weight_dict(
                inputs, softmax_label, args[self.data_target_cnt:len(args)]))
            if softmax_label is None:
                return probs

            samples_num = X.shape[0]
            targets = np.zeros((samples_num, self.num_classes))
            targets[np.arange(samples_num), softmax_label] = 1
            loss = -np.sum(targets * np.log(probs)) / samples_num
            for i in self.get_index_reg_weight():
                loss = loss + np.sum(0.5 * args[i]**2 * self.reg)

            return loss
def affine_backward(dout, cache):
  """
  Computes the backward pass for an affine layer.

  Inputs:
  - dout: Upstream derivative, of shape (N, M)
  - cache: Tuple of:
    - x: Input data, of shape (N, d_1, ... d_k)
    - w: Weights, of shape (D, M)

  Returns a tuple of:
  - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
  - dw: Gradient with respect to w, of shape (D, M)
  - db: Gradient with respect to b, of shape (M,)
  """
  x, w, b = cache
  x_plain = np.reshape(x, (x.shape[0], -1))

  db = np.sum(dout, axis=0)

  dx_plain = np.dot(dout, np.transpose(w))

  dx = np.reshape(dx_plain, x.shape)
  dw = np.dot(np.transpose(x_plain), dout)

  return dx, dw, db
Exemple #33
0
 def forward(self, X):
     a = np.dot(self.params['fc1'], X.T)
     h = np.maximum(0, a)
     logits = np.dot(h.T, self.params['policy_fc_last'].T)
     ps = np.exp(logits - np.max(logits, axis=1, keepdims=True))
     ps /= np.sum(ps, axis=1, keepdims=True)
     vs = np.dot(h.T, self.params['vf_fc_last'].T) + self.params['vf_fc_last_bias']
     return ps, vs
Exemple #34
0
def probs(scores):
    """
    Calculates the probabilities out of a neural networks class scores.
    :param scores: The score matrix of form (N x K), where N is the number of observations and K is the number of classes.
    :return: The probabilities of the same form as the input scores.
    """
    exp_scores = np.exp(scores)
    return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
Exemple #35
0
def blob_normalization(X,
                       settings,
                       gamma,
                       beta,
                       mode='train',
                       epsilon=1e-5,
                       momentum=0.9,
                       running_mean=None,
                       running_variance=None):
    N, D = map(int, X.shape)
    size = N * D

    if running_mean is None:
        running_mean = np.zeros(1)
    if running_variance is None:
        running_variance = np.zeros(1)

    if mode == 'train':
        if 'shared_mean' in settings:
            mean = np.sum(X) / size
        else:
            mean = np.sum(X, axis=0) / N
            mean = np.reshape(mean, (1, D))

        centered_X = X - mean

        if 'shared_deviation' in settings:
            variance = np.sum(centered_X**2) / size
        else:
            variance = np.sum(centered_X**2, axis=0) / N
            variance = np.reshape(variance, (1, D))

        deviation = variance**0.5
        rescaled_X = centered_X / deviation

        out = gamma * rescaled_X + beta

        running_mean = momentum * running_mean + (1.0 - momentum) * mean
        running_variance = momentum * running_variance + (1.0 -
                                                          momentum) * variance

    elif mode == 'test':
        X_hat = (X - running_mean) / np.sqrt(running_variance + epsilon)
        out = gamma * X_hat + beta

    return out, running_mean, running_variance
Exemple #36
0
def softmax_crossentropy(x, y):
    EPSI = 1e-6
    batch_size, seq_len, prob_dim = x.shape
    x = x.reshape((x.shape[0] * x.shape[1], x.shape[2]))
    y = y.reshape((y.shape[0] * y.shape[1], ))

    #print x.shape, y.shape
    # x should be (batch, prob)
    # y should be (batch, )

    x_dev = x - np.max(x, axis=1,
                       keepdims=True)  # minpy doesn't support x.max()
    sm = x_dev - np.log(EPSI + np.sum(np.exp(x_dev), axis=1, keepdims=True))
    ids = np.arange(0, y.shape[0]) * seq_len + y
    ce = -np.sum(sm.reshape((sm.shape[0] * sm.shape[1], ))[ids]) / (
        1.0 * y.shape[0])  # minpy doesn't support -1 in shape inference
    return ce
Exemple #37
0
    def loss(caffe_layer_specs, X, T):
        # original code:
        # log_prior = -L2_reg * np.dot(W_vect, W_vect)
        log_prior = 0
        for caffe_layer in caffe_layer_specs:
            log_prior += -L2_reg * np.dot(caffe_layer.get_learnable_params()[0], caffe_layer.get_learnable_params()[0])

        log_lik = np.sum(predictions(caffe_layer_specs, X) * T)
        return - log_prior - log_lik
Exemple #38
0
def regularization_loss(w, lambda_):
    """
    Calculates the regularization loss of the output weights.
    Regularization loss is used to favor smaller magnitudes of weights.
    :param w: The weight matrix of the output layer of form (H x K),
    where H is the size of the previous layer and K is the number of classes.
    :param lambda_: A hyperparameter used to control the magnitude of the weight.
    :return: The regularization loss.
    """
    return .5 * lambda_ * np.sum(np.square(w))
Exemple #39
0
def softmax_cross_entropy(prob, label):
    N = prob.shape[0]
    C = prob.shape[1]
    if len(label.shape) == 1:
        #convert it to one hot encoding
        onehot_label = np.zeros([N, C])
        np.onehot_encode(label, onehot_label)
    else:
        onehot_label = label
    return -np.sum(np.log(prob) * onehot_label) / N
 def grad(g):
     import numpy as np
     y = label.astype(int)
     probs = np.exp(x - np.max(x, axis=1, keepdims=True))
     probs /= np.sum(probs, axis=1, keepdims=True)
     N = x.shape[0]
     dx = probs.copy()
     dx[np.arange(N), y] -= 1
     dx /= N
     return dx
Exemple #41
0
def softmax_cross_entropy(prob, label):
    N = prob.shape[0]
    C = prob.shape[1]
    if len(label.shape) == 1:
        #convert it to one hot encoding
        onehot_label = np.zeros([N, C])
        np.onehot_encode(label, onehot_label)
    else:
        onehot_label = label
    return -np.sum(np.log(prob) * onehot_label) / N
Exemple #42
0
 def trainOneEpoch(self, input, target):
     """Train network for given input and target with gradient descent"""
     w = self.weights
     loss = self.loss(input, target)
     dw = self.gradient(self.weights, input, target)
     # gradient descent
     w -= self.lr * dw
     deltaW = np.sum((self.weights - w)**2)  # change in weights
     self.weights = w  # update weights
     return loss, deltaW
Exemple #43
0
def quick_grad_check(fun,
                     arg0,
                     extra_args=(),
                     kwargs={},
                     verbose=True,
                     eps=EPS,
                     rtol=RTOL,
                     atol=ATOL,
                     rs=None):
    """Checks the gradient of a function (w.r.t. to its first arg) in a random direction"""

    if verbose:
        print("Checking gradient of {0} at {1}".format(fun, arg0))

    if rs is None:
        rs = nnp.random.RandomState()

    random_dir = rs.standard_normal(nnp.shape(arg0))
    random_dir = random_dir / nnp.sqrt(nnp.sum(random_dir * random_dir))

    if not extra_args == ():
        unary_fun = lambda x: fun(arg0 + x * random_dir, extra_args)
        numeric_grad = (unary_fun(eps / 2) - unary_fun(-eps / 2)) / eps
        analytic_grad = np.sum(grad(fun)(arg0, extra_args) * random_dir)
    else:
        unary_fun = lambda x: fun(arg0 + x * random_dir)
        numeric_grad = (unary_fun(eps / 2) - unary_fun(-eps / 2)) / eps
        analytic_grad = np.sum(grad(fun)(arg0) * random_dir)

    if isinstance(numeric_grad, minpy.array.Number):
        assert abs((analytic_grad - numeric_grad).get_data(None)) < atol and abs((analytic_grad - numeric_grad).get_data(None)) < abs((analytic_grad * rtol).get_data(None)), \
            "Check failed! nd={0}, ad={1}".format(numeric_grad, analytic_grad)
    elif isinstance(numeric_grad, minpy.array.Array):
        assert nnp.prod(nnp.shape(analytic_grad.asnumpy())
                        [:]) == 1, "Currently only support check loss"
        assert abs((analytic_grad - numeric_grad).asnumpy()) < atol and abs((analytic_grad - numeric_grad).asnumpy()) < abs((analytic_grad * rtol).asnumpy()), \
            "Check failed! nd={0}, ad={1}".format(numeric_grad, analytic_grad)
    else:
        assert False
    if verbose:
        print("Gradient projection OK (numeric grad: {0}, analytic grad: {1})".
              format(numeric_grad, analytic_grad))
Exemple #44
0
def backpropagation(x, s, y, hidden_layers, wh, bh, w_out, b_out, alpha):
    """
    Performs the backpropagation of a neural network.
    :param x: The input data of form (N x D), where N is the number of observations an D is the dimensionality.
    :param s: The score matrix of form (N x K), where N is the number of observations and K is the number of classes.
    :param y: The ground truth labels for each observation.
    :param hidden_layers: An array containing the values of each hidden layer as a vector.
    :param wh: The weights of each hidden layer connection as array. Each weight is a matrix of (H_i-1 ... H_i),
    where H_i-1 is the size of the previous hidden layer (or the input layer) and H_i is the size of the corresponding
    hidden layer..
    :param bh: The biases of each hidden layer as array. Each bias is a vector of the same length of the corresponding
    hidden layer.
    :param w_out: The weight of the output layer as matrix of form (H x K),
    where H is the size of the last hidden layer and K is the number of classes.
    :param b_out: The bias of the output layer as vector of length K, where K is the number of classes.
    :param alpha: The factor by which negative inputs are scaled in ReLU activations. Set to 0 to avoid leaky ReLU.
    :return: The backpropagation returns relevant gradients as a tuple containing the following values:
    * An array containing the gradients for the connection weights of each hidden layer of the same form as `wh`.
    * An array containing the gradients for the biases of each hidden layer of the same form as `bh`.
    * An array containing the gradients for the connection weights of the output layer of the same form as `w_out`.
    * An array containing the gradients for the biases of the output layer of the same form as `b_out`.
    """
    dscores = cross_entropy_loss_gradient(s, y)
    dw_out2 = hidden_layers[-1].T.dot(dscores)
    db_out2 = np.sum(dscores, axis=0, keepdims=True)
    dhiddens = {}
    dwh2 = [np.full(w_i.shape, .0) for w_i in wh]
    dbh2 = [np.empty(b_i.shape) for b_i in bh]
    for h in range(len(hidden_layers) - 1, -1, -1):
        if h == len(hidden_layers) - 1:
            dhidden = dscores.dot(w_out.T)
        else:
            dhidden = dhiddens[h + 1].dot(wh[h + 1].T)
        dhidden[hidden_layers[h] < 0] = alpha
        dhiddens[h] = dhidden
        if h == 0:
            dwh2[h] = x.T.dot(dhidden)
        else:
            dwh2[h] = hidden_layers[h - 1].T.dot(dhidden)
        dbh2[h] = np.sum(dhidden, axis=0, keepdims=True)
    dw_out2 += lambda_ * w_out
    return dwh2, dbh2, dw_out2, db_out2
def softmax_loss(x, y):
  """
  Computes the loss and gradient for softmax classification.

  Inputs:
  - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class
    for the ith input.
  - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
    0 <= y[i] < C

  Returns a tuple of:
  - loss: Scalar giving the loss
  """
  #TODO: Missing Max Operator 
  probs = np.exp(x - np.expand_dims(np.max(x, axis=1), axis = 1))
  probs = probs / np.expand_dims(np.sum(probs, axis=1), axis = 1)
  N = x.shape[0]
  loss = -np.sum(np.log(probs[np.arange(N), y])) / N

  return loss
Exemple #46
0
def l2_loss(x, label):
    """
    The Mean Square Error loss for regression.
    """
    N = x.shape[0]
    C = x.shape[1]
    if len(label.shape) == 1:
        #convert it to one hot encoding
        onehot_label = np.zeros([N, C])
        np.onehot_encode(label, onehot_label)
    else:
        onehot_label = label
    return np.sum((x - onehot_label) ** 2) / N
Exemple #47
0
def temporal_softmax_loss(x, y, mask, verbose=False):
    """
    A temporal version of softmax loss for use in RNNs. We assume that we are
    making predictions over a vocabulary of size V for each timestep of a
    timeseries of length T, over a minibatch of size N. The input x gives scores
    for all vocabulary elements at all timesteps, and y gives the indices of the
    ground-truth element at each timestep. We use a cross-entropy loss at each
    timestep, summing the loss over all timesteps and averaging across the
    minibatch.

    As an additional complication, we may want to ignore the model output at some
    timesteps, since sequences of different length may have been combined into a
    minibatch and padded with NULL tokens. The optional mask argument tells us
    which elements should contribute to the loss.

    Inputs:
    - x: Input scores, of shape (N, T, V)
    - y: Ground-truth indices, of shape (N, T) where each element is in the range
       0 <= y[i, t] < V
    - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not
    the scores at x[i, t] should contribute to the loss.

    Returns a tuple of:
    - loss: Scalar giving loss
    - dx: Gradient of loss with respect to scores x.
    """
    N, T, V = x.shape

    x_flat = x.reshape(N * T, V)
    y_flat = y.reshape(N * T)
    mask_flat = mask.reshape(N * T)

    probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True))
    probs = probs / np.sum(probs, axis=1, keepdims=True)
    loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N

    return loss
Exemple #48
0
    def check_accuracy(self, dataiter, num_samples=None):
        """
        Check accuracy of the model on the provided data.

        Parameters
        ----------
        dataiter
            data iterator that can produce batches.
        num_samples
            If not None and dataiter has more than num_samples datapoints,
            subsample the data and only test the model on num_samples datapoints.

        Returns
        -------
        acc
            Scalar giving the fraction of instances that were correctly
            classified by the model.
        """
        # Maybe subsample the data
        N = dataiter.num_data
        check_dataiter = dataiter
        if num_samples is not None and N > num_samples:
            # Sample a sub iter
            check_dataiter = dataiter.getsubiter(num_samples)
        else:
            # Use the entire dataiter otherwise.
            check_dataiter.reset()

        if self.task_type is 'classification':
            acc_count = 0
            num_samples = 0
            for each_batch in check_dataiter:
                predict = self.model.forward_batch(each_batch, mode='test').asnumpy()
                # TODO(minjie): multiple labels.
                acc_count += np.sum(np.argmax(predict, axis=1) == each_batch.label[0])
                num_samples += check_dataiter.batch_size
            return float(acc_count.asnumpy()) / num_samples
        elif self.task_type is 'regression':
            loss = 0
            batch_count = 0
            for each_batch in check_dataiter:
                predict = self.model.forward_batch(each_batch, mode='test').asnumpy()
                loss += self.model.loss(predict, each_batch.label[0])
                batch_count += 1
            return float(loss.asnumpy()) / batch_count
        else:
            raise ValueError('Task type is either classification or regression.')
def test_sum_forward():

  np_x = py_np.zeros((2, 10))
  np_w = py_np.zeros((10, 3))
  np_b = py_np.zeros(3)

  x = NumpyVarToMinpy(np_x)
  w = NumpyVarToMinpy(np_w)
  b = NumpyVarToMinpy(np_b)

  x_plain = np.reshape(x, (x.shape[0], -1))
  out0 = np.dot(x_plain, w)
  out = out0 + b

  np_out = MinpyVarToNumpy(out)

  var = py_np.random.randn(2, 3)
  tmp = NumpyVarToMinpy(var)
  sum_tmp = np.sum(tmp, axis = 0)

  sum_py = MinpyVarToNumpy(sum_tmp)
 def loss_theta(layer):
     pred = predict(layer, inputs)
     return -np.sum(np.log(pred * targets))  # negative log likelihood
Exemple #51
0
def loss(w, x):
    prob = predict(w, x)
    return -np.sum(np.log(prob) * t) / 10000  + 0.5 * w * w
Exemple #52
0
 def loss(self, predict, y):
     # Add L2 regularization for all the weights.
     reg_loss = 0.0
     for name, weight in self.params.items():
         reg_loss += np.sum(weight ** 2)
     return layers.softmax_cross_entropy(predict, y) + 0.5 * weight_decay * reg_loss
def sigmoid(x):
    return np.multiply(0.5, np.add(np.tanh(x), 1))

x = mx.sym.Variable(name='x')
fc = mx.sym.FullyConnected(name='fc', data=x)
#fc = mx.sym.FullyConnected(name='fc', data=x, num_hidden=inputs.shape[1])
act = mx.sym.Activation(data=fc, act_type='sigmoid')
f = core.function(act)

def predict(weights, inputs):
    return f(x=inputs, fc_weight=weights, ctx=mx.cpu())

def training_loss(weights, inputs):
    preds = predict(weights, inputs)
    label_probabilities = preds * targets + (1 - preds) * (1 - targets)
    return -np.sum(np.log(label_probabilities))

xshape = (256, 500)
wshape = (500, 250)
tshape = (256, 250)
inputs = np.random.rand(*xshape) - 0.5
targets = np.random.randint(0, 2, size=tshape)
weights = np.random.rand(*wshape) - 0.5

training_gradient_fun = core.grad(training_loss)

print('Initial loss: {}'.format(training_loss(weights, inputs)))
for i in range(100):
    gr = training_gradient_fun(weights, inputs)
    #print('Training gradient: {}'.format(gr))
    weights -= gr * 0.1
Exemple #54
0
 def check_fn(x):
     y = x + 1
     print(mp.exp(y))
     return mp.sum(2 * y)
Exemple #55
0
 def check_fn(x):
     return mp.sum(x + x)
def training_loss(inputs, targets, fc_weight, fc_bias, conv_weight, conv_bias):
    preds = predict(inputs, fc_weight, fc_bias, conv_weight, conv_bias)
    label_probabilities = preds * targets + (1 - preds) * (1 - targets)
    return -np.sum(np.log(label_probabilities))
Exemple #57
0
def training_loss(weights, inputs):
    preds = predict(weights, inputs)
    label_probabilities = preds * targets + (1 - preds) * (1 - targets)
    l = -np.sum(np.log(label_probabilities))
    return l
def batchnorm_forward(x, gamma, beta, bn_param):
  """
  Forward pass for batch normalization.
  
  During training the sample mean and (uncorrected) sample variance are
  computed from minibatch statistics and used to normalize the incoming data.
  During training we also keep an exponentially decaying running mean of the mean
  and variance of each feature, and these averages are used to normalize data
  at test-time.

  At each timestep we update the running averages for mean and variance using
  an exponential decay based on the momentum parameter:

  running_mean = momentum * running_mean + (1 - momentum) * sample_mean
  running_var = momentum * running_var + (1 - momentum) * sample_var

  Note that the batch normalization paper suggests a different test-time
  behavior: they compute sample mean and variance for each feature using a
  large number of training images rather than using a running average. For
  this implementation we have chosen to use running averages instead since
  they do not require an additional estimation step; the torch7 implementation
  of batch normalization also uses running averages.

  Input:
  - x: Data of shape (N, D)
  - gamma: Scale parameter of shape (D,)
  - beta: Shift paremeter of shape (D,)
  - bn_param: Dictionary with the following keys:
    - mode: 'train' or 'test'; required
    - eps: Constant for numeric stability
    - momentum: Constant for running mean / variance.
    - running_mean: Array of shape (D,) giving running mean of features
    - running_var Array of shape (D,) giving running variance of features

  Returns a tuple of:
  - out: of shape (N, D)
  - cache: A tuple of values needed in the backward pass
  """
  mode = bn_param['mode']
  eps = bn_param.get('eps', 1e-5)
  momentum = bn_param.get('momentum', 0.9)

  N, D = x.shape
  running_mean = bn_param.get('running_mean', np.zeros(D))
  running_var = bn_param.get('running_var', np.zeros(D))

  out = None
  if mode == 'train':
    mean = np.sum(x, axis = 0)/float(N)
    x_mean = (x - mean)

    sqr_x_mean = x_mean ** 2
    var = np.sum(sqr_x_mean, axis = 0)/float(N)
    sqrt_var = np.sqrt(var + eps)

    inv_sqrt_var = 1.0/sqrt_var

    x_hat = x_mean * inv_sqrt_var
    out = gamma * x_hat + beta

    running_mean = momentum*running_mean + (1.0 - momentum) * mean
    running_var = momentum*running_var + (1.0 - momentum) * var
  elif mode == 'test':
    x_hat = (x - running_mean)/np.sqrt(running_var + eps)
    out = gamma * x_hat + beta
  else:
    raise ValueError('Invalid forward batchnorm mode "%s"' % mode)

  # Store the updated running means back into bn_param
  bn_param['running_mean'] = running_mean
  bn_param['running_var'] = running_var
 
  return out