Example #1
0
    def __init__(self,
                 input_dim=3 * 32 * 32,
                 hidden_dim=100,
                 num_classes=10,
                 weight_scale=1e-3,
                 reg=0.0,
                 conv_mode='lazy',
                 dtype=py_np.float64):
        """
        Initialize a new network.

        Inputs:
        - input_dim: An integer giving the size of the input
        - hidden_dim: An integer giving the size of the hidden layer
        - num_classes: An integer giving the number of classes to classify
        - dropout: Scalar between 0 and 1 giving dropout strength.
        - weight_scale: Scalar giving the standard deviation for random
          initialization of the weights.
        - reg: Scalar giving L2 regularization strength.
        """
        super(TwoLayerNet, self).__init__(conv_mode)
        self.params = {}
        self.reg = reg

        self.params['W1'] = random.randn(input_dim, hidden_dim) * weight_scale
        self.params['b1'] = np.zeros((hidden_dim))
        self.params['W2'] = random.randn(hidden_dim, num_classes) * weight_scale
        self.params['b2'] = np.zeros((num_classes))
Example #2
0
    def __init__(self,
                 input_dim=3 * 32 * 32,
                 hidden_dim=100,
                 num_classes=10,
                 weight_scale=1e-3,
                 reg=0.0,
                 conv_mode='lazy',
                 dtype=py_np.float64):
        """
    Initialize a new network.

    Inputs:
    - input_dim: An integer giving the size of the input
    - hidden_dim: An integer giving the size of the hidden layer
    - num_classes: An integer giving the number of classes to classify
    - dropout: Scalar between 0 and 1 giving dropout strength.
    - weight_scale: Scalar giving the standard deviation for random
      initialization of the weights.
    - reg: Scalar giving L2 regularization strength.
    """
        super(TwoLayerNet, self).__init__(conv_mode)
        self.params = {}
        self.reg = reg

        self.params['W1'] = random.randn(input_dim, hidden_dim) * weight_scale
        self.params['b1'] = np.zeros((hidden_dim))
        self.params['W2'] = random.randn(hidden_dim,
                                         num_classes) * weight_scale
        self.params['b2'] = np.zeros((num_classes))
Example #3
0
def xavier(shape, _):
    """Initialize weights with xavier initializer.

    Xavier initializer init matrix based on fan_in and fan_out

    Parameters
    ----------
    shape : tuple
        Shape of the array to be initialized.
    _ : placeholder

    Returns
    -------
    Array
        Initialized array of size `shape`.

    """

    fan_out = shape[0]
    if len(shape) > 1:
        fan_in = numpy.prod(shape[1:])
    else:
        fan_in = 0
    var = numpy.sqrt(6.0 / (fan_out + fan_in))
    ret = npr.randn(*shape) * var
    return ret
Example #4
0
def xavier(shape, config):
    fan_out = shape[0]
    if len(shape) > 1:
        fan_in = numpy.prod(shape[1:])
    else:
        fan_in = 0
    var = numpy.sqrt(6.0 / (fan_out + fan_in))
    ret = npr.randn(*shape) * var
    return ret
Example #5
0
    def set_param(self):
        self.params = {}

        c_cnt, height, width = self.input_dim
        f_cnt = self.num_filters
        f_h, f_w = self.filter_size, self.filter_size

        self.params['conv1_weight'] = random.randn(f_cnt, c_cnt, f_h,
                                                   f_w) * self.weight_scale
        self.params['conv1_bias'] = np.zeros(f_cnt)

        #TODO(Haoran): whole stuff about all dimension calculations
        #should be substituted by quering symbol.arg_list
        conv_stride = 1
        conv_pad = (f_h - 1) / 2

        Hc, Wc = 1 + (height + 2 * conv_pad - f_h) / conv_stride, 1 + (
            width + 2 * conv_pad - f_w) / conv_stride

        pool_height, pool_width = 2, 2
        pool_stride = 2

        Hp, Wp = (Hc - pool_height) / pool_stride + 1, (
            Wc - pool_width) / pool_stride + 1

        # weight has to be tranposed to fit mxnet's symbol
        self.params['fc1_weight'] = np.transpose(
            random.randn(5408, self.hidden_dim) * self.weight_scale)
        self.params['fc1_bias'] = np.zeros((self.hidden_dim))

        # weight has to be tranposed to fit mxnet's symbol
        self.params['fc2_weight'] = np.transpose(
            random.randn(self.hidden_dim, self.num_classes) *
            self.weight_scale)
        self.params['fc2_bias'] = np.zeros((self.num_classes))

        #TODO(Haoran): move following into parent structured model class
        self.param_keys = self.params.keys()

        # Build key's index in loss func's arglist
        self.key_args_index = {}
        for i, key in enumerate(self.param_keys):
            # data, targets would be the first two elments in arglist
            self.key_args_index[key] = self.data_target_cnt + i
Example #6
0
    def set_param(self):
        self.params = {}

        c_cnt, height, width = self.input_dim
        f_cnt = self.num_filters
        f_h, f_w = self.filter_size, self.filter_size

        self.params['conv1_weight'] = random.randn(f_cnt, c_cnt, f_h,
                                                   f_w) * self.weight_scale
        self.params['conv1_bias'] = np.zeros(f_cnt)

        #TODO(Haoran): whole stuff about all dimension calculations
        #should be substituted by quering symbol.arg_list
        conv_stride = 1
        conv_pad = (f_h - 1) / 2

        Hc, Wc = 1 + (height + 2 * conv_pad - f_h) / conv_stride, 1 + (
            width + 2 * conv_pad - f_w) / conv_stride

        pool_height, pool_width = 2, 2
        pool_stride = 2

        Hp, Wp = (Hc - pool_height) / pool_stride + 1, (Wc - pool_width
                                                       ) / pool_stride + 1

        # weight has to be tranposed to fit mxnet's symbol
        self.params['fc1_weight'] = np.transpose(random.randn(
            5408, self.hidden_dim) * self.weight_scale)
        self.params['fc1_bias'] = np.zeros((self.hidden_dim))

        # weight has to be tranposed to fit mxnet's symbol
        self.params['fc2_weight'] = np.transpose(random.randn(
            self.hidden_dim, self.num_classes) * self.weight_scale)
        self.params['fc2_bias'] = np.zeros((self.num_classes))

        #TODO(Haoran): move following into parent structured model class
        self.param_keys = self.params.keys()

        # Build key's index in loss func's arglist
        self.key_args_index = {}
        for i, key in enumerate(self.param_keys):
            # data, targets would be the first two elments in arglist
            self.key_args_index[key] = self.data_target_cnt + i
Example #7
0
def test_policy():
    @minpy.wrap_policy(minpy.OnlyNumPyPolicy())
    def gaussian_cluster_generator(num_samples=10000,
                                   num_features=500,
                                   num_classes=5):
        # with minpy.OnlyNumPyPolicy():
        mu = np.random.rand(num_classes, num_features)
        sigma = np.ones((num_classes, num_features)) * 0.1
        num_cls_samples = num_samples / num_classes
        x = np.zeros((num_samples, num_features))
        y = np.zeros((num_samples, num_classes))
        for i in range(num_classes):
            cls_samples = np.random.normal(mu[i, :], sigma[i, :],
                                           (num_cls_samples, num_features))
            x[i * num_cls_samples:(i + 1) * num_cls_samples] = cls_samples
            y[i * num_cls_samples:(i + 1) * num_cls_samples, i] = 1
        return x, y

    def predict(w, x):
        a = np.exp(np.dot(x, w))
        a_sum = np.sum(a, axis=1, keepdims=True)
        prob = a / a_sum
        return prob

    def train_loss(w, x):
        prob = predict(w, x)
        loss = -np.sum(label * np.log(prob)) / num_samples
        return loss

    """Use Minpy's auto-grad to derive a gradient function off loss"""
    grad_function = grad_and_loss(train_loss)

    # Using gradient descent to fit the correct classes.
    def train(w, x, loops):
        for i in range(loops):
            dw, loss = grad_function(w, x)
            if i % 10 == 0:
                print('Iter {}, training loss {}'.format(i, loss))
            # gradient descent
            w -= 0.1 * dw

    # Initialize training data.
    num_samples = 10000
    num_features = 500
    num_classes = 5
    data, label = gaussian_cluster_generator(num_samples, num_features,
                                             num_classes)

    # Initialize training weight and train
    weight = random.randn(num_features, num_classes)
    train(weight, data, 100)
Example #8
0
def test_policy():
    @minpy.wrap_policy(minpy.OnlyNumPyPolicy())
    def gaussian_cluster_generator(num_samples=10000, num_features=500, num_classes=5):
        # with minpy.OnlyNumPyPolicy():
        mu = np.random.rand(num_classes, num_features)
        sigma = np.ones((num_classes, num_features)) * 0.1
        num_cls_samples = num_samples / num_classes
        x = np.zeros((num_samples, num_features))
        y = np.zeros((num_samples, num_classes))
        for i in range(num_classes):
            cls_samples = np.random.normal(mu[i,:], sigma[i,:], (num_cls_samples, num_features))
            x[i*num_cls_samples:(i+1)*num_cls_samples] = cls_samples
            y[i*num_cls_samples:(i+1)*num_cls_samples,i] = 1
        return x, y
    
    def predict(w, x):
        a = np.exp(np.dot(x, w))
        a_sum = np.sum(a, axis=1, keepdims=True)
        prob = a / a_sum
        return prob
    
    def train_loss(w, x):
        prob = predict(w, x)
        loss = -np.sum(label * np.log(prob)) / num_samples
        return loss
    
    """Use Minpy's auto-grad to derive a gradient function off loss"""
    grad_function = grad_and_loss(train_loss)
    
    # Using gradient descent to fit the correct classes.
    def train(w, x, loops):
        for i in range(loops):
            dw, loss = grad_function(w, x)
            if i % 10 == 0:
                print('Iter {}, training loss {}'.format(i, loss))
            # gradient descent
            w -= 0.1 * dw
    
    # Initialize training data.
    num_samples = 10000
    num_features = 500
    num_classes = 5
    data, label = gaussian_cluster_generator(num_samples, num_features, num_classes)
    
    # Initialize training weight and train
    weight = random.randn(num_features, num_classes)
    train(weight, data, 100)
Example #9
0
File: init.py Project: wddabc/minpy
def gaussian(shape, config):
    """Initialize weights with gaussian distribution.

    Parameters
    ----------
    shape : tuple
        Shape of the array to be initialized.
    config : dict
        Mean and standard variance of the distribution

    Returns
    -------
    Array
        Initialized array of size `shape`

    """
    config.setdefault('mu', 0.0)
    config.setdefault('stdvar', 0.001)
    stdvar = config['stdvar']
    meanvar = config['mu']
    return npr.randn(*shape) * stdvar + meanvar
Example #10
0
def gaussian(shape, config):
    """Initialize weights with gaussian distribution.

    Parameters
    ----------
    shape : tuple
        Shape of the array to be initialized.
    config : dict
        Mean and standard variance of the distribution

    Returns
    -------
    Array
        Initialized array of size `shape`

    """
    config.setdefault('mu', 0.0)
    config.setdefault('stdvar', 0.001)
    stdvar = config['stdvar']
    meanvar = config['mu']
    return npr.randn(*shape) * stdvar + meanvar
Example #11
0
def gaussian(shape, config):
    config.setdefault('mu', 0.0)
    config.setdefault('stdvar', 0.001)
    stdvar = config['stdvar']
    mu = config['mu']
    return npr.randn(*shape) * stdvar + mu
Example #12
0
    train_inputs = build_dataset(__file__, seq_length, input_size, max_lines=60)

    pred_fun, loglike_fun, num_weights = build_lstm(input_size, state_size, output_size)

    def print_training_prediction(weights):
        print("Training text                         Predicted text")
        logprobs = np.asarray(pred_fun(weights, train_inputs))
        for t in range(logprobs.shape[1]):
            training_text  = one_hot_to_string(train_inputs[:,t,:])
            predicted_text = one_hot_to_string(logprobs[:,t,:])
            print(training_text.replace('\n', ' ') + "|" + predicted_text.replace('\n', ' '))

    # Wrap function to only have one argument, for scipy.minimize.
    def training_loss(weights):
        return -loglike_fun(weights, train_inputs, train_inputs)

    def callback(weights):
        print("Train loss:", training_loss(weights))
        print_training_prediction(weights)

   # Build gradient of loss function using autograd.
    training_loss_and_grad = value_and_grad(training_loss)

    init_weights = npr.randn(num_weights) * param_scale

    print("Training LSTM...")
    result = minimize(training_loss_and_grad, init_weights, jac=True, method='CG',
                      options={'maxiter':train_iters}, callback=callback)
    trained_weights = result.x
Example #13
0
  def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,
               dropout=0, use_batchnorm=False, reg=0.0,
               weight_scale=1e-2, dtype=np.float32, seed=None):
    """
    Initialize a new FullyConnectedNet.
    
    Inputs:
    - hidden_dims: A list of integers giving the size of each hidden layer.
    - input_dim: An integer giving the size of the input.
    - num_classes: An integer giving the number of classes to classify.
    - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then
      the network should not use dropout at all.
    - use_batchnorm: Whether or not the network should use batch normalization.
    - reg: Scalar giving L2 regularization strength.
    - weight_scale: Scalar giving the standard deviation for random
      initialization of the weights.
    - dtype: A numpy datatype object; all computations will be performed using
      this datatype. float32 is faster but less accurate, so you should use
      float64 for numeric gradient checking.
    - seed: If not None, then pass this random seed to the dropout layers. This
      will make the dropout layers deteriminstic so we can gradient check the
      model.
    """
    self.use_batchnorm = use_batchnorm
    self.use_dropout = dropout > 0
    self.reg = reg
    self.num_layers = 1 + len(hidden_dims)
    self.dtype = dtype
    self.params = {}

    ############################################################################
    # TODO: Initialize the parameters of the network, storing all values in    #
    # the self.params dictionary. Store weights and biases for the first layer #
    # in W1 and b1; for the second layer use W2 and b2, etc. Weights should be #
    # initialized from a normal distribution with standard deviation equal to  #
    # weight_scale and biases should be initialized to zero.                   #
    #                                                                          #
    # When using batch normalization, store scale and shift parameters for the #
    # first layer in gamma1 and beta1; for the second layer use gamma2 and     #
    # beta2, etc. Scale parameters should be initialized to one and shift      #
    # parameters should be initialized to zero.                                #
    ############################################################################
    for l in xrange(self.num_layers):
      if l == 0:
        input_d = input_dim
      else:
        input_d = hidden_dims[l-1]

      if l < self.num_layers - 1:
        out_d = hidden_dims[l]
      else:
        out_d = num_classes

      self.params[self.GetWeightName(l)] = random.randn(input_d, out_d) * weight_scale
      self.params[self.GetBiasName(l)] = np.zeros((out_d))
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    # When using dropout we need to pass a dropout_param dictionary to each
    # dropout layer so that the layer knows the dropout probability and the mode
    # (train / test). You can pass the same dropout_param to each dropout layer.
    self.dropout_param = {}
    if self.use_dropout:
      self.dropout_param = {'mode': 'train', 'p': dropout}
      if seed is not None:
        self.dropout_param['seed'] = seed
    
    # With batch normalization we need to keep track of running means and
    # variances, so we need to pass a special bn_param object to each batch
    # normalization layer. You should pass self.bn_params[0] to the forward pass
    # of the first batch normalization layer, self.bn_params[1] to the forward
    # pass of the second batch normalization layer, etc.
    self.bn_params = []
    if self.use_batchnorm:
      self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)]
    
    # Cast all parameters to the correct datatype
    for k, v in self.params.iteritems():
      self.params[k] = v.astype(dtype)
Example #14
0
def test_stack():
    arr = [rnd.randn(3, 4) for _ in range(10)]
    res = np.stack(arr)
    assert res.shape == (10, 3, 4)
Example #15
0
def test_concatenate():
    arr = [rnd.randn(3, 4) for _ in range(10)]
    res = np.concatenate(arr, axis=1)
    assert res.shape == (3, 40)
Example #16
0
def xavier(shape, config=None):
    var = len(shape) / sum(shape)
    return npr.randn(*shape) * var
Example #17
0
def _randn(l, c):
    return random.randn(l, c)
Example #18
0
        logprobs = np.asarray(pred_fun(weights, train_inputs))
        for t in range(logprobs.shape[1]):
            training_text = one_hot_to_string(train_inputs[:, t, :])
            predicted_text = one_hot_to_string(logprobs[:, t, :])
            print(
                training_text.replace('\n', ' ') + "|" +
                predicted_text.replace('\n', ' '))

    # Wrap function to only have one argument, for scipy.minimize.
    def training_loss(weights):
        return -loglike_fun(weights, train_inputs, train_inputs)

    def callback(weights):
        print("Train loss:", training_loss(weights))
        print_training_prediction(weights)

# Build gradient of loss function using autograd.

    training_loss_and_grad = value_and_grad(training_loss)

    init_weights = npr.randn(num_weights) * param_scale

    print("Training LSTM...")
    result = minimize(training_loss_and_grad,
                      init_weights,
                      jac=True,
                      method='CG',
                      options={'maxiter': train_iters},
                      callback=callback)
    trained_weights = result.x
Example #19
0
    def __init__(self,
                 hidden_dims,
                 input_dim=3 * 32 * 32,
                 num_classes=10,
                 dropout=0,
                 use_batchnorm=False,
                 reg=0.0,
                 weight_scale=1e-2,
                 seed=None,
                 dtype=py_np.float64,
                 conv_mode='lazy'):
        """
    Initialize a new FullyConnectedNet.
    
    Inputs:
    - hidden_dims: A list of integers giving the size of each hidden layer.
    - input_dim: An integer giving the size of the input.
    - num_classes: An integer giving the number of classes to classify.
    - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then
      the network should not use dropout at all.
    - use_batchnorm: Whether or not the network should use batch normalization.
    - reg: Scalar giving L2 regularization strength.
    - weight_scale: Scalar giving the standard deviation for random
      initialization of the weights.
    - seed: If not None, then pass this random seed to the dropout layers. This
      will make the dropout layers deteriminstic so we can gradient check the
      model.
    """
        super(FullyConnectedNet, self).__init__(conv_mode)
        self.use_batchnorm = use_batchnorm
        self.use_dropout = dropout > 0
        self.reg = reg
        self.num_layers = 1 + len(hidden_dims)
        self.params = {}

        #Define parameter name given # layer
        self.w_name = lambda l: 'W' + str(l)
        self.b_name = lambda l: 'b' + str(l)
        self.bn_ga_name = lambda l: 'bn_ga' + str(l)
        self.bn_bt_name = lambda l: 'bn_bt' + str(l)

        for l in range(self.num_layers):
            if l == 0:
                input_d = input_dim
            else:
                input_d = hidden_dims[l - 1]

            if l < self.num_layers - 1:
                out_d = hidden_dims[l]
            else:
                out_d = num_classes

            self.params[self.w_name(l)] = random.randn(input_d,
                                                       out_d) * weight_scale
            self.params[self.b_name(l)] = np.zeros((out_d))
            if l < self.num_layers and self.use_batchnorm:
                self.params[self.bn_ga_name(l)] = np.ones((out_d))
                self.params[self.bn_bt_name(l)] = np.zeros((out_d))

        self.param_keys = self.params.keys()

        # When using dropout we need to pass a dropout_param dictionary to each
        # dropout layer so that the layer knows the dropout probability and the mode
        # (train / test). You can pass the same dropout_param to each dropout layer.
        self.dropout_param = {}
        if self.use_dropout:
            self.dropout_param = {'mode': 'train', 'p': dropout}
            if seed is not None:
                self.dropout_param['seed'] = seed

        # With batch normalization we need to keep track of running means and
        # variances, so we need to pass a special bn_param object to each batch
        # normalization layer.
        self.bn_params = []
        if self.use_batchnorm:
            self.bn_params = [{
                'mode': 'train'
            } for i in xrange(self.num_layers - 1)]

        # Build key's index in loss func's arglist
        self.key_args_index = {}
        for i, key in enumerate(self.param_keys):
            # data, targets would be the first two elments in arglist
            self.key_args_index[key] = self.data_target_cnt + i

        # Init Key to index in loss_function args
        self.w_idx = self.wrap_param_idx(self.w_name)
        self.b_idx = self.wrap_param_idx(self.b_name)
        self.bn_ga_idx = self.wrap_param_idx(self.bn_ga_name)
        self.bn_bt_idx = self.wrap_param_idx(self.bn_bt_name)
Example #20
0
def train_loss(w, x):
    prob = predict(w, x)
    loss = -np.sum(label * np.log(prob)) / num_samples
    return loss


"""Use Minpy's auto-grad to derive a gradient function off loss"""
grad_function = grad_and_loss(train_loss)


# Using gradient descent to fit the correct classes.
def train(w, x, loops):
    for i in range(loops):
        dw, loss = grad_function(w, x)
        if i % 10 == 0:
            print('Iter {}, training loss {}'.format(i, loss))
        # gradient descent
        w -= 0.1 * dw


# Initialize training data.
num_samples = 10000
num_features = 500
num_classes = 5
data, label = make_data(num_samples, num_features, num_classes)

# Initialize training weight and train
weight = random.randn(num_features, num_classes)
train(weight, data, 100)
Example #21
0
  def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,
               dropout=0, use_batchnorm=False, reg=0.0,
               weight_scale=1e-2, seed=None, dtype=py_np.float64, conv_mode='lazy'):

    """
    Initialize a new FullyConnectedNet.
    
    Inputs:
    - hidden_dims: A list of integers giving the size of each hidden layer.
    - input_dim: An integer giving the size of the input.
    - num_classes: An integer giving the number of classes to classify.
    - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then
      the network should not use dropout at all.
    - use_batchnorm: Whether or not the network should use batch normalization.
    - reg: Scalar giving L2 regularization strength.
    - weight_scale: Scalar giving the standard deviation for random
      initialization of the weights.
    - seed: If not None, then pass this random seed to the dropout layers. This
      will make the dropout layers deteriminstic so we can gradient check the
      model.
    """
    super(FullyConnectedNet, self).__init__(conv_mode)
    self.use_batchnorm = use_batchnorm
    self.use_dropout = dropout > 0
    self.reg = reg
    self.num_layers = 1 + len(hidden_dims)
    self.params = {}

    #Define parameter name given # layer
    self.w_name = lambda l: 'W' + str(l)
    self.b_name = lambda l: 'b' + str(l)
    self.bn_ga_name = lambda l: 'bn_ga' + str(l)
    self.bn_bt_name = lambda l: 'bn_bt' + str(l)

    for l in range(self.num_layers):
      if l == 0:
        input_d = input_dim
      else:
        input_d = hidden_dims[l-1]

      if l < self.num_layers - 1:
        out_d = hidden_dims[l]
      else:
        out_d = num_classes

      self.params[self.w_name(l)] = random.randn(input_d, out_d) * weight_scale
      self.params[self.b_name(l)] = np.zeros((out_d))
      if l < self.num_layers and self.use_batchnorm:
        self.params[self.bn_ga_name(l)] = np.ones((out_d))
        self.params[self.bn_bt_name(l)] = np.zeros((out_d))

    self.param_keys = self.params.keys()

    # When using dropout we need to pass a dropout_param dictionary to each
    # dropout layer so that the layer knows the dropout probability and the mode
    # (train / test). You can pass the same dropout_param to each dropout layer.
    self.dropout_param = {}
    if self.use_dropout:
      self.dropout_param = {'mode': 'train', 'p': dropout}
      if seed is not None:
        self.dropout_param['seed'] = seed
    
    # With batch normalization we need to keep track of running means and
    # variances, so we need to pass a special bn_param object to each batch
    # normalization layer.
    self.bn_params = []
    if self.use_batchnorm:
      self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)]
    
    # Build key's index in loss func's arglist
    self.key_args_index = {}
    for i, key in enumerate(self.param_keys):
      # data, targets would be the first two elments in arglist
      self.key_args_index[key] = self.data_target_cnt + i

    # Init Key to index in loss_function args
    self.w_idx = self.wrap_param_idx(self.w_name)
    self.b_idx = self.wrap_param_idx(self.b_name)
    self.bn_ga_idx = self.wrap_param_idx(self.bn_ga_name)
    self.bn_bt_idx = self.wrap_param_idx(self.bn_bt_name)
    a_sum = np.sum(a, axis=1, keepdims=True)
    prob = a / a_sum
    return prob

def train_loss(w, x):
    prob = predict(w, x)
    loss = -np.sum(label * np.log(prob)) / num_samples
    return loss

"""Use Minpy's auto-grad to derive a gradient function off loss"""
grad_function = grad_and_loss(train_loss)

# Using gradient descent to fit the correct classes.
def train(w, x, loops):
    for i in range(loops):
        dw, loss = grad_function(w, x)
        if i % 10 == 0:
            print('Iter {}, training loss {}'.format(i, loss))
        # gradient descent
        w -= 0.1 * dw

# Initialize training data.
num_samples = 10000
num_features = 500
num_classes = 5
data, label = make_data(num_samples, num_features, num_classes)

# Initialize training weight and train
weight = random.randn(num_features, num_classes)
train(weight, data, 100)