def __init__(self, input_dim=3 * 32 * 32, hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0, conv_mode='lazy', dtype=py_np.float64): """ Initialize a new network. Inputs: - input_dim: An integer giving the size of the input - hidden_dim: An integer giving the size of the hidden layer - num_classes: An integer giving the number of classes to classify - dropout: Scalar between 0 and 1 giving dropout strength. - weight_scale: Scalar giving the standard deviation for random initialization of the weights. - reg: Scalar giving L2 regularization strength. """ super(TwoLayerNet, self).__init__(conv_mode) self.params = {} self.reg = reg self.params['W1'] = random.randn(input_dim, hidden_dim) * weight_scale self.params['b1'] = np.zeros((hidden_dim)) self.params['W2'] = random.randn(hidden_dim, num_classes) * weight_scale self.params['b2'] = np.zeros((num_classes))
def xavier(shape, _): """Initialize weights with xavier initializer. Xavier initializer init matrix based on fan_in and fan_out Parameters ---------- shape : tuple Shape of the array to be initialized. _ : placeholder Returns ------- Array Initialized array of size `shape`. """ fan_out = shape[0] if len(shape) > 1: fan_in = numpy.prod(shape[1:]) else: fan_in = 0 var = numpy.sqrt(6.0 / (fan_out + fan_in)) ret = npr.randn(*shape) * var return ret
def xavier(shape, config): fan_out = shape[0] if len(shape) > 1: fan_in = numpy.prod(shape[1:]) else: fan_in = 0 var = numpy.sqrt(6.0 / (fan_out + fan_in)) ret = npr.randn(*shape) * var return ret
def set_param(self): self.params = {} c_cnt, height, width = self.input_dim f_cnt = self.num_filters f_h, f_w = self.filter_size, self.filter_size self.params['conv1_weight'] = random.randn(f_cnt, c_cnt, f_h, f_w) * self.weight_scale self.params['conv1_bias'] = np.zeros(f_cnt) #TODO(Haoran): whole stuff about all dimension calculations #should be substituted by quering symbol.arg_list conv_stride = 1 conv_pad = (f_h - 1) / 2 Hc, Wc = 1 + (height + 2 * conv_pad - f_h) / conv_stride, 1 + ( width + 2 * conv_pad - f_w) / conv_stride pool_height, pool_width = 2, 2 pool_stride = 2 Hp, Wp = (Hc - pool_height) / pool_stride + 1, ( Wc - pool_width) / pool_stride + 1 # weight has to be tranposed to fit mxnet's symbol self.params['fc1_weight'] = np.transpose( random.randn(5408, self.hidden_dim) * self.weight_scale) self.params['fc1_bias'] = np.zeros((self.hidden_dim)) # weight has to be tranposed to fit mxnet's symbol self.params['fc2_weight'] = np.transpose( random.randn(self.hidden_dim, self.num_classes) * self.weight_scale) self.params['fc2_bias'] = np.zeros((self.num_classes)) #TODO(Haoran): move following into parent structured model class self.param_keys = self.params.keys() # Build key's index in loss func's arglist self.key_args_index = {} for i, key in enumerate(self.param_keys): # data, targets would be the first two elments in arglist self.key_args_index[key] = self.data_target_cnt + i
def set_param(self): self.params = {} c_cnt, height, width = self.input_dim f_cnt = self.num_filters f_h, f_w = self.filter_size, self.filter_size self.params['conv1_weight'] = random.randn(f_cnt, c_cnt, f_h, f_w) * self.weight_scale self.params['conv1_bias'] = np.zeros(f_cnt) #TODO(Haoran): whole stuff about all dimension calculations #should be substituted by quering symbol.arg_list conv_stride = 1 conv_pad = (f_h - 1) / 2 Hc, Wc = 1 + (height + 2 * conv_pad - f_h) / conv_stride, 1 + ( width + 2 * conv_pad - f_w) / conv_stride pool_height, pool_width = 2, 2 pool_stride = 2 Hp, Wp = (Hc - pool_height) / pool_stride + 1, (Wc - pool_width ) / pool_stride + 1 # weight has to be tranposed to fit mxnet's symbol self.params['fc1_weight'] = np.transpose(random.randn( 5408, self.hidden_dim) * self.weight_scale) self.params['fc1_bias'] = np.zeros((self.hidden_dim)) # weight has to be tranposed to fit mxnet's symbol self.params['fc2_weight'] = np.transpose(random.randn( self.hidden_dim, self.num_classes) * self.weight_scale) self.params['fc2_bias'] = np.zeros((self.num_classes)) #TODO(Haoran): move following into parent structured model class self.param_keys = self.params.keys() # Build key's index in loss func's arglist self.key_args_index = {} for i, key in enumerate(self.param_keys): # data, targets would be the first two elments in arglist self.key_args_index[key] = self.data_target_cnt + i
def test_policy(): @minpy.wrap_policy(minpy.OnlyNumPyPolicy()) def gaussian_cluster_generator(num_samples=10000, num_features=500, num_classes=5): # with minpy.OnlyNumPyPolicy(): mu = np.random.rand(num_classes, num_features) sigma = np.ones((num_classes, num_features)) * 0.1 num_cls_samples = num_samples / num_classes x = np.zeros((num_samples, num_features)) y = np.zeros((num_samples, num_classes)) for i in range(num_classes): cls_samples = np.random.normal(mu[i, :], sigma[i, :], (num_cls_samples, num_features)) x[i * num_cls_samples:(i + 1) * num_cls_samples] = cls_samples y[i * num_cls_samples:(i + 1) * num_cls_samples, i] = 1 return x, y def predict(w, x): a = np.exp(np.dot(x, w)) a_sum = np.sum(a, axis=1, keepdims=True) prob = a / a_sum return prob def train_loss(w, x): prob = predict(w, x) loss = -np.sum(label * np.log(prob)) / num_samples return loss """Use Minpy's auto-grad to derive a gradient function off loss""" grad_function = grad_and_loss(train_loss) # Using gradient descent to fit the correct classes. def train(w, x, loops): for i in range(loops): dw, loss = grad_function(w, x) if i % 10 == 0: print('Iter {}, training loss {}'.format(i, loss)) # gradient descent w -= 0.1 * dw # Initialize training data. num_samples = 10000 num_features = 500 num_classes = 5 data, label = gaussian_cluster_generator(num_samples, num_features, num_classes) # Initialize training weight and train weight = random.randn(num_features, num_classes) train(weight, data, 100)
def test_policy(): @minpy.wrap_policy(minpy.OnlyNumPyPolicy()) def gaussian_cluster_generator(num_samples=10000, num_features=500, num_classes=5): # with minpy.OnlyNumPyPolicy(): mu = np.random.rand(num_classes, num_features) sigma = np.ones((num_classes, num_features)) * 0.1 num_cls_samples = num_samples / num_classes x = np.zeros((num_samples, num_features)) y = np.zeros((num_samples, num_classes)) for i in range(num_classes): cls_samples = np.random.normal(mu[i,:], sigma[i,:], (num_cls_samples, num_features)) x[i*num_cls_samples:(i+1)*num_cls_samples] = cls_samples y[i*num_cls_samples:(i+1)*num_cls_samples,i] = 1 return x, y def predict(w, x): a = np.exp(np.dot(x, w)) a_sum = np.sum(a, axis=1, keepdims=True) prob = a / a_sum return prob def train_loss(w, x): prob = predict(w, x) loss = -np.sum(label * np.log(prob)) / num_samples return loss """Use Minpy's auto-grad to derive a gradient function off loss""" grad_function = grad_and_loss(train_loss) # Using gradient descent to fit the correct classes. def train(w, x, loops): for i in range(loops): dw, loss = grad_function(w, x) if i % 10 == 0: print('Iter {}, training loss {}'.format(i, loss)) # gradient descent w -= 0.1 * dw # Initialize training data. num_samples = 10000 num_features = 500 num_classes = 5 data, label = gaussian_cluster_generator(num_samples, num_features, num_classes) # Initialize training weight and train weight = random.randn(num_features, num_classes) train(weight, data, 100)
def gaussian(shape, config): """Initialize weights with gaussian distribution. Parameters ---------- shape : tuple Shape of the array to be initialized. config : dict Mean and standard variance of the distribution Returns ------- Array Initialized array of size `shape` """ config.setdefault('mu', 0.0) config.setdefault('stdvar', 0.001) stdvar = config['stdvar'] meanvar = config['mu'] return npr.randn(*shape) * stdvar + meanvar
def gaussian(shape, config): config.setdefault('mu', 0.0) config.setdefault('stdvar', 0.001) stdvar = config['stdvar'] mu = config['mu'] return npr.randn(*shape) * stdvar + mu
train_inputs = build_dataset(__file__, seq_length, input_size, max_lines=60) pred_fun, loglike_fun, num_weights = build_lstm(input_size, state_size, output_size) def print_training_prediction(weights): print("Training text Predicted text") logprobs = np.asarray(pred_fun(weights, train_inputs)) for t in range(logprobs.shape[1]): training_text = one_hot_to_string(train_inputs[:,t,:]) predicted_text = one_hot_to_string(logprobs[:,t,:]) print(training_text.replace('\n', ' ') + "|" + predicted_text.replace('\n', ' ')) # Wrap function to only have one argument, for scipy.minimize. def training_loss(weights): return -loglike_fun(weights, train_inputs, train_inputs) def callback(weights): print("Train loss:", training_loss(weights)) print_training_prediction(weights) # Build gradient of loss function using autograd. training_loss_and_grad = value_and_grad(training_loss) init_weights = npr.randn(num_weights) * param_scale print("Training LSTM...") result = minimize(training_loss_and_grad, init_weights, jac=True, method='CG', options={'maxiter':train_iters}, callback=callback) trained_weights = result.x
def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10, dropout=0, use_batchnorm=False, reg=0.0, weight_scale=1e-2, dtype=np.float32, seed=None): """ Initialize a new FullyConnectedNet. Inputs: - hidden_dims: A list of integers giving the size of each hidden layer. - input_dim: An integer giving the size of the input. - num_classes: An integer giving the number of classes to classify. - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then the network should not use dropout at all. - use_batchnorm: Whether or not the network should use batch normalization. - reg: Scalar giving L2 regularization strength. - weight_scale: Scalar giving the standard deviation for random initialization of the weights. - dtype: A numpy datatype object; all computations will be performed using this datatype. float32 is faster but less accurate, so you should use float64 for numeric gradient checking. - seed: If not None, then pass this random seed to the dropout layers. This will make the dropout layers deteriminstic so we can gradient check the model. """ self.use_batchnorm = use_batchnorm self.use_dropout = dropout > 0 self.reg = reg self.num_layers = 1 + len(hidden_dims) self.dtype = dtype self.params = {} ############################################################################ # TODO: Initialize the parameters of the network, storing all values in # # the self.params dictionary. Store weights and biases for the first layer # # in W1 and b1; for the second layer use W2 and b2, etc. Weights should be # # initialized from a normal distribution with standard deviation equal to # # weight_scale and biases should be initialized to zero. # # # # When using batch normalization, store scale and shift parameters for the # # first layer in gamma1 and beta1; for the second layer use gamma2 and # # beta2, etc. Scale parameters should be initialized to one and shift # # parameters should be initialized to zero. # ############################################################################ for l in xrange(self.num_layers): if l == 0: input_d = input_dim else: input_d = hidden_dims[l-1] if l < self.num_layers - 1: out_d = hidden_dims[l] else: out_d = num_classes self.params[self.GetWeightName(l)] = random.randn(input_d, out_d) * weight_scale self.params[self.GetBiasName(l)] = np.zeros((out_d)) ############################################################################ # END OF YOUR CODE # ############################################################################ # When using dropout we need to pass a dropout_param dictionary to each # dropout layer so that the layer knows the dropout probability and the mode # (train / test). You can pass the same dropout_param to each dropout layer. self.dropout_param = {} if self.use_dropout: self.dropout_param = {'mode': 'train', 'p': dropout} if seed is not None: self.dropout_param['seed'] = seed # With batch normalization we need to keep track of running means and # variances, so we need to pass a special bn_param object to each batch # normalization layer. You should pass self.bn_params[0] to the forward pass # of the first batch normalization layer, self.bn_params[1] to the forward # pass of the second batch normalization layer, etc. self.bn_params = [] if self.use_batchnorm: self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)] # Cast all parameters to the correct datatype for k, v in self.params.iteritems(): self.params[k] = v.astype(dtype)
def test_stack(): arr = [rnd.randn(3, 4) for _ in range(10)] res = np.stack(arr) assert res.shape == (10, 3, 4)
def test_concatenate(): arr = [rnd.randn(3, 4) for _ in range(10)] res = np.concatenate(arr, axis=1) assert res.shape == (3, 40)
def xavier(shape, config=None): var = len(shape) / sum(shape) return npr.randn(*shape) * var
def _randn(l, c): return random.randn(l, c)
logprobs = np.asarray(pred_fun(weights, train_inputs)) for t in range(logprobs.shape[1]): training_text = one_hot_to_string(train_inputs[:, t, :]) predicted_text = one_hot_to_string(logprobs[:, t, :]) print( training_text.replace('\n', ' ') + "|" + predicted_text.replace('\n', ' ')) # Wrap function to only have one argument, for scipy.minimize. def training_loss(weights): return -loglike_fun(weights, train_inputs, train_inputs) def callback(weights): print("Train loss:", training_loss(weights)) print_training_prediction(weights) # Build gradient of loss function using autograd. training_loss_and_grad = value_and_grad(training_loss) init_weights = npr.randn(num_weights) * param_scale print("Training LSTM...") result = minimize(training_loss_and_grad, init_weights, jac=True, method='CG', options={'maxiter': train_iters}, callback=callback) trained_weights = result.x
def __init__(self, hidden_dims, input_dim=3 * 32 * 32, num_classes=10, dropout=0, use_batchnorm=False, reg=0.0, weight_scale=1e-2, seed=None, dtype=py_np.float64, conv_mode='lazy'): """ Initialize a new FullyConnectedNet. Inputs: - hidden_dims: A list of integers giving the size of each hidden layer. - input_dim: An integer giving the size of the input. - num_classes: An integer giving the number of classes to classify. - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then the network should not use dropout at all. - use_batchnorm: Whether or not the network should use batch normalization. - reg: Scalar giving L2 regularization strength. - weight_scale: Scalar giving the standard deviation for random initialization of the weights. - seed: If not None, then pass this random seed to the dropout layers. This will make the dropout layers deteriminstic so we can gradient check the model. """ super(FullyConnectedNet, self).__init__(conv_mode) self.use_batchnorm = use_batchnorm self.use_dropout = dropout > 0 self.reg = reg self.num_layers = 1 + len(hidden_dims) self.params = {} #Define parameter name given # layer self.w_name = lambda l: 'W' + str(l) self.b_name = lambda l: 'b' + str(l) self.bn_ga_name = lambda l: 'bn_ga' + str(l) self.bn_bt_name = lambda l: 'bn_bt' + str(l) for l in range(self.num_layers): if l == 0: input_d = input_dim else: input_d = hidden_dims[l - 1] if l < self.num_layers - 1: out_d = hidden_dims[l] else: out_d = num_classes self.params[self.w_name(l)] = random.randn(input_d, out_d) * weight_scale self.params[self.b_name(l)] = np.zeros((out_d)) if l < self.num_layers and self.use_batchnorm: self.params[self.bn_ga_name(l)] = np.ones((out_d)) self.params[self.bn_bt_name(l)] = np.zeros((out_d)) self.param_keys = self.params.keys() # When using dropout we need to pass a dropout_param dictionary to each # dropout layer so that the layer knows the dropout probability and the mode # (train / test). You can pass the same dropout_param to each dropout layer. self.dropout_param = {} if self.use_dropout: self.dropout_param = {'mode': 'train', 'p': dropout} if seed is not None: self.dropout_param['seed'] = seed # With batch normalization we need to keep track of running means and # variances, so we need to pass a special bn_param object to each batch # normalization layer. self.bn_params = [] if self.use_batchnorm: self.bn_params = [{ 'mode': 'train' } for i in xrange(self.num_layers - 1)] # Build key's index in loss func's arglist self.key_args_index = {} for i, key in enumerate(self.param_keys): # data, targets would be the first two elments in arglist self.key_args_index[key] = self.data_target_cnt + i # Init Key to index in loss_function args self.w_idx = self.wrap_param_idx(self.w_name) self.b_idx = self.wrap_param_idx(self.b_name) self.bn_ga_idx = self.wrap_param_idx(self.bn_ga_name) self.bn_bt_idx = self.wrap_param_idx(self.bn_bt_name)
def train_loss(w, x): prob = predict(w, x) loss = -np.sum(label * np.log(prob)) / num_samples return loss """Use Minpy's auto-grad to derive a gradient function off loss""" grad_function = grad_and_loss(train_loss) # Using gradient descent to fit the correct classes. def train(w, x, loops): for i in range(loops): dw, loss = grad_function(w, x) if i % 10 == 0: print('Iter {}, training loss {}'.format(i, loss)) # gradient descent w -= 0.1 * dw # Initialize training data. num_samples = 10000 num_features = 500 num_classes = 5 data, label = make_data(num_samples, num_features, num_classes) # Initialize training weight and train weight = random.randn(num_features, num_classes) train(weight, data, 100)
def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10, dropout=0, use_batchnorm=False, reg=0.0, weight_scale=1e-2, seed=None, dtype=py_np.float64, conv_mode='lazy'): """ Initialize a new FullyConnectedNet. Inputs: - hidden_dims: A list of integers giving the size of each hidden layer. - input_dim: An integer giving the size of the input. - num_classes: An integer giving the number of classes to classify. - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then the network should not use dropout at all. - use_batchnorm: Whether or not the network should use batch normalization. - reg: Scalar giving L2 regularization strength. - weight_scale: Scalar giving the standard deviation for random initialization of the weights. - seed: If not None, then pass this random seed to the dropout layers. This will make the dropout layers deteriminstic so we can gradient check the model. """ super(FullyConnectedNet, self).__init__(conv_mode) self.use_batchnorm = use_batchnorm self.use_dropout = dropout > 0 self.reg = reg self.num_layers = 1 + len(hidden_dims) self.params = {} #Define parameter name given # layer self.w_name = lambda l: 'W' + str(l) self.b_name = lambda l: 'b' + str(l) self.bn_ga_name = lambda l: 'bn_ga' + str(l) self.bn_bt_name = lambda l: 'bn_bt' + str(l) for l in range(self.num_layers): if l == 0: input_d = input_dim else: input_d = hidden_dims[l-1] if l < self.num_layers - 1: out_d = hidden_dims[l] else: out_d = num_classes self.params[self.w_name(l)] = random.randn(input_d, out_d) * weight_scale self.params[self.b_name(l)] = np.zeros((out_d)) if l < self.num_layers and self.use_batchnorm: self.params[self.bn_ga_name(l)] = np.ones((out_d)) self.params[self.bn_bt_name(l)] = np.zeros((out_d)) self.param_keys = self.params.keys() # When using dropout we need to pass a dropout_param dictionary to each # dropout layer so that the layer knows the dropout probability and the mode # (train / test). You can pass the same dropout_param to each dropout layer. self.dropout_param = {} if self.use_dropout: self.dropout_param = {'mode': 'train', 'p': dropout} if seed is not None: self.dropout_param['seed'] = seed # With batch normalization we need to keep track of running means and # variances, so we need to pass a special bn_param object to each batch # normalization layer. self.bn_params = [] if self.use_batchnorm: self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)] # Build key's index in loss func's arglist self.key_args_index = {} for i, key in enumerate(self.param_keys): # data, targets would be the first two elments in arglist self.key_args_index[key] = self.data_target_cnt + i # Init Key to index in loss_function args self.w_idx = self.wrap_param_idx(self.w_name) self.b_idx = self.wrap_param_idx(self.b_name) self.bn_ga_idx = self.wrap_param_idx(self.bn_ga_name) self.bn_bt_idx = self.wrap_param_idx(self.bn_bt_name)
a_sum = np.sum(a, axis=1, keepdims=True) prob = a / a_sum return prob def train_loss(w, x): prob = predict(w, x) loss = -np.sum(label * np.log(prob)) / num_samples return loss """Use Minpy's auto-grad to derive a gradient function off loss""" grad_function = grad_and_loss(train_loss) # Using gradient descent to fit the correct classes. def train(w, x, loops): for i in range(loops): dw, loss = grad_function(w, x) if i % 10 == 0: print('Iter {}, training loss {}'.format(i, loss)) # gradient descent w -= 0.1 * dw # Initialize training data. num_samples = 10000 num_features = 500 num_classes = 5 data, label = make_data(num_samples, num_features, num_classes) # Initialize training weight and train weight = random.randn(num_features, num_classes) train(weight, data, 100)