def test_graph_diamond(): npr.seed(2) N = 10 D = 5 H1 = 6 H2 = 7 X = kayak.Inputs(npr.randn(N,D)) W1 = kayak.Parameter(npr.randn(D,H1)) W2a = kayak.Parameter(npr.randn(H1,H2)) W2b = kayak.Parameter(npr.randn(H1,H2)) W3 = kayak.Parameter(npr.randn(H2,1)) U1 = kayak.SoftReLU(kayak.MatMult(X, W1)) U2a = kayak.SoftReLU(kayak.MatMult(U1, W2a)) U2b = kayak.SoftReLU(kayak.MatMult(U1, W2b)) U3a = kayak.SoftReLU(kayak.MatMult(U2a, W3)) U3b = kayak.SoftReLU(kayak.MatMult(U2b, W3)) out = kayak.MatSum(kayak.MatAdd(U3a, U3b)) out.value print kayak.util.checkgrad(W1, out) print kayak.util.checkgrad(W2a, out) print kayak.util.checkgrad(W2b, out) print kayak.util.checkgrad(W3, out) assert kayak.util.checkgrad(W1, out) < MAX_GRAD_DIFF assert kayak.util.checkgrad(W2a, out) < MAX_GRAD_DIFF assert kayak.util.checkgrad(W2b, out) < MAX_GRAD_DIFF assert kayak.util.checkgrad(W3, out) < MAX_GRAD_DIFF
def __init__(self, maxnum, reduced_dims): self.threshold = 1e-2 dummyword = np.zeros((maxnum, 1)) W1 = np.random.randn(reduced_dims, maxnum) * 0.1 W2 = np.random.randn(maxnum, reduced_dims) * 0.1 self.input = ky.Parameter(dummyword) self.W1 = ky.Parameter(W1) self.W2 = ky.Parameter(W2) self.output = ky.MatMult(self.W1, self.input) self.recons = ky.MatMult(self.W2, self.output) self.loss = ky.MatSum(ky.L2Loss(self.recons, self.input)) #self.totloss = ky.MatAdd(self.loss,ky.L2Norm(self.W2,weight=1e-2),ky.L2Norm(self.W1,weight = 1e-2)) self.totloss = self.loss
def test_cache_utility(): npr.seed(3) num_layers = 17 num_dims = 3 X = kayak.Inputs(npr.randn(10, num_dims)) W1 = kayak.Parameter(npr.randn(num_dims, num_dims)) W2 = kayak.Parameter(npr.randn(num_dims, num_dims)) Z = kayak.MatMult(X, W1) for jj in xrange(num_layers): Z = kayak.SoftReLU(kayak.MatAdd(kayak.MatMult(Z, W2), kayak.MatMult(Z, W2))) out = kayak.MatSum(Z) assert kayak.util.checkgrad(W1, out) < 1e-4
def test_matmult_values_2(): npr.seed(2) for ii in xrange(NUM_TRIALS): np_A = npr.randn(5, 5) A = kayak.Parameter(np_A) C = kayak.MatMult(A, A) assert C.value.shape == (5, 5) assert np.all(close_float(C.value, np.dot(np_A, np_A)))
def test_reshape_2(): npr.seed(2) np_A = npr.randn(5,10) A = kayak.Parameter(np_A) B = kayak.Reshape(A, (2,25)) C = kayak.Parameter(npr.randn(25,5)) D = kayak.MatMult(B, C) out = kayak.MatSum(D) out.value assert out.grad(A).shape == np_A.shape assert kayak.util.checkgrad(A, out) < MAX_GRAD_DIFF
def test_matmult_grad_2(): npr.seed(4) for ii in xrange(NUM_TRIALS): np_A = npr.randn(5, 5) A = kayak.Parameter(np_A) C = kayak.MatMult(A, A) D = kayak.MatSum(C) D.value assert D.grad(A).shape == (5, 5) assert kayak.util.checkgrad(A, D) < MAX_GRAD_DIFF
def test_transpose_3(): npr.seed(3) np_A = npr.randn(5, 10) A = kayak.Parameter(np_A) B = kayak.Transpose(A) C = kayak.Parameter(npr.randn(5, 5)) D = kayak.MatMult(B, C) out = kayak.MatSum(D) out.value assert out.grad(A).shape == np_A.shape assert kayak.util.checkgrad(A, out) < MAX_GRAD_DIFF
def test_matmult_values_1(): npr.seed(1) for ii in xrange(NUM_TRIALS): np_A = npr.randn(5, 6) np_B = npr.randn(6, 7) A = kayak.Parameter(np_A) B = kayak.Parameter(np_B) C = kayak.MatMult(A, B) assert C.value.shape == (5, 7) assert np.all(close_float(C.value, np.dot(np_A, np_B)))
def test_graph_chain(): npr.seed(1) N = 10 D = 5 H1 = 6 H2 = 7 X = kayak.Inputs(npr.randn(N,D)) W1 = kayak.Parameter(npr.randn(D,H1)) W2 = kayak.Parameter(npr.randn(H1,H2)) W3 = kayak.Parameter(npr.randn(H2,1)) U1 = kayak.SoftReLU(kayak.MatMult(X, W1)) U2 = kayak.SoftReLU(kayak.MatMult(U1, W2)) U3 = kayak.SoftReLU(kayak.MatMult(U2, W3)) out = kayak.MatSum(U3) out.value assert kayak.util.checkgrad(W1, out) < MAX_GRAD_DIFF assert kayak.util.checkgrad(W2, out) < MAX_GRAD_DIFF assert kayak.util.checkgrad(W3, out) < MAX_GRAD_DIFF
def test_graph_dag(): npr.seed(3) num_layers = 7 num_dims = 5 for ii in xrange(NUM_TRIALS): probs = npr.rand() X = kayak.Inputs(npr.randn(25,num_dims)) wts = [] layers = [] for jj in xrange(num_layers): U = kayak.Constant(np.zeros((25,num_dims))) if npr.rand() < probs: W = kayak.Parameter(0.1*npr.randn(num_dims, num_dims)) wts.append(W) U = kayak.MatAdd( U, kayak.SoftReLU(kayak.MatMult(X, W)) ) for kk in xrange(jj): if npr.rand() < probs: W = kayak.Parameter(0.1*npr.randn(num_dims, num_dims)) wts.append(W) U = kayak.MatAdd( U, kayak.SoftReLU(kayak.MatMult(layers[kk], W)) ) layers.append(U) out = kayak.MatSum(layers[-1]) out.value for jj, wt in enumerate(wts): diff = kayak.util.checkgrad(wt, out, 1e-4) print diff assert diff < 1e-4
def test_matmult_values_3(): npr.seed(3) for ii in xrange(NUM_TRIALS): np_A = npr.randn(5, 6) np_B = npr.randn(6, 7) np_C = npr.randn(7, 8) A = kayak.Parameter(np_A) B = kayak.Parameter(np_B) C = kayak.Parameter(np_C) D = kayak.MatMult(A, B, C) assert D.value.shape == (5, 8) assert np.all(close_float(D.value, np.dot(np_A, np.dot(np_B, np_C))))
def test_graph_simple(): npr.seed(1) N = 1 D = 1 H1 = 1 X = kayak.Inputs(npr.randn(N,D)) W1 = kayak.Parameter(npr.randn(D,H1)) U3 = kayak.MatMult(W1, X) out = U3 print "Value: ", out.value print "Gradient: ", out.grad(W1) print "Grad error: ", kayak.util.checkgrad(W1, out) assert kayak.util.checkgrad(W1, out) < MAX_GRAD_DIFF
def test_matmult_grad_1(): npr.seed(3) for ii in xrange(NUM_TRIALS): np_A = npr.randn(5, 6) np_B = npr.randn(6, 7) A = kayak.Parameter(np_A) B = kayak.Parameter(np_B) C = kayak.MatMult(A, B) D = kayak.MatSum(C) D.value assert D.grad(A).shape == (5, 6) assert D.grad(B).shape == (6, 7) assert kayak.util.checkgrad(A, D) < MAX_GRAD_DIFF assert kayak.util.checkgrad(B, D) < MAX_GRAD_DIFF
def test_matmult_grad_vect_mat(): npr.seed(5) for ii in xrange(NUM_TRIALS): np_A = npr.randn(6, ) np_B = npr.randn(6, 7) np_C = npr.randn(7, ) A = kayak.Parameter(np_A) B = kayak.Parameter(np_B) C = kayak.Parameter(np_C) D = kayak.MatMult(A, B) E = kayak.MatSum(kayak.ElemMult(C, D)) assert E.grad(A).shape == (6, ) assert E.grad(B).shape == (6, 7) assert kayak.util.checkgrad(A, E) < MAX_GRAD_DIFF assert kayak.util.checkgrad(B, E) < MAX_GRAD_DIFF
def test_batchnorm_values_1(): npr.seed(1) for ii in xrange(NUM_TRIALS): np_X = npr.randn(5, 4) np_A = npr.randn(4, 2) A = kayak.Parameter(np_A) X = kayak.Parameter(np_X) Y = kayak.BatchNormalize(X) J = kayak.TanH(kayak.MatMult(Y, A)) Z = kayak.MatSum(J) mu = np.mean(np_X, axis=0, keepdims=True) sig = np.mean((np_X - mu)**2, axis=0, keepdims=True) + 1e-6 np_Y = (np_X - mu) / np.sqrt(sig) assert np.all(close_float(Y.value, np_Y)) assert kayak.util.checkgrad(X, Z, verbose=True) < MAX_GRAD_DIFF
def test_matmult_grad_3(): npr.seed(5) for ii in xrange(NUM_TRIALS): np_A = npr.randn(5, 6) np_B = npr.randn(6, 7) np_C = npr.randn(7, 8) A = kayak.Parameter(np_A) B = kayak.Parameter(np_B) C = kayak.Parameter(np_C) D = kayak.MatMult(A, B, C) E = kayak.MatSum(kayak.SoftReLU(D)) assert E.grad(A).shape == (5, 6) assert E.grad(B).shape == (6, 7) assert E.grad(C).shape == (7, 8) assert kayak.util.checkgrad(A, E) < MAX_GRAD_DIFF assert kayak.util.checkgrad(B, E) < MAX_GRAD_DIFF assert kayak.util.checkgrad(C, E) < MAX_GRAD_DIFF
def train(inputs, targets): # Create a batcher object. batcher = kayak.Batcher(batch_size, inputs.shape[0]) # Inputs and targets need access to the batcher. X = kayak.Inputs(inputs, batcher) T = kayak.Targets(targets, batcher) # First-layer weights and biases, with random initializations. W1 = kayak.Parameter(0.1 * npr.randn(inputs.shape[1], layer1_sz)) B1 = kayak.Parameter(0.1 * npr.randn(1, layer1_sz)) # First hidden layer: ReLU + Dropout H1 = kayak.Dropout(kayak.HardReLU(kayak.ElemAdd(kayak.MatMult(X, W1), B1)), layer1_dropout, batcher=batcher) # Second-layer weights and biases, with random initializations. W2 = kayak.Parameter(0.1 * npr.randn(layer1_sz, layer2_sz)) B2 = kayak.Parameter(0.1 * npr.randn(1, layer2_sz)) # Second hidden layer: ReLU + Dropout H2 = kayak.Dropout(kayak.HardReLU(kayak.ElemAdd(kayak.MatMult(H1, W2), B2)), layer2_dropout, batcher=batcher) # Output layer weights and biases, with random initializations. W3 = kayak.Parameter(0.1 * npr.randn(layer2_sz, 10)) B3 = kayak.Parameter(0.1 * npr.randn(1, 10)) # Output layer. Y = kayak.LogSoftMax(kayak.ElemAdd(kayak.MatMult(H2, W3), B3)) # The training loss is negative multinomial log likelihood. loss = kayak.MatSum(kayak.LogMultinomialLoss(Y, T)) # Use momentum for the gradient-based optimization. mom_grad_W1 = np.zeros(W1.shape) mom_grad_W2 = np.zeros(W2.shape) mom_grad_W3 = np.zeros(W3.shape) # Loop over epochs. for epoch in xrange(10): # Track the total loss. total_loss = 0.0 # Loop over batches -- using batcher as iterator. for batch in batcher: # Compute the loss of this minibatch by asking the Kayak # object for its value and giving it reset=True. total_loss += loss.value # Now ask the loss for its gradient in terms of the # weights and the biases -- the two things we're trying to # learn here. grad_W1 = loss.grad(W1) grad_B1 = loss.grad(B1) grad_W2 = loss.grad(W2) grad_B2 = loss.grad(B2) grad_W3 = loss.grad(W3) grad_B3 = loss.grad(B3) # Use momentum on the weight gradients. mom_grad_W1 = momentum * mom_grad_W1 + (1.0 - momentum) * grad_W1 mom_grad_W2 = momentum * mom_grad_W2 + (1.0 - momentum) * grad_W2 mom_grad_W3 = momentum * mom_grad_W3 + (1.0 - momentum) * grad_W3 # Now make the actual parameter updates. W1.value -= learn_rate * mom_grad_W1 B1.value -= learn_rate * grad_B1 W2.value -= learn_rate * mom_grad_W2 B2.value -= learn_rate * grad_B2 W3.value -= learn_rate * mom_grad_W3 B3.value -= learn_rate * grad_B3 print epoch, total_loss # After we've trained, we return a sugary little function handle # that makes things easy. Basically, what we're doing here is # handing the output object (not the loss!) a dictionary where the # key is the Kayak input object 'X' (that is the features being # used here for logistic regression) and the value in that # dictionary is being determined by the argument to the lambda # expression. The point here is that we wind up with a function # handle the can be called with a numpy object and it produces the # target values for novel data, using the parameters we just learned. def compute_predictions(x): X.data = x batcher.test_mode() return Y.value return compute_predictions
batcher = kayak.Batcher(batch_size, N) # Build network. kyk_inputs = kayak.Inputs(X, batcher) # Labels. kyk_targets = kayak.Targets(Y, batcher) # First layer weights and biases. kyk_W1 = kayak.Parameter(npr.randn(D, H1)) kyk_B1 = kayak.Parameter(npr.randn(1, H1)) # First layer weight mult plus biases, then nonlinearity. kyk_H1 = kayak.Dropout(kayak.HardReLU( kayak.ElemAdd(kayak.MatMult(kyk_inputs, kyk_W1), kyk_B1)), drop_prob=0.5, batcher=batcher) # Second layer weights and bias. kyk_W2 = kayak.Parameter(npr.randn(H1, P)) kyk_B2 = kayak.Parameter(npr.randn(1, P)) # Second layer multiplication. kyk_out = kayak.Dropout(kayak.HardReLU( kayak.ElemAdd(kayak.MatMult(kyk_H1, kyk_W2), kyk_B2)), drop_prob=0.5, batcher=batcher) # Elementwise Loss. kyk_el_loss = kayak.L2Loss(kyk_out, kyk_targets)
Y = npr.poisson(lam) kyk_batcher = kayak.Batcher(batch_size, N) # Build network. kyk_inputs = kayak.Inputs(X, kyk_batcher) # Labels. kyk_targets = kayak.Targets(Y, kyk_batcher) # Weights. W = 0.01 * npr.randn(D, P) kyk_W = kayak.Parameter(W) # Linear layer. kyk_activation = kayak.MatMult(kyk_inputs, kyk_W) # Exponential inverse-link function. kyk_lam = kayak.ElemExp(kyk_activation) # Poisson negative log likelihood. kyk_nll = kyk_lam - kayak.ElemLog(kyk_lam) * kyk_targets # Sum the losses. kyk_loss = kayak.MatSum(kyk_nll) for ii in xrange(100): for batch in kyk_batcher: loss = kyk_loss.value print loss, np.sum((kyk_W.value - true_W)**2)
def train(inputs, targets, batch_size, learn_rate, momentum, l1_weight, l2_weight, dropout): # Create a batcher object. batcher = kayak.Batcher(batch_size, inputs.shape[0]) # Inputs and targets need access to the batcher. X = kayak.Inputs(inputs, batcher) T = kayak.Targets(targets, batcher) # Weights and biases, with random initializations. W = kayak.Parameter( 0.1*npr.randn( inputs.shape[1], 10 )) B = kayak.Parameter( 0.1*npr.randn(1,10) ) # Nothing fancy here: inputs times weights, plus bias, then softmax. dropout_layer = kayak.Dropout(X, dropout, batcher=batcher) Y = kayak.LogSoftMax( kayak.ElemAdd( kayak.MatMult(dropout_layer, W), B ) ) # The training loss is negative multinomial log likelihood. loss = kayak.MatAdd(kayak.MatSum(kayak.LogMultinomialLoss(Y, T)), kayak.L2Norm(W, l2_weight), kayak.L1Norm(W, l1_weight)) # Use momentum for the gradient-based optimization. mom_grad_W = np.zeros(W.shape) # Loop over epochs. for epoch in xrange(10): # Track the total loss and the overall gradient. total_loss = 0.0 total_grad_W = np.zeros(W.shape) # Loop over batches -- using batcher as iterator. for batch in batcher: # Compute the loss of this minibatch by asking the Kayak # object for its value and giving it reset=True. total_loss += loss.value # Now ask the loss for its gradient in terms of the # weights and the biases -- the two things we're trying to # learn here. grad_W = loss.grad(W) grad_B = loss.grad(B) # Use momentum on the weight gradient. mom_grad_W = momentum*mom_grad_W + (1.0-momentum)*grad_W # Now make the actual parameter updates. W.value -= learn_rate * mom_grad_W B.value -= learn_rate * grad_B # Keep track of the gradient to see if we're converging. total_grad_W += grad_W #print epoch, total_loss, np.sum(total_grad_W**2) # After we've trained, we return a sugary little function handle # that makes things easy. Basically, what we're doing here is # handing the output object (not the loss!) a dictionary where the # key is the Kayak input object 'X' (that is the features being # used here for logistic regression) and the value in that # dictionary is being determined by the argument to the lambda # expression. The point here is that we wind up with a function # handle the can be called with a numpy object and it produces the # target values for novel data, using the parameters we just learned. def compute_predictions(x): X.data = x batcher.test_mode() return Y.value return compute_predictions
Y = np.dot(X, true_W) + 0.1 * npr.randn(N, P) kyk_batcher = kayak.Batcher(batch_size, N) # Build network. kyk_inputs = kayak.Inputs(X, kyk_batcher) # Labels. kyk_targets = kayak.Targets(Y, kyk_batcher) # Weights. W = 0.01 * npr.randn(D, P) kyk_W = kayak.Parameter(W) # Linear layer. kyk_out = kayak.MatMult(kyk_inputs, kyk_W) # Elementwise Loss. kyk_el_loss = kayak.L2Loss(kyk_out, kyk_targets) # Sum the losses. kyk_loss = kayak.MatSum(kyk_el_loss) for ii in xrange(100): for batch in kyk_batcher: loss = kyk_loss.value print loss, np.sum((kyk_W.value - true_W)**2) grad = kyk_loss.grad(kyk_W) kyk_W.value -= learn * grad
def initial_latent_trace(body, inpt, voltage, t): I_true = np.diff(voltage) * body.C T = I_true.shape[0] gs = np.diag([c.g for c in body.children]) D = int(sum([c.D for c in body.children])) driving_voltage = np.dot(np.ones((len(body.children), 1)), np.array([voltage]))[:, :T] child_i = 0 for i in range(D): driving_voltage[i, :] = voltage[:T] - body.children[child_i].E K = np.array([[max(i - j, 0) for i in range(T)] for j in range(T)]) K = K.T + K K = -1 * (K**2) K = np.exp(K / 2) L = np.linalg.cholesky(K + (1e-7) * np.eye(K.shape[0])) Linv = scipy.linalg.solve_triangular(L.transpose(), np.identity(K.shape[0])) N = 1 batch_size = 5000 learn = .0000001 runs = 10000 batcher = kayak.Batcher(batch_size, N) inputs = kayak.Parameter(driving_voltage) targets = kayak.Targets(np.array([I_true]), batcher) g_params = kayak.Parameter(gs) I_input = kayak.Parameter(inpt.T[:, :T]) Kinv = kayak.Parameter(np.dot(Linv.transpose(), Linv)) initial_latent = np.random.randn(D, T) latent_trace = kayak.Parameter(initial_latent) sigmoid = kayak.Logistic(latent_trace) quadratic = kayak.ElemMult( sigmoid, kayak.MatMult( kayak.Parameter(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 0]])), sigmoid)) three_quadratic = kayak.MatMult( kayak.Parameter(np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0]])), quadratic) linear = kayak.MatMult( kayak.Parameter(np.array([[0, 0, 0], [0, 0, 0], [0, 0, 1]])), sigmoid) leak_open = kayak.Parameter(np.vstack((np.ones((1, T)), np.ones((2, T))))) open_fractions = kayak.ElemAdd(leak_open, kayak.ElemAdd(three_quadratic, linear)) I_channels = kayak.ElemMult(kayak.MatMult(g_params, inputs), open_fractions) I_ionic = kayak.MatMult(kayak.Parameter(np.array([[1, 1, 1]])), I_channels) predicted = kayak.MatAdd(I_ionic, I_input) nll = kayak.ElemPower(predicted - targets, 2) hack_vec = kayak.Parameter(np.array([1, 0, 0, 0, 1, 0, 0, 0, 1])) kyk_loss = kayak.MatSum(nll) + kayak.MatMult( kayak.Reshape( kayak.MatMult(kayak.MatMult(latent_trace, Kinv), kayak.Transpose(latent_trace)), (9, )), hack_vec) + kayak.MatSum(kayak.ElemPower(I_channels, 2)) grad = kyk_loss.grad(latent_trace) for ii in xrange(runs): for batch in batcher: loss = kyk_loss.value if ii % 100 == 0: print ii, loss, np.sum(np.power(predicted.value - I_true, 2)) / T grad = kyk_loss.grad(latent_trace) + .5 * grad latent_trace.value -= learn * grad return sigmoid.value
def train(inputs, targets, batch_size, learn_rate, momentum, l1_weight, l2_weight, dropout, improvement_thresh): # Create a batcher object. batcher = kayak.Batcher(batch_size, inputs.shape[0]) # Inputs and targets need access to the batcher. X = kayak.Inputs(inputs, batcher) T = kayak.Targets(targets, batcher) # Put some dropout regularization on the inputs H = kayak.Dropout(X, dropout) # Weights and biases, with random initializations. W = kayak.Parameter(0.1 * npr.randn(inputs.shape[1], 10)) B = kayak.Parameter(0.1 * npr.randn(1, 10)) # Nothing fancy here: inputs times weights, plus bias, then softmax. Y = kayak.LogSoftMax(kayak.ElemAdd(kayak.MatMult(H, W), B)) # The training loss is negative multinomial log likelihood. loss = kayak.MatAdd(kayak.MatSum(kayak.LogMultinomialLoss(Y, T)), kayak.L2Norm(W, l2_weight), kayak.L1Norm(W, l1_weight)) # Use momentum for the gradient-based optimization. mom_grad_W = np.zeros(W.shape) best_loss = np.inf best_epoch = -1 # Loop over epochs. for epoch in range(100): # Track the total loss. total_loss = 0.0 # Loop over batches -- using batcher as iterator. for batch in batcher: # Draw new random dropouts H.draw_new_mask() # Compute the loss of this minibatch by asking the Kayak # object for its value and giving it reset=True. total_loss += loss.value # Now ask the loss for its gradient in terms of the # weights and the biases -- the two things we're trying to # learn here. grad_W = loss.grad(W) grad_B = loss.grad(B) # Use momentum on the weight gradient. mom_grad_W *= momentum mom_grad_W += (1.0 - momentum) * grad_W # Now make the actual parameter updates. W.value -= learn_rate * mom_grad_W B.value -= learn_rate * grad_B print("Epoch: %d, total loss: %f" % (epoch, total_loss)) if not np.isfinite(total_loss): print("Training diverged. Returning constraint violation.") break if total_loss < best_loss: best_epoch = epoch else: if (epoch - best_epoch) > improvement_thresh: print("Has been %d epochs without improvement. Aborting." % (epoch - best_epoch)) break # After we've trained, we return a sugary little function handle # that makes things easy. Basically, what we're doing here is # simply replacing the inputs in the above defined graph and then # running through it to produce the outputs. # The point here is that we wind up with a function # handle the can be called with a numpy object and it produces the # target values for novel data, using the parameters we just learned. def predict(x): X.value = x H.reinstate_units() return Y.value return predict
def kayak_mlp(X, y): """ Kayak implementation of a mlp with relu hidden layers and dropout """ # Create a batcher object. batcher = kayak.Batcher(batch_size, X.shape[0]) # count number of rows and columns num_examples, num_features = np.shape(X) X = kayak.Inputs(X, batcher) T = kayak.Targets(y, batcher) # ----------------------------- first hidden layer ------------------------------- # set up weights for our input layer # use the same scheme as our numpy mlp input_range = 1.0 / num_features**(1 / 2) weights_1 = kayak.Parameter(0.1 * np.random.randn(X.shape[1], layer1_size)) bias_1 = kayak.Parameter(0.1 * np.random.randn(1, layer1_size)) # linear combination of weights and inputs hidden_1_input = kayak.ElemAdd(kayak.MatMult(X, weights_1), bias_1) # apply activation function to hidden layer hidden_1_activation = kayak.HardReLU(hidden_1_input) # apply a dropout for regularization hidden_1_out = kayak.Dropout(hidden_1_activation, layer1_dropout, batcher=batcher) # ----------------------------- output layer ----------------------------------- weights_out = kayak.Parameter(0.1 * np.random.randn(layer1_size, 9)) bias_out = kayak.Parameter(0.1 * np.random.randn(1, 9)) # linear combination of layer2 output and output weights out = kayak.ElemAdd(kayak.MatMult(hidden_1_out, weights_out), bias_out) # apply activation function to output yhat = kayak.SoftMax(out) # ----------------------------- loss function ----------------------------------- loss = kayak.MatAdd(kayak.MatSum(kayak.L2Loss(yhat, T)), kayak.L2Norm(weights_1, layer1_l2)) # Use momentum for the gradient-based optimization. mom_grad_W1 = np.zeros(weights_1.shape) mom_grad_W2 = np.zeros(weights_out.shape) # Loop over epochs. plot_loss = np.ones((iterations, 2)) for epoch in xrange(iterations): # Track the total loss. total_loss = 0.0 for batch in batcher: # Compute the loss of this minibatch by asking the Kayak # object for its value and giving it reset=True. total_loss += loss.value # Now ask the loss for its gradient in terms of the # weights and the biases -- the two things we're trying to # learn here. grad_W1 = loss.grad(weights_1) grad_B1 = loss.grad(bias_1) grad_W2 = loss.grad(weights_out) grad_B2 = loss.grad(bias_out) # Use momentum on the weight gradients. mom_grad_W1 = momentum * mom_grad_W1 + (1.0 - momentum) * grad_W1 mom_grad_W2 = momentum * mom_grad_W2 + (1.0 - momentum) * grad_W2 # Now make the actual parameter updates. weights_1.value -= learn_rate * mom_grad_W1 bias_1.value -= learn_rate * grad_B1 weights_out.value -= learn_rate * mom_grad_W2 bias_out.value -= learn_rate * grad_B2 # save values into table to print learning curve at the end of trianing plot_loss[epoch, 0] = epoch plot_loss[epoch, 1] = total_loss print epoch, total_loss #pyplot.plot(plot_loss[:,0], plot_loss[:,1], linewidth=2.0) #pyplot.show() def compute_predictions(x): X.data = x batcher.test_mode() return yhat.value return compute_predictions