def add_last_layer(self, ini=Xavier(), acti=softmax()): n_in = self.dims[-1] n_out = self.classes layer = hidden_layer(n_in, n_out, ini, last_layer=True) # The activation function is softmax for the last layer layer.setActivation(softmax()) # last layer dose not need Dropout layer.setDropout(drop=0) if (self.optimizer != None): layer.setOptimizer(self.optimizer.clone()) self.layers.append(layer) print('LAST LAYER with initialization: {}, '.format(ini.name), 'activation: {}'.format(acti.name))
def estimate_total(self, trainX, trainY, val_X, val_Y, l2, lambd): numData = trainY.shape[1] AL, _ = self.forward_propagation(trainX) # For softmax SAL = softmax(AL) cost = self.compute_cost(SAL, trainY, 'cross_entropy', l2, lambd) prediction = np.argmax(SAL, axis=0) solution = np.argmax(trainY, axis=0) right = np.sum(prediction == solution) train_accuracy = right / numData val_accuracy = None if val_X is not None and val_Y is not None: val_AL, _ = self.forward_propagation(val_X) # cost = self.compute_cost(val_AL, val_Y, 'cross_entropy') val_pred = np.argmax(val_AL, axis=0) val_sol = np.argmax(val_Y, axis=0) val_right = np.sum(val_pred == val_sol) val_accuracy = val_right / val_Y.shape[1] return train_accuracy, val_accuracy, cost
def test_softmax(self): self.assertEqual(list(a.softmax(list(range(1, 6)))), [ 0.011656230956039607, 0.03168492079612427, 0.0861285444362687, 0.23412165725273662, 0.6364086465588308 ])
def test_softmax(): """Test softmax activation function""" x = np.array([[0, 1, 3], [-1, 0, -5], [1, 0, 3], [10, -9, -7]]) y = np.array([[0.04201, 0.11420, 0.84379], [0.26762, 0.72747, 0.00490], [0.11420, 0.04201, 0.84379], [1, 0, 0]]) assert np.allclose(softmax(x), y, atol=0.00001)
def predict(self, x): W1, W2 = self.params['W1'], self.params['W2'] b1, b2 = self.params['b1'], self.params['b2'] a1 = np.dot(x, W1) + b1 z1 = sigmoid(a1) a2 = np.dot(z1, W2) + b2 y = softmax(a2) return y
def feedforward(self, inputs): ''' Passes inputs forward through neural network, returns an array of probabilities Input Layer => Hidden Layer => Softmax Layer ''' # feedforward inputs through hidden layer hidden_outputs = [node.feedforward(inputs) for node in self.hidden] # feedforward hidden layer outputs through softmax layer return softmax([node.linearsum(hidden_outputs) for node in self.soft])
def train(train_x, train_y, learning_rate=0.2): # Flatten input (batch_size, 28, 28) -> (batch_size, 784) x = train_x.reshape(train_x.shape[0], -1) # Turn labels into their one-hot representations y = one_hot_encoder(train_y) # Initialize weights w1, b1 = initialize_weight((784, 256), bias=True) w2, b2 = initialize_weight((256, 10), bias=True) num_epochs = 50 loss_history = [] for epoch in range(1, num_epochs + 1): print("Epoch {}/{}\n===============".format(epoch, num_epochs)) # Forward Prop h1 = np.dot(x, w1) + b1 a1 = sigmoid(h1) h2 = np.dot(a1, w2) + b2 a2 = softmax(h2) out = a2 # Cross Entropy Loss loss = cross_entropy_loss(out, train_y) loss_history.append(loss) print("Loss: {:.6f}".format(loss)) # Compute and print accuracy pred = np.argmax(out, axis=1) pred = pred.reshape(pred.shape[0], 1) acc = np.mean(pred == train_y) print("Accuracy: {:.2f}%\n".format(acc * 100)) # Backward Prop m = out.shape[0] dh2 = a2 - y dw2 = (1 / m) * np.dot(a1.T, dh2) db2 = (1 / m) * np.sum(dh2, axis=0, keepdims=True) dh1 = np.dot(dh2, w2.T) * sigmoid_prime(a1) dw1 = (1 / m) * np.dot(x.T, dh1) db1 = (1 / m) * np.sum(dh1, axis=0, keepdims=True) # Weight (and bias) update w1 -= learning_rate * dw1 b1 -= learning_rate * db1 w2 -= learning_rate * dw2 b2 -= learning_rate * db2 return w1, b1, w2, b2, loss_history
def hessian(self, x, t): k = t.shape[1] n = t.shape[0] d = x.shape[1] w = np.reshape(self.w, (x.shape[1], -1), 'F') y = softmax(np.dot(x, w)) h = np.zeros([d*k, d*k]) for i in xrange(k): for j in xrange(k): h[i*d:(i+1)*d,j*d:(j+1)*d] = np.dot(np.transpose(x) * (y[:,i] * ((i==j) - y[:,j])), x) return h
def __init__(self, input, n_in, n_out, W=None, b=None): if W is None: W = theano.shared(np.random.randn(n_out, n_in).astype(dtype=theano.config.floatX)/np.sqrt(n_in)) if b is None: b = theano.shared(np.random.randn(n_out).astype(dtype=theano.config.floatX)) self.W = W self.b = b self.v_W = theano.shared(np.zeros((n_out, n_in)).astype(dtype=theano.config.floatX)) self.v_b = theano.shared(np.zeros(n_out).astype(dtype=theano.config.floatX)) self.y = a.softmax( T.dot(W, input) + b.dimshuffle(0, 'x')) self.y_pred = T.argmax(self.y, axis=0) self.params = [self.W, self.b] self.velo = [self.v_W, self.v_b] self.input = input
def feedForward(self, inputs): if len(inputs) != self.input-1: raise ValueError('Wrong number of inputs') # input activations self.ai = np.append(inputs, [1]) # add bias node # hidden activations self.ah = sigmoid(self.ai.dot(self.wi)) # self.ah = relu(self.ai.dot(self.wi)) # output activations self.ao = sigmoid(self.ah.dot(self.wo)) # self.ao = relu(self.ah.dot(self.wo)) return softmax(self.ao)
def __init__(self, input_list, n_in, n_out, n_total, mask, batch, W=None, b=None, M=None): w = np.zeros((n_in, n_out)) np.fill_diagonal(w, 1) if W is None: #W = theano.shared(np.random.randn(n_in, n_out).astype(dtype=theano.config.floatX)/np.sqrt(n_in)) W = theano.shared(w.astype(dtype=theano.config.floatX)/np.sqrt(n_in)) if b is None: b = theano.shared(np.zeros(n_out).astype(dtype=theano.config.floatX)) if M is None: M = theano.shared(0.5 * np.ones((n_total, 2)).astype(dtype=theano.config.floatX)) self.W = W self.b = b self.M = M self.v_W = theano.shared(np.zeros((n_in, n_out)).astype(dtype=theano.config.floatX)) self.v_b = theano.shared(np.zeros(n_out).astype(dtype=theano.config.floatX)) self.v_M = theano.shared(np.zeros((n_total, 2)).astype(dtype=theano.config.floatX)) self.input_list = input_list self.input_list[0] = self.input_list[0] self.input_list[1] = (self.input_list[1])[::-1] ''' def Merge(input_seq1, input_seq2, merger): return T.dot((input_seq1 * merger[0] + input_seq2 * merger[1]), self.W) + self.b self.temp_y = a.softmax((theano.scan(Merge, sequences=[self.input_list[0], self.input_list[1], self.M], outputs_info=None))[0]) ''' def Merge(input_seq1, input_seq2): return T.dot((input_seq1 * 1 + input_seq2 * 0), self.W) + self.b self.temp_y = a.softmax((theano.scan(Merge, sequences=[self.input_list[0], self.input_list[1]], outputs_info=None))[0]) self.temp_y = self.temp_y.dimshuffle(1,0,2) self.mask = mask self.batch = batch y_pred_list = [] for i in range(self.batch): y_pred_list.append(T.set_subtensor(T.argmax(self.temp_y[i], axis=1)[self.mask[i]:], 0)) self.y_pred = T.stacklists(y_pred_list) self.params = [self.W, self.b, self.M] self.velo = [self.v_W, self.v_b, self.v_M]
def visualization(test_x, test_y, w1, b1, w2, b2): x = test_x[:20] x = x.reshape(x.shape[0], -1) y = test_y[:20] # Forward Pass h1 = np.dot(x, w1) + b1 a1 = sigmoid(h1) h2 = np.dot(a1, w2) + b2 a2 = softmax(h2) out = a2 pred = np.argmax(out, axis=1) fig = plt.figure(figsize=(25, 4)) for index in np.arange(20): ax = fig.add_subplot(2, 20 / 2, index + 1, xticks=[], yticks=[]) ax.imshow(test_x[index], cmap='gray') ax.set_title("{} ({})".format(str(pred[index]), str(y[index][0])), color=("green" if pred[index] == y[index] else "red"))
def __init__(self, input, n_in, n_out, W=None, b=None): if W is None: W = theano.shared( np.random.randn(n_out, n_in).astype(dtype=theano.config.floatX) / np.sqrt(n_in)) if b is None: b = theano.shared( np.random.randn(n_out).astype(dtype=theano.config.floatX)) self.W = W self.b = b self.v_W = theano.shared( np.zeros((n_out, n_in)).astype(dtype=theano.config.floatX)) self.v_b = theano.shared( np.zeros(n_out).astype(dtype=theano.config.floatX)) self.y = a.softmax(T.dot(W, input) + b.dimshuffle(0, 'x')) self.y_pred = T.argmax(self.y, axis=0) self.params = [self.W, self.b] self.velo = [self.v_W, self.v_b] self.input = input
def batch_train(self, data, label): """ Batch Training X[i, j, k]: input for layer i size of X: nLayer x nNeuron_i x nSample D[i, :]: delta for layer i (except input layer) size of D: nLayer-1 x nNeuron_i x nSample Fd[i, :]: derivative of activation of layer i (except input layer and output layer) size of Fd: nLayer-2 x nNeuron_i x nSample """ n_samples = data.shape[1] # Add bias unit to input layer bias = np.ones((1, n_samples)) X = [np.concatenate((data, bias), axis=0)] Fd = [] # Forward for i in range(self.nLayer - 2): si = np.dot(self.W[i].T, X[i]) xi = self.activation(si) xi_deriv = self.activation(si, 1) xi = np.concatenate((xi, bias), axis=0) X.append(xi) Fd.append(xi_deriv) so = np.dot(self.W[-1].T, X[-1]) xo = act.softmax(so) # Backpropagation o_delta = xo o_delta[label, np.arange(n_samples)] -= 1 D = [o_delta] for i in range(self.nLayer - 3, -1, -1): delta = self.W[i + 1][0:-1, :].dot(D[-1]) delta = np.multiply(delta, Fd[i]) D.append(delta) D.reverse() # Update weight for i in range(self.nLayer - 1): self.W[i] += (-self.lr * X[i].dot(D[i].T)) #Release Memory D.clear() Fd.clear() X.clear()
def single_layer_fp(X, W, b, activation="sigmoid"): l = [] for i in range(0, X.shape[1]): l.append(1) A = np.dot(W, X) + np.outer(b, np.array(l)) if activation == "linear": S = act_fun.linear(A) elif activation == "sigmoid": S = act_fun.sigmoid(beta, A) elif activation == "tanh": S = act_fun.tanh(beta, A) elif activation == "relu": S = act_fun.relu(A) elif activation == "softplus": S = act_fun.softplus(A) elif activation == "elu": S = act_fun.elu(delta, A) elif activation == "softmax": S = act_fun.softmax(A) else: print("Activation function isn't supported") return (A, S)
def test_softmax(self): def _ref_softmax(values): """ Taken from Keras' testing code: https://github.com/keras-team/keras/blob/ce5728bbd36004c7a17b86e69a8e59b21d6ee6d4/keras/activations_test.py """ m = np.max(values) e = np.exp(values - m) return e / np.sum(e) rtol = 1e-3 size = 10 for _ in range(1000): x = np.random.uniform(low=-1., high=1., size=size).flatten() y_numpy = _ref_softmax(x) test_buffer = list_2_swig_float_pointer(x, size) y_nn4mc = activation.softmax(test_buffer.cast(), size) y_nn4mc = swig_py_object_2_list(y_nn4mc, size) y_nn4mc = np.round(y_nn4mc, decimals=5) y_numpy = np.round(y_numpy, decimals=5) assert np.allclose(y_nn4mc, y_numpy, rtol=rtol) print("softmax passed")
def test(test_x, test_y, w1, b1, w2, b2): # Flatten input (batch_size, 28, 28) -> (batch_size, 784) x = test_x.reshape(test_x.shape[0], -1) # Turn labels into their one-hot representations y = one_hot_encoder(test_y) # Forward Pass h1 = np.dot(x, w1) + b1 a1 = sigmoid(h1) h2 = np.dot(a1, w2) + b2 a2 = softmax(h2) out = a2 # Cross Entropy Loss loss = cross_entropy_loss(out, test_y) print("Loss: {:.6f}".format(loss)) # Compute and print accuracy pred = np.argmax(out, axis=1) pred = pred.reshape(pred.shape[0], 1) acc = np.mean(pred == test_y) print("Accuracy: {:.2f}%\n".format(acc * 100))
def train(self, images, labels): """ Train method This method takes a set of images and labels then feeds them to the neural network which then backpropogates with its outputs This then breaks when it reaches its minimum error and returns the weights used to acheive this error @param images | list | an array of images @param labels | list | an array of labels """ labels = list(labels) if not isinstance(labels[0], list): for i, x in enumerate(labels): a = [0 for x in range(10)] a[x] = 1 labels[i] = a print("minimising images 2") for image in images: image = softmax(image) print("done") # n = 20 while True: # if n > 0: # n -= 1 for i in range(len(images)): image, label = images[i], labels[i] # error = self.error outputs = self.execute(image, label) # if self.error > error and n == 0: # print(f"Min error: {self.error}") # return self.save_weights() self.backpropogate(outputs, label)
def predict(self, X, y=None): """Preditc Label for X""" p_label = np.empty((0, 0)) n_samples = X.shape[1] n_batch = np.ceil(n_samples / self.batchSize) loss = 0 for i in range(np.uint16(n_batch)): end_batch = n_samples if ( i + 1) * self.batchSize >= n_samples else (i + 1) * self.batchSize cur_batch = end_batch - i * self.batchSize Xb = X[:, i * self.batchSize:end_batch] # Add bias bias = np.ones((1, cur_batch)) Xb = np.concatenate((Xb, bias), axis=0) # Forward for k in range(self.nLayer - 2): sk = np.dot(self.W[k].T, Xb) Xb = self.activation(sk) Xb = np.concatenate((Xb, bias), axis=0) so = np.dot(self.W[-1].T, Xb) Xb = act.softmax(so) if y is not None: loss += self.loss_function(Xb, y[i * self.batchSize:end_batch]) p_label = np.append(p_label, np.argmax(Xb, axis=0)) loss = loss / n_samples if y is not None: cnt = np.sum(y == p_label) correct = (cnt * 10000 // len(y)) / 100 print("Loss, Correct: ", loss, correct) return loss, correct else: print("Label: ", p_label) return None, None
def forward(self, x, t): self.t = t self.y = softmax(x) self.loss = cross_entropy_error(self.y, self.t) return self.loss
def __init__(self, input_list, n_in, n_out, n_total, mask, batch, W=None, b=None, M=None): w = np.zeros((n_in, n_out)) np.fill_diagonal(w, 1) if W is None: #W = theano.shared(np.random.randn(n_in, n_out).astype(dtype=theano.config.floatX)/np.sqrt(n_in)) W = theano.shared( w.astype(dtype=theano.config.floatX) / np.sqrt(n_in)) if b is None: b = theano.shared( np.zeros(n_out).astype(dtype=theano.config.floatX)) if M is None: M = theano.shared(0.5 * np.ones( (n_total, 2)).astype(dtype=theano.config.floatX)) self.W = W self.b = b self.M = M self.v_W = theano.shared( np.zeros((n_in, n_out)).astype(dtype=theano.config.floatX)) self.v_b = theano.shared( np.zeros(n_out).astype(dtype=theano.config.floatX)) self.v_M = theano.shared( np.zeros((n_total, 2)).astype(dtype=theano.config.floatX)) self.input_list = input_list self.input_list[0] = self.input_list[0] self.input_list[1] = (self.input_list[1])[::-1] ''' def Merge(input_seq1, input_seq2, merger): return T.dot((input_seq1 * merger[0] + input_seq2 * merger[1]), self.W) + self.b self.temp_y = a.softmax((theano.scan(Merge, sequences=[self.input_list[0], self.input_list[1], self.M], outputs_info=None))[0]) ''' def Merge(input_seq1, input_seq2): return T.dot((input_seq1 * 1 + input_seq2 * 0), self.W) + self.b self.temp_y = a.softmax( (theano.scan(Merge, sequences=[self.input_list[0], self.input_list[1]], outputs_info=None))[0]) self.temp_y = self.temp_y.dimshuffle(1, 0, 2) self.mask = mask self.batch = batch y_pred_list = [] for i in range(self.batch): y_pred_list.append( T.set_subtensor( T.argmax(self.temp_y[i], axis=1)[self.mask[i]:], 0)) self.y_pred = T.stacklists(y_pred_list) self.params = [self.W, self.b, self.M] self.velo = [self.v_W, self.v_b, self.v_M]
def train_model(self): epoch = config['TRAIN']['epoch'] batch_size = config['TRAIN']['batch_size'] train_data_ratio = config['TRAIN']['train_data_ratio'] validation_data_ratio = config['TRAIN']['validation_data_ratio'] learning_rate = config['TRAIN']['learning_rate'] optimizer = config['TRAIN']['optimizer'] l2 = config['TRAIN']['L2'] lambd = config['TRAIN']['lambd'] train_acc_list = [] val_acc_list = [] cost_list = [] val_cost_list = [] numTrain = int(self.trainX.shape[1] * train_data_ratio) numVal = int(self.trainX.shape[1] - numTrain) trainX = self.trainX[:, 0:numTrain] trainY = self.trainY[:, 0:numTrain] val_X = self.trainX[:, numTrain:] val_Y = self.trainY[:, numTrain:] numBatch = numTrain // batch_size print("Number of Training Data: " + str(trainX.shape[1])) print("Number of Validation Data: " + str(val_X.shape[1])) if l2 == "true": l2 = True else: l2 = False for i in range(epoch): for j in range(numBatch): batch_X = trainX[:, j * batch_size:(j + 1) * batch_size] batch_Y = trainY[:, j * batch_size:(j + 1) * batch_size] AL, caches = self.forward_propagation(batch_X) # For softmax SAL = softmax(AL) cost = self.compute_cost(AL, batch_Y, 'cross_entropy', l2, lambd) print('Epoch - ' + str(i) + ' Mini-batch - ' + str(j) + ' cost ' + str(cost)) grads = self.backward_propagation(SAL, batch_Y, caches, l2, lambd) self.update_parameters(self.parameters, grads, learning_rate, optimizer) train_acc, val_acc, val_cost = self.estimate( AL, batch_Y, val_X, val_Y, l2, lambd) train_acc_list.append(train_acc) val_acc_list.append(val_acc) cost_list.append(cost) val_cost_list.append(val_cost) print('train_accuracy: ' + str(train_acc)) if val_acc is not None: print('val_accuracy: ' + str(val_acc)) # Last batch if numTrain % batch_size != 0: batch_X = trainX[:, numBatch * batch_size:] batch_Y = trainY[:, numBatch * batch_size:] # print (batch_X.shape) AL, caches = self.forward_propagation(batch_X) # For softmax SAL = softmax(AL) cost = self.compute_cost(AL, batch_Y, 'cross_entropy', l2, lambd) print('Epoch - ' + str(i) + ' Mini-batch - ' + str(j) + ' cost ' + str(cost)) grads = self.backward_propagation(SAL, batch_Y, caches, l2, lambd) self.update_parameters(self.parameters, grads, learning_rate, optimizer) train_acc, val_acc, val_cost = self.estimate( AL, batch_Y, val_X, val_Y, l2, lambd) train_acc_list.append(train_acc) val_acc_list.append(val_acc) cost_list.append(cost) val_cost_list.append(val_cost) print('train_accuracy: ' + str(train_acc)) if val_acc is not None: print('val_accuracy: ' + str(val_acc)) if i % 1 == 0: """ print (AL[:, 0]) print (SAL[:, 0]) print (batch_Y[:, 0]) """ # print (self.parameters["W2"]) # train_acc, val_acc, cost = self.estimate_total(trainX, trainY, val_X, val_Y, l2, lambd) # train_acc_list.append(train_acc) # val_acc_list.append(val_acc) # cost_list.append(cost) if i % 10 == 0: pass """ print ('Epoch: ' + str(i) + '-' + ' cost ' + str(cost)) print ('train_accuracy: ' + str(train_acc)) if val_acc is not None: print ('val_accuracy: ' + str(val_acc)) """ # train_acc, val_acc = self.estimate(AL, batch_Y, val_X, val_Y) #print ('train_accuracy: ' + str(train_acc)) #if val_acc is not None: # print ('val_accuracy: ' + str(val_acc)) # break # W1 = self.parameters["W1"] # print (W1.shape) # print (W1[0].shape) return train_acc_list, val_acc_list, cost_list, val_cost_list
def forward_softmax(self, tree): Z = np.dot(tree.p, self.Ws) + self.bs tree.softmax = act.softmax(Z)
def predict(nn, x): return softmax(forward(nn, x))
def gradient(self, node, grad): grad_A = (activation.softmax(node.parents[0]) + -1 * node.parents[1]) * grad grad_B = op.zeros_like(node.parents[1]) return [grad_A, grad_B]
def forward_prop(self, X): if self.layer_type is "output": return softmax( np.dot(self.W.T, X) ) else: return relu( np.dot(self.W.T, X) )
def main(argv): # load and pre-process the data X, Predict_data, Y = preprocessed.data_preprocess( parameter.input_data_path) print('| Total train data | structure: {}'.format(X.shape)) print('| Train Data label | structure: {}'.format(Y.shape)) print('| Total test Data | structure: {}'.format(Predict_data.shape)) # split data into train, validation and test train_x, train_y, vali_x, vali_y, test_x, test_y = preprocessed.train_vali_test_split( X, Y, parameter.train_rate, parameter.vali_rate, parameter.test_rate) print("_______________________________________") print('after split\ntrain data shape:\t{}'.format(train_x.shape)) print('train data label:\t{}'.format(train_y.shape)) if vali_x is None: print(" after data pre-process, validation is none") else: print('validation data shape:\t{}'.format(vali_x.shape)) if test_x is None: print(" after data pre-process, test data is none") else: print('test data shape:\t{}'.format(test_x.shape)) print("_______________________________________") # create learning model # a model considers batch size, batch normalization, dropout rate, weight decay and way of optimization learn_model = model(train_x, train_y, batch_size=get_batch_size(), drop=get_dropout_rate(), learning_rate=get_lr(), regularizer=get_regularizer(), norm=get_norm(), optimizer=get_opt()) # set validation data into model learn_model.validation(vali_x, vali_y) # create neural layer1 learn_model.add_layer(parameter.num_hide1, ini=He(), acti=relu()) # layer2 learn_model.add_layer(parameter.num_hide2, ini=He(), acti=relu()) # layer3 learn_model.add_layer(parameter.num_hide3, ini=He(), acti=relu()) # layer4 learn_model.add_last_layer(ini=Xavier(), acti=softmax()) # start training x_rem = learn_model.fit(epoch=parameter.epoch, learning_rate=parameter.learning_rate) # start testing learn_model.test(test_x, test_y) # plot result learn_model.plot(x_rem, True, True) # start predict print("---------- finish predict, save to predict.h5 ----------") predict = learn_model.predict(x=Predict_data).T predict = np.argmax(predict, axis=1) # print(predict) f = h5py.File(parameter.ouput_data_path + "/Predicted_labels.h5", 'a') f.create_dataset('/predict', data=predict, dtype=np.float32) f.close()
def forward_softmax(self, root): Z = np.dot(root.p, self.Ws) + self.bs A = act.softmax(Z) return A
import matplotlib.pyplot as plt import numpy as np def drawLinePlot(start, end, activationFunction): inputs = [] outputs = [] for x in np.arange(start, end, 0.5): inputs.append(x) outputs.append(activationFunction(x)) return inputs, outputs x, y = drawLinePlot(-3, 3, activation.linear) plt.plot(x, y, label='Linear') x, y = drawLinePlot(-3, 3, activation.sigmoid) plt.plot(x, y, label='Sigmoid') x, y = drawLinePlot(-3, 3, activation.leakyRelu) plt.plot(x, y, label='Leaky ReLU') x, y = drawLinePlot(-3, 3, activation.tanh) plt.plot(x, y, label='Tanh') x, y = drawLinePlot(-3, 3, activation.relu) plt.plot(x, y, label='ReLU') x, y = drawLinePlot(-3, 3, activation.swish) plt.plot(x, y, label='Swish') plt.plot([-3, -2, -1, 0, 1, 2, 3], activation.softmax([-3, -2, -1, 0, 1, 2, 3]), label='Softmax') plt.xlabel("Inputs") plt.ylabel("Outputs") plt.title("Activation Functions") plt.legend() plt.show()
def construct_RNN(n_input, n_output, n_hid_layers=2, archi=128, lr=1e-3, acti_func='ReLU', update_by='RMSProp', dropout_rate=0.2, batchsize=1, scale=0.033, scale_b=0.001, clip_thres=10.0, seed=42): """ Initialize and construct the bidirectional deep RNN with dropout Update the RNN using minibatch and RMSProp archi: number of neurons of each hidden layer """ x_seq = T.fmatrix() y_hat = T.fmatrix() minibatch = T.scalar() stop_dropout = T.scalar() # choose the optimization function optimiz_func = { 'sgd': sgd, 'momentum': momentum, 'NAG': NAG, 'RMSProp': RMSProp, } update_func = optimiz_func[update_by] # initialize the RNN print('Start initializing RNN...') init = initialize_RNN(n_input, n_output, archi, n_hid_layers, scale, scale_b, clip_thres) param_Ws, param_bs, auxis, caches, a_0, parameters = init # ############ bidirectional recurrent neural network ############### srng = RandomStreams(seed=seed) # #### Hidden layers ###### for l in range(n_hid_layers): if l == 0: a_seq = x_seq z_seq = T.dot(a_seq, param_Ws[0][l]) z_seq += param_bs[0][l].dimshuffle('x', 0) zf_seq = z_seq zb_seq = z_seq else: zf_seq = T.dot(a_seq, param_Ws[1][l - 1]) zf_seq += param_bs[1][l - 1].dimshuffle('x', 0) zb_seq = T.dot(a_seq, param_Ws[2][l - 1]) zb_seq += param_bs[2][l - 1].dimshuffle('x', 0) step = set_step(param_Ws[3], param_bs[3], l, acti_func) [af_seq, ab_seq], _ = th.scan(step, sequences=[zf_seq, zb_seq[::-1]], outputs_info=[a_0, a_0], truncate_gradient=-1) a_out = T.concatenate([af_seq, ab_seq[::-1]], axis=1) dropping = srng.binomial(size=T.shape(a_out), p=(1 - dropout_rate)) a_seq = ifelse(T.lt(stop_dropout, 1.05), (a_out * dropping).astype('float32'), a_out) a_seq /= stop_dropout # #### End of Hidden layers ###### y_pre = T.dot(a_seq, param_Ws[0][1]) + param_bs[0][1].dimshuffle('x', 0) y_seq = softmax(y_pre) forward = th.function(inputs=[x_seq, stop_dropout], outputs=y_seq) cost = T.sum((y_seq - y_hat)**2) + minibatch * 0 valid = th.function(inputs=[x_seq, y_hat, minibatch, stop_dropout], outputs=cost) grads = T.grad(cost, parameters, disconnected_inputs='ignore') # ############ end of construction ############### updates = update_func(parameters, grads, lr, minibatch, batchsize, auxis, caches) rnn_train = th.function(inputs=[x_seq, y_hat, minibatch, stop_dropout], outputs=cost, updates=updates) return forward, valid, rnn_train
def test_softmax(self): self.assertEqual( activation.softmax(softmax_inp_x).any(), softmax_out_x.any())
def train(self, data, yTrues, learnRate=0.1, epochs=1000, checkRate=50): ''' Uses backpropagation to calculate the partial derivatives (gradience) of Loss in regard to each weight and bias, then uses Stochastic Gradient Descent (SGD) to adjust each weight and bias such that: w <= w-lr*dLdw where: w is a weight or bias, lr is the learn rate (typically 0.1) dwdL is the partial derivative Loss in regards to that weight or bias Each epoch represents one run through the entire dataset, and after every 50 epochs (or however many is set by the check rate) the program will run a feedforward and print the Epoch number, the Cross Entropy Loss and the Accuracy (percent of correct guesses) ''' # a runthrough of the entire data set for epoch in range(epochs): # a runthrough of one row of data for x, yTrue in zip(data, yTrues): # execute a feedforward, storing linear sums # (results of linear function in neuron) hidden_outputs = [] hidden_totals = [] for node in self.hidden: hidden_totals.append(node.linearsum(x)) hidden_outputs.append(node.feedforward(x)) soft_totals = [ node.linearsum(hidden_outputs) for node in self.soft ] soft_outputs = softmax(soft_totals) # partial derivatives # partial L / partial softout for case c (yTrue) dL_dsoc = soft_outputs[yTrue] - 1 # Update softmax layer j = 0 # counter for index of current neuron in self.soft for node in self.soft: # partial softout for case c / partial total dsoc_dt = d_softmax(yTrue, j, soft_totals) # Update weights for w in range(len(node.weights)): # partial total / partial weight dt_dw = hidden_outputs[w] # partial loss / partial weight partial_derivative = dL_dsoc * dsoc_dt * dt_dw # SGD newweight = (node.weights[w] - learnRate * partial_derivative) # update weight node.changeweight(newweight, w) # Update biases # partial loss / partial bias partial_derivative = dL_dsoc * dsoc_dt # SGD newbias = node.bias - learnRate * partial_derivative # update bias node.changebias(newbias) # increment counter j += 1 # counter for index of current neuron in self.hidden h = 0 for hnode in self.hidden: # partial total(softmax layer) / partial h dt_dh = node.weights[h] for w in range(len(hnode.weights)): # partial h / partial w dh_dw = x[w] * activation_derivative( self.hiddenActivation, hidden_totals[h]) # partial loss / partial w partial_derivative = (dL_dsoc * dsoc_dt * dt_dh * dh_dw) # SGD newweight = (hnode.weights[w] - (learnRate * partial_derivative)) # update weight hnode.changeweight(newweight, w) # partial h / partial bias dh_db = activation_derivative(self.hiddenActivation, hidden_totals[h]) # partial loss / partial bias partial_derivative = dL_dsoc * dsoc_dt * dt_dh * dh_db # SGD newbias = hnode.bias - learnRate * partial_derivative # update bias hnode.changebias(newbias) # increment counter h += 1 # Run a feedforward on the data and print an update to the console # with the epoch, avg. loss, accuracy if epoch % checkRate == 0: self.test(data, yTrues, True, epoch)
def construct_LSTM(n_input, n_output, n_hid_layers=2, archi=36, lr=1e-3, update_by='NAG', batchsize=1, scale=0.01, scale_b=0.001, clip_thres=1.0): """ Initialize and construct the bidirectional Long Short-term Memory (LSTM) Update the LSTM using minibatch and RMSProp archi: number of neurons of each hidden layer """ x_seq = T.fmatrix() y_hat = T.fmatrix() minibatch = T.scalar() # choose the optimization function optimiz_func = { 'sgd': sgd, 'momentum': momentum, 'NAG': NAG, 'RMSProp': RMSProp, } update_func = optimiz_func[update_by] # initialize the LSTM print('Start initializing LSTM...') init = initialize_LSTM(n_input, n_output, archi, n_hid_layers, scale, scale_b, clip_thres) param_Ws, param_bs, auxis, caches, a_0, h_0, parameters = init # ############ bidirectional Long Short-term Memory ############### # #### Hidden layers ###### for l in range(n_hid_layers): # computing gates if l == 0: a_seq = x_seq W, Wi, Wf, Wo = param_Ws[0][l][:-1] b, bi, bf, bo = param_bs[0][l] z_seq = T.dot(a_seq, W) + b.dimshuffle('x', 0) zi_seq = T.dot(a_seq, Wi) + bi.dimshuffle('x', 0) zf_seq = T.dot(a_seq, Wf) + bf.dimshuffle('x', 0) zo_seq = T.dot(a_seq, Wo) + bo.dimshuffle('x', 0) zf_seq, zif_seq, zff_seq, zof_seq = z_seq, zi_seq, zf_seq, zo_seq zb_seq, zib_seq, zfb_seq, zob_seq = z_seq, zi_seq, zf_seq, zo_seq else: # forward gates W_f, Wi_f, Wf_f, Wo_f = param_Ws[1][l - 1] b_f, bi_f, bf_f, bo_f = param_bs[1][l - 1] zf_seq = T.dot(a_seq, W_f) + b_f.dimshuffle('x', 0) zif_seq = T.dot(a_seq, Wi_f) + bi_f.dimshuffle('x', 0) zff_seq = T.dot(a_seq, Wf_f) + bf_f.dimshuffle('x', 0) zof_seq = T.dot(a_seq, Wo_f) + bo_f.dimshuffle('x', 0) # backward gates W_b, Wi_b, Wf_b, Wo_b = param_Ws[2][l - 1] b_b, bi_b, bf_b, bo_b = param_bs[2][l - 1] zb_seq = T.dot(a_seq, W_b) + b_b.dimshuffle('x', 0) zib_seq = T.dot(a_seq, Wi_b) + bi_b.dimshuffle('x', 0) zfb_seq = T.dot(a_seq, Wf_b) + bf_b.dimshuffle('x', 0) zob_seq = T.dot(a_seq, Wo_b) + bo_b.dimshuffle('x', 0) # computing cells step = set_step(param_Ws[3][l], param_Ws[4][l]) # Forward direction seqs = [zf_seq, zif_seq, zff_seq, zof_seq] [cf_seq, hf_seq], _ = th.scan(step, sequences=seqs, outputs_info=[a_0, h_0], truncate_gradient=-1) # Backward direction seqs = [zb_seq[::-1], zib_seq[::-1], zfb_seq[::-1], zob_seq[::-1]] [cb_seq, hb_seq], _ = th.scan(step, sequences=seqs, outputs_info=[a_0, h_0], truncate_gradient=-1) a_seq = T.concatenate([hf_seq, hb_seq[::-1]], axis=1) # #### End of Hidden layers ###### y_seq = softmax(T.dot(a_seq, param_Ws[0][0][-1])) forward = th.function(inputs=[x_seq], outputs=y_seq) cost = T.sum((y_seq - y_hat)**2) + minibatch * 0 valid = th.function(inputs=[x_seq, y_hat, minibatch], outputs=cost) grads = T.grad(cost, parameters, disconnected_inputs='ignore') forward_grad = th.function([x_seq, y_hat, minibatch], grads) # ############ end of construction ############### updates = update_func(parameters, grads, lr, minibatch, batchsize, auxis, caches) lstm_train = th.function(inputs=[x_seq, y_hat, minibatch], outputs=cost, updates=updates) return forward, valid, lstm_train, forward_grad
X_test = X_test.astype(np.float32) / 255 r = np.random.permutation(len(y_train)) X_train = X_train[r] y_train = y_train[r] X_dev = X_train[:12000] y_dev = y_train[:12000] X_train = X_train[10000:] y_train = y_train[10000:] LOG.info("finish data preprocessing.") FCs = [ FullyConnected(784, 256, opts.batch_size, relu()), FullyConnected(256, 128, opts.batch_size, relu()), FullyConnected(128, 64, opts.batch_size, relu()), FullyConnected(64, 10, opts.batch_size, softmax()) ] LOG.info("finish initialization.") n_samples = len(y_train) order = np.arange(n_samples) best_precision, test_precision = 0, 0 for epochs in range(0, opts.epochs): np.random.shuffle(order) cost = 0. for batch_start in range(0, n_samples, opts.batch_size): batch_end = batch_start + opts.batch_size if batch_start \ + opts.batch_size < n_samples else n_samples batch_id = order[batch_start:batch_end] xs, ys = X_train[batch_id], y_train[batch_id]