def fit(self, X, Y, learning_rate=10e-6, reg=10e-7, epochs=10001, show_fig=False): N, D = X.shape K = len(set(Y)) T = y_hot_encoding(Y) W1, b1 = init_weight_and_bias(D, self.M_1) W2, b2 = init_weight_and_bias(self.M_1, self.M_2) W3, b3 = init_weight_and_bias(self.M_2, self.M_3) W4, b4 = init_weight_and_bias(self.M_3, self.M_4) W5, b5 = init_weight_and_bias(self.M_4, K) self.weights = [W1, W2, W3, W4, W5] self.biases = [b1, b2, b3, b4, b5] costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation and cost calculation pY, Z_4, Z_3, Z_2, Z_1 = self.forward(X) Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1] Z_3_deriv = self.nonlinear(self.layers[-1], Z = Z_3)[1] Z_2_deriv = self.nonlinear(self.layers[-2], Z = Z_2)[1] Z_1_deriv = self.nonlinear(self.layers[-3], Z = Z_1)[1] # gradient descent step pY_T = pY - T self.weights[-1] -= learning_rate * (Z_4.T.dot(pY_T) + reg * self.weights[-1]) self.biases[-1] -= learning_rate * (pY_T.sum(axis=0) + reg * self.biases[-1]) dZ_4 = pY_T.dot((self.weights[-1]).T) * Z_4_deriv self.weights[-2] -= learning_rate * (Z_3.T.dot(dZ_4) + reg * self.weights[-2]) self.biases[-2] -= learning_rate * (dZ_4.sum(axis=0) + reg * self.biases[-2]) dZ_3 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot((self.weights[-2]).T) * Z_3_deriv self.weights[-3] -= learning_rate * (Z_2.T.dot(dZ_3) + reg * self.weights[-3]) self.biases[-3] -= learning_rate * (dZ_3.sum(axis=0) + reg * self.biases[-3]) dZ_2 = (((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot((self.weights[-2]).T) * Z_3_deriv).dot((self.weights[-3]).T)) * Z_2_deriv self.weights[-4] -= learning_rate * (Z_1.T.dot(dZ_2) + reg * self.weights[-4]) self.biases[-4] -= learning_rate * (dZ_2.sum(axis=0) + reg * self.biases[-4]) dZ_1 = (((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot((self.weights[-2]).T) * Z_3_deriv).dot((self.weights[-3]).T)) * Z_2_deriv).dot((self.weights[-4]).T)) * Z_1_deriv self.weights[-5] -= learning_rate * (X.T.dot(dZ_1) + reg * self.weights[-5]) self.biases[-5] -= learning_rate * (dZ_1.sum(axis=0) + reg * self.biases[-5]) if i % 4000 == 0: pYvalid, _, __, ___, ____ = self.forward(X) c = cost(T, pYvalid) costs.append(c) e = error_rate(T, np.argmax(pYvalid, axis=1)) print("i:", i, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best_validation_error:", best_validation_error) if show_fig: plt.plot(costs) plt.show()
def __init__(self, M1, M2, an_id, nonlin_func): self.id = an_id self.M1 = M1 self.M2 = M2 '''self.params contain W and b for particular layer''' self.params = list(map(tf.Variable, init_weight_and_bias(M1, M2))) self.nonlin_func = nonlin_func
def fit(self, X, Y, learning_rate=5 * 10e-5, reg=10e-2, epochs=51, show_fig=False): N, D = X.shape K = len(set(Y)) T = y_hot_encoding(Y) W1, b1 = init_weight_and_bias(D, self.M_1) W2, b2 = init_weight_and_bias(self.M_1, self.M_2) W3, b3 = init_weight_and_bias(self.M_2, self.M_3) W4, b4 = init_weight_and_bias(self.M_3, self.M_4) W5, b5 = init_weight_and_bias(self.M_4, K) self.weights = [W1, W2, W3, W4, W5] self.biases = [b1, b2, b3, b4, b5] batch_sz = 100 n_batches = int(N / batch_sz) # momentum decay_rate = 0.999 eps = 10e-10 cache_W = [1, 1, 1, 1, 1] cache_b = [1, 1, 1, 1, 1] mu = 0.9 dW = [0, 0, 0, 0, 0] db = [0, 0, 0, 0, 0] costs = [] best_validation_error = 1 for i in range(epochs): for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz), ] Tbatch = T[j * batch_sz:(j * batch_sz + batch_sz), ] # forward propagation and cost calculation pY, Z_4, Z_3, Z_2, Z_1 = self.forward(Xbatch) Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1] Z_3_deriv = self.nonlinear(self.layers[-1], Z=Z_3)[1] Z_2_deriv = self.nonlinear(self.layers[-2], Z=Z_2)[1] Z_1_deriv = self.nonlinear(self.layers[-3], Z=Z_1)[1] # gradient descent step # learning_rate=5*10e-5, reg=10e-2, epochs=51 pY_T = pY - Tbatch gW5 = Z_4.T.dot(pY_T) + reg * self.weights[-1] gb5 = pY_T.sum(axis=0) + reg * self.biases[-1] cache_W[-1] = decay_rate * cache_W[-1] + ( 1 - decay_rate) * gW5 * gW5 cache_b[-1] = decay_rate * cache_b[-1] + ( 1 - decay_rate) * gb5 * gb5 dW[-1] = mu * dW[-1] - (1 - mu) * learning_rate * gW5 / ( np.sqrt(cache_W[-1]) + eps) db[-1] = mu * db[-1] - (1 - mu) * learning_rate * gb5 / ( np.sqrt(cache_b[-1]) + eps) self.weights[-1] += dW[-1] self.biases[-1] += db[-1] dZ_4 = pY_T.dot((self.weights[-1]).T) * Z_4_deriv gW4 = Z_3.T.dot(dZ_4) + reg * self.weights[-2] gb4 = dZ_4.sum(axis=0) + reg * self.biases[-2] cache_W[-2] = decay_rate * cache_W[-2] + ( 1 - decay_rate) * gW4 * gW4 cache_b[-2] = decay_rate * cache_b[-2] + ( 1 - decay_rate) * gb4 * gb4 dW[-2] = mu * dW[-2] - (1 - mu) * learning_rate * gW4 / ( np.sqrt(cache_W[-2]) + eps) db[-2] = mu * db[-2] - (1 - mu) * learning_rate * gb4 / ( np.sqrt(cache_b[-2]) + eps) self.weights[-2] += dW[-2] self.biases[-2] += db[-2] dZ_3 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot( (self.weights[-2]).T) * Z_3_deriv gW3 = Z_2.T.dot(dZ_3) + reg * self.weights[-3] gb3 = dZ_3.sum(axis=0) + reg * self.biases[-3] cache_W[-3] = decay_rate * cache_W[-3] + ( 1 - decay_rate) * gW3 * gW3 cache_b[-3] = decay_rate * cache_b[-3] + ( 1 - decay_rate) * gb3 * gb3 dW[-3] = mu * dW[-3] - (1 - mu) * learning_rate * gW3 / ( np.sqrt(cache_W[-3]) + eps) db[-3] = mu * db[-3] - (1 - mu) * learning_rate * gb3 / ( np.sqrt(cache_b[-3]) + eps) self.weights[-3] += dW[-3] self.biases[-3] += db[-3] dZ_2 = (((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot( (self.weights[-2]).T) * Z_3_deriv).dot( (self.weights[-3]).T)) * Z_2_deriv gW2 = Z_1.T.dot(dZ_2) + reg * self.weights[-4] gb2 = dZ_2.sum(axis=0) + reg * self.biases[-4] cache_W[-4] = decay_rate * cache_W[-4] + ( 1 - decay_rate) * gW2 * gW2 cache_b[-4] = decay_rate * cache_b[-4] + ( 1 - decay_rate) * gb2 * gb2 dW[-4] = mu * dW[-4] - (1 - mu) * learning_rate * gW2 / ( np.sqrt(cache_W[-4]) + eps) db[-4] = mu * db[-4] - (1 - mu) * learning_rate * gb2 / ( np.sqrt(cache_b[-4]) + eps) self.weights[-4] += dW[-4] self.biases[-4] += db[-4] dZ_1 = (((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot( (self.weights[-2]).T) * Z_3_deriv).dot( (self.weights[-3]).T)) * Z_2_deriv).dot( (self.weights[-4]).T)) * Z_1_deriv gW1 = Xbatch.T.dot(dZ_1) + reg * self.weights[-5] gb1 = dZ_1.sum(axis=0) + reg * self.biases[-5] cache_W[-5] = decay_rate * cache_W[-5] + ( 1 - decay_rate) * gW1 * gW1 cache_b[-5] = decay_rate * cache_b[-5] + ( 1 - decay_rate) * gb1 * gb1 dW[-5] = mu * dW[-5] - (1 - mu) * learning_rate * gW1 / ( np.sqrt(cache_W[-5]) + eps) db[-5] = mu * db[-5] - (1 - mu) * learning_rate * gb1 / ( np.sqrt(cache_b[-5]) + eps) self.weights[-5] += dW[-5] self.biases[-5] += db[-5] # if j % 10 == 0: # pYvalid, _, __, ___, ____ = self.forward(X) # c = cost(T, pYvalid) # costs.append(c) # e = error_rate(Y, np.argmax(pYvalid, axis=1)) # print("i:", i, "cost:", c, "error:", e) # if e < best_validation_error: # best_validation_error = e # print("best_validation_error:", best_validation_error) if i % 10 == 0: pYvalid, _, __, ___, ____ = self.forward(X) c = cost(T, pYvalid) costs.append(c) print("i:", i, "cost:", c) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-7, mu=0.99, decay=0.999, reg=10e-3, epochs=400, batch_size=100, split=True, show_fig=False, print_every=20): self.epochs = epochs K = len(set(Y)) X, Y = X.astype(np.float32).toarray(), y_hot_encoding(Y).astype( np.float32) X, Y = shuffle(X, Y) if split: Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] else: Xvalid, Yvalid = X, Y Yvalid_flat = np.argmax(Yvalid, axis=1) self.training = True '''Clears the default graph stack and resets the global default graph.''' tf.reset_default_graph() '''initialize hidden layers''' N, D = X.shape M1 = D self.hidden_layers = [] for id in range(len(self.hidden_layer_sizes)): self.hidden_layers.append( HiddenLayer(M1, self.hidden_layer_sizes[id], id, self.nonlin_functions[id])) M1 = self.hidden_layer_sizes[id] self.params = list(map(tf.Variable, init_weight_and_bias(M1, K))) [self.params.append(j) for h in self.hidden_layers for j in h.params] tfX = tf.placeholder(tf.float32, shape=(None, D), name="tfX") tfT = tf.placeholder(tf.float32, shape=(None, K), name="tfT") logits = self.forward(tfX) rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tfT)) + rcost #cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tfT)) #+ rcost prediction = self.predict(tfX) #train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost) train_op = tf.train.AdamOptimizer(learning_rate, beta1=0.99, beta2=0.999).minimize(cost) #train_op = tf.train.MomentumOptimizer(learning_rate, momentum=mu, use_nesterov=False).minimize(cost) #train_op = tf.train.ProximalGradientDescentOptimizer(learning_rate, l2_regularization_strength=0.0, use_locking=False).minimize(cost) n_batches = int(N / batch_size) costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_size:(j * batch_size + batch_size)] Ybatch = Y[j * batch_size:(j * batch_size + batch_size)] session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch}) if j % print_every == 0: costs.append( session.run(cost, feed_dict={ tfX: Xvalid, tfT: Yvalid })) p = session.run(prediction, feed_dict={ tfX: Xvalid, tfT: Yvalid }) print("i:", i, "j:", j, "nb:", n_batches, "cost:", costs[-1], "error_rate:", error_rate(Yvalid_flat, p)) saver = tf.train.Saver() '''Now, save the graph''' saver.save(session, './my_model-' + str(self.counter), global_step=self.epochs) print("Done!") if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, batch_size, learning_rate=10e-6, reg=10e-7, epochs=10001, show_fig=False): N, D = X.shape K = len(set(Y)) T = y_hot_encoding(Y) W1, b1 = init_weight_and_bias(D, self.M_1) W2, b2 = init_weight_and_bias(self.M_1, self.M_2) W3, b3 = init_weight_and_bias(self.M_2, self.M_3) W4, b4 = init_weight_and_bias(self.M_3, self.M_4) W5, b5 = init_weight_and_bias(self.M_4, K) self.weights = [W1, W2, W3, W4, W5] self.biases = [b1, b2, b3, b4, b5] batch_sz = batch_size n_batches = int(N / batch_sz) #momentum mu = 0.9 dW = [0, 0, 0, 0, 0] db = [0, 0, 0, 0, 0] costs = [] best_validation_error = 1 self.training = True for i in range(epochs): for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz), ] Tbatch = T[j * batch_sz:(j * batch_sz + batch_sz), ] # forward propagation and cost calculation pY, Z_4, Z_3, Z_2, Z_1, U = self.forward(Xbatch) Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1] Z_3_deriv = self.nonlinear(self.layers[-1], Z=Z_3)[1] Z_2_deriv = self.nonlinear(self.layers[-2], Z=Z_2)[1] Z_1_deriv = self.nonlinear(self.layers[-3], Z=Z_1)[1] # gradient descent step pY_T = pY - Tbatch dW[-1] = mu * dW[-1] - (1 - mu) * learning_rate * ( Z_4.T.dot(pY_T) + reg * self.weights[-1]) db[-1] = mu * db[-1] - (1 - mu) * learning_rate * ( pY_T.sum(axis=0) + reg * self.biases[-1]) self.weights[-1] += dW[-1] self.biases[-1] += db[-1] dZ_4 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv) * U[-1] dW[-2] = mu * dW[-2] - (1 - mu) * learning_rate * ( Z_3.T.dot(dZ_4) + reg * self.weights[-2]) db[-2] = mu * db[-2] - (1 - mu) * learning_rate * ( dZ_4.sum(axis=0) + reg * self.biases[-2]) self.weights[-2] += dW[-2] self.biases[-2] += db[-2] dZ_3 = ((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot( (self.weights[-2]).T) * Z_3_deriv) * U[-2] dW[-3] = mu * dW[-3] - (1 - mu) * learning_rate * ( Z_2.T.dot(dZ_3) + reg * self.weights[-3]) db[-3] = mu * db[-3] - (1 - mu) * learning_rate * ( dZ_3.sum(axis=0) + reg * self.biases[-3]) self.weights[-3] += dW[-3] self.biases[-3] += db[-3] dZ_2 = ((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot( (self.weights[-2]).T) * Z_3_deriv).dot( (self.weights[-3]).T)) * Z_2_deriv) * U[-3] dW[-4] = mu * dW[-4] - (1 - mu) * learning_rate * ( Z_1.T.dot(dZ_2) + reg * self.weights[-4]) db[-4] = mu * db[-4] - (1 - mu) * learning_rate * ( dZ_2.sum(axis=0) + reg * self.biases[-4]) self.weights[-4] += dW[-4] self.biases[-4] += db[-4] dZ_1 = ((((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot( (self.weights[-2]).T) * Z_3_deriv).dot( (self.weights[-3]).T)) * Z_2_deriv).dot( (self.weights[-4]).T)) * Z_1_deriv) * U[-4] dW[-5] = mu * dW[-5] - (1 - mu) * learning_rate * ( Xbatch.T.dot(dZ_1) + reg * self.weights[-5]) db[-5] = mu * db[-5] - (1 - mu) * learning_rate * ( dZ_1.sum(axis=0) + reg * self.biases[-5]) self.weights[-5] += dW[-5] self.biases[-5] += db[-5] #if j % 10 == 0: # pYvalid, _, __, ___, ____ = self.forward(X) # c = cost(T, pYvalid) # costs.append(c) # e = error_rate(Y, np.argmax(pYvalid, axis=1)) # print("i:", i, "cost:", c, "error:", e) # if e < best_validation_error: # best_validation_error = e # print("best_validation_error:", best_validation_error) if i % 50 == 0: pYvalid, _, __, ___, ____, _____ = self.forward(X) c = cost(T, pYvalid) costs.append(c) print("i:", i, "cost:", c) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=5 * 10e-5, reg=10e-2, epochs=51, show_fig=False): N, D = X.shape K = len(set(Y)) T = y_hot_encoding(Y) W1, b1 = init_weight_and_bias(D, self.M_1) W2, b2 = init_weight_and_bias(self.M_1, self.M_2) W3, b3 = init_weight_and_bias(self.M_2, self.M_3) W4, b4 = init_weight_and_bias(self.M_3, self.M_4) W5, b5 = init_weight_and_bias(self.M_4, K) self.weights = [W1, W2, W3, W4, W5] self.biases = [b1, b2, b3, b4, b5] batch_sz = 100 n_batches = int(N / batch_sz) decay_rate = 0.999 eps = 10e-10 beta_1 = 0.9 beta_2 = 0.999 # first momentum m_W = [0, 0, 0, 0, 0] m_b = [0, 0, 0, 0, 0] #second momentum v_W = [0, 0, 0, 0, 0] v_b = [0, 0, 0, 0, 0] def updater(idx, gW, gb): m_W[idx] = (beta_1 * m_W[idx] + (1 - beta_1) * gW) / (1 - beta_1**t) m_b[idx] = (beta_1 * m_b[idx] + (1 - beta_1) * gb) / (1 - beta_1**t) v_W[idx] = (beta_2 * v_W[idx] + (1 - beta_2) * gW * gW) / (1 - beta_2**t) v_b[idx] = (beta_2 * v_b[idx] + (1 - beta_2) * gb * gb) / (1 - beta_2**t) self.weights[idx] -= learning_rate * m_W[idx] / np.sqrt(v_W[idx] + eps) self.biases[idx] -= learning_rate * m_b[idx] / np.sqrt(v_b[idx] + eps) costs = [] best_validation_error = 1 for i in range(epochs): for j in range(n_batches): #num of iteration t = 1 + i * n_batches + j Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz), ] Tbatch = T[j * batch_sz:(j * batch_sz + batch_sz), ] # forward propagation and cost calculation pY, Z_4, Z_3, Z_2, Z_1 = self.forward(Xbatch) Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1] Z_3_deriv = self.nonlinear(self.layers[-1], Z=Z_3)[1] Z_2_deriv = self.nonlinear(self.layers[-2], Z=Z_2)[1] Z_1_deriv = self.nonlinear(self.layers[-3], Z=Z_1)[1] # gradient descent step # learning_rate=5*10e-5, reg=10e-2, epochs=51 pY_T = pY - Tbatch gW5 = Z_4.T.dot(pY_T) + reg * self.weights[-1] gb5 = pY_T.sum(axis=0) + reg * self.biases[-1] updater(-1, gW5, gb5) dZ_4 = pY_T.dot((self.weights[-1]).T) * Z_4_deriv gW4 = Z_3.T.dot(dZ_4) + reg * self.weights[-2] gb4 = dZ_4.sum(axis=0) + reg * self.biases[-2] updater(-2, gW4, gb4) dZ_3 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot( (self.weights[-2]).T) * Z_3_deriv gW3 = Z_2.T.dot(dZ_3) + reg * self.weights[-3] gb3 = dZ_3.sum(axis=0) + reg * self.biases[-3] updater(-3, gW3, gb3) dZ_2 = (((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot( (self.weights[-2]).T) * Z_3_deriv).dot( (self.weights[-3]).T)) * Z_2_deriv gW2 = Z_1.T.dot(dZ_2) + reg * self.weights[-4] gb2 = dZ_2.sum(axis=0) + reg * self.biases[-4] updater(-4, gW2, gb2) dZ_1 = (((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot( (self.weights[-2]).T) * Z_3_deriv).dot( (self.weights[-3]).T)) * Z_2_deriv).dot( (self.weights[-4]).T)) * Z_1_deriv gW1 = Xbatch.T.dot(dZ_1) + reg * self.weights[-5] gb1 = dZ_1.sum(axis=0) + reg * self.biases[-5] updater(-5, gW1, gb1) # if j % 10 == 0: # pYvalid, _, __, ___, ____ = self.forward(X) # c = cost(T, pYvalid) # costs.append(c) # e = error_rate(Y, np.argmax(pYvalid, axis=1)) # print("i:", i, "cost:", c, "error:", e) # if e < best_validation_error: # best_validation_error = e # print("best_validation_error:", best_validation_error) if i % 10 == 0: pYvalid, _, __, ___, ____ = self.forward(X) c = cost(T, pYvalid) costs.append(c) print("i:", i, "cost:", c) if show_fig: plt.plot(costs) plt.show()