def train_step(self, X_train, y_train, h): ys, caches = [], [] total_loss = 0 grads = {k: np.zeros_like(v) for k, v in self.model.items()} # forward pass and store values for bptt for x, y in zip(X_train, y_train): y_pred, h, cache = self._forward(x, h) p = softmax(y_pred) log_likelihood = -np.log(p[range(y_pred.shape[0]), y]) total_loss += np.sum(log_likelihood) / y_pred.shape[0] ys.append(y_pred) caches.append(cache) total_loss /= X_train.shape[0] # backprop through time dh_next = np.zeros((1, self.h_size)) for t in reversed(range(len(X_train))): grad, dh_next = self._backward(ys[t], y_train[t], dh_next, caches[t]) # sum up the gradients for each time step for k in grads.keys(): grads[k] += grad[k] # clip vanishing/exploding gradients for k, v in grads.items(): grads[k] = np.clip(v, -5.0, 5.0) return loss, grads, h
def SoftmaxLoss(X, y): m = y.shape[0] p = softmax(X) log_likelihood = -np.log(p[range(m), y]) loss = np.sum(log_likelihood) / m dx = p.copy() dx[range(m), y] -= 1 dx /= m return loss, dx
def _backward(self, out, y, dh_next, cache): X_onehot, h_prev = cache # gradient of output from froward step dout = softmax(out) dout[range(len(y)), y] -= 1 # fully connected backward step dWhy = X_onehot.T @ dout dby = np.sum(dWhy, axis=0).reshape(1, -1) dh = dout @ self.dWhy.T # gradient through tanh dh = dout * (1 - out**2) # add up gradient from previous gradient dh += dh_next # hidden state dbh = dh dWhh = h_prev.T @ dh dWxh = X_onehot.T @ dh dh_next = dh @ Whh.T grads = dict(Wxh=dWxh, Whh=dWhh, Why=dWhy, bh=dbh, by=dby) return grads, dh_next
def predict(self, X): X = self.forward(X) return np.argmax(softmax(X), axis=1)
def evaluate(self, X, y): out = self.forward(X) loss, dout = self.loss_func(out, y) return np.argmax(softmax(X), axis=1), loss
def predict(self, X): X = self.forward(X) # print(np.argmax(softmax(X), axis=1)) return softmax(X)