def predict(self, X): """Return the probability of x belonging to either class""" hidden = sigmoid(self.Wh @ X + self.bh) scores = self.Ws @ hidden + self.bs probs = softmax_vectorized(scores) return probs.argmax(axis=0)
def forward(self, scores, ys): self.ys = ys self.probs = softmax_vectorized(np.array(scores)) y_hats = self.probs[self.ys, range(len(self.ys))] # Loss losses = -np.log(y_hats) loss = losses.sum() return loss, losses, self.probs
def forward_backward_prop(self, X=None, ys=None, rollout=None, Whh=None, bhh=None, Wxh=None, bxh=None, Ws=None, bs=None, hidden=None, predict=False): """Perform forward and backward prop over a single training example Returns loss and gradients """ # Hidden and input weights Whh = self.Whh if not type(Whh) == np.ndarray else Whh bhh = self.bhh if not type(bhh) == np.ndarray else bhh Wxh = self.Wxh if not type(Wxh) == np.ndarray else Wxh bxh = self.bxh if not type(bxh) == np.ndarray else bxh # Softmax weights Ws = self.Ws if not type(Ws) == np.ndarray else Ws bs = self.bs if not type(bs) == np.ndarray else bs # Initial hidden state hidden = self.hidden if not type(hidden) == np.ndarray else hidden # Where to start in the sequence and how far to go rollout = self.rollout if not rollout else rollout # Convert X and ys to dictionaries? if predict: X = {t+1:X[:, [t]] for t in range(rollout)} ys = {t+1:1 for t in range(rollout)} else: X, ys = {}, {} # Forward pass! dWhh, dbhh = np.zeros_like(Whh), np.zeros_like(bhh) dWxh, dbxh = np.zeros_like(Wxh), np.zeros_like(bxh) dWs, dbs = np.zeros_like(Ws), np.zeros_like(bs) loss = 0. hiddens = {0: hidden} dhiddens, dhiddens_downstream, dhiddens_local = {}, {rollout:np.zeros((self.H, 1))}, {} scores, probs = {}, {} for t in range(1, rollout+1): # Get the next input in the sequence X[t], ys[t] = next(self.input) # Previous hidden layer and input at time t Z = (Whh @ hiddens[t-1] + bhh) + (Wxh @ X[t] + bxh) hiddens[t] = np.tanh(Z) # Softmax scores[t] = Ws @ hiddens[t] + bs probs[t] = softmax_vectorized(scores[t]) y_hat = probs[t][ys[t]] # Loss loss += -np.log(y_hat).sum() # Add regularization loss += self.regularizer * 0.5*(np.sum(Whh**2) + np.sum(bhh**2) + np.sum(Wxh**2) + np.sum(bxh**2) + np.sum(Ws**2) + np.sum(bs**2)) if predict: return hiddens[rollout], scores # Backpropagate! backwards = list(reversed(range(rollout+1))) for t in backwards[:-1]: # Scores dscores = probs[t] dscores[ys[t], 0] -= 1 # Softmax weights dbs += dscores dWs += dscores @ hiddens[t].T dhiddens_local[t] = Ws.T @ dscores dhiddens[t] = dhiddens_local[t] + dhiddens_downstream[t] # Karpathy optimization dZ = tanh_grad(hiddens[t]) * dhiddens[t] # Input and hidden weights dbxh += dZ dWxh += dZ @ X[t].T dbhh += dZ dWhh += dZ @ hiddens[t-1].T # Set up incoming hidden weight gradient for previous time step dhiddens_downstream[t-1] = Whh.T @ dZ # Regularization # # Hidden and input weights dWhh += (self.regularizer*Whh) dbhh += (self.regularizer*bhh) dWxh += (self.regularizer*Wxh) dbxh += (self.regularizer*bxh) # Softmax weights dWs += (self.regularizer*Ws) dbs += (self.regularizer*bs) # Log additional info? if self.inspect: self.xs, self.ys = str(X), str(ys) self.scores, self.probs = scores, probs self.loss = loss self.dWhh, self.dbhh, self.dWxh, self.dbxh = dWhh, dbhh, dWxh, dbxh self.dWs, self.dbs = dWs, dbs self.hiddens = hiddens self.dhiddens = dhiddens self.dhiddens_local, self.dhiddens_downstream = dhiddens_local, dhiddens_downstream return State(loss, Gradients(dWhh, dbhh, dWxh, dbxh, dWs, dbs), hiddens[rollout])
def forward_backward_prop(self, X=None, ys=None, rollout=None, train_index=None, Whh=None, bhh=None, Wxh=None, bxh=None, Ws=None, bs=None, hidden=None, predict=False): """Perform forward and backward prop over a single training example Returns loss and gradients """ # Hidden and input weights Whh = self.Whh if not type(Whh) == np.ndarray else Whh bhh = self.bhh if not type(bhh) == np.ndarray else bhh Wxh = self.Wxh if not type(Wxh) == np.ndarray else Wxh bxh = self.bxh if not type(bxh) == np.ndarray else bxh # Softmax weights Ws = self.Ws if not type(Ws) == np.ndarray else Ws bs = self.bs if not type(bs) == np.ndarray else bs # Initial hidden state hidden = self.hidden if not type(hidden) == np.ndarray else hidden # Where to start in the sequence and how far to go rollout = self.rollout if not rollout else rollout train_index = self.train_index if not train_index else train_index # Get next portion of sequence to train on if not type(X) == np.ndarray: X = self.X_train[:, train_index:train_index+rollout] ys = self.ys_train[train_index:train_index+rollout] # Got to the end and need to wrap around? if train_index+rollout > self.T: rollover_index = (train_index+rollout) % self.T X = np.hstack([X, self.X_train[:, :rollover_index]]) ys = np.hstack([ys, self.ys_train[:rollover_index]]) # Append column of zeros to align X and Y with natural time X, ys = np.hstack([np.zeros((self.N, 1)), X]), np.hstack([np.zeros(1, dtype=np.int), ys]) # Forward pass! dWhh, dbhh = np.zeros_like(Whh), np.zeros_like(bhh) dWxh, dbxh = np.zeros_like(Wxh), np.zeros_like(bxh) dWs, dbs = np.zeros_like(Ws), np.zeros_like(bs) loss = 0. hiddens = {0: hidden} dhiddens, dhiddens_downstream, dhiddens_local = {}, {rollout:np.zeros((self.H, 1))}, {} scores, probs = {}, {} for t in range(1, rollout+1): # Previous hidden layer and input at time t Z = (Whh @ hiddens[t-1] + bhh) + (Wxh @ X[:,[t]] + bxh) hiddens[t] = np.tanh(Z) # Softmax scores[t] = Ws @ hiddens[t] + bs probs[t] = softmax_vectorized(scores[t]) y_hat = probs[t][ys[t]] # Loss loss += -np.log(y_hat).sum() # Add regularization loss += self.regularizer * 0.5*(np.sum(Whh**2) + np.sum(bhh**2) + np.sum(Wxh**2) + np.sum(bxh**2) + np.sum(Ws**2) + np.sum(bs**2)) if predict: return hiddens[rollout], scores # Backpropagate! backwards = list(reversed(range(rollout+1))) for t in backwards[:-1]: # Scores dscores = probs[t] dscores[ys[t], 0] -= 1 # Softmax weights dbs += dscores dWs += dscores @ hiddens[t].T dhiddens_local[t] = Ws.T @ dscores dhiddens[t] = dhiddens_local[t] + dhiddens_downstream[t] # Karpathy optimization dZ = tanh_grad(hiddens[t]) * dhiddens[t] # Input and hidden weights dbxh += dZ dWxh += dZ @ X[:,[t]].T dbhh += dZ dWhh += dZ @ hiddens[t-1].T # Set up incoming hidden weight gradient for previous time step dhiddens_downstream[t-1] = Whh.T @ dZ # Regularization # # Hidden and input weights dWhh += (self.regularizer*Whh) dbhh += (self.regularizer*bhh) dWxh += (self.regularizer*Wxh) dbxh += (self.regularizer*bxh) # Softmax weights dWs += (self.regularizer*Ws) dbs += (self.regularizer*bs) # Log additional info? if self.inspect: self.xs, self.ys = str(X[:, 1:]), str(ys[1:]) self.scores, self.probs = scores, probs self.loss = loss self.dWhh, self.dbhh, self.dWxh, self.dbxh = dWhh, dbhh, dWxh, dbxh self.dWs, self.dbs = dWs, dbs self.hiddens = hiddens self.dhiddens = dhiddens self.dhiddens_local, self.dhiddens_downstream = dhiddens_local, dhiddens_downstream return State(loss, Gradients(dWhh, dbhh, dWxh, dbxh, dWs, dbs), hiddens[rollout])