def main(): lr = 0.001 input_size = 50 output_size = 10 n_iterations = 100 # Both input and "ground truth" are random vectors x = np.random.random(input_size) y = np.random.random(output_size) # Randomly initialize neural network weights #weights = to_value(np.random.random((input_size, output_size))) nn = MLP(input_size, output_size, [5, 10, 20]) print(nn.layers[0]) losses = [] for i in tqdm(range(100)): y_pred = nn(x) loss = np.sum((y - y_pred) * (y - y_pred)) losses.append(loss.data) loss.backward() for p in nn.parameters(): p.data -= lr * p.grad nn.zero_grad() plt.plot(losses) plt.ylabel('Loss') plt.xlabel('Iteration') plt.title('Multilayer perceptron fitting random noise') plt.show()
class Policy(object): def __init__(self, input_dim, n_actions, gamma=0.9): self.input_dim = input_dim self.n_actions = n_actions self.gamma = gamma self.model = MLP(input_dim, [32, 32], n_actions) self.optim = optim.Adam(self.model.parameters(), lr=1e-2) self.action_reward = [] def get_action(self, observation, stochastic=True): pred = self.model(observation) if stochastic: return pred.multinomial() return pred[0].argmax() def update(self): R = 0 rewards = [] for action, reward in self.action_reward: R = reward + self.gamma * R rewards.insert(0, R) rewards = T.Tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) actions = [] for (action, _), reward in zip(self.action_reward, rewards): action.reinforce(reward) actions.append(action) self.optim.zero_grad() T.autograd.backward(actions, [None for _ in actions]) self.optim.step() self.action_reward = [] def record(self, action, reward): self.action_reward.append((action, reward))