for i in range(1, len(layer_sizes)): self.layers.append(nn.ReLU()) self.layers.append(nn.Linear(layer_sizes[i - 1], layer_sizes[i])) self.nn = nn.Sequential(*self.layers) def forward(self, x): x = torch.flatten(x) return self.nn.forward(x) gamma = .95 alpha = .002 all_mean_diffs = [] all_states = env.get_all_states() ql_a = Qlearning(env, n_states=len(all_states), n_actions=env.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998) ql_b = Qlearning(env, n_states=len(all_states), n_actions=env.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998) for ne in range(0, 10): np.random.seed(10) num_episodes = 100 #* ne convergence_durations = [] ql_agents = [] for i in range(2): #print('Simulation {}/{}'.format(i, 50)) ql = Qlearning(env, n_states=len(all_states), n_actions=env.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998) #dqn = DQN(env, qnet=net, plotter=plot, render=True, memory_length=2000, gamma=gamma, alpha=alpha, epsilon_start=0.3, caching_interval=3000) for e in range(num_episodes):
his = 50 if len(durations_t) >= his: means = durations_t.unfold(0, his, 1).mean(1).view(-1) means = torch.cat((torch.zeros(his - 1), means)) plt.plot(means.numpy(), c='green') plt.pause(0.001) gamma = .95 alpha = .002 all_mean_diffs = [] all_states = env_a.get_all_states() ql_a = Qlearning(env_a, n_states=len(all_states), n_actions=env_a.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998) ql_b = Qlearning(env_b, n_states=len(all_states), n_actions=env_b.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998) for ne in range(0, 20): #np.random.seed(10)
super(PolicyNet, self).__init__() self.fc1 = nn.Linear(64, 32) self.fc2 = nn.Linear(32, 16) self.fc3 = nn.Linear(16, 4) self.linreg = nn.Linear(64, 4) def forward(self, x): x = torch.flatten(x) return self.linreg(x) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x all_states = env_a.get_all_states() + env_b.get_all_states() #ql_a = Qlearning(env_a, n_states=len(all_states), n_actions=env_a.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998) #ql_b = Qlearning(env_b, n_states=len(all_states), n_actions=env_b.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998) dqn_a = DQN(env_a, qnet=PolicyNet().double(), plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1) dqn_b = DQN(env_b, qnet=PolicyNet().double(), plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1) #dqn.train(2000, 4, plot=True, verbose=True) for ne in range(0, 30): #np.random.seed(10) num_episodes = 100 #* ne convergence_durations = [] ql_agents = [] dqn_a.train(num_episodes, 4, plot=False)