def generate_episode(env, model, device, steps_done, episode_rewards): state = env.reset().transpose((2, 0, 1)) i_rewards, i_states, i_actions = [], [], [] total_reward = 0 for t in count(): # Select and perform an action action_idx = select_action(state, model, device, steps_done) action = index_to_action(action_idx) new_state, reward, done, _ = env.step(action) env.render() new_state = new_state.transpose((2, 0, 1)) steps_done += 1 # Save reward, action, state i_rewards.append(reward) i_actions.append(action_idx) i_states.append(state) total_reward += reward # Move state forward state = new_state # Break if done if done or t == 5000: print(total_reward) episode_rewards.append(total_reward) plot_rewards(episode_rewards) break return i_rewards, i_states, i_actions, steps_done
def train(): policy_net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device) target_net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = RMSprop(policy_net.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) memory = ReplayMemory(MEMORY_SIZE) env = Game(N_PLAYERS, LARGEST_CARD, HAND_SIZE, N_ROUNDS) select_action = generate_action_selector() rewards = [] for episode in trange(N_EPISODES): total_reward = 0 observation = env.reset() done = False while not done: state = torch.tensor([create_state(observation)], dtype=torch.float, device=device) action = select_action(policy_net, state, observation.hand) observation, reward, done, info = env.step(action.item()) total_reward += reward if not done: next_state = torch.tensor([create_state(observation)], dtype=torch.float, device=device) else: next_state = None reward = torch.tensor([reward], device=device) memory.push(state, action, next_state, reward) state = next_state optimize_model(policy_net, target_net, optimizer, memory) if done: rewards.append(total_reward) break if episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) if episode % SAVE_INTERVAL == 0: torch.save(target_net.state_dict(), f'models/model_{episode}.pth') if episode % 100 == 0: plot_rewards(np.cumsum(rewards), baseline=np.zeros(len(rewards))) return rewards
def train(): env = Game(N_PLAYERS, LARGEST_CARD, HAND_SIZE, N_ROUNDS) net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device) optimizer = RMSprop(net.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) episodic_rewards = [] for episode in trange(N_EPISODES): states, rewards, actions = generate_episode(env, net) optimize_model(net, optimizer, states, rewards, actions) episodic_rewards.append(sum(rewards[:, 0])) if episode % SAVE_INTERVAL == 0: torch.save(net.state_dict(), f'models/model_{episode}.pth') if episode % 100 == 0: plot_rewards(np.cumsum(episodic_rewards), baseline=np.zeros_like(episodic_rewards)) return episodic_rewards