def main(): # Create carpole environment and network env = gym.make('CartPole-v0').unwrapped if not os.path.exists(model_path): raise Exception("You should train the DQN first!") net = DQN(n_state=env.observation_space.shape[0], n_action=env.action_space.n, epsilon=epsilon, batch_size=batch_size, model_path=model_path) net.load() net.cuda() reward_list = [] for i in range(episode): s = env.reset() total_reward = 0 while True: # env.render() # Select action and obtain the reward a = net.chooseAction(s) s_, r, finish, _ = env.step(a) total_reward += r if finish: print("Episode: %d \t Total reward: %d \t Eps: %f" % (i, total_reward, net.epsilon)) reward_list.append(total_reward) break s = s_ env.close() print("Testing average reward: ", np.mean(reward_list))
def load_and_test(opt): netp1 = DQN(NUM_STATES, NUM_ACTIONS, opt.eps, opt) netp1.load(opt.load_path) load_path2 = list(opt.load_path) print(opt.load_path) load_path2[-11] = '2' load_path2 = "".join(load_path2) print(load_path2) netp2 = DQN(NUM_STATES, NUM_ACTIONS, opt.eps, opt) netp2.load(load_path2) if opt.player == 1: r1, r2, w, d = test_ep_pvp(netp1, netp2, opt.num_test, opt.eps, render=opt.render) print('p1 average reward:', r1) print('p2 average reward:', r2) print('p1 win rate:', w) print('p2 win rate:', 1 - w - d) print('draw rate:', d) elif opt.player == 2: r2, r1, w, d = test_ep_pvp(netp2, netp1, opt.num_test, opt.eps, render=opt.render) print('p1 average reward:', r1) print('p2 average reward:', r2) print('p1 win rate:', 1 - w - d) print('p2 win rate:', w) print('draw rate:', d)
def load_and_play(opt): net = DQN(NUM_STATES, NUM_ACTIONS, opt.eps, opt) net.load(opt.load_path) if opt.player == 1: r, w = test_ep(net, opt.opp_policy, opt.num_test, opt.eps, render=opt.render) elif opt.player == 2: r, w = test_ep_p2(net, opt.opp_policy, opt.num_test, opt.eps, render=opt.render) print('average reward: {:.3f}'.format(r)) print('win rate: {:.1f}%'.format(100 * w))
def main(): model = DQN(env.observation_space.shape, env.action_space.n) model.load(MODEL_FILENAME) while True: state = env.reset() state = rgb2dataset(state) # Transition transition = [] transition.append(state) model.episode += 1 accum_reward = 0 while True: if len(transition) == 4: action = model.get_action(transition, is_random=False) else: action = model.get_action(transition, is_random=True) state_, reward, done, info = env.step(action) state_ = rgb2dataset(state_) accum_reward += reward state = state_ # Transition transition.append(state) if len(transition) > 4: transition.pop(0) if RENDER: env.render() if done: print("accum_reward : %7d" % (accum_reward)) break env.close()
class Agent: """ The intelligent agent of the simulation. Set the model of the neural network used and general parameters. It is responsible to select the actions, optimize the neural network and manage the models. """ def __init__(self, action_set, train=True, load_path=None): #1. Initialize agent params self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.action_set = action_set self.action_number = len(action_set) self.steps_done = 0 self.epsilon = Config.EPS_START self.episode_durations = [] #2. Build networks self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE) if not train: self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0) self.policy_net.load(load_path, optimizer=self.optimizer) self.policy_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() #3. Create Prioritized Experience Replay Memory self.memory = Memory(Config.MEMORY_SIZE) def append_sample(self, state, action, next_state, reward): """ save sample (error,<s,a,s',r>) to the replay memory """ # Define if is the end of the simulation done = True if next_state is None else False # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = self.policy_net(state) state_action_values = state_action_values.gather(1, action.view(-1,1)) if not done: # Compute argmax Q(s', a; θ) next_state_actions = self.policy_net(next_state).max(1)[1].detach().unsqueeze(1) # Compute Q(s', argmax Q(s', a; θ), θ-) next_state_values = self.target_net(next_state).gather(1, next_state_actions).squeeze(1).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * Config.GAMMA) + reward else: expected_state_action_values = reward error = abs(state_action_values - expected_state_action_values).data.cpu().numpy() self.memory.add(error, state, action, next_state, reward) def select_action(self, state, train=True): """ Selet the best action according to the Q-values outputed from the neural network Parameters ---------- state: float ndarray The current state on the simulation train: bool Define if we are evaluating or trainning the model Returns ------- a.max(1)[1]: int The action with the highest Q-value a.max(0): float The Q-value of the action taken """ global steps_done sample = random.random() #1. Perform a epsilon-greedy algorithm #a. set the value for epsilon self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \ math.exp(-1. * self.steps_done / Config.EPS_DECAY) self.steps_done += 1 #b. make the decision for selecting a random action or selecting an action from the neural network if sample > self.epsilon or (not train): # select an action from the neural network with torch.no_grad(): # a <- argmax Q(s, theta) a = self.policy_net(state) return a.max(1)[1].view(1, 1), a.max(0) else: # select a random action print('random action') return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None """ def select_action(self, state, train=True): Selet the best action according to the Q-values outputed from the neural network Parameters ---------- state: float ndarray The current state on the simulation train: bool Define if we are evaluating or trainning the model Returns ------- a.max(1)[1]: int The action with the highest Q-value a.max(0): float The Q-value of the action taken global steps_done sample = random.random() #1. Perform a epsilon-greedy algorithm #a. set the value for epsilon self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \ math.exp(-1. * self.steps_done / Config.EPS_DECAY) self.steps_done += 1 #b. make the decision for selecting a random action or selecting an action from the neural network if sample > self.epsilon or (not train): # select an action from the neural network with torch.no_grad(): # a <- argmax Q(s, theta) #set the network to train mode is important to enable dropout self.policy_net.train() output_list = [] # Retrieve the outputs from neural network feedfoward n times to build a statistic model for i in range(Config.STOCHASTIC_PASSES): #print(agent.policy_net(data)) output_list.append(torch.unsqueeze(F.softmax(self.policy_net(state)), 0)) #print(output_list[i]) self.policy_net.eval() # The result of the network is the mean of n passes output_mean = torch.cat(output_list, 0).mean(0) q_value = output_mean.data.cpu().numpy().max() action = output_mean.max(1)[1].view(1, 1) uncertainty = torch.cat(output_list, 0).var(0).mean().item() return action, q_value, uncertainty else: # select a random action print('random action') return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None, None """ def optimize_model(self): """ Perform one step of optimization on the neural network """ if self.memory.tree.n_entries < Config.BATCH_SIZE: return transitions, idxs, is_weights = self.memory.sample(Config.BATCH_SIZE) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.uint8) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = self.policy_net(state_batch).gather(1, action_batch) # Compute argmax Q(s', a; θ) next_state_actions = self.policy_net(non_final_next_states).max(1)[1].detach().unsqueeze(1) # Compute Q(s', argmax Q(s', a; θ), θ-) next_state_values = torch.zeros(Config.BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, next_state_actions).squeeze(1).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * Config.GAMMA) + reward_batch # Update priorities errors = torch.abs(state_action_values.squeeze() - expected_state_action_values).data.cpu().numpy() # update priority for i in range(Config.BATCH_SIZE): idx = idxs[i] self.memory.update(idx, errors[i]) # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) loss_return = loss.item() # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() return loss_return def save(self, step, logs_path, label): """ Save the model on hard disc Parameters ---------- step: int current step on the simulation logs_path: string path to where we will store the model label: string label that will be used to store the model """ os.makedirs(logs_path + label, exist_ok=True) full_label = label + str(step) + '.pth' logs_path = os.path.join(logs_path, label, full_label) self.policy_net.save(logs_path, step=step, optimizer=self.optimizer) def restore(self, logs_path): """ Load the model from hard disc Parameters ---------- logs_path: string path to where we will store the model """ self.policy_net.load(logs_path) self.target_net.load(logs_path)
if done: episode_score[-1] = score.get() score.reset() break mean_score = np.mean(episode_score) mean_reward = np.mean(episode_reward) n_episodes = len(episode_reward) return mean_score, mean_reward, n_episodes score = LunarLanderScore() reward = CustomReward() if use_custom_reward else None env = arlie.make("LunarLander", port=4000, seed=seed, render_mode=False, reward=reward) model = DQN.load("wave-trained-model") print("Evaluating...") mean_score, mean_reward, n_episodes = evaluate(env, model, score, num_episodes=eval_episodes) print("Mean score: {}, reward: {}, in {} episodes".format( mean_score, mean_reward, n_episodes)) env.close()
class Agent: """ The intelligent agent of the simulation. Set the model of the neural network used and general parameters. It is responsible to select the actions, optimize the neural network and manage the models. """ def __init__(self, action_set, train=True, load_path=None): #1. Initialize agent params self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.action_set = action_set self.action_number = len(action_set) self.steps_done = 0 self.epsilon = Config.EPS_START self.episode_durations = [] print('LOAD PATH -- agent.init:', load_path) time.sleep(2) #2. Build networks self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE) if not train: print('entrou no not train') self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0) self.policy_net.load(load_path, optimizer=self.optimizer) self.policy_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.memory = ReplayMemory(1000) def select_action(self, state, train=True): """ Selet the best action according to the Q-values outputed from the neural network Parameters ---------- state: float ndarray The current state on the simulation train: bool Define if we are evaluating or trainning the model Returns ------- a.max(1)[1]: int The action with the highest Q-value a.max(0): float The Q-value of the action taken """ global steps_done sample = random.random() #1. Perform a epsilon-greedy algorithm #a. set the value for epsilon self.epsilon = Config.EPS_END + (Config.EPS_START - Config.EPS_END) * \ math.exp(-1. * self.steps_done / Config.EPS_DECAY) self.steps_done += 1 #b. make the decision for selecting a random action or selecting an action from the neural network if sample > self.epsilon or (not train): # select an action from the neural network with torch.no_grad(): # a <- argmax Q(s, theta) a = self.policy_net(state) return a.max(1)[1].view(1, 1), a.max(0) else: # select a random action print('random action') return torch.tensor([[random.randrange(2)]], device=self.device, dtype=torch.long), None def optimize_model(self): """ Perform one step of optimization on the neural network """ if len(self.memory) < Config.BATCH_SIZE: return transitions = self.memory.sample(Config.BATCH_SIZE) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.uint8) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = self.policy_net(state_batch).gather(1, action_batch) # Compute argmax Q(s', a; θ) next_state_actions = self.policy_net(non_final_next_states).max(1)[1].detach().unsqueeze(1) # Compute Q(s', argmax Q(s', a; θ), θ-) next_state_values = torch.zeros(Config.BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, next_state_actions).squeeze(1).detach() # Compute the expected Q values expected_state_action_values = (next_state_values * Config.GAMMA) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() def save(self, step, logs_path, label): """ Save the model on hard disc Parameters ---------- step: int current step on the simulation logs_path: string path to where we will store the model label: string label that will be used to store the model """ os.makedirs(logs_path + label, exist_ok=True) full_label = label + str(step) + '.pth' logs_path = os.path.join(logs_path, label, full_label) self.policy_net.save(logs_path, step=step, optimizer=self.optimizer) def restore(self, logs_path): """ Load the model from hard disc Parameters ---------- logs_path: string path to where we will store the model """ self.policy_net.load(logs_path) self.target_net.load(logs_path)
class Agent: """ Class representing a learning agent acting in an environment. """ def __init__(self, buffer_size, batch_size, alpha, gamma, epsilon, epsilon_min, epsilon_decay, lr, game="CartPole-v1", mean_bound=5, reward_bound=495.0, sync_model=1000, save_model=10): """ Constructor of the agent class. - game="CartPole-v1" : Name of the game environment - mean_bound=5 : Number of last acquired rewards considered for mean reward - reward_bound=495.0 : Reward acquired for completing an episode properly - sync_model=1000 : Interval for synchronizing model and target model - save_model=10 : Interval for saving model - buffer_size : Replay buffer size of the DQN model - batch_size : Batch size of the DQN model - alpha : Learning rate for Q-Learning - gamma : Discount factor for Q-Learning - epsilon : Threshold for taking a random action - epsilon_min : Minimal value allowed for epsilon - epsilon_decay : Decay rate for epsilon - lr : Learning rate for the DQN model """ # Environment variables self.game = game self.env = gym.make(self.game) self.num_states = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.n # Agent variables self.buffer_size = buffer_size self.batch_size = batch_size self.buffer = ReplayBuffer(self.buffer_size, self.batch_size) self.alpha = alpha self.gamma = gamma self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.mean_bound = mean_bound self.reward_bound = reward_bound # DQN variables self.lr = lr self.model = DQN(self.num_states, self.num_actions, self.lr) self.target_model = DQN(self.num_states, self.num_actions, self.lr) self.target_model.update(self.model) self.sync_model = sync_model self.save_model = save_model # File paths dirname = os.path.dirname(__file__) self.path_model = os.path.join(dirname, "../models/dqn.h5") self.path_plot = os.path.join(dirname, "../plots/dqn.png") # Load model, if it already exists try: self.model.load(self.path_model) self.target_model.update(self.model) except: print("Model does not exist! Create new model...") def reduce_epsilon(self): """ Reduces the parameter epsilon up to a given minimal value where the speed of decay is controlled by some given parameter. """ epsilon = self.epsilon * self.epsilon_decay if epsilon >= self.epsilon_min: self.epsilon = epsilon else: self.epsilon = self.epsilon_min def get_action(self, state): """ Returns an action for a given state, based on the current policy. - state : Current state of the agent """ if np.random.random() < self.epsilon: action = self.env.action_space.sample() else: action = np.argmax(self.model.predict(state)) return action def train(self, num_episodes, report_interval): """ Trains the DQN model for a given number of episodes. Outputting report information is controlled by a given time interval. - num_episodes : Number of episodes to train - report_interval : Interval for outputting report information of training """ step = 0 total_rewards = [] for episode in range(1, num_episodes + 1): if episode % self.save_model == 0: self.model.save(self.path_model) state = self.env.reset() state = state.reshape((1, self.num_states)) total_reward = 0.0 while True: step += 1 action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) next_state = next_state.reshape((1, self.num_states)) # Penalize agent if pole could not be balanced until end of episode if done and reward < 499.0: reward = -100.0 self.buffer.remember(state, action, reward, next_state, done) self.replay() self.reduce_epsilon() state = next_state total_reward += reward if step % self.sync_model == 0: self.target_model.update(self.model) if done: total_reward += 100.0 total_rewards.append(total_reward) mean_reward = np.mean(total_rewards[-self.mean_bound:]) if episode % report_interval == 0: print(f"Episode: {episode}/{num_episodes}" f"\tStep: {step}" f"\tMemory Size: {len(self.memory)}" f"\tEpsilon: {self.epsilon : .3f}" f"\tReward: {total_reward}" f"\tLast 5 Mean: {mean_reward : .2f}") self.plot_rewards(total_rewards) if mean_reward > self.reward_bound: self.model.save(self.path_model) return break self.model.save(self.path_model) def replay(self): """ Samples training data from the replay buffer and fits the DQN model. """ sample_size, states, actions, rewards, next_states, dones = self.memory.sample( ) q_values = self.model.predict(states) next_q_values = self.target_model.predict(next_states) for i in range(sample_size): action = actions[i] done = dones[i] if done: q_target = rewards[i] else: q_target = rewards[i] + self.gamma * np.max(next_q_values[i]) q_values[i][action] = (1 - self.alpha) * \ q_values[i][action] + self.alpha * q_target self.model.fit(states, q_values) def play(self, num_episodes): """ Renders the trained agent for a given number of episodes. - num_episodes : Number of episodes to render """ self.epsilon = self.epsilon_min for episode in range(1, num_episodes + 1): state = self.env.reset() state = state.reshape((1, self.num_states)) total_reward = 0.0 while True: self.env.render() action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) next_state = next_state.reshape((1, self.num_states)) state = next_state total_reward += reward if done: print(f"Episode: {episode}/{num_episodes}" f"\tTotal Reward: {total_reward : .2f}") break def plot_rewards(self, total_rewards): """ Plots the rewards the agent has acquired during training. - total_rewards : Rewards the agent has gained per episode """ x = range(len(total_rewards)) y = total_rewards slope, intercept, _, _, _ = linregress(x, y) plt.plot(x, y, linewidth=0.8) plt.plot(x, slope * x + intercept, color="red", linestyle="-.") plt.xlabel("Episode") plt.ylabel("Reward") plt.title("DQN-Learning") plt.savefig(self.path_plot)
from model import DQN from rewards import CustomReward wave = True render_episodes = 7 if wave: import arlie env = arlie.make("LunarLander", reward=CustomReward()) else: import gym env = gym.make("LunarLander-v2") model = DQN.load("{}-trained-model".format("wave" if wave else "gym")) episode = render_episodes reward_sum = 0 obs = np.reshape(env.reset(), (1, model.obs_size)) while episode > 0: action, _states = model.predict(obs) obs, reward, done, _ = env.step(action) obs = np.reshape(obs, (1, model.obs_size)) reward_sum += reward env.render() if done: print("Points: {}".format(reward_sum)) episode -= 1 reward_sum = 0 obs = np.reshape(env.reset(), (1, model.obs_size))
if __name__ == '__main__': # Create carpole environment and network env = gym.make('CartPole-v0').unwrapped net = DQN(n_state=env.observation_space.shape[0], n_action=env.action_space.n, memory_size=memory_size, lr=lr, epsilon=epsilon, epsilon_decay=epsilon_decay, update_iter=update_iter, batch_size=batch_size, gamma=gamma, model_path=model_path) net.cuda() net.load() reward_list = [] for i in range(episode): s = env.reset() total_reward = 0 while True: # env.render() # Select action and obtain the reward a = net.chooseAction(s) s_, r, finish, info = env.step(a) # Record the total reward total_reward += r # Revised the reward if finish:
parser = argparse.ArgumentParser() parser.add_argument('path', type=str, help='path of input test weight') parser.add_argument('--rounds', type=int, default=3, help='play x rounds') parser.add_argument('--render', action='store_true') parser.add_argument('--test_epsilon', default=0, type=float) args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') env = gym.make('BreakoutNoFrameskip-v4') # frame stack and preprocessing env = AtariPreprocessing(env, noop_max=30, frame_skip=4) env = FrameStack(env, 4) model = DQN(env.observation_space.shape, env.action_space.n).to(device).eval() model.load(args.path, test=True) # play three rounds for i in range(args.rounds): done = False total_reward = 0 state = env.reset() while not done: if args.render: env.render() action = model.select_action(state, args.test_epsilon, action_space) next_state, reward, done, _ = env.step(action) state = next_state total_reward += reward if done: