def __push_state(self, state): # Push the current state to the history. # Oldest state in history is discarded. sg = state.astype(np.float32) sg = np.expand_dims(sg, 0) sg = utils.preprocess_state(sg) self.state_hist[0, :, :, 1:] = self.state_hist[0, :, :, :-1] self.state_hist[0, :, :, 0] = sg[0]
def preprocess_data(X, y, hist_len, shuffle): """ Preprocess states and actions from expert dataset before feeding them to the agent """ print('Preprocessing states. Shape:', X.shape) utils.check_invalid_actions(y) y_pp = utils.transl_action_env2agent(y) X_pp = utils.preprocess_state(X) X_pp, y_pp = utils.stack_history(X_pp, y_pp, hist_len, shuffle=shuffle) return X_pp, y_pp
def agent(obs_dict, config_dict): global prev_direction env = make('hungry_geese') # agent = QAgent(rows=11, columns=11, num_actions=3) agent = PPOAgent(rows=11, columns=11, num_actions=3) model_name = '' agent.load_model_weights('models/' + model_name + '.h5') state = preprocess_state(obs_dict, prev_direction) action = agent.select_action(state) direction = get_direction(prev_direction, action) prev_direction = direction return env.specification.action.enum[direction]
def select_action(self, state): action_prob = np.zeros(self.n_action, np.float32) action_prob.fill(self.eps / self.n_action) max_q, max_q_index = self.qNetwork(Variable(state.to( self.args.device))).data.cpu().max(1) action_prob[max_q_index[0]] += 1 - self.eps action = np.random.choice(self.arr_actions, p=action_prob) next_state, reward, done, _ = self.env.step(action) next_state = torch.cat( [state.narrow(1, 1, 3), preprocess_state(next_state, self.env)], 1) self.memory.push( (state, torch.LongTensor([int(action)]), torch.Tensor([reward]), next_state, torch.Tensor([done]))) return next_state, reward, done, max_q[0]
def ppo_train(model_name, load_model=False, actor_filename=None, critic_filename=None, optimizer_filename=None): print("PPO -- Training") env = make('hungry_geese') trainer = env.train(['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py']) agent = PPOAgent(rows=11, columns=11, num_actions=3) memory = Memory() if load_model: agent.load_model_weights(actor_filename, critic_filename) agent.load_optimizer_weights(optimizer_filename) episode = 0 start_episode = 0 end_episode = 50000 reward_threshold = None threshold_reached = False epochs = 4 batch_size = 128 current_frame = 0 training_rewards = [] evaluation_rewards = [] last_1000_ep_reward = [] for episode in range(start_episode + 1, end_episode + 1): obs_dict = trainer.reset() ep_reward, ep_steps, done = 0, 0, False prev_direction = 0 while not done: current_frame += 1 ep_steps += 1 state = preprocess_state(obs_dict, prev_direction) action = agent.select_action(state, training=True) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) next_state = preprocess_state(next_obs_dict, direction) memory.add(state, action, reward, next_state, float(done)) obs_dict = next_obs_dict prev_direction = direction ep_reward += reward if current_frame % batch_size == 0: for _ in range(epochs): states, actions, rewards, next_states, dones = memory.get_all_samples() agent.fit(states, actions, rewards, next_states, dones) memory.clear() agent.update_networks() print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps)) if len(last_1000_ep_reward) == 1000: last_1000_ep_reward = last_1000_ep_reward[1:] last_1000_ep_reward.append(ep_reward) if reward_threshold: if len(last_1000_ep_reward) == 1000: if np.mean(last_1000_ep_reward) >= reward_threshold: print("You solved the task after" + str(episode) + "episodes") agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5') threshold_reached = True break if episode % 1000 == 0: print('Episode ' + str(episode) + '/' + str(end_episode)) last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3) training_rewards.append(last_1000_ep_reward_mean) print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean)) print() if episode % 1000 == 0: eval_reward = 0 for i in range(100): obs_dict = trainer.reset() done = False prev_direction = 0 while not done: state = preprocess_state(obs_dict, prev_direction) action = agent.select_action(state) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) obs_dict = next_obs_dict prev_direction = direction eval_reward += reward eval_reward /= 100 evaluation_rewards.append(eval_reward) print("Evaluation reward: " + str(eval_reward)) print() if episode % 5000 == 0: agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5') agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(episode) + '_optimizer.npy') agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(end_episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(end_episode) + '.h5') agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(end_episode) + '_optimizer.npy') if threshold_reached: plt.plot([i for i in range(start_episode + 1000, episode, 1000)], training_rewards) else: plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards) plt.title("Reward") plt.show() plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards) plt.title('Evaluation rewards') plt.show()
def reset(self): return torch.cat([preprocess_state(self.env.reset(), self.env)] * 4, 1)
def main(): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument('-g', action='store', dest='game') parser.add_argument('-w', action='store_true', dest='warm_start', default=False) args = parser.parse_args() game = args.game warm_start = args.warm_start # Initialize environment env = gym.make(game) num_actions = env.action_space.n # Initialize constants num_frames = 4 max_episodes = 1000000 max_frames = 10000 gamma = 0.95 lr = 1e-4 # LSTM Update: Work well in 1st iteration target_score = 21.0 # Temperature Update: specific to Pong # Cold start if not warm_start: # Initialize model model = Policy(input_channels=num_frames, num_actions=num_actions) optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=0.1) #LSTM Change: lr = 1e-4 # Initialize statistics running_reward = None running_rewards = [] prior_eps = 0 # Warm start if warm_start: data_file = 'results/{}.p'.format(game) try: with open(data_file, 'rb') as f: running_rewards = pickle.load(f) running_reward = running_rewards[-1] prior_eps = len(running_rewards) model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format( game, prior_eps) with open(model_file, 'rb') as f: # Model Save and Load Update: Include both model and optim parameters saved_model = pickle.load(f) model, optimizer = saved_model except OSError: print('Saved file not found. Creating new cold start model.') model = Policy(input_channels=num_frames, num_actions=num_actions) optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=0.1) running_reward = None running_rewards = [] prior_eps = 0 cuda = torch.cuda.is_available() if cuda: model = model.cuda() for ep in range(max_episodes): # Temperature Update: specific to Pong # Anneal temperature from 2.0 down to 0.8 based on how far running reward is from # target score if running_reward is None: model.temperature = 2.0 # Start with temp = 2.0 (Explore) else: # Specific to Pong - running reward starts at -21, so we encourage the agent # to explore. temp = 0.8 + 1.2*[21-(-21)]/42 = 2.0 # As it gets closer to 0, temp = 0.8 + 1.2(21-0)/42 = 1.4 # As it gets to 7, temp = 0.8 + 1.2(21-14)/42 = 1.0 model.temperature = max( 0.8, 0.8 + (target_score - running_reward) / 42 * 1.2) state = env.reset() state = preprocess_state(state) state = np.stack([state] * num_frames) # LSTM change - reset LSTM hidden units when episode begins cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) if cuda: cx = cx.cuda() hx = hx.cuda() reward_sum = 0.0 for frame in range(max_frames): # Select action # LSTM Change: Need to cycle hx and cx thru select_action action, log_prob, state_value, (hx, cx) = select_action( model, state, (hx, cx), cuda) model.saved_actions.append((log_prob, state_value)) # Perform step next_state, reward, done, info = env.step(action) # Add reward to reward buffer model.rewards.append(reward) reward_sum += reward # Compute latest state next_state = preprocess_state(next_state) # Evict oldest diff add new diff to state next_state = np.stack([next_state] * num_frames) next_state[1:, :, :] = state[:-1, :, :] state = next_state if done: break # Compute/display statistics if running_reward is None: running_reward = reward_sum else: running_reward = running_reward * 0.99 + reward_sum * 0.01 running_rewards.append(running_reward) verbose_str = 'Episode {} complete'.format(ep + prior_eps + 1) verbose_str += '\tReward total:{}'.format(reward_sum) verbose_str += '\tRunning mean: {:.4}'.format(running_reward) # Temperature Update: Track temp if (ep + prior_eps + 1) % 5 == 0: verbose_str += '\tTemp = {:.4}'.format(model.temperature) sys.stdout.write('\r' + verbose_str) sys.stdout.flush() # Update model finish_episode(model, optimizer, gamma, cuda) if (ep + prior_eps + 1) % 500 == 0: model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format( game, ep + prior_eps + 1) data_file = 'results/{}.p'.format(game) with open(model_file, 'wb') as f: # Model Save and Load Update: Include both model and optim parameters pickle.dump((model.cpu(), optimizer), f) if cuda: model = model.cuda() with open(data_file, 'wb') as f: pickle.dump(running_rewards, f)
env = Env() model = Model() tt = Time() steps = 0 # arg.MAX_EPISODE # i_episode = 0 for i_episode in range(MAX_EPISODE): try: if (break_flag): break except: break_flag = 0 s, position = env.reset(return_s_pos=1) s = preprocess_state(s, position, env) # actions = [0, 0, 0, 1, 1, 1, -1] t1 = Time() ep_r = 0 for i in range(MAX_STEP): if (tt.stop_alt('s')): print('----- break! -----') break_flag = 1 break steps += 1 # a = actions[i] # len(actions) a = model.choose_action(s) s_, r, done, info = env.step(a)
def main(): model_name = 'dqn' # Parse arguments game, warm_start, render = parse_arguments() # Initialize enviroment/model data = initialize(game, model_name, warm_start) env, model, optimizer, criterion, memory_buffer, cuda, running_reward, running_rewards = data # Initialize constants max_episodes = 500000 batch_size = 10 gamma = 0.95 num_frames = 4 for ep in range(max_episodes): state = env.reset() state = preprocess_state(state) state = np.stack([state] * num_frames) reward_sum = 0.0 while True: # render frame if render argument was passed if render: env.render() # Select action action = select_epilson_greedy_action(model, state, ep, cuda) # Perform step next_state, reward, done, info = env.step(action) next_state = preprocess_state(next_state) next_state = np.stack([next_state] * num_frames) next_state[1:, :, :] = state[:-1, :, :] reward_sum += reward # Add transition to replay memory transition = Transition(state, action, next_state, reward, done) memory_buffer.push(transition) # Update state state = next_state # Sample mini-batch from replay memory_buffer batch = memory_buffer.sample(batch_size, replace=True) # Compute targets targets = np.zeros((batch_size, ), dtype=float) for i, transition in enumerate(batch): targets[i] = transition.reward if not transition.done: next_state = transition.next_state num_frames, height, width = next_state.shape next_state = next_state.reshape(-1, num_frames, height, width) next_state = torch.FloatTensor(next_state) if cuda: next_state = next_state.cuda() next_state = Variable(next_state) targets[i] += gamma * model(next_state).data.max(1)[0] targets = torch.FloatTensor(targets) if cuda: targets = targets.cuda() targets = Variable(targets) # Compute predictions model.zero_grad() states = [transition.state for transition in batch] states = torch.FloatTensor(states) if cuda: states = states.cuda() states = Variable(states) actions = [int(transition.action) for transition in batch] actions = torch.LongTensor(actions) if cuda: actions = actions.cuda() actions = Variable(actions) outputs = model(states).gather(1, actions.unsqueeze(1)) # Perform gradient descent step loss = criterion(outputs.view(batch_size), targets) loss.backward() # Clip gradient at 20,000 # torch.nn.utils.clip_grad_norm(model.parameters(), 20000) optimizer.step() if done: break # Compute/display statistics if running_reward is None: running_reward = reward_sum else: running_reward = running_reward * 0.99 + reward_sum * 0.01 running_rewards.append(running_reward) verbose_str = 'Episode {} complete'.format(ep + 1) verbose_str += '\tReward total:{}'.format(reward_sum) verbose_str += '\tRunning mean: {:.4}'.format(running_reward) sys.stdout.write('\r' + verbose_str) sys.stdout.flush() # Save model every 1000 episodes if (ep + 1) % 1000 == 0: model_file = 'saved_models/{}_{}_ep_{}.p'.format( game, model_name, ep + 1) with open(model_file, 'wb') as f: pickle.dump((model.cpu(), optimizer, memory_buffer), f) if cuda: model = model.cuda() data_file = 'results/{}_{}.p'.format(game, model_name) with open(data_file, 'wb') as f: pickle.dump(running_rewards, f)
def main(): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument('-g', action='store', dest='game') parser.add_argument('-w', action='store_true', dest='warm_start', default=False) args = parser.parse_args() game = args.game warm_start = args.warm_start # Initialize environment env = gym.make(game) num_actions = env.action_space.n # Initialize constants num_frames = 4 max_episodes = 1000000 max_frames = 6000 # limit episode to 6000 game steps gamma = 0.95 lr = 1e-4 # LSTM Update: Work well in 1st iteration target_score = 21.0 # Temperature Update: specific to Pong # Truncated Backprop(TBP) Update: # Slide 41-44 CS231N_2017 Lecture 10 # Run forward and backward through chunks of sequence vs whole sequence. While hidden values hx and cx # are carried forward in time forever. chunk_size = 768 # Cold start if not warm_start: # Initialize model model = Policy(input_channels=num_frames, num_actions=num_actions) optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=0.1) #LSTM Change: lr = 1e-4 # Initialize statistics running_reward =None running_rewards = [] prior_eps = 0 # Warm start if warm_start: data_file = 'results/acl-batch_{}_cs_{}.p'.format(game, chunk_size) try: with open(data_file, 'rb') as f: running_rewards = pickle.load(f) running_reward = running_rewards[-1] prior_eps = len(running_rewards) model_file = 'saved_models/acl-batch_{}_cs_{}_ep_{}.p'.format( game, chunk_size, prior_eps) with open(model_file, 'rb') as f: # Model Save and Load Update: Include both model and optim parameters saved_model = pickle.load(f) model, optimizer = saved_model except OSError: print('Saved file not found. Creating new cold start model.') model = Policy(input_channels=num_frames, num_actions=num_actions) optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=0.1) running_reward = None running_rewards = [] prior_eps = 0 cuda = torch.cuda.is_available() if cuda: model = model.cuda() for ep in range(max_episodes): # Truncated Backprop(TBP) Update: For every episode # Anneal temperature from 1.8 down to 1.0 over 100000 episodes model.temperature = max(0.8, 1.8 - 0.8 * ((ep+prior_eps) / 1.0e5)) state = env.reset() state = preprocess_state(state) state = np.stack([state]*num_frames) done = False # TBP Update: init done # LSTM change - reset LSTM hidden units when episode begins cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) if cuda: cx = cx.cuda() hx = hx.cuda() reward_sum = 0.0 grad_norm = 0.0 # Track grad norm for the episode while not done: # TBP Update: if episode is not done # TBP Update: Forward a fixed number of game steps thru CNN-LSTM for frame in range(chunk_size): # env.render() # For initial debugging # Select action # LSTM Change: Need to cycle hx and cx thru select_action action, log_prob, state_value, (hx,cx) = select_action(model, state, (hx,cx), cuda) model.saved_actions.append((log_prob, state_value)) # Perform step next_state, reward, done, info = env.step(action) # Add reward to reward buffer model.rewards.append(reward) reward_sum += reward # Compute latest state next_state = preprocess_state(next_state) # Evict oldest diff add new diff to state next_state = np.stack([next_state]*num_frames) next_state[1:, :, :] = state[:-1, :, :] state = next_state if done: break # Update model # TBP Update: Backprop the fixed number of game steps back thru CNN-LSTM, and perform # an update on the parameters of the Actor-Critic. if frame > chunk_size/4: grad_norm = finish_chunk(model, optimizer, gamma, cuda) # print (grad_norm, frame) # for debugging nan problem # TBP Update: hidden values are carried forward cx = Variable(cx.data) hx = Variable(hx.data) # TBP Update: At this point, the episode is done. We need to do some bookkeeping # Compute/display statistics if running_reward is None: running_reward = reward_sum else: running_reward = running_reward * 0.99 + reward_sum * 0.01 running_rewards.append(running_reward) verbose_str = 'Episode {} complete'.format(ep+prior_eps+1) verbose_str += '\tReward total:{}'.format(reward_sum) verbose_str += '\tRunning mean: {:.4}'.format(running_reward) # Temperature Update: Track temp if (ep+prior_eps+1) % 5 == 0: verbose_str += '\tTemp = {:.4}'.format(model.temperature) verbose_str += '\tGrad norm:{}'.format(grad_norm) sys.stdout.write('\r' + verbose_str) sys.stdout.flush() # Periodically save model and optimizer parameters, and statistics if (ep+prior_eps+1) % 100 == 0: model_file = 'saved_models/acl-batch_{}_cs_{}_ep_{}.p'.format( game, chunk_size, ep+prior_eps+1) data_file = 'results/acl-batch_{}_cs_{}.p'.format(game, chunk_size) with open(model_file, 'wb') as f: # Model Save and Load Update: Include both model and optim parameters pickle.dump((model, optimizer), f) if cuda: model = model.cuda() with open(data_file, 'wb') as f: pickle.dump(running_rewards, f)
def ddqn_train(model_name, load_model=False, model_filename=None, optimizer_filename=None): print("DDQN -- Training") env = make('hungry_geese') trainer = env.train( ['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py']) agent = DDQNAgent(rows=11, columns=11, num_actions=3) buffer = ReplayBuffer() strategy = EpsilonGreedyStrategy(start=0.5, end=0.0, decay=0.00001) if load_model: agent.load_model_weights(model_filename) agent.load_optimizer_weights(optimizer_filename) start_episode = 0 end_episode = 50000 epochs = 32 batch_size = 128 training_rewards = [] evaluation_rewards = [] last_1000_ep_reward = [] for episode in range(start_episode + 1, end_episode + 1): obs_dict = trainer.reset() epsilon = strategy.get_epsilon(episode - start_episode) ep_reward, ep_steps, done = 0, 0, False prev_direction = 0 while not done: ep_steps += 1 state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step( env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) next_state = preprocess_state(next_obs_dict, direction) buffer.add(state, action, reward, next_state, done) obs_dict = next_obs_dict prev_direction = direction ep_reward += reward if len(buffer) >= batch_size: for _ in range(epochs): states, actions, rewards, next_states, dones = buffer.get_samples( batch_size) agent.fit(states, actions, rewards, next_states, dones) print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps)) if len(last_1000_ep_reward) == 1000: last_1000_ep_reward = last_1000_ep_reward[1:] last_1000_ep_reward.append(ep_reward) if episode % 10 == 0: agent.update_target_network() if episode % 1000 == 0: print('Episode ' + str(episode) + '/' + str(end_episode)) print('Epsilon: ' + str(round(epsilon, 3))) last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3) training_rewards.append(last_1000_ep_reward_mean) print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean)) print() if episode % 1000 == 0: eval_reward = 0 for i in range(100): obs_dict = trainer.reset() epsilon = 0 done = False prev_direction = 0 while not done: state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step( env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) obs_dict = next_obs_dict prev_direction = direction eval_reward += reward eval_reward /= 100 evaluation_rewards.append(eval_reward) print("Evaluation reward: " + str(eval_reward)) print() if episode % 5000 == 0: agent.save_model_weights('models/ddqn_' + model_name + '_' + str(episode) + '.h5') agent.save_optimizer_weights('models/ddqn_' + model_name + '_' + str(episode) + '_optimizer.npy') agent.save_model_weights('models/ddqn_' + model_name + '_' + str(end_episode) + '.h5') agent.save_optimizer_weights('models/ddqn_' + model_name + '_' + str(end_episode) + '_optimizer.npy') plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards) plt.title('Reward') plt.show() plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards) plt.title('Evaluation rewards') plt.show()
if (r_p != 0): d_p = r_d / r_p else: d_p = -1 # ------------- r_d and r_p print( 'r: {:5.3f}, r_d: {:5.3f}, r_p: {:5.3f}, d/p: {:-8.3f}'.format( r, r_d, r_p, d_p)) ################### ------------------- pre_process_image if (not done): # plot cv_img position_ = info[0] # print('NOT ------------------------ done', position_) # if (arg.preprocess_state): s_ = preprocess_state(s_, position_, env, resize=arg.resize) # s_ = preprocess_state(s_) # add points # if(arg.resize): # for img in s_: # add_rects(img=img, point=position_, env=env) # img = cv2.resize(img, arg.wind_conv_wh, interpolation=cv2.INTER_AREA) # # print(len(s_), env.num_frames) # else: # img = s_[-1] # add_rects(img=img, point=position_, env=env) # plot if (arg.show_pre_image and cv_img(s_[-1])): break_flag = 1
def main(): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument('-g', action='store', dest='game') parser.add_argument('-f', action='store', dest='filename', default=None) parser.add_argument('-d', action='store', dest='foldername', default=None) args = parser.parse_args() game = args.game model_file = args.filename foldername = args.foldername # Initialize environment render = True env = gym.make(game) env = gym.wrappers.Monitor(env, foldername, video_callable=lambda episode_id: True, force=True) num_actions = env.action_space.n # Initialize constants num_frames = 4 max_episodes = 1 # Just render 1 episode max_frames = 10000 # Initialize model try: with open(model_file, 'rb') as f: # Model Save and Load Update: Include both model and optim parameters # saved_model = torch.load(model_file,map_location=lambda storage, loc:storage) saved_model = pickle.load(f) if hasattr(saved_model, '__iter__'): model, _ = saved_model else: model = saved_model except OSError: print('Model file not found.') return model.temperature = 1.0 # When we play, we sample as usual. for ep in range(max_episodes): state = env.reset() state = preprocess_state(state) state = np.stack([state] * num_frames) # LSTM change - reset LSTM hidden units when episode begins cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) for frame in range(max_frames): env.render() # Select action # LSTM Change: Need to cycle hx and cx thru select_action action, log_prob, state_value, (hx, cx) = select_action( model, state, (hx, cx)) # Perform step next_state, reward, done, info = env.step(action) # Compute latest state next_state = preprocess_state(next_state) # Evict oldest diff add new diff to state next_state = np.stack([next_state] * num_frames) next_state[1:, :, :] = state[:-1, :, :] state = next_state if done: break env.env.close()
def main(): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument('-g', action='store', dest='game') parser.add_argument('-w', action='store_true', dest='warm_start', default=False) args = parser.parse_args() game = args.game warm_start = args.warm_start # Initialize environment env = gym.make(game) num_actions = env.action_space.n # Initialize constants num_frames = 4 max_episodes = 1000000 max_frames = 10000 gamma = 0.95 # Cold start if not warm_start: # Initialize model model = Policy(input_channels=num_frames, num_actions=num_actions) # Initialize statistics running_reward = None running_rewards = [] prior_eps = 0 # Warm start if warm_start: data_file = 'results/{}.p'.format(game) try: with open(data_file, 'rb') as f: running_rewards = pickle.load(f) running_reward = running_rewards[-1] prior_eps = len(running_rewards) model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format( game, prior_eps) with open(model_file, 'rb') as f: model = pickle.load(f) except OSError: print('Saved file not found. Creating new cold start model.') model = Policy(input_channels=num_frames, num_actions=num_actions) running_reward = None running_rewards = [] prior_eps = 0 cuda = torch.cuda.is_available() if cuda: model = model.cuda() optimizer = optim.RMSprop(model.parameters(), lr=1e-4, weight_decay=0.1) #LSTM Change: lr = 1e-4 for ep in range(max_episodes): # Anneal temperature from 2.0 down to 0.5 over 10000 episodes model.temperature = max(0.5, 2.0 - 1.5 * ((ep + prior_eps) / 1.0e4)) state = env.reset() state = preprocess_state(state) state = np.stack([state] * num_frames) # LSTM change - reset LSTM hidden units when episode begins cx = Variable(torch.zeros(1, 256)) hx = Variable(torch.zeros(1, 256)) if cuda: cx = cx.cuda() hx = hx.cuda() reward_sum = 0.0 for frame in range(max_frames): # Select action # LSTM Change: Need to cycle hx and cx thru select_action action, log_prob, state_value, (hx, cx) = select_action( model, state, (hx, cx), cuda) model.saved_actions.append((log_prob, state_value)) # Perform step next_state, reward, done, info = env.step(action) # Add reward to reward buffer model.rewards.append(reward) reward_sum += reward # Compute latest state next_state = preprocess_state(next_state) # Evict oldest diff add new diff to state next_state = np.stack([next_state] * num_frames) next_state[1:, :, :] = state[:-1, :, :] state = next_state if done: break # Compute/display statistics if running_reward is None: running_reward = reward_sum else: running_reward = running_reward * 0.99 + reward_sum * 0.01 running_rewards.append(running_reward) verbose_str = 'Episode {} complete'.format(ep + prior_eps + 1) verbose_str += '\tReward total:{}'.format(reward_sum) verbose_str += '\tRunning mean: {:.4}'.format(running_reward) sys.stdout.write('\r' + verbose_str) sys.stdout.flush() # Update model finish_episode(model, optimizer, gamma, cuda) if (ep + prior_eps + 1) % 500 == 0: model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format( game, ep + prior_eps + 1) data_file = 'results/{}.p'.format(game) with open(model_file, 'wb') as f: pickle.dump(model.cpu(), f) if cuda: model = model.cuda() with open(data_file, 'wb') as f: pickle.dump(running_rewards, f)
def main(): # Parse arguments game, model_name, warm_start, render = parse_arguments() # initialize enviroment/model data = initialize(game, model_name, warm_start) env, model, optimizer, cuda, running_reward, running_rewards = data # Initialize constants max_episodes = 500000 max_frames = 10000 gamma = 0.95 num_frames = 4 for ep in range(len(running_rewards), max_episodes): # Anneal temperature from 1.8 down to 0.8 over 20,000 episodes model.temperature = max(0.8, 1.8 - 1.0 * ((ep) / 2.0e4)) # Reset LSTM hidden units when episode begins if model_name == 'a2c-lstm': cx = Variable(torch.zeros(1, 100)) hx = Variable(torch.zeros(1, 100)) if cuda: cx = cx.cuda() hx = hx.cuda() state = env.reset() state = preprocess_state(state) state = np.stack([state] * num_frames) reward_sum = 0.0 for frame in range(max_frames): # render frame if render argument was passed if render: env.render() # Select action if model_name == 'a2c-lstm': result = select_action_lstm(model, state, (hx, cx), cuda) action, log_prob, state_value, (hx, cx) = result else: result = select_action(model, state, cuda) action, log_prob, state_value = result model.saved_actions.append((log_prob, state_value)) # Perform step next_state, reward, done, info = env.step(action) # Add reward to reward buffer model.rewards.append(reward) reward_sum += reward # Compute latest state next_state = preprocess_state(next_state) # Evict oldest frame add new frame to state next_state = np.stack([next_state] * num_frames) next_state[1:, :, :] = state[:-1, :, :] state = next_state if done: break # Compute/display episode statistics if running_reward is None: running_reward = reward_sum else: running_reward = running_reward * 0.99 + reward_sum * 0.01 running_rewards.append(running_reward) verbose_str = 'Episode {} complete'.format(ep + 1) verbose_str += '\tReward total:{}'.format(reward_sum) verbose_str += '\tRunning mean: {:.4}'.format(running_reward) sys.stdout.write('\r' + verbose_str) sys.stdout.flush() # Update model backpropagate(model, optimizer, gamma, cuda) # Save model every 1000 episodes if (ep + 1) % 1000 == 0: model_file = 'saved_models/{}_{}_ep_{}.p'.format( game, model_name, ep + 1) with open(model_file, 'wb') as f: pickle.dump((model.cpu(), optimizer), f) if cuda: model = model.cuda() data_file = 'results/{}_{}.p'.format(game, model_name) with open(data_file, 'wb') as f: pickle.dump(running_rewards, f)