'snake_dis', 'food_dis', 'snake_perc', 'food_perc', 'snake_50', 'snake_100' ] for i in range(len(features_index)): new_features[features_index[i]] = features[i] return new_features if __name__ == '__main__': # Create customized and processed slither env #universe.configure_logging(False) env = create_slither_env('features') env = Unvectorize(env) env.configure(fps=5.0, remotes=1, start_timeout=15 * 60, vnc_driver='go', vnc_kwargs={ 'encoding': 'tight', 'compress_level': 0, 'fine_quality_level': 50 }) observation_n = env.reset() ## init the q learning agent
FLAGS.grad_clip = True FLAGS.check_every = FLAGS.train_steps/10 FLAGS.log_every = 500 FLAGS.learn_every = 1 FLAGS.gamma = 0.99 FLAGS.lr_end = 0.00005 FLAGS.lr_nsteps = FLAGS.train_steps/2 FLAGS.eps_end = 0.1 FLAGS.eps_nsteps = FLAGS.train_steps/5 FLAGS.fps = 5 FLAGS.state_hist = 4 env = create_slither_env(FLAGS.state_type) FLAGS.state_size = env.state_size FLAGS.high_val = env.high_val FLAGS.num_actions = env.action_space.n env = Unvectorize(env) env.configure(fps=FLAGS.fps, remotes=FLAGS.remotes, start_timeout=15 * 60, vnc_driver='go', vnc_kwargs={'encoding': 'tight', 'compress_level': 0, 'fine_quality_level': 50}) # Make recording env record_env = None if FLAGS.record: record_env = create_slither_env(FLAGS.state_type) record_env = Unvectorize(record_env) record_env.configure(fps=30, remotes=1, start_timeout=15 * 60, vnc_driver='go', vnc_kwargs={'encoding': 'tight', 'compress_level': 0, 'fine_quality_level': 50}) record_env = gym.wrappers.Monitor(record_env, FLAGS.record_path, video_callable=lambda x: True, resume=True)
import utils.utils as utils from agent import ApproximateQAgent from utils.env import create_slither_env from universe.wrappers import Unvectorize # center of the frame center_x = 270 center_y = 235 if __name__ == '__main__': # Create customized and processed slither env #universe.configure_logging(False) env = create_slither_env('shapes') env = Unvectorize(env) env.configure(fps=20.0, remotes=1, start_timeout=15 * 60, vnc_driver='go', vnc_kwargs={ 'encoding': 'tight', 'compress_level': 0, 'fine_quality_level': 50 }) observation_n = env.reset() ## init the q learning agent # read in stored weight from previous games with pickle
def train(self, exp_schedule, lr_schedule): # Initialize replay buffer and variables replay_buffer = ReplayBuffer(self.FLAGS.buffer_size, self.FLAGS.state_hist) rewards = deque(maxlen=self.FLAGS.num_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = 0 # time control of nb of steps loss_eval = grad_eval = 0 scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] self.prog = Progbar(target=self.FLAGS.train_steps) # Train for # of train steps while t < self.FLAGS.train_steps: continual_crash = 0 try: total_reward = 0 ep_len = 0 state = self.env.reset() # Run for 1 episode and update the buffer while True: ep_len += 1 # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.network.get_best_action( q_input) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # Count reward total_reward += reward # Stop at end of episode if done: break #Store episodic rewards if ep_len > 1: rewards.append(total_reward) # Learn using replay while True: t += 1 ep_len -= 1 # Make train step if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)): loss_eval, grad_eval = self.network.update_step( t, replay_buffer, lr_schedule.epsilon, self.summary) exp_schedule.update(t) lr_schedule.update(t) if (t % self.FLAGS.target_every == 0): self.network.update_target_params() # Update logs if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0) and (len(rewards) > 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) self.update_logs(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon) # Update logs if necessary elif (t < self.FLAGS.learn_start) and ( t % self.FLAGS.log_every == 0): sys.stdout.write( "\rPopulating the memory {}/{}...".format( t, self.FLAGS.learn_start)) sys.stdout.flush() if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.check_every == 0)): # Evaluate current model scores_eval += [ self.evaluate(self.env, self.FLAGS.num_test) ] # Save current Model self.network.save() # Record video of current model if self.FLAGS.record: self.record() if ep_len <= 0 or t >= self.FLAGS.train_steps: break continual_crash = 0 except Exception as e: continual_crash += 1 self.logger.info(e) if continual_crash >= 10: self.logger.info("Crashed 10 times -- stopping u suck") raise e else: t -= 1 self.logger.info("Env crash, making new env") time.sleep(60) self.env = create_slither_env(self.FLAGS.state_type) self.env = Unvectorize(self.env) self.env.configure(fps=self.FLAGS.fps, remotes=self.FLAGS.remotes, start_timeout=15 * 60, vnc_driver='go', vnc_kwargs={ 'encoding': 'tight', 'compress_level': 0, 'fine_quality_level': 50 }) time.sleep(60) # End of training self.logger.info("- Training done.") self.network.save() scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] export_plot(scores_eval, "Scores", self.FLAGS.plot_path)