def __init__(self, env_config, agent_config, use_cuda = True): self.env = GymEnvironment(name = env_config["name"]) self.action_size = self.env.action_size[0] self.state_size = self.env.obs_size[0] # initialize mimic agent self.agent = MimicAgent(action_size = self.action_size, state_size = self.state_size, **agent_config, use_cuda = use_cuda) # if train_config.get('load_path') # self.agent.load_models(train_config.get('load_path')) # initialize expert # self.expert = LunarLanderExpert() self.expert = SmallReactivePolicy(self.env.observation_space, self.env.action_space)
def __init__(self, env_config, agent_config): self.n = 10 self.noise_dim = 2 self.env = GymEnvironment(name = env_config["name"]) self.critic = SimpleCritic(self.n, self.env.obs_size, self.env.action_size) self.hallucinator = SimpleHallucinator(self.n, self.env.obs_size, self.noise_dim) self.policy_buffer = PolicyBuffer() self.policy_c = Policy self.trainer = SimpleTrainer(self.env, self.critic, self.hallucinator, self.policy_buffer, self.policy_c, self.noise_dim)
def test_game_data(): g = GameBatchData(get_timestamp(True)) env = GymEnvironment('Atlantis-v0') env.reset() for _ in range(5): i = g.new_game(get_timestamp(True)) print("Game: %s" % i) for _ in range(5): observation, reward, done, info = env.step(0) d = g.add_step( timestamp=get_timestamp(True), observation=observation, concatenated_observation=observation, reward="rew%s" % (random.randint(1, 10)), action_value=[ "av%s" % (random.randint(1, 10)), "av%s" % (random.randint(1, 10)) ], action="a%s" % (random.randint(1, 10)), ) print("Step %s" % d) g.save_progress()
def test_graph(): env = GymEnvironment('MsPacman-v0') graph = Graph(actions=10) env.reset() screenshots = [] g = graph.get_graph() with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) for x in range(1, 7): env.render() observation, reward, done, info = env.step(0) screenshots.append(observation) if x % 4 == 0: concat_image = np.concatenate((screenshots[0], screenshots[1], screenshots[2], screenshots[3]), axis=1) im = Image.fromarray(concat_image).convert('LA') # im.show() grayscale_im = np.array(im) graph.run_graph(sess, grayscale_im.reshape([-1, 25600]))
def main(args): env = GymEnvironment(args, gamma) env.env = env.env.unwrapped actor_critic = Policy(obs_shape, env.action_size, base_kwargs={'recurrent': False}) actor_critic.load_state_dict(torch.load('log/model.pt')) actor_critic.to(device) agent = PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr, eps, max_grad_norm) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, env.action_space, actor_critic.recurrent_hidden_state_size) current_obs = torch.zeros(num_processes, *obs_shape) obs, _, _, _ = env.new_expt() obs = obs[np.newaxis, ...] current_obs[:, -1] = torch.from_numpy(obs) rollouts.obs[0].copy_(current_obs) current_obs = current_obs.to(device) rollouts.to(device) num_updates = math.ceil(args.max_timesteps / (num_processes * num_steps)) n_goal_reached = 0 n_episodes = 0 for j in range(num_updates): for step in range(num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() (obs, reward, done), goal_reached = env.act(action) reward = torch.from_numpy(np.expand_dims(np.stack([reward]), 1)).float() masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in [done]]) masks = masks.to(device) current_obs[:, :-1] = current_obs[:, 1:] if done: current_obs[:] = 0 current_obs[:, -1] = torch.from_numpy(obs) rollouts.insert(current_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if done: n_episodes += 1 env.new_expt() if goal_reached: n_goal_reached += 1 with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]).detach() rollouts.compute_returns(next_value, use_gae, gamma, tau, step) value_loss, action_loss, dist_entropy = agent.update(rollouts, step) rollouts.after_update() if j % log_interval == 0: total_num_steps = (j + 1) * num_processes * num_steps try: success = float(n_goal_reached) / n_episodes except ZeroDivisionError: success = 0. print( "Timesteps: {}, Goal reached : {} / {}, Success %: {}".format( total_num_steps, n_goal_reached, n_episodes, success)) if args.lang_coeff > 0: av_list = np.array(env.action_vectors_list) for k in range(len(spearman_corr_coeff_actions)): sr, _ = spearmanr(env.rewards_list, av_list[:, k]) print(k, sr)
mainarg.add_argument( "--save_weights_prefix", help="Save network to given file. Epoch and extension will be appended.") comarg = parser.add_argument_group('Common') comarg.add_argument("output_folder", help="Where to write results to.") comarg.add_argument("--num_episodes", type=int, default=100, help="Number of episodes to test.") comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") args = parser.parse_args() if args.random_seed: random.seed(args.random_seed) env = GymEnvironment(args.env_id, args) net = DeepQNetwork(env.numActions(), args) mem = None agent = Agent(env, mem, net, args) if args.load_weights: print "Loading weights from %s" % args.load_weights net.load_weights(args.load_weights) env.gym.monitor.start(args.output_folder, force=True) agent.play(args.num_episodes) env.gym.monitor.close()
antarg.add_argument("--random_starts", type=int, default=30, help="Perform max this number of dummy actions after game restart, to produce more random game dynamics.") mainarg = parser.add_argument_group('Main loop') mainarg.add_argument("--load_weights", help="Load network from file.") mainarg.add_argument("--save_weights_prefix", help="Save network to given file. Epoch and extension will be appended.") comarg = parser.add_argument_group('Common') comarg.add_argument("output_folder", help="Where to write results to.") comarg.add_argument("--num_episodes", type=int, default=10, help="Number of episodes to test.") comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") args = parser.parse_args() if args.random_seed: random.seed(args.random_seed) env = GymEnvironment(args.env_id, args) net = DeepQNetwork(env.numActions(), args) buf = MemoryBuffer(args) if args.load_weights: print "Loading weights from %s" % args.load_weights net.load_weights(args.load_weights) env.gym.monitor.start(args.output_folder, force=True) avg_reward = 0 num_episodes = args.num_episodes for i_episode in xrange(num_episodes): env.restart() observation = env.getScreen() buf.reset() i_total_reward = 0
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.") args = parser.parse_args() logger = logging.getLogger() logger.setLevel(args.log_level) # bug with double logging if args.environment == 'gym': logger.handlers.pop() if args.random_seed: random.seed(args.random_seed) # instantiate classes env = GymEnvironment(args.rom_file, args) if args.environment == 'gym' else ALEEnvironment( args.rom_file, args) mem = ReplayMemory(args.replay_size, args) net = DeepQNetwork(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights) net.load_weights(args.load_weights) if args.play_games: logger.info("Playing for %d game(s)" % args.play_games) stats.reset() agent.play(args.play_games) stats.write(0, "play")
def main(): # Process arguments args = utils.parse_args() # Use random seed from argument if args.random_seed: random.seed(args.random_seed) # Instantiate environment class if args.environment == "ale": env = ALEEnvironment(args.game, args) elif args.environment == "gym": env = GymEnvironment(args.game, args) elif args.environment == "robot": env = RobotEnvironment(args.game, args) else: assert False, "Unknown environment" + args.environment # Instantiate DQN action_dim = env.action_dim() state_dim = env.state_dim() net = DQN(state_dim, action_dim, args) # Load weights before starting training if args.load_weights: filepath = args.load_weights net.load(filepath) # Instantiate agent agent = Agent(env, net, args) # Start statistics stats = Statistics(agent, agent.net, agent.net.memory, env, args) # Play game with two players (user and agent) if args.two_player: player_b = PlayerTwo(args) env.set_mode('test') stats.reset() agent.play_two_players(player_b) stats.write(0, "2player") sys.exit() # Play agent if args.play_games > 0: env.set_mode('test') stats.reset() for _ in range(args.play_games): agent.play() stats.write(0, "play") sys.exit() # Populate replay memory with random steps if args.random_steps: env.set_mode('test') stats.reset() agent.play_random(args.random_steps) stats.write(0, "random") for epoch in range(args.start_epoch, args.epochs): # Train agent if args.train_steps: env.set_mode('train') stats.reset() agent.train(args.train_steps) stats.write(epoch + 1, "train") # Save weights after every epoch if args.save_weights_prefix: filepath = args.save_weights_prefix + "_%d.h5" % (epoch + 1) net.save(filepath) # Test agent if args.test_steps: env.set_mode('test') stats.reset() agent.test(args.test_steps) stats.write(epoch + 1, "test") # Stop statistics stats.close()
def __init__(self, env_config, subroutine_configs): self.env = GymEnvironment(name = env_config["name"]) self.controllers = [] for config in subroutine_configs: c = DQNController(config,self.env) self.controllers.append(c)
import agent from environment import GymEnvironment import tensorflow as tf env_agent = GymEnvironment() agent = agent.DQNAgent(environment=env_agent) with tf.Session() as sess: agent.build_dqn(sess) sess.run(tf.global_variables_initializer()) agent.train(episodes=50000)
def __init__(self, env_config, agent_config, use_cuda=True): self.env = GymEnvironment(name=env_config["name"]) self.agent = DDPGAgent(action_size=self.env.action_size[0], state_size=self.env.obs_size[0], **agent_config, use_cuda=use_cuda)
class Runner: def __init__(self, env_config, agent_config, use_cuda=True): self.env = GymEnvironment(name=env_config["name"]) self.agent = DDPGAgent(action_size=self.env.action_size[0], state_size=self.env.obs_size[0], **agent_config, use_cuda=use_cuda) def train(self, train_config): # Load model if train_config.get('load_path'): self.agent.load_models(train_config.get('load_path')) # Fill experience replay self.env.new_episode() ma_reward = 0 prefill = train_config['prefill'] if prefill > 0: temp_reward = 0 temp_done = False for step in range(prefill): cur_obs = self.env.cur_obs _ = self.agent.get_next_action(cur_obs) cur_action = np.asarray([random.random() * 2.0 - 1.0] * self.env.action_size[0]) next_state, reward, done = self.env.next_obs(cur_action, render=(step % 8 == 0)) temp_reward = reward temp_done = done self.agent.log_reward(temp_reward, temp_done) ma_reward = ma_reward * 0.99 + reward * 0.01 # Start training train_steps = train_config['steps'] temp_reward = 0 temp_done = True for step in range(train_steps): cur_obs = self.env.cur_obs # TODO: This step probably belongs somewhere else cur_action = np.squeeze(self.agent.get_next_action(cur_obs), axis=0) if (any(np.isnan(cur_obs))): pdb.set_trace() next_state, reward, done = self.env.next_obs(cur_action, render=(step % 8 == 0)) temp_reward = reward temp_done = done self.agent.log_reward(temp_reward, temp_done) self.agent.train() ma_reward = ma_reward * 0.995 + reward * 0.005 if (step % 500 == 0): print(cur_obs, ' ', cur_action, 'Reward:', ma_reward) print('Eps', self.agent.epsilon) if (step % 5000 == 0): print('Saving weights') self.agent.save_models(train_config['save_path']) def test(self, test_config): if test_config.get('load_path'): self.agent.load_models(test_config.get('load_path')) else: print( 'Warning: did not parse load path. Running random init model') test_steps = test_config['steps'] self.env.new_episode() temp_reward = 0 temp_done = False for step in range(test_steps): cur_obs = self.env.cur_obs cur_action = np.squeeze(self.agent.get_next_action(cur_obs, is_test=True), axis=0) cur_action = np.clip(cur_action, -1, 1) next_state, reward, done = self.env.next_obs(cur_action, render=True)
class Runner: def __init__(self, env_config, agent_config, fd_config, use_cuda = True): self.env = GymEnvironment(name = env_config["name"]) self.action_size = self.env.action_size[0] self.state_size = self.env.obs_size[0] # initialize mimic agent self.agent = MimicAgent(action_size = self.action_size, state_size = self.state_size, **agent_config, use_cuda = use_cuda) self.fd = FDModel(action_size = self.action_size, state_size = self.state_size, **fd_config, use_cuda = use_cuda) # if train_config.get('load_path') # self.agent.load_models(train_config.get('load_path')) # initialize expert # self.expert = LunarLanderExpert() self.expert = SmallReactivePolicy(self.env.observation_space, self.env.action_space) def reset(): self.agent.reset() self.env.new_episode() def sample_expert(self, num_tuples, do_render = False): ''' Accumulates experience tuples from the expert for num tuples. Returns states, action, rewards and done flags as np arrays. ''' state_size = self.state_size action_size = self.action_size capacity = num_tuples actions = np.empty((capacity, action_size), dtype = np.float16) states = np.empty((capacity, state_size), dtype = np.float16) next_states = np.empty((capacity, state_size), dtype = np.float16) rewards = np.empty(capacity, dtype = np.float16) self.env.new_episode() transition = 0 while transition < num_tuples: print('{} / {}'.format(transition+1, num_tuples)) cur_obs = self.env.cur_obs cur_action = self.expert.get_next_action(cur_obs) next_state, reward, done = self.env.next_obs(cur_action, render = ((transition % 8 == 0) and do_render)) # dont confuse the fd model with terminal states if not done: actions[transition] = cur_action states[transition] = cur_obs next_states[transition] = next_state rewards[transition] = reward transition += 1 print('Ave expert reward: ', np.mean(rewards)) return states, actions, next_states, rewards def train_mimic(self, states, actions, train_config, num_epochs = 4000): # Load model #train_config.get('num_epochs') self.agent.train_epochs(states, actions, num_epochs, states.shape[0]) def train_mimic_fd(self, states, actions, train_config, num_epochs, do_render = False): state_size = self.state_size action_size = self.action_size capacity = num_tuples actions = np.empty((capacity, action_size), dtype = np.float16) states = np.empty((capacity, state_size), dtype = np.float16) rewards = np.empty(capacity, dtype = np.float16) dones = np.empty(capacity, dtype = np.bool) self.env.new_episode() beta = 1.0 tuples_per_epoch = int(num_tuples/num_epochs) epochs = 0 for i in range(num_tuples): print('{} / {}'.format(i+1, num_tuples)) cur_obs = self.env.cur_obs cur_action = None expert_action = None if beta > np.random.rand(): cur_action = self.expert.get_next_action(cur_obs) expert_action = cur_action else: expert_action = self.expert.get_next_action(cur_obs) cur_action = np.squeeze(self.agent.get_next_action(cur_obs), axis=0) # cur_action = np.clip(cur_action, -1, 1) next_state, reward, done = self.env.next_obs(cur_action, render = ((i % 8 == 0) and do_render)) actions[i] = expert_action states[i] = cur_obs rewards[i] = reward dones[i] = done beta = 1.0-float(i)/num_tuples if ((i+1)%tuples_per_epoch) == 0 and i != 0: self.agent.train_epochs(states[:i], actions[:i], 1, 500) epochs += 1 def train_fd(self, states, actions, next_states, train_config, num_epochs): self.fd.train_epochs(states, actions, next_states, num_epochs, states.shape[0]) def train_dagger(self, train_config, num_tuples, num_epochs, do_render = False): # Load model #train_config.get('num_epochs') state_size = self.state_size action_size = self.action_size capacity = num_tuples actions = np.empty((capacity, action_size), dtype = np.float16) states = np.empty((capacity, state_size), dtype = np.float16) rewards = np.empty(capacity, dtype = np.float16) dones = np.empty(capacity, dtype = np.bool) self.env.new_episode() beta = 1.0 tuples_per_epoch = int(num_tuples/num_epochs) epochs = 0 for i in range(num_tuples): print('{} / {}'.format(i+1, num_tuples)) cur_obs = self.env.cur_obs cur_action = None expert_action = None if beta > np.random.rand(): cur_action = self.expert.get_next_action(cur_obs) expert_action = cur_action else: expert_action = self.expert.get_next_action(cur_obs) cur_action = np.squeeze(self.agent.get_next_action(cur_obs), axis=0) # cur_action = np.clip(cur_action, -1, 1) next_state, reward, done = self.env.next_obs(cur_action, render = ((i % 8 == 0) and do_render)) actions[i] = expert_action states[i] = cur_obs rewards[i] = reward dones[i] = done beta = 1.0-float(i)/num_tuples if ((i+1)%tuples_per_epoch) == 0 and i != 0: self.agent.train_epochs(states[:i], actions[:i], 1, 500) epochs += 1 # print('Ave expert reward: ', np.mean(rewards)) def test_mimic(self, test_config, do_render = False): test_steps = test_config['steps'] self.env.new_episode() tot_reward = 0 for step in range(test_steps): cur_obs = self.env.cur_obs cur_action = np.squeeze(self.agent.get_next_action(cur_obs), axis=0) # cur_action = np.clip(cur_action, -1, 1) _, reward, _ = self.env.next_obs(cur_action, render = (step%1==0 and do_render)) tot_reward += reward print('Ave test reward: {}'.format(tot_reward/test_steps)) return tot_reward/test_steps
def train(params): # Load Atari rom and prepare ALE environment atari = GymEnvironment(params.random_start_wait, params.show_game) # Initialize two Q-Value Networks one for training and one for target prediction dqn_train = DeepQNetwork( params=params, num_actions=atari.num_actions, network_name="qnetwork-train", trainable=True ) # Q-Network for predicting target Q-values dqn_target= DeepQNetwork( params=params, num_actions=atari.num_actions, network_name="qnetwork-target", trainable=False ) # Initialize replay memory for storing experience to sample batches from replay_mem = ReplayMemory(params.replay_capacity, params.batch_size) # Small structure for storing the last four screens history = ScreenHistory(params) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it replay_mem_dump = os.path.abspath(os.path.join(params.output_dir, "replay_memory.hdf5")) checkpoint_dir = os.path.abspath(os.path.join(params.output_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) train_step = 0 count_actions = np.zeros(atari.num_actions) # Count per action (only greedy) count_act_random = 0 # Count of random actions count_act_greedy = 0 # Count of greedy actions # Histories of qvalues and loss for running average qvalues_hist = collections.deque([0]*params.interval_summary, maxlen=params.interval_summary) loss_hist = collections.deque([10]*params.interval_summary, maxlen=params.interval_summary) # Time measurements dt_batch_gen = collections.deque([0]*10, maxlen=10) dt_optimization = collections.deque([0]*10, maxlen=10) dt_train_total = collections.deque([0]*10, maxlen=10) # Optionally load pre-initialized replay memory from disk if params.replay_mem_dump is not None and params.is_train: print("Loading pre-initialized replay memory from HDF5 file.") replay_mem.load(params.replay_mem_dump) # Initialize a new game and store the screens in the history reward, screen, is_terminal = atari.new_random_game() for _ in xrange(params.history_length): history.add(screen) # Initialize the TensorFlow session gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=0.4 ) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # Initialize the TensorFlow session init = tf.initialize_all_variables() sess.run(init) # Only save trainable variables and the global step to disk tf_vars_to_save = tf.trainable_variables() + [dqn_train.global_step] saver = tf.train.Saver(tf_vars_to_save, max_to_keep=40) if params.model_file is not None: # Load pre-trained model from disk saver.restore(sess, params.model_file) train_step, learning_rate = sess.run([dqn_train.global_step, dqn_train.learning_rate]) print("Restarted training from model file. Step = %06i, Learning Rate = %.5f" % (train_step, learning_rate)) # Initialize summary writer dqn_train.build_summary_writer(sess) # Initialize the target Q-Network fixed with the same weights update_target_network(sess, "qnetwork-train", "qnetwork-target") for step in xrange(params.num_steps): replay_mem_size = replay_mem.num_examples() if params.is_train and replay_mem_size < params.train_start and step % 1000 == 0: print("Initializing replay memory %i/%i" % (step, params.train_start)) # Epsilon Greedy Exploration: with the probability of epsilon # choose a random action, otherwise go greedy with the action # having the maximal Q-value. Note the minimum episolon of 0.1 if params.is_train: epsilon = max(0.1, 1.0-float(train_step*params.train_freq) / float(params.epsilon_step)) else: epsilon = 0.05 ################################################################ ####################### SELECT A MOVE ########################## ################################################################ # Either choose a random action or predict the action using the Q-network do_random_action = (random.random() < epsilon) if do_random_action or (replay_mem_size < params.train_start and params.is_train): action_id = random.randrange(atari.num_actions) count_act_random += 1 else: # Get the last screens from the history and perform # feed-forward through the network to compute Q-values feed_dict = { dqn_train.pl_screens: history.get() } qvalues = sess.run(dqn_train.qvalues, feed_dict=feed_dict) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) count_act_greedy += 1 count_actions[action_id] += 1 qvalues_hist.append(qvalue_max) ################################################################ ####################### PLAY THE MOVE ########################## ################################################################ # Play the selected action (either random or predicted) on the Atari game # Note that the action is performed for k = 4 frames (frame skipping) cumulative_reward, screen, is_terminal = atari.act(action_id) # Perform reward clipping and add the example to the replay memory cumulative_reward = min(+1.0, max(-1.0, cumulative_reward)) # Add the screen to short term history and replay memory history.add(screen) # Add experience to replay memory if params.is_train: replay_mem.add(action_id, cumulative_reward, screen, is_terminal) # Check if we are game over, and if yes, initialize a new game if is_terminal: reward, screen, is_terminal = atari.new_random_game() replay_mem.add(0, reward, screen, is_terminal) history.add(screen) ################################################################ ###################### TRAINING MODEL ########################## ################################################################ if params.is_train and step > params.train_start and step % params.train_freq == 0: t1 = time.time() # Prepare batch and train the network # TODO: set actions with terminal == 1 to reward = -1 ?? screens_in, actions, rewards, screens_out, terminals = replay_mem.sample_batch() dt_batch_gen.append(time.time() - t1) t2 = time.time() # Compute the target rewards from the previously fixed network # Note that the forward run is performed on the output screens. qvalues_target = sess.run( dqn_target.qvalues, feed_dict={ dqn_target.pl_screens: screens_out } ) # Inputs for trainable Q-network feed_dict = { dqn_train.pl_screens : screens_in, dqn_train.pl_actions : actions, dqn_train.pl_rewards : rewards, dqn_train.pl_terminals : terminals, dqn_train.pl_qtargets : np.max(qvalues_target, axis=1), } # Actual training operation _, loss, train_step = sess.run([dqn_train.train_op, dqn_train.loss, dqn_train.global_step], feed_dict=feed_dict) t3 = time.time() dt_optimization.append(t3 - t2) dt_train_total.append(t3 - t1) # Running average of the loss loss_hist.append(loss) # Check if the returned loss is not NaN if np.isnan(loss): print("[%s] Training failed with loss = NaN." % datetime.now().strftime("%Y-%m-%d %H:%M")) # Once every n = 10000 frames update the Q-network for predicting targets if train_step % params.network_update_rate == 0: print("[%s] Updating target network." % datetime.now().strftime("%Y-%m-%d %H:%M")) update_target_network(sess, "qnetwork-train", "qnetwork-target") ################################################################ ####################### MODEL EVALUATION ####################### ################################################################ if params.is_train and train_step % params.eval_frequency == 0: eval_total_reward = 0 eval_num_episodes = 0 eval_num_rewards = 0 eval_episode_max_reward = 0 eval_episode_reward = 0 eval_actions = np.zeros(atari.num_actions) # Initialize new game without random start moves reward, screen, terminal = atari.new_game() for _ in range(4): history.add(screen) for eval_step in range(params.eval_steps): if random.random() < params.eval_epsilon: # Random action action_id = random.randrange(atari.num_actions) else: # Greedy action # Get the last screens from the history and perform # feed-forward through the network to compute Q-values feed_dict_eval = { dqn_train.pl_screens: history.get() } qvalues = sess.run(dqn_train.qvalues, feed_dict=feed_dict_eval) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) # Keep track of how many of each action is performed eval_actions[action_id] += 1 # Perform the action reward, screen, terminal = atari.act(action_id) history.add(screen) eval_episode_reward += reward if reward > 0: eval_num_rewards += 1 if terminal: eval_total_reward += eval_episode_reward eval_episode_max_reward = max(eval_episode_reward, eval_episode_max_reward) eval_episode_reward = 0 eval_num_episodes += 1 reward, screen, terminal = atari.new_game() for _ in range(4): history.add(screen) # Send statistics about the environment to TensorBoard eval_update_ops = [ dqn_train.eval_rewards.assign(eval_total_reward), dqn_train.eval_num_rewards.assign(eval_num_rewards), dqn_train.eval_max_reward.assign(eval_episode_max_reward), dqn_train.eval_num_episodes.assign(eval_num_episodes), dqn_train.eval_actions.assign(eval_actions / np.sum(eval_actions)) ] sess.run(eval_update_ops) summaries = sess.run(dqn_train.eval_summary_op, feed_dict=feed_dict) dqn_train.train_summary_writer.add_summary(summaries, train_step) print("[%s] Evaluation Summary" % datetime.now().strftime("%Y-%m-%d %H:%M")) print(" Total Reward: %i" % eval_total_reward) print(" Max Reward per Episode: %i" % eval_episode_max_reward) print(" Num Episodes: %i" % eval_num_episodes) print(" Num Rewards: %i" % eval_num_rewards) ################################################################ ###################### PRINTING / SAVING ####################### ################################################################ # Write a training summary to disk if params.is_train and train_step % params.interval_summary == 0: avg_dt_batch_gen = sum(dt_batch_gen) / float(len(dt_batch_gen)) avg_dt_optimization = sum(dt_optimization) / float(len(dt_optimization)) avg_dt_total = sum(dt_train_total) / float(len(dt_train_total)) # print("Avg. Time Batch Preparation: %.3f seconds" % avg_dt_batch_gen) # print("Avg. Time Train Operation: %.3f seconds" % avg_dt_train_op) # print("Avg. Time Total per Batch: %.3f seconds (%.2f samples/second)" % # (avg_dt_total, (1.0/avg_dt_total)*params.batch_size)) # Send statistics about the environment to TensorBoard update_game_stats_ops = [ dqn_train.avg_reward_per_game.assign(atari.avg_reward_per_episode()), dqn_train.max_reward_per_game.assign(atari.max_reward_per_episode), dqn_train.avg_moves_per_game.assign(atari.avg_steps_per_episode()), dqn_train.total_reward_replay.assign(replay_mem.total_reward()), dqn_train.num_games_played.assign(atari.episode_number), dqn_train.actions_random.assign(count_act_random), dqn_train.actions_greedy.assign(count_act_greedy), dqn_train.runtime_batch.assign(avg_dt_batch_gen), dqn_train.runtime_train.assign(avg_dt_optimization), dqn_train.runtime_total.assign(avg_dt_total), dqn_train.samples_per_second.assign((1.0/avg_dt_total)*params.batch_size) ] sess.run(update_game_stats_ops) # Build and save summaries summaries = sess.run(dqn_train.train_summary_op, feed_dict=feed_dict) dqn_train.train_summary_writer.add_summary(summaries, train_step) avg_qvalue = avg_loss = 0 for i in xrange(len(qvalues_hist)): avg_qvalue += qvalues_hist[i] avg_loss += loss_hist[i] avg_qvalue /= float(len(qvalues_hist)) avg_loss /= float(len(loss_hist)) format_str = "[%s] Step %06i, ReplayMemory = %i, Epsilon = %.4f, "\ "Episodes = %i, Avg.Reward = %.2f, Max.Reward = %.2f, Avg.QValue = %.4f, Avg.Loss = %.6f" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), train_step, replay_mem.num_examples(), epsilon, atari.episode_number, atari.avg_reward_per_episode(), atari.max_reward_per_episode, avg_qvalue, avg_loss)) # For debugging purposes, dump the batch to disk #print("[%s] Writing batch images to file (debugging)" % # datetime.now().strftime("%Y-%m-%d %H:%M")) #batch_output_dir = os.path.join(params.output_dir, "batches/%06i/" % train_step) #replay_mem.write_batch_to_disk(batch_output_dir, screens_in, actions, rewards, screens_out) # Write model checkpoint to disk if params.is_train and train_step % params.interval_checkpoint == 0: path = saver.save(sess, checkpoint_prefix, global_step=train_step) print("[%s] Saving TensorFlow model checkpoint to disk." % datetime.now().strftime("%Y-%m-%d %H:%M")) # Dump the replay memory to disk # TODO: fix this! # print("[%s] Saving replay memory to disk." % # datetime.now().strftime("%Y-%m-%d %H:%M")) # replay_mem.save(replay_mem_dump) sum_actions = float(reduce(lambda x, y: x+y, count_actions)) action_str = "" for action_id, action_count in enumerate(count_actions): action_perc = action_count/sum_actions if not sum_actions == 0 else 0 action_str += "<%i, %s, %i, %.2f> " % \ (action_id, atari.action_to_string(action_id), action_count, action_perc) format_str = "[%s] Q-Network Actions Summary: NumRandom: %i, NumGreedy: %i, %s" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), count_act_random, count_act_greedy, action_str)) print("Finished training Q-network.")
def create_emulator(args): if args.environment == "ale": return AtariEnvironment(args) else: return GymEnvironment(args)
args = parser.parse_args() logger = logging.getLogger() logger.setLevel(args.log_level) if args.random_seed: random.seed(args.random_seed) # instantiate classes if args.environment == 'ale': env = ALEEnvironment(args.game, args) logger.info("Using ALE Environment") elif args.environment == 'gym': # logger does not work with this line #logger.handlers.pop() env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") else: assert False, "Unknown environment" + args.environment mem = ReplayMemory(args.replay_size, args) net = DeepQNetwork(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights) net.load_weights(args.load_weights) if args.play_games: logger.info("Playing for %d game(s)" % args.play_games)
comarg = parser.add_argument_group('Common') comarg.add_argument("output_folder", help="Where to write results to.") comarg.add_argument("--num_episodes", type=int, default=10, help="Number of episodes to test.") comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") args = parser.parse_args() if args.random_seed: random.seed(args.random_seed) env = GymEnvironment(args.env_id, args) net = DeepQNetwork(env.numActions(), args) buf = MemoryBuffer(args) if args.load_weights: print "Loading weights from %s" % args.load_weights net.load_weights(args.load_weights) env.gym.monitor.start(args.output_folder, force=True) avg_reward = 0 num_episodes = args.num_episodes for i_episode in xrange(num_episodes): env.restart() observation = env.getScreen() buf.reset() i_total_reward = 0
antarg.add_argument("--exploration_decay_steps", type=float, default=1000000, help="How many steps to decay the exploration rate.") antarg.add_argument("--exploration_rate_test", type=float, default=0.05, help="Exploration rate used during testing.") antarg.add_argument("--train_frequency", type=int, default=4, help="Perform training after this many game steps.") antarg.add_argument("--train_repeat", type=int, default=1, help="Number of times to sample minibatch during training.") antarg.add_argument("--random_starts", type=int, default=30, help="Perform max this number of dummy actions after game restart, to produce more random game dynamics.") mainarg = parser.add_argument_group('Main loop') mainarg.add_argument("--load_weights", help="Load network from file.") mainarg.add_argument("--save_weights_prefix", help="Save network to given file. Epoch and extension will be appended.") comarg = parser.add_argument_group('Common') comarg.add_argument("output_folder", help="Where to write results to.") comarg.add_argument("--num_episodes", type=int, default=100, help="Number of episodes to test.") comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") args = parser.parse_args() if args.random_seed: random.seed(args.random_seed) env = GymEnvironment(args.env_id, args) net = DeepQNetwork(env.numActions(), args) mem = None agent = Agent(env, mem, net, args) if args.load_weights: print "Loading weights from %s" % args.load_weights net.load_weights(args.load_weights) env.gym.monitor.start(args.output_folder, force=True) agent.play(args.num_episodes) env.gym.monitor.close()
import agent import tensorflow as tf import argparse from environment import GymEnvironment env_agent = GymEnvironment(display=True) agent = agent.DQNAgent(environment=env_agent, display=True) with tf.Session() as sess: agent.build_dqn(sess) sess.run(tf.global_variables_initializer()) agent.load_model() agent.play(10)