def main(_): pp.pprint(flags.FLAGS.__flags) game = Game() with tf.Session() as sess: with tf.device('/cpu:0'): dqn = DQN(sess, game, flags.FLAGS) dqn.train()
def eval_game(game: Game, dqn: DQN, action, q_vals, queue, root_index, root=True): """ Called by look_ahead function. Used to evaluate a state, update Q value, enumerate and enqueue possible child actions. By default, this treats the root actions first. Args: game, A Game object to be evaluated dqn, A deep Q learning network object to evaluate state action, A tuple representing an action and its optional target q_vals, A shared mem array for the global Q vales queue, A Queue to store child actions root_index, The index of the root action in q_vals root(=True), Whether or not these are the root actions Returns: self """ # (local) copy game object, perform action, get state feature vector, evaluate perform_action(action, game.current_player, game) state = get_state(game) #Pass to Tensorflow here to evaluate s_val = dqn.get_q_value(state, "dqn") print("Action:", action) print("Q value: %f", s_val) """
def tf_worker(dqn: DQN, s_queue): """ A single process that removes evaluation tasks from a queue. It constructs the NN in TensorFlow and uses it to evaluate board states sent to it. Args: dqn, an uninitialized TensorFlow object representing the DQN s_queue, the queue of board states. Returns: self """ #Perform TensorFlow initialization with tf.Graph().as_default() as dqn.tf_graph: dqn.build_model() with tf.Session() as dqn.tf_session: dqn._init_tf() try: index, state = s_queue.get(True, 5) while index != -1: #Reshape to align with network input. state = state.reshape(1, 263) #Pass to Tensorflow here to evaluate s_val = dqn.get_q_value(state, "dqn") # (global) Update root action with evaluation; average child values if not isclose(q_vals[index], 0.0, rel_tol=1e-6): q_vals[index] = (q_vals[index] + s_val)/2 else: q_vals[index] = s_val index, state = s_queue.get(True, 5) except Empty as e: raise GameTreeFailure except: raise
def __init__(self): self.last_action = Action() self.time_step = 0 self.total_time_step = 0 self.episode_step = 0 self.populating_phase = False self.model_save_interval = 30 # Switch learning phase / evaluation phase self.policy_frozen = False self.dqn = DQN() self.state = np.zeros((config.rl_agent_history_length, config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]), dtype=np.float32) self.exploration_rate = self.dqn.exploration_rate self.exploration_rate_for_evaluation = 0.05 self.last_observed_screen = None
def run_dqn(): # get command line arguments, defaults set in utils.py agent_params, dqn_params, cnn_params, prog_params = parse_args() env = gym.make(agent_params['environment']) episodes = agent_params['episodes'] steps = agent_params['steps'] steps_to_update = agent_params['steps_to_update'] skipping = agent_params['skipping'] num_actions = env.action_space.n observation_shape = env.observation_space.shape display = prog_params['display'] monitor = prog_params['monitor'] verbose = prog_params['verbose'] if verbose > 0: print("num actions: ", num_actions) print("observation_shape: ", observation_shape) # initialize dqn learning dqn = DQN(num_actions, observation_shape, dqn_params, cnn_params, prog_params) if monitor: env.monitor.start('./outputs/experiment-' + agent_params['run_id']) last_100 = deque(maxlen=100) total_steps = 0 for i_episode in range(episodes): observation = env.reset() reward_sum = 0 for t in range(steps): if display: env.render() # Use the previous action if in a skipping frame if total_steps % skipping == 0: # select action based on the model action = dqn.select_action(observation) # execute actin in emulator new_observation, reward, done, _ = env.step(action) new_observation = new_observation.ravel() # Only update the network if not in a skipping frame if total_steps % skipping == 0: # update the state dqn.update_state(action, new_observation, reward, done) # train the model dqn.train_step() observation = new_observation reward_sum += reward if done: if verbose > 0: print("Episode ", i_episode) if verbose > 1: print("Finished after {} timesteps".format(t+1)) print("Reward for this episode: ", reward_sum) if verbose > 0: last_100.append(reward_sum) print("Average reward for last 100 episodes: ", np.mean(last_100)) break if total_steps % steps_to_update == 0: if verbose > 0: print("Total steps : ", total_steps) print("Updating target network...") dqn.update_target() total_steps += 1 if monitor: env.monitor.close()
class Agent(RLGlueAgent): def __init__(self): self.last_action = Action() self.time_step = 0 self.total_time_step = 0 self.episode_step = 0 self.populating_phase = False self.model_save_interval = 30 # Switch learning phase / evaluation phase self.policy_frozen = False self.dqn = DQN() self.state = np.zeros((config.rl_agent_history_length, config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]), dtype=np.float32) self.exploration_rate = self.dqn.exploration_rate self.exploration_rate_for_evaluation = 0.05 self.last_observed_screen = None def preprocess_screen(self, observation): screen_width = config.ale_screen_size[0] screen_height = config.ale_screen_size[1] new_width = config.ale_scaled_screen_size[0] new_height = config.ale_scaled_screen_size[1] if len(observation.intArray) == 100928: observation = np.asarray(observation.intArray[128:], dtype=np.uint8).reshape((screen_width, screen_height, 3)) observation = spm.imresize(observation, (new_height, new_width)) # Clip the pixel value to be between 0 and 1 if config.ale_screen_channels == 1: # Convert RGB to Luminance observation = np.dot(observation[:,:,:], [0.299, 0.587, 0.114]) observation = observation.reshape((new_height, new_width, 1)) observation = observation.transpose(2, 0, 1) / 255.0 observation /= (np.max(observation) + 1e-5) else: # Greyscale if config.ale_screen_channels == 3: raise Exception("You forgot to add --send_rgb option when you run ALE.") observation = np.asarray(observation.intArray[128:]).reshape((screen_width, screen_height)) observation = spm.imresize(observation, (new_height, new_width)) # Clip the pixel value to be between 0 and 1 observation = observation.reshape((1, new_height, new_width)) / 255.0 observation /= (np.max(observation) + 1e-5) observed_screen = observation if self.last_observed_screen is not None: observed_screen = np.maximum(observation, self.last_observed_screen) self.last_observed_screen = observation return observed_screen def agent_init(self, taskSpecString): pass def reshape_state_to_conv_input(self, state): return state.reshape((1, config.rl_agent_history_length * config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0])) def dump_result(self, reward, q_max=None, q_min=None): if self.time_step % 50 == 0: if self.policy_frozen is False: print "time_step:", self.time_step, print "reward:", reward, print "eps:", self.exploration_rate, if q_min is None: print "" else: print "Q ::", print "max:", q_max, print "min:", q_min def dump_state(self, state=None, prefix=""): if state is None: state = self.state state = self.reshape_state_to_conv_input(state) for h in xrange(config.rl_agent_history_length): start = h * config.ale_screen_channels end = start + config.ale_screen_channels image = state[0,start:end,:,:] if config.ale_screen_channels == 1: image = image.reshape((image.shape[1], image.shape[2])) elif config.ale_screen_channels == 3: image = image.transpose(1, 2, 0) image = np.uint8(image * 255.0) image = Image.fromarray(image) image.save(("%sstate-%d.png" % (prefix, h))) def learn(self, reward, epsode_ends=False): if self.policy_frozen is False: self.dqn.store_transition_in_replay_memory(self.reshape_state_to_conv_input(self.last_state), self.last_action.intArray[0], reward, self.reshape_state_to_conv_input(self.state), epsode_ends) if self.total_time_step <= config.rl_replay_start_size: # A uniform random policy is run for 'replay_start_size' frames before learning starts # 経験を積むためランダムに動き回るらしい。 print "Initial exploration before learning starts:", "%d/%d" % (self.total_time_step, config.rl_replay_start_size) self.populating_phase = True self.exploration_rate = config.rl_initial_exploration else: self.populating_phase = False self.dqn.decrease_exploration_rate() self.exploration_rate = self.dqn.exploration_rate if self.total_time_step % (config.rl_action_repeat * config.rl_update_frequency) == 0 and self.total_time_step != 0: self.dqn.replay_experience() if self.total_time_step % config.rl_target_network_update_frequency == 0 and self.total_time_step != 0: print "Target has been updated." self.dqn.update_target() def agent_start(self, observation): print "Episode", self.episode_step, "::", "total_time_step:", if self.total_time_step > 1000: print int(self.total_time_step / 1000), "K" else: print self.total_time_step observed_screen = self.preprocess_screen(observation) self.state[0] = observed_screen return_action = Action() action, q_max, q_min = self.dqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate) return_action.intArray = [action] self.last_action = copy.deepcopy(return_action) self.last_state = self.state return return_action def agent_step(self, reward, observation): observed_screen = self.preprocess_screen(observation) self.state = np.roll(self.state, 1, axis=0) self.state[0] = observed_screen ########################### DEBUG ############################### # if self.total_time_step % 500 == 0 and self.total_time_step != 0: # self.dump_state() self.learn(reward) return_action = Action() q_max = None q_min = None if self.time_step % config.rl_action_repeat == 0: action, q_max, q_min = self.dqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate) else: action = self.last_action.intArray[0] return_action.intArray = [action] self.dump_result(reward, q_max, q_min) if self.policy_frozen is False: self.last_action = copy.deepcopy(return_action) self.last_state = self.state self.time_step += 1 self.total_time_step += 1 return return_action def agent_end(self, reward): self.learn(reward, epsode_ends=True) # [Optional] ## Visualizing the results self.dump_result(reward) if self.policy_frozen is False: self.time_step = 0 self.total_time_step += 1 self.episode_step += 1 def agent_cleanup(self): pass def agent_message(self, inMessage): if inMessage.startswith("freeze_policy"): self.policy_frozen = True self.exploration_rate = self.exploration_rate_for_evaluation return "The policy was freezed." if inMessage.startswith("unfreeze_policy"): self.policy_frozen = False self.exploration_rate = self.dqn.exploration_rate return "The policy was unfreezed." if inMessage.startswith("save_model"): if self.populating_phase is False: self.dqn.save() return "The model was saved."
testarg = parser.add_argument_group('Test') testarg.add_argument("--display", dest="display", help="Display screen during testing.") testarg.set_defaults(display=False) testarg.add_argument("--random_starts", type=int, default=30, help="Perform max this number of no-op actions to be performed by the agent at the start of an episode.") testarg.add_argument("--ckpt_dir", default='model', help="Tensorflow checkpoint directory.") testarg.add_argument("--out", help="Output directory for gym.") testarg.add_argument("--episodes", type=int, default=100, help="Number of episodes.") testarg.add_argument("--seed", type=int, help="Random seed.") args = parser.parse_args() if args.seed: rand.seed(args.seed) if not os.path.exists(args.ckpt_dir): os.makedirs(args.ckpt_dir) # initialize gym environment and dqn env = Environment(args) agent = DQN(env, args) # train agent Trainer(agent).run() # play the game env.gym.monitor.start(args.out, force=True) agent.play() env.gym.monitor.close()
import multiprocessing as mp import sys def q_par(dqn, g1): print(dqn.get_q_value(g1, "dqn")) def par(a): print(a) features = 20 h1 = 10 h2 = 5 g1 = np.arange(40).reshape(2, 20) dqn = DQN(features, h1, h2, "models/tf_multi_1") with dqn.tf_graph.as_default(): dqn.build_model() with tf.Session() as dqn.tf_session: dqn._init_tf() print(dqn.tf_session.run(dqn.model, feed_dict={dqn.s_: g1})) processes = [] for p in range(4): processes.append(mp.Process(target=q_par, args=(dqn,g1))) processes[p].start() for p in range(4): processes[p].join()
#Basic script to test the dqn.py object and functions import tensorflow as tf import numpy as np from dqn import DQN features = 20 h1 = 10 h2 = 5 g1 = np.arange(40).reshape(2, 20) g2 = np.random.randint(-5,5,40).reshape(2,20) dqn = DQN(features, h1, h2, "models/dqn_1") with tf.Graph().as_default(): dqn.build_model() with tf.Session() as dqn.tf_session: dqn._init_tf() print(dqn.tf_session.run(dqn.model, feed_dict={dqn.s_: g1})) print(dqn.get_q_value(g1, "dqn")) print(dqn.get_q_value(g1, "dqn")) print(dqn.get_q_value(g1, "dqn"))
#!/usr/bin/env python from dqn import DQN actl = DQN() actl.train()
class Agent(): def __init__(self, env, gamma, lr, n_actions, input_dim, ann_layer, mem_size, batch_size, epsilon, eps_min=0.01, eps_dec=5e-7, replace=500, path='tmp'): self.env = env self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dim = input_dim self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.path = path self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayMemory(mem_size) self.q_eval = DQN(input_dim, ann_layer, n_actions, self.batch_size) self.q_next = DQN(input_dim, ann_layer, n_actions, self.batch_size) self.optimizer = torch.optim.RMSprop(self.q_eval.parameters(), lr=self.lr) self.loss = nn.SmoothL1Loss() self.last_loss = 0 def choose_action(self, state): if np.random.random() > self.epsilon: state = self.env.transform_state(state) self.q_eval.eval() with torch.no_grad(): actions = self.q_eval(state.reshape(1, -1)) self.q_eval.train() return actions.argmax().item() else: return np.random.choice(self.action_space) def store_transition(self, state, action, reward, done, next_state): self.memory.push(state.reshape(1, -1), action, reward, done, next_state.reshape(1, -1)) def sample_memory(self): state, action, reward, done, next_state = self.memory.sample(self.batch_size) device = self.q_eval.device state, action, reward, done, next_state = \ state.to(device), action.to(device), reward.to(device), done.to(device), next_state.to(device) return state, action, reward, done, next_state def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon - self.eps_dec > self.eps_min \ else self.eps_min def learn(self): if len(self.memory) < self.batch_size: return self.optimizer.zero_grad() self.replace_target_network() state, action, reward, done, next_state = self.sample_memory() state = self.env.transform_state(state) reward = self.env.transform_reward(reward) next_state = self.env.transform_state(next_state) q_pred = self.q_eval(state) q_pred = q_pred[torch.arange(self.batch_size), action.long()] q_next = self.q_next(next_state).max(1)[0] q_next[done] = 0.0 q_target = reward + self.gamma * q_next loss = self.loss(q_pred, q_target.detach()).to(self.q_eval.device) self.last_loss = loss.item() loss.backward() self.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon() def save(self, path): self.q_eval.save_model(self.path) def load(self, path): self.q_eval.load_model(self.path) self.q_next.load_model(self.path)