def evaluate(self, env=None, num_episodes=None): if env is None: env = self.env if num_episodes is None: self.logger.info("Evaluating...") num_episodes = self.config.num_episodes_test replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = [] for i in range(num_episodes): sum_reward = 0 state = env.reset() while True: idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() action = self.env.action_space.sample() if self.config.soft_epsilon < np.random.random(): action = np.argmax( self.sess.run(self.q, feed_dict={self.s: [q_input]})[0]) new_state, reward, done, info = env.step(action) replay_buffer.store_effect(idx, action, reward, done) state = new_state sum_reward += reward if done: break rewards.append(sum_reward) avg_reward = np.mean(rewards) if num_episodes > 1: self.logger.info("Average reward: {:04.2f}".format(avg_reward)) return avg_reward
def train(self, exp_schedule, lr_schedule): replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) t = last_eval = last_record = 0 scores_eval = [] # scores for plot scores_eval += [self.evaluate()] while t < self.config.nsteps_train: sum_reward = 0 state = self.env.reset() while True: t += 1 last_eval += 1 last_record += 1 # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() action_values = self.sess.run(self.q, feed_dict={self.s: [q_input]})[0] best_action = np.argmax(action_values) q_values = action_values action = exp_schedule.get_action(best_action) max_q_values.append(max(q_values)) q_values += list(q_values) new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state loss_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) self.get_log(exp_schedule, lr_schedule, t, loss_eval, max_q_values, rewards) sum_reward += reward if done or t >= self.config.nsteps_train: break rewards.append(sum_reward) if t > self.config.learning_start: if last_eval > self.config.eval_freq: last_eval = 0 scores_eval += [self.evaluate()] elif self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record =0 self.record() self.logger.info("*** Training is done.") self.saver.save(self.sess, self.config.model_output2, global_step=t) scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
def evaluate(net, env=None, num_episodes=50): """ Evaluation with same procedure as the training """ print("Evaluating...") # arguments defaults # replay memory to play replay_buffer = ReplayBuffer(1000000, 4) rewards = [] for i in range(num_episodes): total_reward = 0 state = env.reset() while True: # store last state in buffer idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() action = net.get_action(q_input) # perform action in env new_state, reward, done, info = env.step(action) # store in replay memory replay_buffer.store_effect(idx, action, reward, done) state = new_state # count reward total_reward += reward if done: break # updates to perform at the end of an episode rewards.append(total_reward) avg_reward = np.mean(rewards) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) if num_episodes > 1: msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) print(msg) return avg_reward
class DataManager: def __init__(self, verbose=False): self.is_online = False self.verbose = verbose def init_online(self, foxnet, session, batch_size, replay_buffer_size, frames_per_state, ip, image_height, image_width, epsilon, user_overwrite=False): self.is_online = True self.foxnet = foxnet self.session = session self.batch_size = batch_size self.epsilon = epsilon # Allow player to overwrite for faster learning self.user_overwrite = user_overwrite # Initialize ReplayBuffer. self.replay_buffer = ReplayBuffer(replay_buffer_size, frames_per_state) # Initialize emulator transfers self.frame_reader = FrameReader(ip, image_height, image_width) self.health_extractor = HealthExtractor() self.reward_extractor = RewardExtractor() self.menu_navigator = MenuNavigator() # Keep full image for reward extraction. frame, full_image = self.frame_reader.read_frame() self.prev_frame = frame self.prev_full_image = full_image # Remember the health from the previous frame. self.prev_health = None def init_offline(self, use_test_set, data_params, batch_size): self.is_online = False self.user_overwrite = False self.epsilon = 0 # Not used. # Load the two pertinent datasets into train_dataset and eval_dataset if use_test_set: train_dataset, eval_dataset = load_datasets('test', data_params) else: train_dataset, eval_dataset = load_datasets('dev', data_params) self.s_train, self.a_train, scores_train, h_train = train_dataset self.s_eval, self.a_eval, scores_test, h_test = eval_dataset # Compute the reward given scores and health. Currently, this just adds the two, weighting each one equally. self.r_train = np.add(scores_train, h_train) self.r_test = np.add(scores_test, h_test) self.batch_size = batch_size def init_epoch(self, for_eval=False): self.batch_iteration = -1 if self.is_online: pass else: if for_eval: # "epoch" is entire validation set self.epoch_indices = np.arange(self.s_eval.shape[0]) else: self.epoch_indices = np.arange(self.s_train.shape[0]) np.random.shuffle(self.epoch_indices) def has_next_batch(self, for_eval=False): if self.is_online: return True else: if for_eval: num_batch_iterations = int(math.ceil(self.s_eval.shape[0] / self.batch_size)) else: num_batch_iterations = int(math.ceil(self.s_train.shape[0] / self.batch_size)) return self.batch_iteration < num_batch_iterations def get_next_batch(self, for_eval=False): s_batch = [] a_batch = [] r_batch = [] max_score_batch = 0 self.batch_iteration += 1 frame_skip = 5 if self.is_online: frame = self.prev_frame full_image = self.prev_full_image # Play the game for base_size frames. i = 0 last_action_str = 'n' last_frame_was_a_menu = False while i < self.batch_size or not self.replay_buffer.can_sample(self.batch_size): i += 1 for j in np.arange(frame_skip): self.frame_reader.send_action(last_action_str) frame, full_image = self.frame_reader.read_frame() # As soon as the frame is the main menu, select the first option. while self.menu_navigator.is_image_menu(full_image): # Alternate actions between l and j because j selects the option, but holding j does nothing. action_str = np.random.choice(['l', 'j']) if self.verbose: print('MENU DETECTED: Pressing l or j.' 'Taking action: %s' % action_str) self.frame_reader.send_action(action_str) frame, full_image = self.frame_reader.read_frame() # Store the most recent frame and get the past frames_per_state frames that define the current state. replay_buffer_index = self.replay_buffer.store_frame(np.squeeze(frame)) state = self.replay_buffer.encode_recent_observation() state = np.expand_dims(state, 0) # Get the best action to take in the current state. if last_frame_was_a_menu: # We are not actually playing a level, so press 'l' or 'j' to get through the current menu/video. action_str = np.random.choice(['l', 'j']) if self.verbose: print('NO SCORE DETECTED: Pressing l or j. ' 'Taking action: %s' % action_str) else: feed_dict = {self.foxnet.X: state, self.foxnet.is_training: False} q_values_it = self.session.run(self.foxnet.probs, feed_dict=feed_dict) action_str = 'n' if self.user_overwrite: action_str = self.frame_reader.get_keys() # If in user-overwrite and player does not input, do e-greedy if action_str == 'n': # e-greedy exploration. if np.random.uniform() >= self.epsilon: action_str = self.foxnet.available_actions[np.argmax(q_values_it)] else: action_str = np.random.choice(self.foxnet.available_actions) # Send action to emulator. self.frame_reader.send_action(action_str) # Remember this action for the next iteration. last_action_str = action_str # Determine the action we will send to the replay buffer. if last_frame_was_a_menu: # If the last frame was a menu/video, pretend we just did a noop. replay_buffer_str = self.foxnet.available_actions.index('n') else: replay_buffer_str = self.foxnet.available_actions.index(action_str) # Get the next frame. new_frame, full_image = self.frame_reader.read_frame() # Get the reward (score + health). score_reward, score_is_not_digits = self.reward_extractor.get_reward(full_image) last_frame_was_a_menu = score_is_not_digits health_reward = self.health_extractor(full_image, offline=False) if self.verbose and not last_frame_was_a_menu: print('Online reward extracted: score=%d\thealth=%f' % (score_reward, health_reward)) # Check if we just died. if self.prev_health and self.prev_health > 0 and health_reward == 0: # Agent just died. if self.verbose: print('Agent just died. Setting health reward to -10.') health_reward = -10 self.prev_health = health_reward reward = score_reward + health_reward max_score_batch = max(score_reward, max_score_batch) # Store the <s,a,r,s'> transition. self.replay_buffer.store_effect(replay_buffer_index, replay_buffer_str, reward, False) frame = new_frame self.prev_frame = frame self.prev_full_image = full_image s_batch, a_batch, r_batch, _, _ = self.replay_buffer.sample(self.batch_size) else: # Choose which data to batch if (for_eval): s_to_batch = self.s_eval a_to_batch = self.a_eval r_to_batch = None else: s_to_batch = self.s_train a_to_batch = self.a_train r_to_batch = self.r_train # Generate indices for the batch. start_idx = (self.batch_iteration * self.batch_size) % s_to_batch.shape[0] idx = self.epoch_indices[start_idx: start_idx + self.batch_size] s_batch = s_to_batch[idx, :] a_batch = a_to_batch[idx] if (not for_eval): r_batch = r_to_batch[idx] # print('Max score for current batch: %d' % max_score_batch) return s_batch, a_batch, r_batch, max_score_batch
def dqn_learing(env, q_func, optimizer_spec, exploration, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.MultiDiscrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c #num_actions = env.action_space.shape num_actions = 13 # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].view(-1, 1).cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # to mario act ex:[0, 0, 0, 1, 1, 0] def to_mario_act(action, num_actions): """ action = action % num_actions if action == 0: # Move right while jumping action_onehot = np.array([0, 0, 0, 1, 1, 0]) else: action_onehot = np.zeros(num_actions, dtype=int) action_onehot[action] = 1 """ action_list = [[0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 1, 0], [0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 0, 1], [0, 1, 0, 0, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1]] return action_list[action] # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(to_mario_act(action, num_actions)) # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.view(-1, 1)) """ # DQN # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0].view(-1, 1) next_Q_values = not_done_mask.view(-1, 1) * next_max_q """ next_argmax_action = Q(next_obs_batch).max(1)[1].view(-1, 1) next_q = target_Q(next_obs_batch).detach().gather( 1, next_argmax_action) next_Q_values = not_done_mask.view(-1, 1) * next_q # Compute the target of the current Q values target_Q_values = rew_batch.view(-1, 1) + (gamma * next_Q_values) """ # Compute Bellman error bellman_error = target_Q_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass current_Q_values.backward(d_error.data) """ loss = F.smooth_l1_loss(current_Q_values, target_Q_values) optimizer.zero_grad() loss.backward() for param in Q.parameters(): param.grad.data.clamp(-1, 1) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress and keep track of statistics episode_rewards = env.get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f)
class DoubleDQN(object): def __init__(self, image_shape, num_actions, frame_history_len=4, replay_buffer_size=1000000, training_freq=4, training_starts=5000, training_batch_size=32, target_update_freq=1000, reward_decay=0.99, exploration=LinearSchedule(5000, 0.1), log_dir="logs/"): """ Double Deep Q Network params: image_shape: (height, width, n_values) num_actions: how many different actions we can choose frame_history_len: feed this number of frame data as input to the deep-q Network replay_buffer_size: size limit of replay buffer training_freq: train base q network once per training_freq steps training_starts: only train q network after this number of steps training_batch_size: batch size for training base q network with gradient descent reward_decay: decay factor(called gamma in paper) of rewards that happen in the future exploration: used to generate an exploration factor(see 'epsilon-greedy' in paper). when rand(0,1) < epsilon, take random action; otherwise take greedy action. log_dir: path to write tensorboard logs """ super().__init__() self.num_actions = num_actions self.training_freq = training_freq self.training_starts = training_starts self.training_batch_size = training_batch_size self.target_update_freq = target_update_freq self.reward_decay = reward_decay self.exploration = exploration # use multiple frames as input to q network input_shape = image_shape[:-1] + (image_shape[-1] * frame_history_len, ) # used to choose action self.base_model = q_model(input_shape, num_actions) self.base_model.compile(optimizer=optimizers.adam(clipnorm=10, lr=1e-4, decay=1e-6, epsilon=1e-4), loss='mse') # used to estimate q values self.target_model = q_model(input_shape, num_actions) self.replay_buffer = ReplayBuffer(size=replay_buffer_size, frame_history_len=frame_history_len) # current replay buffer offset self.replay_buffer_idx = 0 self.tensorboard_callback = TensorBoard(log_dir=log_dir) self.latest_losses = deque(maxlen=100) def get_replay_buffer_idx(self, obs): return self.replay_buffer.store_frame(obs) def train_have_started(self, step): return step < self.training_starts def is_new_exploration_decision(self, step): return (np.random.rand() < self.exploration.value(step)) def get_randint_actions(self): return np.random.randint(self.num_actions) def encodeRecentObservationsReplayBuffer(self): return self.replay_buffer.encode_recent_observation() def _settle_replay_buffer_id(self, obs): self.replay_buffer_idx = self.get_replay_buffer_idx(obs) return self def choose_action(self, step, obs): #self.replay_buffer_idx = self.get_replay_buffer_idx(obs) self._settle_replay_buffer_id(obs) train_have_started = self.train_have_started is_new_exploration_decision = self.is_new_exploration_decision get_randint_actions = self.get_randint_actions encodeRecentObservationsReplayBuffer = self.encodeRecentObservationsReplayBuffer continuous_decision = lambda step_cached: train_have_started( step_cached) or is_new_exploration_decision(step_cached) if continuous_decision(step): # take random action action = get_randint_actions() else: # take action that results in maximum q value recent_obs = encodeRecentObservationsReplayBuffer() base_model = self.base_model arr_recent_obs = np.array([recent_obs]) base_model_predicted = base_model.predict_on_batch(arr_recent_obs) q_vals = base_model_predicted.flatten() action = np.argmax(q_vals) return action def learn(self, step, action, reward, done, info=None): self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done) if step > self.training_starts and step % self.training_freq == 0: self._train() if step > self.training_starts and step % self.target_update_freq == 0: self._update_target() def eval_iters(self): optimizer = self.base_model.optimizer iterations_optimizer = optimizer.iterations eval_iterations = K.eval(iterations_optimizer) return eval_iterations def mul_decay_iters(self): optimizer = self.base_model.optimizer evaluated_iters = self.eval_iters() evaluated_mul_decay_iters = K.eval(optimizer.decay * evaluated_iters) return evaluated_mul_decay_iters def normalize_params(self): mul_decay_evaluated_iters = self.mul_decay_iters() normalization = (1. / (1. + mul_decay_evaluated_iters)) return normalization def get_learning_rate(self): optimizer = self.base_model.optimizer #import pdb; pdb.set_trace() #lr = K.eval(optimizer.lr * (1. / (1. + optimizer.decay * optimizer.iterations))) evaluated_iters = self.eval_iters() params_norm = self.normalize_params() lr = K.eval(optimizer.lr * params_norm) return lr def get_avg_loss(self): latest_losses = self.latest_losses is_gt_zero_latest_losses = len(latest_losses) > 0 if is_gt_zero_latest_losses: latest_losses = np.array(latest_losses, dtype=np.float32) mean_latest_losses = np.mean(latest_losses) return mean_latest_losses else: return None def _train(self): obs_t, action, reward, obs_t1, done_mask = self.replay_buffer.sample( self.training_batch_size) q = self.base_model.predict(obs_t) q_t1 = self.target_model.predict(obs_t1) q_t1_max = np.max(q_t1, axis=1) # print('q:\n', q) # print('q_t1:\n', q_t1) # print('q_t1_max:\n', q_t1_max) # print('action:\n', action) # for idx in range(len(q)): # q[idx][action[idx]] = reward[idx] + q_t1_max[idx] * self.reward_decay * (1-done_mask[idx]) q[range(len(action)), action] = reward + q_t1_max * self.reward_decay * (1 - done_mask) # print('reward:\n', reward) # print('qt1_max:\n', q_t1_max) # print('done mask:\n', done_mask) # print("q': \n", q) # self.base_model.fit(obs_t, q, batch_size=self.training_batch_size, epochs=1) loss = self.base_model.train_on_batch(obs_t, q) self.latest_losses.append(loss) def _update_target(self): weights = self.base_model.get_weights() # print('update target', weights) self.target_model.set_weights(weights)
num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() episodes_rewards = [] for t in count(): ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) print(last_idx, last_obs.shape) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() print(recent_observations.shape) guard_action, invader_action = env.act() # Choose random action if not yet start learning if t > LEARNING_STARTS: action = select_epilson_greedy_action(Q, recent_observations, t).item() print(action) else: action = random.randrange(NUM_ACTIONS) print(action) print(guard_action, invader_action) # Advance one step obs, reward, done, _ = env.step(guard_action, invader_action) print(reward)