class DataManager: def __init__(self, verbose=False): self.is_online = False self.verbose = verbose def init_online(self, foxnet, session, batch_size, replay_buffer_size, frames_per_state, ip, image_height, image_width, epsilon, user_overwrite=False): self.is_online = True self.foxnet = foxnet self.session = session self.batch_size = batch_size self.epsilon = epsilon # Allow player to overwrite for faster learning self.user_overwrite = user_overwrite # Initialize ReplayBuffer. self.replay_buffer = ReplayBuffer(replay_buffer_size, frames_per_state) # Initialize emulator transfers self.frame_reader = FrameReader(ip, image_height, image_width) self.health_extractor = HealthExtractor() self.reward_extractor = RewardExtractor() self.menu_navigator = MenuNavigator() # Keep full image for reward extraction. frame, full_image = self.frame_reader.read_frame() self.prev_frame = frame self.prev_full_image = full_image # Remember the health from the previous frame. self.prev_health = None def init_offline(self, use_test_set, data_params, batch_size): self.is_online = False self.user_overwrite = False self.epsilon = 0 # Not used. # Load the two pertinent datasets into train_dataset and eval_dataset if use_test_set: train_dataset, eval_dataset = load_datasets('test', data_params) else: train_dataset, eval_dataset = load_datasets('dev', data_params) self.s_train, self.a_train, scores_train, h_train = train_dataset self.s_eval, self.a_eval, scores_test, h_test = eval_dataset # Compute the reward given scores and health. Currently, this just adds the two, weighting each one equally. self.r_train = np.add(scores_train, h_train) self.r_test = np.add(scores_test, h_test) self.batch_size = batch_size def init_epoch(self, for_eval=False): self.batch_iteration = -1 if self.is_online: pass else: if for_eval: # "epoch" is entire validation set self.epoch_indices = np.arange(self.s_eval.shape[0]) else: self.epoch_indices = np.arange(self.s_train.shape[0]) np.random.shuffle(self.epoch_indices) def has_next_batch(self, for_eval=False): if self.is_online: return True else: if for_eval: num_batch_iterations = int(math.ceil(self.s_eval.shape[0] / self.batch_size)) else: num_batch_iterations = int(math.ceil(self.s_train.shape[0] / self.batch_size)) return self.batch_iteration < num_batch_iterations def get_next_batch(self, for_eval=False): s_batch = [] a_batch = [] r_batch = [] max_score_batch = 0 self.batch_iteration += 1 frame_skip = 5 if self.is_online: frame = self.prev_frame full_image = self.prev_full_image # Play the game for base_size frames. i = 0 last_action_str = 'n' last_frame_was_a_menu = False while i < self.batch_size or not self.replay_buffer.can_sample(self.batch_size): i += 1 for j in np.arange(frame_skip): self.frame_reader.send_action(last_action_str) frame, full_image = self.frame_reader.read_frame() # As soon as the frame is the main menu, select the first option. while self.menu_navigator.is_image_menu(full_image): # Alternate actions between l and j because j selects the option, but holding j does nothing. action_str = np.random.choice(['l', 'j']) if self.verbose: print('MENU DETECTED: Pressing l or j.' 'Taking action: %s' % action_str) self.frame_reader.send_action(action_str) frame, full_image = self.frame_reader.read_frame() # Store the most recent frame and get the past frames_per_state frames that define the current state. replay_buffer_index = self.replay_buffer.store_frame(np.squeeze(frame)) state = self.replay_buffer.encode_recent_observation() state = np.expand_dims(state, 0) # Get the best action to take in the current state. if last_frame_was_a_menu: # We are not actually playing a level, so press 'l' or 'j' to get through the current menu/video. action_str = np.random.choice(['l', 'j']) if self.verbose: print('NO SCORE DETECTED: Pressing l or j. ' 'Taking action: %s' % action_str) else: feed_dict = {self.foxnet.X: state, self.foxnet.is_training: False} q_values_it = self.session.run(self.foxnet.probs, feed_dict=feed_dict) action_str = 'n' if self.user_overwrite: action_str = self.frame_reader.get_keys() # If in user-overwrite and player does not input, do e-greedy if action_str == 'n': # e-greedy exploration. if np.random.uniform() >= self.epsilon: action_str = self.foxnet.available_actions[np.argmax(q_values_it)] else: action_str = np.random.choice(self.foxnet.available_actions) # Send action to emulator. self.frame_reader.send_action(action_str) # Remember this action for the next iteration. last_action_str = action_str # Determine the action we will send to the replay buffer. if last_frame_was_a_menu: # If the last frame was a menu/video, pretend we just did a noop. replay_buffer_str = self.foxnet.available_actions.index('n') else: replay_buffer_str = self.foxnet.available_actions.index(action_str) # Get the next frame. new_frame, full_image = self.frame_reader.read_frame() # Get the reward (score + health). score_reward, score_is_not_digits = self.reward_extractor.get_reward(full_image) last_frame_was_a_menu = score_is_not_digits health_reward = self.health_extractor(full_image, offline=False) if self.verbose and not last_frame_was_a_menu: print('Online reward extracted: score=%d\thealth=%f' % (score_reward, health_reward)) # Check if we just died. if self.prev_health and self.prev_health > 0 and health_reward == 0: # Agent just died. if self.verbose: print('Agent just died. Setting health reward to -10.') health_reward = -10 self.prev_health = health_reward reward = score_reward + health_reward max_score_batch = max(score_reward, max_score_batch) # Store the <s,a,r,s'> transition. self.replay_buffer.store_effect(replay_buffer_index, replay_buffer_str, reward, False) frame = new_frame self.prev_frame = frame self.prev_full_image = full_image s_batch, a_batch, r_batch, _, _ = self.replay_buffer.sample(self.batch_size) else: # Choose which data to batch if (for_eval): s_to_batch = self.s_eval a_to_batch = self.a_eval r_to_batch = None else: s_to_batch = self.s_train a_to_batch = self.a_train r_to_batch = self.r_train # Generate indices for the batch. start_idx = (self.batch_iteration * self.batch_size) % s_to_batch.shape[0] idx = self.epoch_indices[start_idx: start_idx + self.batch_size] s_batch = s_to_batch[idx, :] a_batch = a_to_batch[idx] if (not for_eval): r_batch = r_to_batch[idx] # print('Max score for current batch: %d' % max_score_batch) return s_batch, a_batch, r_batch, max_score_batch
def main_loop(handle, possible_actions: list, model: Model, target_model: Model): exp_schedule = ExplorationScheduler() target_model.load_state_dict(model.state_dict()) optimizer = torch.optim.RMSprop(model.parameters()) with mss() as sct: counter = 0 frame_counter = 0 frame_skip_counter = 0 score = 0 lives = 3 frame_times = [0, 0, 0, 0] replay_buffer = ReplayBuffer( REPLAY_BUFFER_SIZE, (3 * FRAMES_FEED, RESIZE_HEIGHT, RESIZE_WIDTH), FRAMES_FEED, baseline_priority=1, gamma=GAMMA, reward_steps=N_STEP_REWARD) t = 0 action = 0 while True: if not active: time.sleep( 0.5 ) # Wait some time and check if recording should be resumed. continue startMillis = time.time() # Time # Grab frames frame, frame_cv2 = grab_screen(monitor, sct) # Show frame if DEBUG: cv2.imshow('window1', frame_cv2) # Check if frame will be skipped. Not skipped if counter is 0 if frame_skip_counter == 0: reward, score, lives = get_reward(handle, lives, score) # print(action, reward) if replay_buffer.waiting_for_effect: replay_buffer.add_effects(action, reward) replay_buffer.push_frame(frame) if replay_buffer.buffer_init( ) and np.random.random() > exp_schedule.value(t): action = choose_action(replay_buffer.encode_last_frame(), model) else: action = np.random.randint(0, len(possible_actions)) execute_actions([possible_actions[int(action)] ]), # dk.SCANCODES["z"] # Logic to deal with a ready datapoint if replay_buffer.can_sample( BATCH_SIZE) and t % TRAIN_FREQ == 0: if PAUSE_ON_TRAIN: pause_game() for _ in range(BATCHES_PER_TRAIN): optimize_model(model, target_model, replay_buffer, optimizer, num_actions=len(possible_actions)) if PAUSE_ON_TRAIN: pause_game() # Copy model weights to target if t % TARGET_MODEL_UPDATE_FREQ == 0: print("Saving model") state_dict = model.state_dict() torch.save(state_dict, MODEL_PATH) print("done pickling") target_model.load_state_dict(state_dict) target_model.eval() frame_skip_counter += 1 frame_skip_counter = frame_skip_counter % FRAMES_SKIP # Frame timings and other utility endMillis = time.time() frame_time = endMillis - startMillis frame_times[counter % 4] = frame_time t += 1 # if counter % 4 == 0: # print("frame time: %s" % (np.mean(frame_times))) counter += 1 if cv2.waitKey(25) & 0xFF == ord('q'): cv2.destroyAllWindows() break
def dqn_learing(env, q_func, optimizer_spec, exploration, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.MultiDiscrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c #num_actions = env.action_space.shape num_actions = 13 # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].view(-1, 1).cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # to mario act ex:[0, 0, 0, 1, 1, 0] def to_mario_act(action, num_actions): """ action = action % num_actions if action == 0: # Move right while jumping action_onehot = np.array([0, 0, 0, 1, 1, 0]) else: action_onehot = np.zeros(num_actions, dtype=int) action_onehot[action] = 1 """ action_list = [[0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 1, 0], [0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 0, 1], [0, 1, 0, 0, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1]] return action_list[action] # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(to_mario_act(action, num_actions)) # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable( torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.view(-1, 1)) """ # DQN # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0].view(-1, 1) next_Q_values = not_done_mask.view(-1, 1) * next_max_q """ next_argmax_action = Q(next_obs_batch).max(1)[1].view(-1, 1) next_q = target_Q(next_obs_batch).detach().gather( 1, next_argmax_action) next_Q_values = not_done_mask.view(-1, 1) * next_q # Compute the target of the current Q values target_Q_values = rew_batch.view(-1, 1) + (gamma * next_Q_values) """ # Compute Bellman error bellman_error = target_Q_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass current_Q_values.backward(d_error.data) """ loss = F.smooth_l1_loss(current_Q_values, target_Q_values) optimizer.zero_grad() loss.backward() for param in Q.parameters(): param.grad.data.clamp(-1, 1) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress and keep track of statistics episode_rewards = env.get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('statistics.pkl', 'wb') as f: pickle.dump(Statistic, f)
print(np.mean(episodes_rewards), t) episodes_rewards = [] torch.save(Q, '../weights/Q_dqn_invader.pt') torch.save(target_Q, '../weights/target_Q_dqn_invader.pt') obs = env.reset() last_obs = obs exit() ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > LEARNING_STARTS and t % LEARNING_FREQ == 0 and replay_buffer.can_sample(BATCH_SIZE)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( BATCH_SIZE) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(dtype) / 255.0) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: