Beispiel #1
0
class DataManager:
    def __init__(self, verbose=False):
        self.is_online = False
        self.verbose = verbose

    def init_online(self, foxnet, session, batch_size, replay_buffer_size, frames_per_state, ip, image_height,
                    image_width, epsilon, user_overwrite=False):
        self.is_online = True
        self.foxnet = foxnet
        self.session = session
        self.batch_size = batch_size
        self.epsilon = epsilon

        # Allow player to overwrite for faster learning
        self.user_overwrite = user_overwrite

        # Initialize ReplayBuffer.
        self.replay_buffer = ReplayBuffer(replay_buffer_size, frames_per_state)

        # Initialize emulator transfers
        self.frame_reader = FrameReader(ip, image_height, image_width)
        self.health_extractor = HealthExtractor()
        self.reward_extractor = RewardExtractor()
        self.menu_navigator = MenuNavigator()

        # Keep full image for reward extraction.
        frame, full_image = self.frame_reader.read_frame()
        self.prev_frame = frame
        self.prev_full_image = full_image

        # Remember the health from the previous frame.
        self.prev_health = None

    def init_offline(self, use_test_set, data_params, batch_size):
        self.is_online = False
        self.user_overwrite = False
        self.epsilon = 0 # Not used.

        # Load the two pertinent datasets into train_dataset and eval_dataset
        if use_test_set:
            train_dataset, eval_dataset = load_datasets('test', data_params)
        else:
            train_dataset, eval_dataset = load_datasets('dev', data_params)

        self.s_train, self.a_train, scores_train, h_train = train_dataset
        self.s_eval, self.a_eval, scores_test, h_test = eval_dataset

        # Compute the reward given scores and health. Currently, this just adds the two, weighting each one equally.
        self.r_train = np.add(scores_train, h_train)
        self.r_test = np.add(scores_test, h_test)

        self.batch_size = batch_size

    def init_epoch(self, for_eval=False):
        self.batch_iteration = -1

        if self.is_online:
            pass
        else:
            if for_eval: # "epoch" is entire validation set
                self.epoch_indices = np.arange(self.s_eval.shape[0])
            else:
                self.epoch_indices = np.arange(self.s_train.shape[0])
            np.random.shuffle(self.epoch_indices)

    def has_next_batch(self, for_eval=False):
        if self.is_online:
            return True
        else:
            if for_eval:
                num_batch_iterations = int(math.ceil(self.s_eval.shape[0] / self.batch_size))
            else:
                num_batch_iterations = int(math.ceil(self.s_train.shape[0] / self.batch_size))
            return self.batch_iteration < num_batch_iterations

    def get_next_batch(self, for_eval=False):
        s_batch = []
        a_batch = []
        r_batch = []
        max_score_batch = 0

        self.batch_iteration += 1
        frame_skip = 5

        if self.is_online:
            frame = self.prev_frame
            full_image = self.prev_full_image

            # Play the game for base_size frames.
            i = 0
            last_action_str = 'n'
            last_frame_was_a_menu = False
            while i < self.batch_size or not self.replay_buffer.can_sample(self.batch_size):
                i += 1
                for j in np.arange(frame_skip):
                    self.frame_reader.send_action(last_action_str)
                    frame, full_image = self.frame_reader.read_frame()

                # As soon as the frame is the main menu, select the first option.
                while self.menu_navigator.is_image_menu(full_image):
                    # Alternate actions between l and j because j selects the option, but holding j does nothing.
                    action_str = np.random.choice(['l', 'j'])
                    if self.verbose:
                        print('MENU DETECTED: Pressing l or j.'
                              'Taking action: %s' % action_str)
                    self.frame_reader.send_action(action_str)
                    frame, full_image = self.frame_reader.read_frame()

                # Store the most recent frame and get the past frames_per_state frames that define the current state.
                replay_buffer_index = self.replay_buffer.store_frame(np.squeeze(frame))
                state = self.replay_buffer.encode_recent_observation()
                state = np.expand_dims(state, 0)

                # Get the best action to take in the current state.
                if last_frame_was_a_menu:
                    # We are not actually playing a level, so press 'l' or 'j' to get through the current menu/video.
                    action_str = np.random.choice(['l', 'j'])
                    if self.verbose:
                        print('NO SCORE DETECTED: Pressing l or j. '
                              'Taking action: %s' % action_str)
                else:
                    feed_dict = {self.foxnet.X: state, self.foxnet.is_training: False}
                    q_values_it = self.session.run(self.foxnet.probs, feed_dict=feed_dict)

                    action_str = 'n'

                    if self.user_overwrite:
                        action_str = self.frame_reader.get_keys()

                    # If in user-overwrite and player does not input, do e-greedy
                    if action_str == 'n':
                        # e-greedy exploration.
                        if np.random.uniform() >= self.epsilon:
                            action_str = self.foxnet.available_actions[np.argmax(q_values_it)]
                        else:
                            action_str = np.random.choice(self.foxnet.available_actions)

                # Send action to emulator.
                self.frame_reader.send_action(action_str)

                # Remember this action for the next iteration.
                last_action_str = action_str

                # Determine the action we will send to the replay buffer.
                if last_frame_was_a_menu:
                    # If the last frame was a menu/video, pretend we just did a noop.
                    replay_buffer_str = self.foxnet.available_actions.index('n')
                else:
                    replay_buffer_str = self.foxnet.available_actions.index(action_str)

                # Get the next frame.
                new_frame, full_image = self.frame_reader.read_frame()

                # Get the reward (score + health).
                score_reward, score_is_not_digits = self.reward_extractor.get_reward(full_image)
                last_frame_was_a_menu = score_is_not_digits
                health_reward = self.health_extractor(full_image, offline=False)

                if self.verbose and not last_frame_was_a_menu:
                    print('Online reward extracted: score=%d\thealth=%f' % (score_reward, health_reward))

                # Check if we just died.
                if self.prev_health and self.prev_health > 0 and health_reward == 0:
                    # Agent just died.
                    if self.verbose:
                        print('Agent just died. Setting health reward to -10.')
                    health_reward = -10
                self.prev_health = health_reward

                reward = score_reward + health_reward
                max_score_batch = max(score_reward, max_score_batch)

                # Store the <s,a,r,s'> transition.
                self.replay_buffer.store_effect(replay_buffer_index, replay_buffer_str, reward, False)
                frame = new_frame

            self.prev_frame = frame
            self.prev_full_image = full_image

            s_batch, a_batch, r_batch, _, _ = self.replay_buffer.sample(self.batch_size)
        else:
            # Choose which data to batch
            if (for_eval):
                s_to_batch = self.s_eval
                a_to_batch = self.a_eval
                r_to_batch = None
            else:
                s_to_batch = self.s_train
                a_to_batch = self.a_train
                r_to_batch = self.r_train

            # Generate indices for the batch.
            start_idx = (self.batch_iteration * self.batch_size) % s_to_batch.shape[0]
            idx = self.epoch_indices[start_idx: start_idx + self.batch_size]

            s_batch = s_to_batch[idx, :]
            a_batch = a_to_batch[idx]
            if (not for_eval):
                r_batch = r_to_batch[idx]

        # print('Max score for current batch: %d' % max_score_batch)
        return s_batch, a_batch, r_batch, max_score_batch
Beispiel #2
0
def main_loop(handle, possible_actions: list, model: Model,
              target_model: Model):
    exp_schedule = ExplorationScheduler()
    target_model.load_state_dict(model.state_dict())
    optimizer = torch.optim.RMSprop(model.parameters())
    with mss() as sct:
        counter = 0
        frame_counter = 0
        frame_skip_counter = 0
        score = 0
        lives = 3
        frame_times = [0, 0, 0, 0]
        replay_buffer = ReplayBuffer(
            REPLAY_BUFFER_SIZE, (3 * FRAMES_FEED, RESIZE_HEIGHT, RESIZE_WIDTH),
            FRAMES_FEED,
            baseline_priority=1,
            gamma=GAMMA,
            reward_steps=N_STEP_REWARD)
        t = 0
        action = 0
        while True:
            if not active:
                time.sleep(
                    0.5
                )  # Wait some time and check if recording should be resumed.
                continue

            startMillis = time.time()  # Time

            # Grab frames
            frame, frame_cv2 = grab_screen(monitor, sct)

            # Show frame
            if DEBUG:
                cv2.imshow('window1', frame_cv2)
            # Check if frame will be skipped. Not skipped if counter is 0
            if frame_skip_counter == 0:
                reward, score, lives = get_reward(handle, lives, score)

                # print(action, reward)
                if replay_buffer.waiting_for_effect:
                    replay_buffer.add_effects(action, reward)
                replay_buffer.push_frame(frame)
                if replay_buffer.buffer_init(
                ) and np.random.random() > exp_schedule.value(t):
                    action = choose_action(replay_buffer.encode_last_frame(),
                                           model)
                else:
                    action = np.random.randint(0, len(possible_actions))

                execute_actions([possible_actions[int(action)]
                                 ]),  # dk.SCANCODES["z"]

                # Logic to deal with a ready datapoint
                if replay_buffer.can_sample(
                        BATCH_SIZE) and t % TRAIN_FREQ == 0:
                    if PAUSE_ON_TRAIN:
                        pause_game()
                    for _ in range(BATCHES_PER_TRAIN):
                        optimize_model(model,
                                       target_model,
                                       replay_buffer,
                                       optimizer,
                                       num_actions=len(possible_actions))
                    if PAUSE_ON_TRAIN:
                        pause_game()

                # Copy model weights to target
                if t % TARGET_MODEL_UPDATE_FREQ == 0:
                    print("Saving model")
                    state_dict = model.state_dict()
                    torch.save(state_dict, MODEL_PATH)
                    print("done pickling")
                    target_model.load_state_dict(state_dict)
                    target_model.eval()

            frame_skip_counter += 1
            frame_skip_counter = frame_skip_counter % FRAMES_SKIP

            # Frame timings and other utility
            endMillis = time.time()
            frame_time = endMillis - startMillis
            frame_times[counter % 4] = frame_time
            t += 1
            # if counter % 4 == 0:
            #    print("frame time: %s" % (np.mean(frame_times)))
            counter += 1
            if cv2.waitKey(25) & 0xFF == ord('q'):
                cv2.destroyAllWindows()
                break
Beispiel #3
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.
    You can specify your own convnet using q_func.
    All schedules are w.r.t. total number of steps taken in the environment.
    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.MultiDiscrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    #num_actions = env.action_space.shape
    num_actions = 13

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs,
                                  volatile=True)).data.max(1)[1].view(-1,
                                                                      1).cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # to mario act ex:[0, 0, 0, 1, 1, 0]
    def to_mario_act(action, num_actions):
        """
        action = action % num_actions
        if action == 0:
            # Move right while jumping
            action_onehot = np.array([0, 0, 0, 1, 1, 0])
        else:
            action_onehot = np.zeros(num_actions, dtype=int)
            action_onehot[action] = 1
	"""
        action_list = [[0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0],
                       [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0],
                       [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0],
                       [0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 1, 0],
                       [0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 0, 1],
                       [0, 1, 0, 0, 1, 1], [0, 0, 0, 1, 1, 1],
                       [0, 0, 1, 0, 0, 1]]
        return action_list[action]

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0,
                                                                             0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(to_mario_act(action, num_actions))
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(1, act_batch.view(-1, 1))
            """
            # DQN
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0].view(-1, 1)
            next_Q_values = not_done_mask.view(-1, 1) * next_max_q
            """
            next_argmax_action = Q(next_obs_batch).max(1)[1].view(-1, 1)
            next_q = target_Q(next_obs_batch).detach().gather(
                1, next_argmax_action)
            next_Q_values = not_done_mask.view(-1, 1) * next_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch.view(-1, 1) + (gamma * next_Q_values)
            """
            # Compute Bellman error
            bellman_error = target_Q_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
         
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            current_Q_values.backward(d_error.data)
            """
            loss = F.smooth_l1_loss(current_Q_values, target_Q_values)
            optimizer.zero_grad()
            loss.backward()
            for param in Q.parameters():
                param.grad.data.clamp(-1, 1)
            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress and keep track of statistics
        episode_rewards = env.get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
Beispiel #4
0
            print(np.mean(episodes_rewards), t)
            episodes_rewards = []

            torch.save(Q, '../weights/Q_dqn_invader.pt')
            torch.save(target_Q, '../weights/target_Q_dqn_invader.pt')

        obs = env.reset()

    last_obs = obs
    exit()
    ### Perform experience replay and train the network.
    # Note that this is only done if the replay buffer contains enough samples
    # for us to learn something useful -- until then, the model will not be
    # initialized and random actions should be taken
    if (t > LEARNING_STARTS and t % LEARNING_FREQ == 0
            and replay_buffer.can_sample(BATCH_SIZE)):
        # Use the replay buffer to sample a batch of transitions
        # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
        # in which case there is no Q-value at the next state; at the end of an
        # episode, only the current state reward contributes to the target
        obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
            BATCH_SIZE)
        # Convert numpy nd_array to torch variables for calculation
        obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
        act_batch = Variable(torch.from_numpy(act_batch).long())
        rew_batch = Variable(torch.from_numpy(rew_batch))
        next_obs_batch = Variable(
            torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
        not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

        if USE_CUDA: