Ejemplo n.º 1
0
    def evaluate(self, env=None, num_episodes=None):
        if env is None: env = self.env
        if num_episodes is None:
            self.logger.info("Evaluating...")
            num_episodes = self.config.num_episodes_test

        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = []
        for i in range(num_episodes):
            sum_reward = 0
            state = env.reset()
            while True:
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()
                action = self.env.action_space.sample()
                if self.config.soft_epsilon < np.random.random():
                    action = np.argmax(
                        self.sess.run(self.q, feed_dict={self.s:
                                                         [q_input]})[0])
                new_state, reward, done, info = env.step(action)
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state
                sum_reward += reward
                if done: break
            rewards.append(sum_reward)

        avg_reward = np.mean(rewards)
        if num_episodes > 1:
            self.logger.info("Average reward: {:04.2f}".format(avg_reward))
        return avg_reward
Ejemplo n.º 2
0
    def train(self, exp_schedule, lr_schedule):
        replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history)
        rewards = deque(maxlen=self.config.num_episodes_test)
        max_q_values = deque(maxlen=1000)
        q_values = deque(maxlen=1000)

        t = last_eval = last_record = 0
        scores_eval = [] # scores for plot
        scores_eval += [self.evaluate()]
        
        while t < self.config.nsteps_train:
            sum_reward = 0
            state = self.env.reset()
            while True:
                t += 1
                last_eval += 1
                last_record += 1

                # replay memory stuff
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                action_values = self.sess.run(self.q, feed_dict={self.s: [q_input]})[0]
                best_action = np.argmax(action_values)
                q_values = action_values
                action = exp_schedule.get_action(best_action)

                max_q_values.append(max(q_values))
                q_values += list(q_values)
                new_state, reward, done, info = self.env.step(action)

                # store the transition
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state

                loss_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon)
                self.get_log(exp_schedule, lr_schedule, t, loss_eval, max_q_values, rewards)
                sum_reward += reward
                if done or t >= self.config.nsteps_train: break

            rewards.append(sum_reward)          

            if t > self.config.learning_start:
                if last_eval > self.config.eval_freq:
                    last_eval = 0
                    scores_eval += [self.evaluate()]

                elif self.config.record and (last_record > self.config.record_freq):
                    self.logger.info("Recording...")
                    last_record =0
                    self.record()

        self.logger.info("*** Training is done.")
        self.saver.save(self.sess, self.config.model_output2, global_step=t)
        scores_eval += [self.evaluate()]
        export_plot(scores_eval, "Scores", self.config.plot_output)
Ejemplo n.º 3
0
def evaluate(net, env=None, num_episodes=50):
    """
    Evaluation with same procedure as the training
    """
    print("Evaluating...")
    # arguments defaults

    # replay memory to play
    replay_buffer = ReplayBuffer(1000000, 4)
    rewards = []

    for i in range(num_episodes):
        total_reward = 0
        state = env.reset()
        while True:

            # store last state in buffer
            idx = replay_buffer.store_frame(state)
            q_input = replay_buffer.encode_recent_observation()

            action = net.get_action(q_input)

            # perform action in env
            new_state, reward, done, info = env.step(action)

            # store in replay memory
            replay_buffer.store_effect(idx, action, reward, done)
            state = new_state

            # count reward
            total_reward += reward
            if done:
                break

        # updates to perform at the end of an episode
        rewards.append(total_reward)

    avg_reward = np.mean(rewards)
    sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

    if num_episodes > 1:
        msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
            avg_reward, sigma_reward)
    print(msg)
    return avg_reward
Ejemplo n.º 4
0
class DataManager:
    def __init__(self, verbose=False):
        self.is_online = False
        self.verbose = verbose

    def init_online(self, foxnet, session, batch_size, replay_buffer_size, frames_per_state, ip, image_height,
                    image_width, epsilon, user_overwrite=False):
        self.is_online = True
        self.foxnet = foxnet
        self.session = session
        self.batch_size = batch_size
        self.epsilon = epsilon

        # Allow player to overwrite for faster learning
        self.user_overwrite = user_overwrite

        # Initialize ReplayBuffer.
        self.replay_buffer = ReplayBuffer(replay_buffer_size, frames_per_state)

        # Initialize emulator transfers
        self.frame_reader = FrameReader(ip, image_height, image_width)
        self.health_extractor = HealthExtractor()
        self.reward_extractor = RewardExtractor()
        self.menu_navigator = MenuNavigator()

        # Keep full image for reward extraction.
        frame, full_image = self.frame_reader.read_frame()
        self.prev_frame = frame
        self.prev_full_image = full_image

        # Remember the health from the previous frame.
        self.prev_health = None

    def init_offline(self, use_test_set, data_params, batch_size):
        self.is_online = False
        self.user_overwrite = False
        self.epsilon = 0 # Not used.

        # Load the two pertinent datasets into train_dataset and eval_dataset
        if use_test_set:
            train_dataset, eval_dataset = load_datasets('test', data_params)
        else:
            train_dataset, eval_dataset = load_datasets('dev', data_params)

        self.s_train, self.a_train, scores_train, h_train = train_dataset
        self.s_eval, self.a_eval, scores_test, h_test = eval_dataset

        # Compute the reward given scores and health. Currently, this just adds the two, weighting each one equally.
        self.r_train = np.add(scores_train, h_train)
        self.r_test = np.add(scores_test, h_test)

        self.batch_size = batch_size

    def init_epoch(self, for_eval=False):
        self.batch_iteration = -1

        if self.is_online:
            pass
        else:
            if for_eval: # "epoch" is entire validation set
                self.epoch_indices = np.arange(self.s_eval.shape[0])
            else:
                self.epoch_indices = np.arange(self.s_train.shape[0])
            np.random.shuffle(self.epoch_indices)

    def has_next_batch(self, for_eval=False):
        if self.is_online:
            return True
        else:
            if for_eval:
                num_batch_iterations = int(math.ceil(self.s_eval.shape[0] / self.batch_size))
            else:
                num_batch_iterations = int(math.ceil(self.s_train.shape[0] / self.batch_size))
            return self.batch_iteration < num_batch_iterations

    def get_next_batch(self, for_eval=False):
        s_batch = []
        a_batch = []
        r_batch = []
        max_score_batch = 0

        self.batch_iteration += 1
        frame_skip = 5

        if self.is_online:
            frame = self.prev_frame
            full_image = self.prev_full_image

            # Play the game for base_size frames.
            i = 0
            last_action_str = 'n'
            last_frame_was_a_menu = False
            while i < self.batch_size or not self.replay_buffer.can_sample(self.batch_size):
                i += 1
                for j in np.arange(frame_skip):
                    self.frame_reader.send_action(last_action_str)
                    frame, full_image = self.frame_reader.read_frame()

                # As soon as the frame is the main menu, select the first option.
                while self.menu_navigator.is_image_menu(full_image):
                    # Alternate actions between l and j because j selects the option, but holding j does nothing.
                    action_str = np.random.choice(['l', 'j'])
                    if self.verbose:
                        print('MENU DETECTED: Pressing l or j.'
                              'Taking action: %s' % action_str)
                    self.frame_reader.send_action(action_str)
                    frame, full_image = self.frame_reader.read_frame()

                # Store the most recent frame and get the past frames_per_state frames that define the current state.
                replay_buffer_index = self.replay_buffer.store_frame(np.squeeze(frame))
                state = self.replay_buffer.encode_recent_observation()
                state = np.expand_dims(state, 0)

                # Get the best action to take in the current state.
                if last_frame_was_a_menu:
                    # We are not actually playing a level, so press 'l' or 'j' to get through the current menu/video.
                    action_str = np.random.choice(['l', 'j'])
                    if self.verbose:
                        print('NO SCORE DETECTED: Pressing l or j. '
                              'Taking action: %s' % action_str)
                else:
                    feed_dict = {self.foxnet.X: state, self.foxnet.is_training: False}
                    q_values_it = self.session.run(self.foxnet.probs, feed_dict=feed_dict)

                    action_str = 'n'

                    if self.user_overwrite:
                        action_str = self.frame_reader.get_keys()

                    # If in user-overwrite and player does not input, do e-greedy
                    if action_str == 'n':
                        # e-greedy exploration.
                        if np.random.uniform() >= self.epsilon:
                            action_str = self.foxnet.available_actions[np.argmax(q_values_it)]
                        else:
                            action_str = np.random.choice(self.foxnet.available_actions)

                # Send action to emulator.
                self.frame_reader.send_action(action_str)

                # Remember this action for the next iteration.
                last_action_str = action_str

                # Determine the action we will send to the replay buffer.
                if last_frame_was_a_menu:
                    # If the last frame was a menu/video, pretend we just did a noop.
                    replay_buffer_str = self.foxnet.available_actions.index('n')
                else:
                    replay_buffer_str = self.foxnet.available_actions.index(action_str)

                # Get the next frame.
                new_frame, full_image = self.frame_reader.read_frame()

                # Get the reward (score + health).
                score_reward, score_is_not_digits = self.reward_extractor.get_reward(full_image)
                last_frame_was_a_menu = score_is_not_digits
                health_reward = self.health_extractor(full_image, offline=False)

                if self.verbose and not last_frame_was_a_menu:
                    print('Online reward extracted: score=%d\thealth=%f' % (score_reward, health_reward))

                # Check if we just died.
                if self.prev_health and self.prev_health > 0 and health_reward == 0:
                    # Agent just died.
                    if self.verbose:
                        print('Agent just died. Setting health reward to -10.')
                    health_reward = -10
                self.prev_health = health_reward

                reward = score_reward + health_reward
                max_score_batch = max(score_reward, max_score_batch)

                # Store the <s,a,r,s'> transition.
                self.replay_buffer.store_effect(replay_buffer_index, replay_buffer_str, reward, False)
                frame = new_frame

            self.prev_frame = frame
            self.prev_full_image = full_image

            s_batch, a_batch, r_batch, _, _ = self.replay_buffer.sample(self.batch_size)
        else:
            # Choose which data to batch
            if (for_eval):
                s_to_batch = self.s_eval
                a_to_batch = self.a_eval
                r_to_batch = None
            else:
                s_to_batch = self.s_train
                a_to_batch = self.a_train
                r_to_batch = self.r_train

            # Generate indices for the batch.
            start_idx = (self.batch_iteration * self.batch_size) % s_to_batch.shape[0]
            idx = self.epoch_indices[start_idx: start_idx + self.batch_size]

            s_batch = s_to_batch[idx, :]
            a_batch = a_to_batch[idx]
            if (not for_eval):
                r_batch = r_to_batch[idx]

        # print('Max score for current batch: %d' % max_score_batch)
        return s_batch, a_batch, r_batch, max_score_batch
Ejemplo n.º 5
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.
    You can specify your own convnet using q_func.
    All schedules are w.r.t. total number of steps taken in the environment.
    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.MultiDiscrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    #num_actions = env.action_space.shape
    num_actions = 13

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs,
                                  volatile=True)).data.max(1)[1].view(-1,
                                                                      1).cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # to mario act ex:[0, 0, 0, 1, 1, 0]
    def to_mario_act(action, num_actions):
        """
        action = action % num_actions
        if action == 0:
            # Move right while jumping
            action_onehot = np.array([0, 0, 0, 1, 1, 0])
        else:
            action_onehot = np.zeros(num_actions, dtype=int)
            action_onehot[action] = 1
	"""
        action_list = [[0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0],
                       [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0],
                       [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0],
                       [0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 1, 0],
                       [0, 1, 0, 0, 0, 1], [0, 0, 0, 1, 0, 1],
                       [0, 1, 0, 0, 1, 1], [0, 0, 0, 1, 1, 1],
                       [0, 0, 1, 0, 0, 1]]
        return action_list[action]

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0,
                                                                             0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(to_mario_act(action, num_actions))
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(1, act_batch.view(-1, 1))
            """
            # DQN
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0].view(-1, 1)
            next_Q_values = not_done_mask.view(-1, 1) * next_max_q
            """
            next_argmax_action = Q(next_obs_batch).max(1)[1].view(-1, 1)
            next_q = target_Q(next_obs_batch).detach().gather(
                1, next_argmax_action)
            next_Q_values = not_done_mask.view(-1, 1) * next_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch.view(-1, 1) + (gamma * next_Q_values)
            """
            # Compute Bellman error
            bellman_error = target_Q_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
         
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            current_Q_values.backward(d_error.data)
            """
            loss = F.smooth_l1_loss(current_Q_values, target_Q_values)
            optimizer.zero_grad()
            loss.backward()
            for param in Q.parameters():
                param.grad.data.clamp(-1, 1)
            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress and keep track of statistics
        episode_rewards = env.get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
Ejemplo n.º 6
0
class DoubleDQN(object):
    def __init__(self,
                 image_shape,
                 num_actions,
                 frame_history_len=4,
                 replay_buffer_size=1000000,
                 training_freq=4,
                 training_starts=5000,
                 training_batch_size=32,
                 target_update_freq=1000,
                 reward_decay=0.99,
                 exploration=LinearSchedule(5000, 0.1),
                 log_dir="logs/"):
        """
            Double Deep Q Network
            params:
            image_shape: (height, width, n_values)
            num_actions: how many different actions we can choose
            frame_history_len: feed this number of frame data as input to the deep-q Network
            replay_buffer_size: size limit of replay buffer
            training_freq: train base q network once per training_freq steps
            training_starts: only train q network after this number of steps
            training_batch_size: batch size for training base q network with gradient descent
            reward_decay: decay factor(called gamma in paper) of rewards that happen in the future
            exploration: used to generate an exploration factor(see 'epsilon-greedy' in paper).
                         when rand(0,1) < epsilon, take random action; otherwise take greedy action.
            log_dir: path to write tensorboard logs
        """
        super().__init__()
        self.num_actions = num_actions
        self.training_freq = training_freq
        self.training_starts = training_starts
        self.training_batch_size = training_batch_size
        self.target_update_freq = target_update_freq
        self.reward_decay = reward_decay
        self.exploration = exploration

        # use multiple frames as input to q network
        input_shape = image_shape[:-1] + (image_shape[-1] *
                                          frame_history_len, )
        # used to choose action
        self.base_model = q_model(input_shape, num_actions)
        self.base_model.compile(optimizer=optimizers.adam(clipnorm=10,
                                                          lr=1e-4,
                                                          decay=1e-6,
                                                          epsilon=1e-4),
                                loss='mse')
        # used to estimate q values
        self.target_model = q_model(input_shape, num_actions)

        self.replay_buffer = ReplayBuffer(size=replay_buffer_size,
                                          frame_history_len=frame_history_len)
        # current replay buffer offset
        self.replay_buffer_idx = 0

        self.tensorboard_callback = TensorBoard(log_dir=log_dir)
        self.latest_losses = deque(maxlen=100)

    def get_replay_buffer_idx(self, obs):
        return self.replay_buffer.store_frame(obs)

    def train_have_started(self, step):
        return step < self.training_starts

    def is_new_exploration_decision(self, step):
        return (np.random.rand() < self.exploration.value(step))

    def get_randint_actions(self):
        return np.random.randint(self.num_actions)

    def encodeRecentObservationsReplayBuffer(self):
        return self.replay_buffer.encode_recent_observation()

    def _settle_replay_buffer_id(self, obs):
        self.replay_buffer_idx = self.get_replay_buffer_idx(obs)
        return self

    def choose_action(self, step, obs):
        #self.replay_buffer_idx = self.get_replay_buffer_idx(obs)
        self._settle_replay_buffer_id(obs)
        train_have_started = self.train_have_started
        is_new_exploration_decision = self.is_new_exploration_decision
        get_randint_actions = self.get_randint_actions
        encodeRecentObservationsReplayBuffer = self.encodeRecentObservationsReplayBuffer
        continuous_decision = lambda step_cached: train_have_started(
            step_cached) or is_new_exploration_decision(step_cached)
        if continuous_decision(step):
            # take random action
            action = get_randint_actions()
        else:
            # take action that results in maximum q value
            recent_obs = encodeRecentObservationsReplayBuffer()
            base_model = self.base_model
            arr_recent_obs = np.array([recent_obs])
            base_model_predicted = base_model.predict_on_batch(arr_recent_obs)
            q_vals = base_model_predicted.flatten()
            action = np.argmax(q_vals)
        return action

    def learn(self, step, action, reward, done, info=None):
        self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward,
                                        done)
        if step > self.training_starts and step % self.training_freq == 0:
            self._train()

        if step > self.training_starts and step % self.target_update_freq == 0:
            self._update_target()

    def eval_iters(self):
        optimizer = self.base_model.optimizer
        iterations_optimizer = optimizer.iterations
        eval_iterations = K.eval(iterations_optimizer)
        return eval_iterations

    def mul_decay_iters(self):
        optimizer = self.base_model.optimizer
        evaluated_iters = self.eval_iters()
        evaluated_mul_decay_iters = K.eval(optimizer.decay * evaluated_iters)
        return evaluated_mul_decay_iters

    def normalize_params(self):
        mul_decay_evaluated_iters = self.mul_decay_iters()
        normalization = (1. / (1. + mul_decay_evaluated_iters))
        return normalization

    def get_learning_rate(self):
        optimizer = self.base_model.optimizer
        #import pdb; pdb.set_trace()
        #lr = K.eval(optimizer.lr * (1. / (1. + optimizer.decay * optimizer.iterations)))
        evaluated_iters = self.eval_iters()
        params_norm = self.normalize_params()
        lr = K.eval(optimizer.lr * params_norm)
        return lr

    def get_avg_loss(self):
        latest_losses = self.latest_losses
        is_gt_zero_latest_losses = len(latest_losses) > 0
        if is_gt_zero_latest_losses:
            latest_losses = np.array(latest_losses, dtype=np.float32)
            mean_latest_losses = np.mean(latest_losses)
            return mean_latest_losses
        else:
            return None

    def _train(self):
        obs_t, action, reward, obs_t1, done_mask = self.replay_buffer.sample(
            self.training_batch_size)
        q = self.base_model.predict(obs_t)
        q_t1 = self.target_model.predict(obs_t1)
        q_t1_max = np.max(q_t1, axis=1)
        # print('q:\n', q)
        # print('q_t1:\n', q_t1)
        # print('q_t1_max:\n', q_t1_max)
        # print('action:\n', action)

        # for idx in range(len(q)):
        #     q[idx][action[idx]] = reward[idx] + q_t1_max[idx] * self.reward_decay * (1-done_mask[idx])
        q[range(len(action)),
          action] = reward + q_t1_max * self.reward_decay * (1 - done_mask)
        # print('reward:\n', reward)
        # print('qt1_max:\n', q_t1_max)
        # print('done mask:\n', done_mask)
        # print("q': \n", q)
        # self.base_model.fit(obs_t, q, batch_size=self.training_batch_size, epochs=1)
        loss = self.base_model.train_on_batch(obs_t, q)
        self.latest_losses.append(loss)

    def _update_target(self):
        weights = self.base_model.get_weights()
        # print('update target', weights)
        self.target_model.set_weights(weights)
Ejemplo n.º 7
0
num_param_updates = 0
mean_episode_reward = -float('nan')
best_mean_episode_reward = -float('inf')
last_obs = env.reset()
episodes_rewards = []

for t in count():
    ### Step the env and store the transition
    # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
    last_idx = replay_buffer.store_frame(last_obs)
    print(last_idx, last_obs.shape)
    # encode_recent_observation will take the latest observation
    # that you pushed into the buffer and compute the corresponding
    # input that should be given to a Q network by appending some
    # previous frames.
    recent_observations = replay_buffer.encode_recent_observation()
    print(recent_observations.shape)

    guard_action, invader_action = env.act()
    # Choose random action if not yet start learning
    if t > LEARNING_STARTS:
        action = select_epilson_greedy_action(Q, recent_observations, t).item()
        print(action)
    else:
        action = random.randrange(NUM_ACTIONS)
        print(action)

    print(guard_action, invader_action)
    # Advance one step
    obs, reward, done, _ = env.step(guard_action, invader_action)
    print(reward)