Ejemplo n.º 1
0
def evaluate_policy(policy, env, eval_episodes=10):
    reward_arr = np.zeros(eval_episodes)

    for i in range(eval_episodes):
        obs = env.reset()
        done = False
        total_reward = 0.
        while not done:
            feasible_actions = AllocationEnv.get_feasible_actions(
                obs["board_config"])
            action_mask = AllocationEnv.get_action_mask(
                feasible_actions, env.action_space.n)

            action, _states = policy.predict(obs, mask=action_mask)
            action = AllocationEnv.check_action(obs['board_config'], action)
            obs, reward, done, _ = env.step(action)
            total_reward += reward

        reward_arr[i] = total_reward

    avg_reward = reward_arr.mean()
    std_reward = reward_arr.std()

    print("---------------------------------------")
    print("Evaluation over {} episodes: {:.1f} ({:.2f})".format(
        eval_episodes, avg_reward, std_reward))
    print("---------------------------------------")
    return avg_reward, std_reward
Ejemplo n.º 2
0
    def evaluate(self):
        gamma = .8
        rewards = []
        for i in range(self.n_episodes):

            self.queue = self.build_queue(self.buffer)

            r_i = 0
            state, _, _, _, _ = self.buffer.sample(batch_size=1)
            state = state[0]

            iter = 0
            cntr = 0

            while True:
                board_cfg = State.get_board_config_from_vec(
                    state,
                    n_regions=self.n_regions,
                    n_products=self.n_products)
                feasible_actions = AllocationEnv.get_feasible_actions(
                    board_cfg)
                action_mask = AllocationEnv.get_action_mask(
                    feasible_actions, self.n_actions)

                #M = self.get_m(state, action_mask)
                M = 1
                try:
                    _, a, r, s_prime = self.queue[state].pop()
                    #_, a, r, s_prime = self.queue[state][-1]

                except IndexError:
                    break

                alpha = random.random()

                prob_policy = self.policy.proba_step(state.reshape(1, -1),
                                                     mask=action_mask)[0][a]
                prob_env = self.env_policy.predict_proba(state)[a]

                rejection_tol = (1 / M) * prob_policy / prob_env

                iter += 1
                print(f"eps: {i+1} - iter:{iter} - success: {cntr}")

                if alpha > rejection_tol:
                    continue
                else:
                    #self.queue[state].pop()

                    r_i += (gamma)**cntr * r
                    state = s_prime
                    cntr += 1
            if r_i > 0:
                rewards.append(r_i)

        return rewards
Ejemplo n.º 3
0
    def learn(self):


        for i in range(self.epochs):
            print(f"Epoch {i}/{self.epochs}")
            pbar = tqdm(range(self.rollout_batch_size))
            for b in pbar:
                #state = self.buffer_env.sample(batch_size=1)[0][0]
                state = self.env_model.reset()
                state = State.get_vec_observation(state)



                for h in range(self.rollout):
                    pbar.set_description(f"batch: {b} rollout: {h}")
                    board_cfg = State.get_board_config_from_vec(state,
                                                                n_regions=self.n_regions,
                                                                n_products=self.n_products
                                                                )

                    feasible_actions = AllocationEnv.get_feasible_actions(board_cfg)
                    #feasible_actions = AllocationEnv.get_feasible_actions(state["board_config"])
                    action_mask = AllocationEnv.get_action_mask(feasible_actions, self.n_actions)

                    # sample action a_j ~ pi(s_j)
                    alpha = random.random()

                    if alpha < self.eps:
                        action = self.env_model.action_space.sample()
                    else:
                        action, _states = self.policy.predict(state.reshape(1, -1), mask=action_mask)

                    # compute dynamics from env model
                    new_state, r_hat, dones, info = self.env_model.step(action)
                    new_state = State.get_vec_observation(new_state)

                    reward = self.get_penalized_reward(r_hat, self.lmbda)


                    # add (s, a, r, s') to buffer
                    self.buffer_model.add(obs_t=state,
                                          action=action,
                                          reward=reward,
                                          obs_tp1=new_state,
                                          done=float(dones))

                    state = new_state



                    # update policy with samples from D_env and D_model
                self.policy.update_weights(self.buffer_model)
        self.save_buffer()
Ejemplo n.º 4
0
def map_optimal_rewards():
    state = env.reset()
    total_reward = 0
    results = {'rewards': [0.0]}
    optimal_actions = []

    curr_action = 0

    for day in range(TEST_T):

        curr_state = copy.deepcopy(env.state)
        feasible_actions = AllocationEnv.get_feasible_actions(
            curr_state.board_config)
        proposed_action = np.random.choice(list(feasible_actions))

        curr_state_step, curr_reward, b, i = env.step(curr_action)
        env.set_state(curr_state)

        proposed_state, proposed_reward, b, i = env.step(proposed_action)

        curr_f = get_f(ae=-curr_reward, lmbda=LMBDA, log=True, T=T)
        proposed_f = get_f(ae=-proposed_reward, lmbda=LMBDA, log=True, T=T)

        gamma = get_gamma(f_current=curr_f, f_proposed=proposed_f, log=True)
        # Generate random number on log scale
        sr = np.log(random.random())

        if sr < gamma:  # made progress
            #state, final_reward, _, _ = env.step(curr_best_action)  # update the state after each day based on the optimal action taken
            optimal_actions.append(proposed_action)
            curr_best_action = proposed_action
            final_reward = proposed_reward

        else:
            optimal_actions.append(curr_action)
            state, final_reward, _, _ = env.step(curr_action)
            curr_best_action = curr_action

        total_reward += final_reward
        results['rewards'].append(total_reward)
        print("best action: {} - reward: {}".format(curr_best_action,
                                                    final_reward))
        print("total reward: {}".format(total_reward))

    return state, optimal_actions, results
Ejemplo n.º 5
0
def map_optimal_rewards(tabu_len, k):
    state = env.reset()
    total_reward = 0
    results = {'rewards': [0.0]}
    optimal_actions = []




    for day in range(TEST_T):
        curr_best_val = 0.0
        curr_best_action = 0.0

        curr_state = copy.deepcopy(env.state)
        feasible_actions = AllocationEnv.get_feasible_actions(curr_state.board_config)


        for action in feasible_actions:

            print("Iteration: {}, Action: {}".format(day, action), end='\r')
            action = AllocationEnv.check_action(curr_state.board_config, action)
            proposed_state, reward, b, i = env.step(action)
            env.set_state(curr_state)

            if reward > curr_best_val:
                curr_best_action = action


        optimal_actions.append(curr_best_action)
        curr_best_action = AllocationEnv.check_action(curr_state.board_config, curr_best_action)

        state, final_reward, _ , _ = env.step(curr_best_action)  # update the state after each day based on the optimal action taken

        total_reward += final_reward
        curr_best_val = final_reward
        results['rewards'].append(total_reward)
        print("best action: {} - reward: {}".format(curr_best_action, final_reward))
        print("total reward: {}".format(total_reward))


    return state, optimal_actions, results
Ejemplo n.º 6
0
            exploration_fraction=0.35,
            exploration_final_eps=0.2)
model.learn(total_timesteps=TIME_STEPS, learning_curve=False, test_t=TEST_T)

with open(f"../data/{store_id}-buffer-d-test.p", 'wb') as f:
    pickle.dump(model.replay_buffer, f)

results = {'rewards': [0.0]}
buffer = ReplayBuffer(size=50000)

for j in range(100):

    obs = env.reset()

    for i in range(TEST_T):
        feasible_actions = AllocationEnv.get_feasible_actions(
            obs["board_config"])
        action_mask = AllocationEnv.get_action_mask(feasible_actions,
                                                    n_actions)
        action, _states = model.predict(obs, mask=action_mask)

        action = AllocationEnv.check_action(obs['board_config'], action)
        new_obs, r, dones, info = env.step([action])

        results['rewards'].append(r[0] + results['rewards'][-1])

        # add (s, a, r, s') to buffer
        buffer.add(obs_t=State.get_vec_observation(obs),
                   action=action,
                   reward=r[0],
                   obs_tp1=State.get_vec_observation(new_obs),
                   done=float(dones))
Ejemplo n.º 7
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None,
              learning_curve=False,
              test_t=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            self.cumul_reward = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))

            # variables for test eval ##
            test_step = test_t * 3
            test_results = {'sum': []}
            test_ts = []

            for _ in range(total_timesteps):

                ## Test eval period ##
                if learning_curve and _ % test_step == 0 and _ > 0:
                    print("--> Simulating test period")
                    self.env.reset()
                    test_r = 0.0
                    for i in range(test_t):
                        feasible_actions = AllocationEnv.get_feasible_actions(
                            obs["board_config"])
                        action_mask = AllocationEnv.get_action_mask(
                            feasible_actions, self.env.action_space.n)
                        action, _states = self.predict(obs, mask=action_mask)
                        action = AllocationEnv.check_action(
                            obs['board_config'], action)
                        obs, rewards, dones, info = self.env.step(action)
                        test_r += rewards

                    test_results["sum"].append(test_r)
                    test_ts.append(_)
                    self.env.reset()

                    # plot test eval progress
                    plt.plot(test_ts, test_results["sum"])
                    # plt.errorbar(iteration_cuts, results["mean"], yerr=results["std"], fmt='.k')
                    plt.xlabel("Iteration count")
                    plt.ylabel("Total (sum) test reward")
                    plt.savefig("figs/rl-learning-curve-{}.pdf".format(
                        cfg.vals['prj_name']))
                    plt.clf()
                    plt.close()

                    # write test eval progress
                    write_results = {}
                    for k, v in test_results.items():
                        write_results[k] = serialize_floats(v)

                    with open(
                            "output/rl-learning-curve-{}.json".format(
                                cfg.vals['prj_name']), 'w') as f:
                        json.dump(write_results, f)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True

                feasible_actions = AllocationEnv.get_feasible_actions(
                    obs["board_config"])
                action_mask = AllocationEnv.get_action_mask(
                    feasible_actions, self.action_space.n)
                with self.sess.as_default():
                    action = self.act(State.get_vec_observation(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs,
                                      mask=action_mask)[0]
                reset = False
                # CHECK IF ACTIONS IS FEASIBLE
                action = AllocationEnv.check_action(obs['board_config'],
                                                    action)
                env_action = action
                new_obs, rew, done, info = self.env.step(env_action)
                print("action: {} - reward: {} - eps: {:.4}".format(
                    action, rew, update_eps))
                print(new_obs['day_vec'])
                print(new_obs['board_config'])
                # Store transition in the replay buffer.
                self.replay_buffer.add(State.get_vec_observation(obs), action,
                                       rew, State.get_vec_observation(new_obs),
                                       float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                self.cumul_reward.append(self.cumul_reward[-1] + rew)
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                    and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()
                print('timestamp: {}'.format(self.num_timesteps, end='\r\n'))
                self.num_timesteps += 1

        return self