Ejemplo n.º 1
0
def main_cem(args, env, agent, sampling_agent):
    best_n_samples = int(np.ceil(args.best_frac * args.n_samples))

    test_returns = []
    print("TRAINING")
    for i in range(args.n_iters):
        sample_returns = []
        sample_states = []
        sample_actions = []
        for j in range(args.n_samples):
            states, actions, rewards = episode_rollout(env, agent.action_set,
                                                       sampling_agent,
                                                       args.max_timesteps)
            total_return, returns = compute_returns(rewards)

            sample_states.append(states)
            sample_actions.append(actions)
            sample_returns.append(total_return)

        sample_returns = np.array(sample_returns)
        best_indices = np.argsort(sample_returns)[::-1][:best_n_samples]

        data = np.vstack([sample_states[k] for k in best_indices])
        targets = np.vstack(
            [np.reshape(sample_actions[k], (-1, 1)) for k in best_indices])

        # Learn policy
        agent.fit(data, targets, verbose=1, epochs=3)

        # Test actor
        if i % 5 == 0:
            _, actions, rewards = episode_rollout(env,
                                                  agent.action_set,
                                                  agent,
                                                  args.max_timesteps,
                                                  render=False)
            print(f"unique: R {np.unique(targets)} CEM {np.unique(actions)}")
            print_trainable_variables(agent)

            total_return, _ = compute_returns(rewards)
            print(f"{i}: {sample_returns.mean()} {total_return}")
            test_returns.append(total_return)
        else:
            print(f"{i} {sample_returns.mean()}")

    return test_returns
Ejemplo n.º 2
0
                if "obj" in info:
                    objectives.append(info["obj"])
                if "obj_dn" in info:
                    objectives_dn.append(info["obj_dn"])

                if "mu_max" in info:
                    mu_max.append(info["mu_max"])
                if "mu_max_dn" in info:
                    mu_max_dn.append(info["mu_max_dn"])

                if "mu_gen" in info:
                    mu_gen.append(info["mu_gen"])
                if "mu_gen_dn" in info:
                    mu_gen_dn.append(info["mu_gen_dn"])

            total_return = compute_returns(rewards)[0]
            chronic_data.append({
                "chronic_idx": chronic_idx,
                "chronic_org_idx": chronic_org_idx,
                "chronic_name": chronic_name,
                "actions": actions,
                "time_steps": time_steps,
                "rewards": rewards,
                "return": total_return,
                "chronic_length": chronic_len,
                "duration": t,
                "distances": distances,
                "distances_status": distances_status,
                "distances_sub": distances_sub,
                "objectives": objectives,
                "objectives_dn": objectives_dn,
Ejemplo n.º 3
0
print("TRAINING")
for i in range(args.n_iters):
    sample_returns = []
    sample_states = []
    sample_actions = []
    for j in range(args.n_samples):
        # Off- or on-policy? Better works in off-policy mode. Why?
        if j % 20 != 0 or True:
            sample_actor = random_actor
        else:
            sample_actor = actor

        states, actions, rewards = episode_rollout(env, sample_actor,
                                                   args.max_timesteps)
        total_return, returns = compute_returns(rewards)

        sample_states.append(states)
        sample_actions.append(actions)
        sample_returns.append(total_return)

    sample_returns = np.array(sample_returns)
    best_indices = np.argsort(sample_returns)[::-1][:best_n_samples]

    data = np.vstack([sample_states[k] for k in best_indices])
    targets = np.vstack(
        [np.reshape(sample_actions[k], (-1, 1)) for k in best_indices])

    # Learn policy
    actor.fit(data, targets, verbose=0, epochs=1)
Ejemplo n.º 4
0
                                   alpha_decay))  # Update actor weights

            # Act
            next_action, next_grads = actor_critic.act(
                next_state)  # a_t+1, grads_t+1
            next_action = next_action.numpy()

            state, action, grads = next_state, next_action, next_grads

            if render:
                time.sleep(1.0 / FPS)
                env.render()

        e_length = t

        total_return, returns = compute_returns(np.array(rewards), GAMMA)
        print("e {:<20} return {:<20} length {:<20}".format(
            e, np.round(total_return, decimals=3), e_length))
        total_returns.append(total_return), episodes.append(e)

        alpha_decay = np.exp(-e / DECAY_PERIOD)

    env.close()
    episodes = np.array(episodes)
    total_returns = np.array(total_returns)
    average_returns = pd.Series(total_returns).rolling(
        100, min_periods=1).mean().values

    fig, ax = plt.subplots(1, 2, figsize=(16, 5))
    ax[0].plot(episodes, total_returns, label=f"{MODE}")
    ax[1].plot(episodes, average_returns, label=f"avg_{MODE}")
Ejemplo n.º 5
0
    def _runner(self,
                env,
                agent,
                do_chronics=(),
                n_chronics=-1,
                n_steps=-1,
                verbose=False):
        self.print_experiment("Performance")
        agent.print_agent(default=verbose)

        agent_name = agent.name.replace(" ", "-").lower()
        self.collector._load_chronics(agent_name=agent_name)

        chronics_dir, chronics, chronics_sorted = get_sorted_chronics(env=env)
        pprint("Chronics:", chronics_dir)

        if len(self.collector.chronic_ids):
            pprint(
                "    - Done chronics:",
                ", ".join(
                    map(lambda x: str(x), sorted(self.collector.chronic_ids))),
            )

        if len(do_chronics):
            pprint(
                "    - To do chronics:",
                ", ".join(map(lambda x: str(x), sorted(do_chronics))),
            )

        done_chronic_ids = []
        for chronic_idx, chronic_name in enumerate(chronics_sorted):
            if len(done_chronic_ids) >= n_chronics >= 0:
                break

            # If chronic already done
            if chronic_idx in self.collector.chronic_ids:
                continue

            # Environment specific filtering
            if "rte_case5" in env.name:
                if chronic_idx not in do_chronics:
                    continue
            elif "l2rpn_2019" in env.name:
                if chronic_idx not in do_chronics:
                    continue
            elif "l2rpn_wcci_2020" in env.name:
                if chronic_idx not in do_chronics:
                    continue

            chronic_org_idx = chronics.index(chronic_name)
            env.chronics_handler.tell_id(chronic_org_idx - 1)  # Set chronic id

            obs = env.reset()
            agent.reset(obs=obs)

            chronic_len = env.chronics_handler.real_data.data.max_iter
            chronic_path_name = "/".join(
                os.path.normpath(env.chronics_handler.get_id()).split(
                    os.sep)[-3:])

            augmentation_info = os.path.join(env.chronics_handler.get_id(),
                                             "augmentation.json")
            ps = None
            if os.path.isfile(augmentation_info):
                with open(augmentation_info, "r") as f:
                    ps = json.load(f)

            pprint("    - Chronic:", chronic_path_name)
            # if ps:
            #     p = ps["p"]
            #     min_p = ps["min_p"]
            #     max_p = ps["max_p"]
            #     targets = ps["targets"]
            #
            #     pprint("        - Augmentation:", ps["augmentation"])
            #     pprint(
            #         "            - Rate:",
            #         "p = {:.2f} ~ [{:.2f}, {:.2f}]".format(p, min_p, max_p),
            #     )
            #     if targets:
            #         pprint("            - Targets:", str(targets))

            t = 0
            done = False
            reward = np.nan
            """
                Collect data.
            """
            while not done and not (t >= n_steps > 0):
                action = agent.act(obs, reward=reward, done=done)
                obs_next, reward, done, info = env.step(action)
                self.collector._add(obs, action, reward, done)

                semi_action = False
                if agent.semi_agent is not None:
                    semi_action = agent.semi_agent.semi_action

                dist, dist_status, dist_status = agent.distance_to_ref_topology(
                    obs_next.topo_vect, obs_next.line_status)
                self.collector._add_plus(dist, dist_status, dist_status,
                                         semi_action)

                t = env.chronics_handler.real_data.data.current_index

                if t % 200 == 0:
                    semi_sum = np.sum(self.collector.semi_actions)
                    pprint("        - Step:", t, semi_sum)

                if done:
                    semi_sum = np.sum(self.collector.semi_actions)
                    pprint("        - Length:", f"{t}/{chronic_len}", semi_sum)

                obs = obs_next

            self.collector.obses.append(obs.to_vect())
            self.collector.total_return = compute_returns(
                self.collector.rewards)[0]
            self.collector.duration = t
            self.collector.chronic_len = chronic_len
            self.collector.chronic_name = chronic_name

            done_chronic_ids.append(chronic_idx)

            self.collector._save_chronic(agent_name, chronic_idx, verbose)
            self.collector.reset()