def main_cem(args, env, agent, sampling_agent): best_n_samples = int(np.ceil(args.best_frac * args.n_samples)) test_returns = [] print("TRAINING") for i in range(args.n_iters): sample_returns = [] sample_states = [] sample_actions = [] for j in range(args.n_samples): states, actions, rewards = episode_rollout(env, agent.action_set, sampling_agent, args.max_timesteps) total_return, returns = compute_returns(rewards) sample_states.append(states) sample_actions.append(actions) sample_returns.append(total_return) sample_returns = np.array(sample_returns) best_indices = np.argsort(sample_returns)[::-1][:best_n_samples] data = np.vstack([sample_states[k] for k in best_indices]) targets = np.vstack( [np.reshape(sample_actions[k], (-1, 1)) for k in best_indices]) # Learn policy agent.fit(data, targets, verbose=1, epochs=3) # Test actor if i % 5 == 0: _, actions, rewards = episode_rollout(env, agent.action_set, agent, args.max_timesteps, render=False) print(f"unique: R {np.unique(targets)} CEM {np.unique(actions)}") print_trainable_variables(agent) total_return, _ = compute_returns(rewards) print(f"{i}: {sample_returns.mean()} {total_return}") test_returns.append(total_return) else: print(f"{i} {sample_returns.mean()}") return test_returns
if "obj" in info: objectives.append(info["obj"]) if "obj_dn" in info: objectives_dn.append(info["obj_dn"]) if "mu_max" in info: mu_max.append(info["mu_max"]) if "mu_max_dn" in info: mu_max_dn.append(info["mu_max_dn"]) if "mu_gen" in info: mu_gen.append(info["mu_gen"]) if "mu_gen_dn" in info: mu_gen_dn.append(info["mu_gen_dn"]) total_return = compute_returns(rewards)[0] chronic_data.append({ "chronic_idx": chronic_idx, "chronic_org_idx": chronic_org_idx, "chronic_name": chronic_name, "actions": actions, "time_steps": time_steps, "rewards": rewards, "return": total_return, "chronic_length": chronic_len, "duration": t, "distances": distances, "distances_status": distances_status, "distances_sub": distances_sub, "objectives": objectives, "objectives_dn": objectives_dn,
print("TRAINING") for i in range(args.n_iters): sample_returns = [] sample_states = [] sample_actions = [] for j in range(args.n_samples): # Off- or on-policy? Better works in off-policy mode. Why? if j % 20 != 0 or True: sample_actor = random_actor else: sample_actor = actor states, actions, rewards = episode_rollout(env, sample_actor, args.max_timesteps) total_return, returns = compute_returns(rewards) sample_states.append(states) sample_actions.append(actions) sample_returns.append(total_return) sample_returns = np.array(sample_returns) best_indices = np.argsort(sample_returns)[::-1][:best_n_samples] data = np.vstack([sample_states[k] for k in best_indices]) targets = np.vstack( [np.reshape(sample_actions[k], (-1, 1)) for k in best_indices]) # Learn policy actor.fit(data, targets, verbose=0, epochs=1)
alpha_decay)) # Update actor weights # Act next_action, next_grads = actor_critic.act( next_state) # a_t+1, grads_t+1 next_action = next_action.numpy() state, action, grads = next_state, next_action, next_grads if render: time.sleep(1.0 / FPS) env.render() e_length = t total_return, returns = compute_returns(np.array(rewards), GAMMA) print("e {:<20} return {:<20} length {:<20}".format( e, np.round(total_return, decimals=3), e_length)) total_returns.append(total_return), episodes.append(e) alpha_decay = np.exp(-e / DECAY_PERIOD) env.close() episodes = np.array(episodes) total_returns = np.array(total_returns) average_returns = pd.Series(total_returns).rolling( 100, min_periods=1).mean().values fig, ax = plt.subplots(1, 2, figsize=(16, 5)) ax[0].plot(episodes, total_returns, label=f"{MODE}") ax[1].plot(episodes, average_returns, label=f"avg_{MODE}")
def _runner(self, env, agent, do_chronics=(), n_chronics=-1, n_steps=-1, verbose=False): self.print_experiment("Performance") agent.print_agent(default=verbose) agent_name = agent.name.replace(" ", "-").lower() self.collector._load_chronics(agent_name=agent_name) chronics_dir, chronics, chronics_sorted = get_sorted_chronics(env=env) pprint("Chronics:", chronics_dir) if len(self.collector.chronic_ids): pprint( " - Done chronics:", ", ".join( map(lambda x: str(x), sorted(self.collector.chronic_ids))), ) if len(do_chronics): pprint( " - To do chronics:", ", ".join(map(lambda x: str(x), sorted(do_chronics))), ) done_chronic_ids = [] for chronic_idx, chronic_name in enumerate(chronics_sorted): if len(done_chronic_ids) >= n_chronics >= 0: break # If chronic already done if chronic_idx in self.collector.chronic_ids: continue # Environment specific filtering if "rte_case5" in env.name: if chronic_idx not in do_chronics: continue elif "l2rpn_2019" in env.name: if chronic_idx not in do_chronics: continue elif "l2rpn_wcci_2020" in env.name: if chronic_idx not in do_chronics: continue chronic_org_idx = chronics.index(chronic_name) env.chronics_handler.tell_id(chronic_org_idx - 1) # Set chronic id obs = env.reset() agent.reset(obs=obs) chronic_len = env.chronics_handler.real_data.data.max_iter chronic_path_name = "/".join( os.path.normpath(env.chronics_handler.get_id()).split( os.sep)[-3:]) augmentation_info = os.path.join(env.chronics_handler.get_id(), "augmentation.json") ps = None if os.path.isfile(augmentation_info): with open(augmentation_info, "r") as f: ps = json.load(f) pprint(" - Chronic:", chronic_path_name) # if ps: # p = ps["p"] # min_p = ps["min_p"] # max_p = ps["max_p"] # targets = ps["targets"] # # pprint(" - Augmentation:", ps["augmentation"]) # pprint( # " - Rate:", # "p = {:.2f} ~ [{:.2f}, {:.2f}]".format(p, min_p, max_p), # ) # if targets: # pprint(" - Targets:", str(targets)) t = 0 done = False reward = np.nan """ Collect data. """ while not done and not (t >= n_steps > 0): action = agent.act(obs, reward=reward, done=done) obs_next, reward, done, info = env.step(action) self.collector._add(obs, action, reward, done) semi_action = False if agent.semi_agent is not None: semi_action = agent.semi_agent.semi_action dist, dist_status, dist_status = agent.distance_to_ref_topology( obs_next.topo_vect, obs_next.line_status) self.collector._add_plus(dist, dist_status, dist_status, semi_action) t = env.chronics_handler.real_data.data.current_index if t % 200 == 0: semi_sum = np.sum(self.collector.semi_actions) pprint(" - Step:", t, semi_sum) if done: semi_sum = np.sum(self.collector.semi_actions) pprint(" - Length:", f"{t}/{chronic_len}", semi_sum) obs = obs_next self.collector.obses.append(obs.to_vect()) self.collector.total_return = compute_returns( self.collector.rewards)[0] self.collector.duration = t self.collector.chronic_len = chronic_len self.collector.chronic_name = chronic_name done_chronic_ids.append(chronic_idx) self.collector._save_chronic(agent_name, chronic_idx, verbose) self.collector.reset()