def train_once(graph: nx.Graph, clusters: list, pos: dict, env_name: str='Controller-Select-v0', compute_optimal: bool=True, trained_model: DQN=None, steps: int=2e5, logdir: str='train_log_compare', env_kwargs: dict={}) -> (DQN, float, float): """ Main training loop. Initializes RL environment, performs training, and outputs results Args: graph (nx.Graph): NetworkX graph to train on clusters (list): List of lists of nodes in each cluster pos (dict): Graph rendering positions env_name (str): Name of Gym environment compute_optimal (bool): Whether to compute optimal set of controllers by brute-force trained_model (DQN): Provide starting model to train on Return: Trained model """ # Selecting controllers one-at-a-time environment env = gym.make(env_name, graph=graph, clusters=clusters, pos=pos, **env_kwargs) heuristic_controllers, heuristic_distance = env.compute_greedy_heuristic() print("WMSCP Greedy Heuristic: {}, {}".format(heuristic_controllers, heuristic_distance)) #for i in range(1000): # env.reset() # print(env.graph.size(weight='weight')) orig_graph = env.original_graph optimal_controllers = None if compute_optimal: print("Computing optimal!") optimal_controllers = env.calculateOptimal() # Generate custom replay buffer full of valid experiences to speed up exploration of training def add_wrapper(replay_buffer): # Replay buffer maxsize is by default 50000. Should this be lowered? # valid_controllers_set = [env._random_valid_controllers() for i in range(int(replay_buffer._maxsize * 0.5 / len(clusters)))] # Uses heuristic controller set as innitial 'random' controllers valid_controllers_set = env.graphCentroidAction() for valid_controllers in valid_controllers_set: obs_current = env.reset() # Really strange issue - obs_current follows the change in env.state, making it equal to obs! for controller in valid_controllers: (obs, rew, done, _) = env.step(controller) replay_buffer.add(obs_current, controller, rew, obs, done) # For some reason, obs is a pointer which ends up being the very last obs before reset, so need to copy obs_current = obs.copy() return replay_buffer # Agent model = None if trained_model is None: print("Creating new training model!") model = DQN(LnMlpPolicy, env, tensorboard_log=logdir, verbose=0, full_tensorboard_log=True, exploration_initial_eps=0.5, exploration_fraction=0.2, learning_starts=0, target_network_update_freq=100, batch_size=32, learning_rate=0.00025) else: print("Using provided training model!") model = trained_model model.set_env(env) model.tensorboard_log = logdir # Train the agent print("Training!") model.learn(total_timesteps=int(steps))#, callback=callback)#, replay_wrapper=add_wrapper) # Run a single run to evaluate the DQN obs = env.reset() reward = 0 #We want the last reward to be minimal (perhaps instead do cumulative?) reward_final = 0 done = False action = None final_rl_actions = [] while not done: action, states = model.predict(obs) (obs, rew, done, _) = env.step(action) final_rl_actions.append(action) reward += rew reward_final = rew # Show controllers chosen by the model env.render(mode='graph_end.png') print(env.controllers, reward_final) print("BEST EVER:") print(env.best_controllers, env.best_reward) best_reward = env.optimal_neighbors(graph, env.best_controllers) print(best_reward) average_graph = env.average_graph.copy() rl_controllers = env.controllers rl_best_controllers = env.best_controllers if env_name == 'Controller-Cluster-v0': rl_controllers.sort() rl_best_controllers.sort() cluster_len = len(clusters[0]) for i in range(len(clusters)): rl_controllers[i] -= i * cluster_len rl_best_controllers[i] -= i * cluster_len env.reset(adjust=False, full=True) nx.write_gpickle(average_graph, 'average_graph.gpickle') env.graph = average_graph.copy() for cont in rl_controllers: (_, reward_final, _, _) = env.step(cont) print("RL Controllers on average change graph {} - {}".format(env.controllers, reward_final)) env.reset(adjust=False, full=True) env.graph = average_graph.copy() for cont in rl_best_controllers: (_, reward_final, _, _) = env.step(cont) print("RL Best Controllers on average change graph {} - {}".format(env.best_controllers, reward_final)) # Show controllers chosen using heuristic centroid_controllers, heuristic_distance = env.graphCentroidAction() #centroid_controllers, heuristic_distance = env.compute_greedy_heuristic() # Convert heuristic controllers to actual if env_name == 'Controller-Cluster-v0' or env_name == 'Controller-Cluster-Options-v0': # Assume all clusters same length centroid_controllers.sort() cluster_len = len(clusters[0]) for i in range(len(clusters)): centroid_controllers[i] -= i * cluster_len env.reset(adjust=False, full=True) env.graph = average_graph.copy() for cont in centroid_controllers: (_, reward_final, _, _) = env.step(cont) env.render(mode='graph_heuristic.png') best_heuristic = reward_final print("Heuristic on average change graph {} - {}".format(env.controllers, reward_final)) #print("Heuristic optimal {} - {}".format(*env.optimal_neighbors(graph, env.controllers))) heuristic_controllers = env.controllers rl_rewards = [] heuristic_rewards = [] rl_best_rewards = [] NUM_GRAPHS = 100 for i in range(NUM_GRAPHS): rl_reward = None heuristic_reward = None rl_best_reward = None env.reset() nx.write_gpickle(env.graph, '100Graphs/graph_{}.gpickle'.format(i)) for cont in final_rl_actions: (_, rl_reward, _, _) = env.step(cont) env.reset(adjust=False, full=False) for cont in centroid_controllers: (_, heuristic_reward, _, _) = env.step(cont) env.reset(adjust=False, full=False) for cont in rl_best_controllers: (_, rl_best_reward, _, _) = env.step(cont) print("RL REWARD, RL BEST REWARD, HEURISTIC: {}\t{}\t{}".format(rl_reward, rl_best_reward, heuristic_reward)) rl_rewards.append(rl_reward) heuristic_rewards.append(heuristic_reward) rl_best_rewards.append(rl_best_reward) def create_hist(fig, data, title=None, color=None): bins = np.arange(min(data) - 100, max(data) + 100, 100) plt.xlim([min(data) - 100, max(data) + 100]) fig.hist(data, bins=bins, alpha=0.5, color=color) if title: fig.title(title) plt.xlabel('Controller Distances') plt.ylabel('Count') fig = plt.figure() ax1 = fig.add_subplot(2, 1, 1) create_hist(ax1, rl_rewards, color='blue') create_hist(ax1, heuristic_rewards, color='red') create_hist(ax1, rl_best_rewards, color='green') ax2 = fig.add_subplot(2, 1, 2) ax2.plot(np.arange(0, NUM_GRAPHS, 1), rl_rewards, c='blue') ax2.plot(np.arange(0, NUM_GRAPHS, 1), heuristic_rewards, c='red') ax2.plot(np.arange(0, NUM_GRAPHS, 1), rl_best_rewards, c='green') plt.show() # Show optimal if optimal_controllers is not None: env.reset() for cont in optimal_controllers[0]: (_, reward_final, _, _) = env.step(cont) env.render(mode='graph_optimal.png') print(env.controllers, reward_final) print(optimal_controllers) return model, best_reward, best_heuristic
def __init__(self, env: Env, params: dict, model_path: str, log_path: str): """Initialize. :param env: gym environment. Assuming observation space is a tuple, where first component is from original env, and the second is temporal goal state. :param params: dict of parameters, like `default_parameters`. :param model_path: directory where to save models. :param log_path: directory where to save tensorboard logs. """ # Check if params["initialize_file"]: raise ValueError( "Initialization not supported; use resuming option") if params["action_bias"]: raise ValueError("Action bias is not maintained here") # Alias original_env = env # Load a saved agent for the action bias self.biased_agent: Optional[DQN] = None if params["action_bias"]: loading_params = dict(params) loading_params["resume_file"] = params["action_bias"] loading_params["action_bias"] = None self.biased_agent = TrainStableBaselines( env=env, params=loading_params, model_path=model_path, log_path=log_path, ).model # Collect statistics # (assuming future wrappers do not modify episodes) env = MyStatsRecorder(env=env, gamma=params["gamma"]) # Callbacks checkpoint_callback = CustomCheckpointCallback( save_path=model_path, save_freq=params["save_freq"], extra=None, ) stats_logger_callback = StatsLoggerCallback(stats_recorder=env, scope="env0") callbacks_list = [checkpoint_callback, stats_logger_callback] if params["render"]: renderer_callback = RendererCallback() callbacks_list.append(renderer_callback) # If training a passive agent log this too if params["active_passive_agents"]: # Find the reward shaping env reward_shaping_env = find_wrapper(env, RewardShapingWrapper) passive_stats_env = MyStatsRecorder( env=UnshapedEnv(reward_shaping_env), gamma=params["gamma"], ) passive_stats_callback = StatsLoggerCallback( stats_recorder=passive_stats_env, scope="env1", ) callbacks_list.append(passive_stats_callback) # Make it move with the original env env = UnshapedEnvWrapper( shaped_env=env, unshaped_env=passive_stats_env, ) original_reward_getter = env.get_reward # alias else: original_reward_getter = None # Combine callbacks all_callbacks = CallbackList(callbacks_list) # Define or load resuming = bool(params["resume_file"]) if not resuming: # Normalizer normalized_env = NormalizeEnvWrapper( env=env, training=True, entry=0, # Only env features, not temporal goal state ) flat_env = BoxAutomataStates(normalized_env) # Saving normalizer too checkpoint_callback.saver.extra_model = normalized_env # Agent model = DQN( env=flat_env, policy=ModularPolicy, policy_kwargs={ "layer_norm": params["layer_norm"], "layers": params["layers"], "shared_layers": params["shared_layers"], "dueling": params["dueling"], }, gamma=params["gamma"], learning_rate=params["learning_rate"], train_freq=params["train_freq"], double_q=True, batch_size=params["batch_size"], buffer_size=params["buffer_size"], learning_starts=params["learning_starts"], prioritized_replay=True, target_network_update_freq=params[ "target_network_update_freq"], exploration_fraction=params["exploration_fraction"], exploration_final_eps=params["exploration_final_eps"], exploration_initial_eps=params["exploration_initial_eps"], active_passive_agents=params["active_passive_agents"], passive_reward_getter=original_reward_getter, tensorboard_log=log_path, full_tensorboard_log=False, verbose=1, ) else: # Reload model model, extra_model, counters = checkpoint_callback.load( path=params["resume_file"], ) # Restore normalizer and env normalized_env = extra_model normalized_env.set_env(env) flat_env = BoxAutomataStates(normalized_env) # Restore properties model.tensorboard_log = log_path model.num_timesteps = counters["step"] model.learning_starts = params["learning_starts"] + counters["step"] model.set_env(flat_env) model.passive_reward_getter = original_reward_getter # Store self.params = params self.resuming = resuming self.saver = checkpoint_callback self.logger = stats_logger_callback self.callbacks = all_callbacks self.model: DQN = model self.normalized_env = normalized_env self.testing_agent = model if not params[ "test_passive"] else model.passive_agent