Exemple #1
0
def launchAgent():
    from stable_baselines import DQN
    import Reinforcement_AI.env.c_seperate_env as sep_env
    from queue import Queue
    from threading import Thread

    minimap_env = sep_env.MinimapEnv()
    allenv = sep_env.AllEnv()

    minimap_model = DQN(
        "CnnPolicy",  # policy
        minimap_env,  # environment
        double_q=True,  # Double Q enable
        prioritized_replay=True,  # Replay buffer enabled
        verbose=0  # log print
    )

    allenv_model = DQN(
        "MlpPolicy",
        allenv,
        double_q=True,
        prioritized_replay=True,
        verbose=0
    )

    for i in range(100):
        if i != 0:
            minimap_model = DQN.load("KR_minimap_" + str(i))
            allenv_model = DQN.load("KR_allenv_" + str(i))

        que = Queue()

        minimap_model.set_env(minimap_env)
        allenv_model.set_env(allenv)

        # minimap_thread = Thread(target=minimap_model.learn, args=[50000])
        # allenv_thread = Thread(target=allenv_model.learn, args=[50000])
        allenv_thread = Thread(target=lambda q, arg1: q.put(allenv_model.learn(arg1)), args=(que, 50000))
        # test = Pool(processes=1)

        # minimap_thread.start()
        allenv_thread.start()
        # test_result = test.apply_async(allenv_model.learn, (50000, None, 100, "DQN", True, None))
        minimap_model.learn(total_timesteps=50000)

        # allenv_model.learn(total_timesteps=50000)

        # minimap_thread.join()
        allenv_thread.join()

        allenv_model = que.get()
        # return_val = test_result.get()

        minimap_model.save("KR_minimap_" + str(i + 1))
        allenv_model.save("KR_allenv_" + str(i + 1))
Exemple #2
0
    def leveltrain(self, from_level, to_level, env, timesteps, level_modelpath,
                   tensorboard_logs_path):
        model = DQN('MlpPolicy',
                    env,
                    verbose=1,
                    policy_kwargs=self.policy_kwargs,
                    prioritized_replay=True,
                    buffer_size=100000,
                    learning_rate=0.0003,
                    exploration_final_eps=0,
                    tensorboard_log=tensorboard_logs_path)
        model.save(level_modelpath)

        for current_level in range(
                from_level,
                to_level + 1):  # Train model for increasingly difficult levels
            env = gym.make('DeepWellEnvSpherlevel' + str(current_level) +
                           '-v0')

            model = self.load(level_modelpath,
                              tensorboard_logs_path)  # Load previous model
            env_str = self.get_env_str(env)
            model.set_env(make_vec_env(env_str, n_envs=1))
            model.learn(total_timesteps=timesteps,
                        reset_num_timesteps=False,
                        tb_log_name="TB_" +
                        datetime.now().strftime('%d%m%y-%H%M')
                        )  # Continue training previous model

            level_modelpath = level_modelpath[0:-1] + str(
                current_level)  # Generate new name of newly trained model
            model.save(level_modelpath)  # Save newly trained model

            print("====================== Level " + str(current_level) +
                  " finished with " + str(timesteps) +
                  " timesteps ==========================")

        return model
def train_once(graph: nx.Graph, clusters: list, pos: dict, env_name: str='Controller-Select-v0', compute_optimal: bool=True, trained_model: DQN=None, steps: int=2e5, logdir: str='train_log_compare', env_kwargs: dict={}) -> (DQN, float, float):
	"""
	Main training loop. Initializes RL environment, performs training, and outputs results
	Args:
		graph (nx.Graph): NetworkX graph to train on
		clusters (list): List of lists of nodes in each cluster
		pos (dict): Graph rendering positions
		env_name (str): Name of Gym environment
		compute_optimal (bool): Whether to compute optimal set of controllers by brute-force
		trained_model (DQN): Provide starting model to train on
	Return:
		Trained model
	"""
	# Selecting controllers one-at-a-time environment
	env = gym.make(env_name, graph=graph, clusters=clusters, pos=pos, **env_kwargs)
	heuristic_controllers, heuristic_distance = env.compute_greedy_heuristic()
	print("WMSCP Greedy Heuristic: {}, {}".format(heuristic_controllers, heuristic_distance))
	#for i in range(1000):
	#	env.reset()
	#	print(env.graph.size(weight='weight'))
	orig_graph = env.original_graph
	optimal_controllers = None
	if compute_optimal:
		print("Computing optimal!")
		optimal_controllers = env.calculateOptimal()


	# Generate custom replay buffer full of valid experiences to speed up exploration of training
	def add_wrapper(replay_buffer):
		# Replay buffer maxsize is by default 50000. Should this be lowered?
		# valid_controllers_set = [env._random_valid_controllers() for i in range(int(replay_buffer._maxsize * 0.5 / len(clusters)))]
		# Uses heuristic controller set as innitial 'random' controllers
		valid_controllers_set = env.graphCentroidAction()
	
		for valid_controllers in valid_controllers_set:
			obs_current = env.reset()  # Really strange issue - obs_current follows the change in env.state, making it equal to obs!
			for controller in valid_controllers:
				(obs, rew, done, _) = env.step(controller)
				replay_buffer.add(obs_current, controller, rew, obs, done)  # For some reason, obs is a pointer which ends up being the very last obs before reset, so need to copy
				obs_current = obs.copy()
		return replay_buffer

	# Agent
	model = None
	if trained_model is None:
		print("Creating new training model!")
		model = DQN(LnMlpPolicy, env, tensorboard_log=logdir, verbose=0, full_tensorboard_log=True, exploration_initial_eps=0.5, exploration_fraction=0.2, learning_starts=0, target_network_update_freq=100, batch_size=32, learning_rate=0.00025)
	else:
		print("Using provided training model!")
		model = trained_model
		model.set_env(env)
		model.tensorboard_log = logdir

	# Train the agent
	print("Training!")
	model.learn(total_timesteps=int(steps))#, callback=callback)#, replay_wrapper=add_wrapper)

	# Run a single run to evaluate the DQN
	obs = env.reset()
	reward = 0 #We want the last reward to be minimal (perhaps instead do cumulative?)
	reward_final = 0
	done = False
	action = None
	final_rl_actions = []
	while not done:
		action, states = model.predict(obs)
		(obs, rew, done, _) = env.step(action)
		final_rl_actions.append(action)
		reward += rew
		reward_final = rew

	# Show controllers chosen by the model
	env.render(mode='graph_end.png')
	print(env.controllers, reward_final)
	print("BEST EVER:")
	print(env.best_controllers, env.best_reward)
	best_reward = env.optimal_neighbors(graph, env.best_controllers)
	print(best_reward)

	average_graph = env.average_graph.copy()
	rl_controllers = env.controllers
	rl_best_controllers = env.best_controllers
	if env_name == 'Controller-Cluster-v0':
		rl_controllers.sort()
		rl_best_controllers.sort()
		cluster_len = len(clusters[0])
		for i in range(len(clusters)):
			rl_controllers[i] -= i * cluster_len
			rl_best_controllers[i] -= i * cluster_len
	env.reset(adjust=False, full=True)
	nx.write_gpickle(average_graph, 'average_graph.gpickle')
	env.graph = average_graph.copy()
	for cont in rl_controllers:
		(_, reward_final, _, _) = env.step(cont)
	print("RL Controllers on average change graph {} - {}".format(env.controllers, reward_final))
	env.reset(adjust=False, full=True)
	env.graph = average_graph.copy()
	for cont in rl_best_controllers:
		(_, reward_final, _, _) = env.step(cont)
	print("RL Best Controllers on average change graph {} - {}".format(env.best_controllers, reward_final))
	# Show controllers chosen using heuristic
	centroid_controllers, heuristic_distance = env.graphCentroidAction()
	#centroid_controllers, heuristic_distance = env.compute_greedy_heuristic()
	# Convert heuristic controllers to actual
	if env_name == 'Controller-Cluster-v0' or env_name == 'Controller-Cluster-Options-v0':
		# Assume all clusters same length
		centroid_controllers.sort()
		cluster_len = len(clusters[0])
		for i in range(len(clusters)):
			centroid_controllers[i] -= i * cluster_len
	env.reset(adjust=False, full=True)
	env.graph = average_graph.copy()
	for cont in centroid_controllers:
		(_, reward_final, _, _) = env.step(cont)
	env.render(mode='graph_heuristic.png')
	best_heuristic = reward_final
	print("Heuristic on average change graph {} - {}".format(env.controllers, reward_final))
	#print("Heuristic optimal {} - {}".format(*env.optimal_neighbors(graph,  env.controllers)))
	heuristic_controllers = env.controllers

	rl_rewards = []
	heuristic_rewards = []
	rl_best_rewards = []
	NUM_GRAPHS = 100
	for i in range(NUM_GRAPHS):
		rl_reward = None
		heuristic_reward = None
		rl_best_reward = None
		env.reset()
		nx.write_gpickle(env.graph, '100Graphs/graph_{}.gpickle'.format(i))
		for cont in final_rl_actions:
			(_, rl_reward, _, _) = env.step(cont)
		env.reset(adjust=False, full=False)
		for cont in centroid_controllers:
			(_, heuristic_reward, _, _) = env.step(cont)
		env.reset(adjust=False, full=False)
		for cont in rl_best_controllers:
			(_, rl_best_reward, _, _) = env.step(cont)
		print("RL REWARD, RL BEST REWARD, HEURISTIC: {}\t{}\t{}".format(rl_reward, rl_best_reward, heuristic_reward))
		rl_rewards.append(rl_reward)
		heuristic_rewards.append(heuristic_reward)
		rl_best_rewards.append(rl_best_reward)

	def create_hist(fig, data, title=None, color=None):
		bins = np.arange(min(data) - 100, max(data) + 100, 100)
		plt.xlim([min(data) - 100, max(data) + 100])
		fig.hist(data, bins=bins, alpha=0.5, color=color)
		if title:
			fig.title(title)
		plt.xlabel('Controller Distances')
		plt.ylabel('Count')
	fig = plt.figure()
	ax1 = fig.add_subplot(2, 1, 1)
	create_hist(ax1, rl_rewards, color='blue')
	create_hist(ax1, heuristic_rewards, color='red')
	create_hist(ax1, rl_best_rewards, color='green')
	ax2 = fig.add_subplot(2, 1, 2)
	ax2.plot(np.arange(0, NUM_GRAPHS, 1), rl_rewards, c='blue')
	ax2.plot(np.arange(0, NUM_GRAPHS, 1), heuristic_rewards, c='red')
	ax2.plot(np.arange(0, NUM_GRAPHS, 1), rl_best_rewards, c='green')
	plt.show()
	# Show optimal
	if optimal_controllers is not None:
		env.reset()
		for cont in optimal_controllers[0]:
			(_, reward_final, _, _) = env.step(cont)
		env.render(mode='graph_optimal.png')
		print(env.controllers, reward_final)
		print(optimal_controllers)
	return model, best_reward, best_heuristic
programing_type = int(sys.argv[1])

# Start to train the agent
if programing_type == 0:
	env = gym.make(environment_name)
	model = DQN("MlpPolicy", env, learning_rate=0.0001, gamma=0.7, batch_size=1024, prioritized_replay=False, verbose=1, tensorboard_log="./log/dqn_crossy_road_tensorboard_without_prioritized/")
	model.learn(total_timesteps=30000)
	model.save("../model/DQN_without_prioritized")
	env.close()

# Continue to train
elif programing_type == 1:
	myenv = gym.make(environment_name)
	env = DummyVecEnv([lambda: myenv])
	model = DQN.load('../model/DQN_without_prioritized', env=env)
	model.set_env(env)
	model.learn(total_timesteps=20000,callback=None, reset_num_timesteps=False)
	model.save("../model/DQN_without_prioritized")
	env.close()

# Test the agent
else:
	myenv = gym.make(environment_name)
	env = DummyVecEnv([lambda: myenv])
	model = DQN.load('../model/DQN_without_prioritized', env=env)
	result = {}

	mean_reward = []
	scores = []

	episodes = 1000
Exemple #5
0
    def __init__(self, env: Env, params: dict, model_path: str, log_path: str):
        """Initialize.

        :param env: gym environment. Assuming observation space is a tuple,
            where first component is from original env, and the second is
            temporal goal state.
        :param params: dict of parameters, like `default_parameters`.
        :param model_path: directory where to save models.
        :param log_path: directory where to save tensorboard logs.
        """
        # Check
        if params["initialize_file"]:
            raise ValueError(
                "Initialization not supported; use resuming option")
        if params["action_bias"]:
            raise ValueError("Action bias is not maintained here")

        # Alias
        original_env = env

        # Load a saved agent for the action bias
        self.biased_agent: Optional[DQN] = None
        if params["action_bias"]:
            loading_params = dict(params)
            loading_params["resume_file"] = params["action_bias"]
            loading_params["action_bias"] = None

            self.biased_agent = TrainStableBaselines(
                env=env,
                params=loading_params,
                model_path=model_path,
                log_path=log_path,
            ).model

        # Collect statistics
        #    (assuming future wrappers do not modify episodes)
        env = MyStatsRecorder(env=env, gamma=params["gamma"])

        # Callbacks
        checkpoint_callback = CustomCheckpointCallback(
            save_path=model_path,
            save_freq=params["save_freq"],
            extra=None,
        )
        stats_logger_callback = StatsLoggerCallback(stats_recorder=env,
                                                    scope="env0")

        callbacks_list = [checkpoint_callback, stats_logger_callback]
        if params["render"]:
            renderer_callback = RendererCallback()
            callbacks_list.append(renderer_callback)

        # If training a passive agent log this too
        if params["active_passive_agents"]:

            # Find the reward shaping env
            reward_shaping_env = find_wrapper(env, RewardShapingWrapper)

            passive_stats_env = MyStatsRecorder(
                env=UnshapedEnv(reward_shaping_env),
                gamma=params["gamma"],
            )

            passive_stats_callback = StatsLoggerCallback(
                stats_recorder=passive_stats_env,
                scope="env1",
            )
            callbacks_list.append(passive_stats_callback)

            # Make it move with the original env
            env = UnshapedEnvWrapper(
                shaped_env=env,
                unshaped_env=passive_stats_env,
            )
            original_reward_getter = env.get_reward  # alias
        else:
            original_reward_getter = None

        # Combine callbacks
        all_callbacks = CallbackList(callbacks_list)

        # Define or load
        resuming = bool(params["resume_file"])
        if not resuming:
            # Normalizer
            normalized_env = NormalizeEnvWrapper(
                env=env,
                training=True,
                entry=0,  # Only env features, not temporal goal state
            )
            flat_env = BoxAutomataStates(normalized_env)
            # Saving normalizer too
            checkpoint_callback.saver.extra_model = normalized_env

            # Agent
            model = DQN(
                env=flat_env,
                policy=ModularPolicy,
                policy_kwargs={
                    "layer_norm": params["layer_norm"],
                    "layers": params["layers"],
                    "shared_layers": params["shared_layers"],
                    "dueling": params["dueling"],
                },
                gamma=params["gamma"],
                learning_rate=params["learning_rate"],
                train_freq=params["train_freq"],
                double_q=True,
                batch_size=params["batch_size"],
                buffer_size=params["buffer_size"],
                learning_starts=params["learning_starts"],
                prioritized_replay=True,
                target_network_update_freq=params[
                    "target_network_update_freq"],
                exploration_fraction=params["exploration_fraction"],
                exploration_final_eps=params["exploration_final_eps"],
                exploration_initial_eps=params["exploration_initial_eps"],
                active_passive_agents=params["active_passive_agents"],
                passive_reward_getter=original_reward_getter,
                tensorboard_log=log_path,
                full_tensorboard_log=False,
                verbose=1,
            )
        else:
            # Reload model
            model, extra_model, counters = checkpoint_callback.load(
                path=params["resume_file"], )

            # Restore normalizer and env
            normalized_env = extra_model
            normalized_env.set_env(env)
            flat_env = BoxAutomataStates(normalized_env)

            # Restore properties
            model.tensorboard_log = log_path
            model.num_timesteps = counters["step"]
            model.learning_starts = params["learning_starts"] + counters["step"]
            model.set_env(flat_env)
            model.passive_reward_getter = original_reward_getter

        # Store
        self.params = params
        self.resuming = resuming
        self.saver = checkpoint_callback
        self.logger = stats_logger_callback
        self.callbacks = all_callbacks
        self.model: DQN = model
        self.normalized_env = normalized_env
        self.testing_agent = model if not params[
            "test_passive"] else model.passive_agent