def main(): ray.shutdown() ray.init(include_webui=False, ignore_reinit_error=True, redis_max_memory=500000000, object_store_memory=5000000000) env = CartPoleEnv() env.reset() cw_num = 4 ew_num = 4 training_episodes, test_interval, trials = 10000, 50, 30 agent = distributed_DQN_agent(env, hyperparams_CartPole, cw_num, ew_num, training_episodes, test_interval, trials) start_time = time.time() result = agent.learn_and_evaluate() run_time = {} run_time['distributed DQN agent'] = time.time() - start_time print("running time: ", run_time['distributed DQN agent']) plot_result(result, test_interval, ["batch_update with target_model"])
def __init__(self, hyper_params, memory_server, nb_agents, nb_evaluators, action_space=len(ACTION_DICT)): self.beta = hyper_params['beta'] self.initial_epsilon = 1 self.final_epsilon = hyper_params['final_epsilon'] self.epsilon_decay_steps = hyper_params['epsilon_decay_steps'] self.hyper_params = hyper_params self.update_steps = hyper_params['update_steps'] self.model_replace_freq = hyper_params['model_replace_freq'] self.action_space = action_space self.batch_size = hyper_params['batch_size'] self.memory_server = memory_server self.nb_agents = nb_agents self.nb_evaluators = nb_evaluators env = CartPoleEnv() state = env.reset() input_len = len(state) output_len = action_space self.eval_model = DQNModel(input_len, output_len, learning_rate=hyper_params['learning_rate']) self.target_model = DQNModel(input_len, output_len) self.agents = [ DQN_agent_remote.remote(CartPoleEnv(), memory_server, hyper_params, action_space, i) for i in range(nb_agents) ] self.evaluators = [ EvalWorker.remote(self.eval_model, CartPoleEnv(), hyper_params['max_episode_steps'], hyper_params['eval_trials'], i) for i in range(nb_evaluators) ]
def learn_and_evaluate(self): worker_id = [] # evaluators_id = [] for i in range(self.cw_num): simulator = CartPoleEnv() worker_id.append(collecting_server.remote(simulator, self.model_server, self.memory, action_space=2)) # ray.wait(collectors_id, len(collectors_id)) # for j in range(self.ew_num): simulator = self.env worker_id.append(evaluation_server.remote(simulator, self.model_server)) ray.wait(worker_id, len(worker_id)) return ray.get(self.model_server.get_reuslts.remote())
plt.xlabel("Episodes") plt.ylabel("Total Rewards") plt.savefig("Distributed_DQN_4_Collectors_4_Workers.png") plt.show() ray.shutdown() ray.init(include_webui=False, ignore_reinit_error=True, redis_max_memory=500000000, object_store_memory=5000000000) from memory_remote import ReplayBuffer_remote from dqn_model import _DQNModel import torch from custom_cartpole import CartPoleEnv from collections import deque simulator = CartPoleEnv() result_folder = ENV_NAME result_file = ENV_NAME + "/results4.txt" if not os.path.isdir(result_folder): os.mkdir(result_folder) torch.set_num_threads(12) Memory_Server = ReplayBuffer_remote.remote(2000) @ray.remote class DQN_Model_Server(): def __init__(self, env, hyper_params, batch_size, update_steps, memory_size, beta, model_replace_freq, learning_rate, use_target_model=True, memory=Memory_Server, action_space=2, training_episodes=7000, test_interval=50):
from custom_cartpole import CartPoleEnv # Set the Env name and action space for CartPole ENV_NAME = 'CartPole_distributed' # Set result saveing floder result_floder = ENV_NAME + "_distributed" result_file = ENV_NAME + "/results.txt" if not os.path.isdir(result_floder): os.mkdir(result_floder) torch.set_num_threads(12) # Move left, Move right ACTION_DICT = {"LEFT": 0, "RIGHT": 1} # Register the environment env = CartPoleEnv() memory = ReplayBuffer_remote.remote(2000) @ray.remote class DQN_server(object): def __init__(self, learning_rate, training_episodes, memory, env, test_interval=50, batch_size=32, action_space=len(ACTION_DICT), beta=0.99):
# Set the Env name and action space for CartPole ENV_NAME = 'CartPole_distributed' # Set result saveing floder result_floder = ENV_NAME result_file = result_floder + "/results.txt" if not os.path.isdir(result_floder): os.mkdir(result_floder) torch.set_num_threads(12) # Move left, Move right ACTION_DICT = {"LEFT": 0, "RIGHT": 1} # make environment env_CartPole = CartPoleEnv() def plot_result(total_rewards, learning_num, legend): print("\nLearning Performance:\n") episodes = [] for i in range(len(total_rewards)): episodes.append(i * learning_num + 1) plt.figure(num=1) fig, ax = plt.subplots() plt.plot(episodes, total_rewards) plt.title('performance') plt.legend(legend) plt.xlabel("Episodes") plt.ylabel("total rewards")
'batch_size': 32, 'update_steps': 10, 'memory_size': 2000, 'beta': 0.99, 'model_replace_freq': 2000, 'learning_rate': 0.0003, 'use_target_model': True } # =================== Initialize Environment =================== # Set the Env name and action space for CartPole ENV_NAME = 'CartPole_distributed' # Move left, Move right ACTION_DICT = {"LEFT": 0, "RIGHT": 1} # Register the environment env_CartPole = CartPoleEnv() # =================== Ray Init =================== ray.shutdown() # ray.init(include_webui=False, ignore_reinit_error=True, redis_max_memory=500000000, object_store_memory=5000000000) ray.init() # =================== DQN =================== class DQN_agent(object): def __init__(self, env, hyper_params, action_space=len(ACTION_DICT)): self.env = env self.max_episode_steps = env._max_episode_steps """ beta: The discounted factor of Q-value function (epsilon): The explore or exploit policy epsilon.
worker_id = [] # evaluators_id = [] for i in range(self.cw_num): simulator = CartPoleEnv() worker_id.append(collecting_server.remote(simulator, self.model_server, self.memory, action_space=2)) # ray.wait(collectors_id, len(collectors_id)) # for j in range(self.ew_num): simulator = self.env worker_id.append(evaluation_server.remote(simulator, self.model_server)) ray.wait(worker_id, len(worker_id)) return ray.get(self.model_server.get_reuslts.remote()) #################### training_episodes, test_interval = 7000, 50 ENV_NAME = 'CartPole_distributed' cartpole = CartPoleEnv() # env_CartPole = gym.make(CartPoleEnv) # print(cartpole.reset()) agent = distributed_DQN_agent(cartpole, collectors_num=8, evaluators_num=4) result = agent.learn_and_evaluate() plot_result(result, test_interval, ["batch_update with target_model"])
hyperparams = { 'epsilon_decay_steps': 7000, 'final_epsilon': 0.1, 'batch_size': 10, 'update_steps': 5, 'memory_size': 2000, 'beta': 0.99, 'model_replace_freq': 2000, 'learning_rate': 0.0003, 'use_target_model': True, 'workers': (12, 4), 'do_test': True, 'initial_epsilon': 1, 'steps': 0, 'training_episodes': 7000, 'test_interval': 50 } start_time = time.time() env = CartPoleEnv() env.reset() agent = distributed_RL_agent(env, hyperparams) result = agent.learn_and_evaluate() print(result) print(time.time() - start_time) # plot_result(result, test_interval, ["batch_update with target_model"]) print("Done!!")
from custom_cartpole import CartPoleEnv import gym env = CartPoleEnv() for i_episode in range(20): observation = env.reset() for t in range(100): env.render() print(observation) action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break