def collect_porcess(agent_index, queue_mem, acrot_param): env = NormalizedEnv(gym.make('Pendulum-v0')) agent = Action(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0]) try: while True: done = False state = env.reset() state = (state - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) agent.load_param(acrot_param) print("agent {} load param".format(agent_index)) while not done: action = agent.chose_action(state, explort=True) next_state, reward, done, _ = env.step(action) # env.render() next_state = (next_state - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) is_done = 0 if done else 1 queue_mem.put((state, action, next_state, reward, is_done)) state = next_state except Exception as e: print(e) print("agent {} exit".format(agent_index)) env.close()
def main(): mp.set_start_method('spawn') config = Config() # 1. 初始化环境 env = NormalizedEnv(gym.make('Pendulum-v0')) # 2. 初始化agent agent = DDPGAgent(env=env, seed=config.seed, batch_size=config.batch_size, learning_rate_actor=config.learning_rate_actor, learning_rate_critic=config.learning_rate_critic, weight_decay=config.weight_decay) agent.target_actor.share_memory() # 3. 初始化memory memory = ReplayMemory(config.capacity) q = mp.Queue(10) process_collect_list = [] for i in range(config.agent_num): process_name = "collect_process_" + str(i) process = mp.Process(name=process_name, target=collect_porcess, args=(i, q, agent.target_actor)) process.start() process_collect_list.append(process) steps = mp.Value('d', 0) test_p = mp.Process(name="test_process", target=test_process, args=(config, steps, agent.target_actor)) test_p.start() process_collect_list.append(test_p) try: while True: len = q.qsize() while len: mem = q.get() memory.push(mem[0], mem[1], mem[2], mem[3], mem[4]) len -= 1 # 4.4 学习 if memory.len > config.batch_size: agent.learning(memory) # save model if steps.value > 1 and steps.value % config.save_steps == 0: agent.save_models(steps.value / config.save_steps) steps.value += 1 except Exception as e: print(e) except: for process in process_collect_list: process.join() print(process.name + " stop ") env.close()
def test_process(config, steps, target_actor): env = NormalizedEnv(gym.make('Pendulum-v0')) agent = Action(state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0]) reward_list = [] try: while True: # for test if (steps.value) != 0 and (steps.value % config.test_every_eposide == 0): agent.load_param(target_actor) print("test agent load param ") et_reward = 0 for index in range(config.num_eposide_test): eposide = 0 state = env.reset() state = (state - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) while True: action = agent.chose_action(state, explort=False) next_state, reward, done, _ = env.step(action) env.render() next_state = (next_state - env.observation_space.low ) / (env.observation_space.high - env.observation_space.low) eposide += reward state = next_state if done: break et_reward += eposide print("\033[93m [ test ] eposide average reward : {}\033[00m". format(et_reward / config.num_eposide_test)) reward_list.append(et_reward / config.num_eposide_test) x = np.arange(len(reward_list)) y = np.array(reward_list) plt.plot(x, y) plt.savefig("./eposide_reward.png") except Exception as e: print(e) print("test process exit") env.close()
def main(): env = NormalizedEnv(gym.make('Pendulum-v0')) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] agent = Actor(state_dim, action_dim).to('cuda') agent.load_state_dict(torch.load('./Models/78.0_actor.pt')) eposide = 0 done = False eposide_list = [] while eposide < 100: eposide_reward = 0 state = env.reset() state = (state - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) state = to_tensor(state) while not done: action = agent.forward(state).detach().cpu().data.numpy() state_, reward, done, _ = env.step(action) state_ = (state_ - env.observation_space.low) / ( env.observation_space.high - env.observation_space.low) env.render() state = to_tensor(state_) eposide_reward += reward eposide_list.append(eposide_reward) eposide += 1 done = False print("{} : {}".format(eposide, eposide_reward)) import matplotlib.pyplot as plt x = np.arange(100) y = np.array(eposide_list) plt.plot(x, y) plt.savefig("./test_eposide_reward.png") env.close()