var=var) net.load() # Train reward_list = [] for i in range(episode): s = env.reset() total_reward = 0 for j in range(max_iter): # env.render() a = net.chooseAction(s) s_, r, finish, info = env.step(a) # 將資料存到記憶庫並更新參數 net.store_path(s, a, r / 10, s_) net.update() # 更新total reward和s_t資訊 s = s_ total_reward += r if j == max_iter - 1: print( "Episode: %d \tReward: %i \t Explore: %.2f \t Pointer: %d" % (i, total_reward, net.var, net.memory_counter)) reward_list.append(total_reward) break net.save() env.close() plt.plot(range(len(reward_list)), reward_list, '-o') plt.title('The reward curve of DDPG') plt.show()
def master_loop(env): logger = logging.getLogger() formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fileHandler = logging.FileHandler('./log/test.log') fileHandler.setFormatter(formatter) logger.addHandler(fileHandler) logger.setLevel(logging.INFO) s_dim = env.get_s_dim() a_dim = env.get_a_dim() a_high = env.get_a_high() a_low = env.get_a_low() # print(a_bound) print("s_dim: {}, a_dim{}, a_high:{}, a_low:{}".format( s_dim, a_dim, a_high, a_low)) ddpg = DDPG(a_dim, s_dim, a_high, a_low, lr_a=LR_A, lr_c=LR_C, gamma=GAMMA, tau=TAU, rpm_size=MEMORY_CAPACITY, batch_size=BATCH_SIZE) status = MPI.Status() start_time = time.time() reset_time = time.time() total_eps = 0 total_step = 0 n_step = 0 n_eps = 0 max_reward = -9999 max_reward_rank = 0 ddpg.load() while total_eps < MAX_EPISODES: data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) source = status.Get_source() tag = status.Get_tag() if tag == REQ_ACTION: # action = env.action_space.sample() action = ddpg.choose_action(data) comm.send((action, total_eps, total_step), dest=source, tag=RSP_ACTION) elif tag == OBS_DATA: n_step += 1 total_step += 1 (s, a, r, s_, done, ep_reward, ep_step) = data is_done = 0.0 if done: is_done = 1.0 ddpg.store_transition(s, a, r, s_, is_done) if ddpg.pointer > LEARN_START and total_step % 3 == 0: ddpg.learn() if done: total_eps += 1 if ep_reward > max_reward: max_reward = ep_reward max_reward_rank = source s = "eps: {:>8}, worker: {:>3}, ep_reward:{:7.4f}, max:{:7.4f}/{:>3}, step:{:4}".format( total_eps, source, ep_reward, max_reward, max_reward_rank, ep_step) #print(s) logging.info(s) if total_eps % 500 == 0: ddpg.save(total_eps) interval = time.time() - reset_time s = "# total_step: {:>8} ,total_eps: {:>6} eps/min: {:>6}, frame/sec: {:>6}".format( total_step, total_eps, n_eps / interval * 60, n_step / interval) #print(s) logging.info(s) n_step = 0 n_eps = 0 reset_time = time.time()