class RLBenchmarkDispatcher(DispatcherBase): ''' An RL benchmark for elevator system ''' def load_settings(self): self._obs_dim = obs_dim(self._mansion) self._act_dim = act_dim(self._mansion) self._ele_num = self._mansion._elevator_number self._max_floor = self._mansion._floor_number self._global_step = 0 for i in range(self._mansion._elevator_number): self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1) self._model = RLDispatcherModel(self._act_dim) hyperparas = { 'action_dim': self._act_dim, 'lr': 5.0e-4, 'gamma': 0.998 } #print ("action dimention:", self._obs_dim, self._act_dim) self._algorithm = DQN(self._model, hyperparas) self._agent = ElevatorAgent(self._algorithm, self._obs_dim, self._act_dim) self._warm_up_size = 2000 self._statistic_freq = 1000 self._loss_queue = deque() def feedback(self, state, action, r): self._global_step += 1 observation_array = mansion_state_preprocessing(state) new_actions = list() for ele_act in action: new_actions.append(action_to_action_idx(ele_act, self._act_dim)) if(self._global_step > self._warm_up_size): for i in range(self._ele_num): self._rpm.append( self._last_observation_array[i], self._last_action[i], self._last_reward, deepcopy(observation_array[i]), False) self._last_observation_array = deepcopy(observation_array) self._last_action = deepcopy(new_actions) self._last_reward = r if self._rpm.size() > self._warm_up_size: batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \ self._rpm.sample_batch(BATCH_SIZE) cost = self._agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) self._loss_queue.appendleft(cost) if(len(self._loss_queue) > self._statistic_freq): self._loss_queue.pop() if(self._global_step % self._statistic_freq == 0): self._mansion._config.log_notice("Temporal Difference Error(Average) %f", sum(self._loss_queue)/float(len(self._loss_queue))) def policy(self, state): self._exploration_ratio = 500000.0 / (500000.0 + self._global_step) + 0.02 observation_array = mansion_state_preprocessing(state) q_values = self._agent.predict(observation_array) ret_actions = list() for i in range(self._ele_num): if(random.random() < self._exploration_ratio): action = random.randint(1, self._max_floor) else: action = np.argmax(q_values[i]) ret_actions.append(action_idx_to_action(int(action), self._act_dim)) return ret_actions
def run_episode(env: Env, agent: parl.Agent, rpm: ReplayMemory, return_time: bool = False): if return_time: start_tp = time() total_sample_time = 0. total_learn_time = 0. total_reward, steps = 0., 0 obs = env.reset() while True: steps += 1 ls_tp = time() if np.random.random() < param_dict["EPSILON"]: action = np.random.uniform(-1., 1., size=(2, )) else: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype("float32")) action = np.squeeze(action) # add guassion noise, clip, map to corresponding interval action = np.clip(np.random.normal(action, 1.0), -1., 1.) if return_time: total_sample_time += time() - ls_tp action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) next_obs, reward, done, info = env.step(action) # with open("./log/sample.log", "a+", encoding="utf-8") as f: # f.write(str(action) + "|" + str(next_obs)) rpm.append(obs, action, param_dict["REWARD_SCALE"] * reward, next_obs, done) # do warm up until rpm size reach MEMORY_WARMUP_SIZE if rpm.size() > param_dict["MEMORY_WARMUP_SIZE"]: batch_obs, batch_action, batch_reward, batch_next_obs, \ batch_terminal = rpm.sample_batch(param_dict["BATCH_SIZE"]) ls_tp = time() critic_cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) if return_time: total_learn_time += time() - ls_tp obs = next_obs total_reward += reward if done: break if return_time: run_time = time() - start_tp time_info = { "run time": run_time, "total sample time": total_sample_time, "total learn time": total_learn_time } return total_reward, steps, time_info else: return total_reward, steps
def main(): logger.info("-----------------Carla_SAC-------------------") logger.set_dir('./{}_train'.format(args.env)) # Parallel environments for training train_envs_params = EnvConfig['train_envs_params'] env_num = EnvConfig['env_num'] env_list = ParallelEnv(args.env, args.xparl_addr, train_envs_params) # env for eval eval_env_params = EnvConfig['eval_env_params'] eval_env = LocalEnv(args.env, eval_env_params) obs_dim = eval_env.obs_dim action_dim = eval_env.action_dim # Initialize model, algorithm, agent, replay_memory if args.framework == 'torch': CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent elif args.framework == 'paddle': CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent model = CarlaModel(obs_dim, action_dim) algorithm = SAC( model, gamma=GAMMA, tau=TAU, alpha=ALPHA, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = CarlaAgent(algorithm) rpm = ReplayMemory( max_size=MEMORY_SIZE, obs_dim=obs_dim, act_dim=action_dim) total_steps = 0 last_save_steps = 0 test_flag = 0 obs_list = env_list.reset() while total_steps < args.train_total_steps: # Train episode if rpm.size() < WARMUP_STEPS: action_list = [ np.random.uniform(-1, 1, size=action_dim) for _ in range(env_num) ] else: action_list = [agent.sample(obs) for obs in obs_list] next_obs_list, reward_list, done_list, info_list = env_list.step( action_list) # Store data in replay memory for i in range(env_num): rpm.append(obs_list[i], action_list[i], reward_list[i], next_obs_list[i], done_list[i]) obs_list = env_list.get_obs() total_steps = env_list.total_steps # Train agent after collecting sufficient data if rpm.size() >= WARMUP_STEPS: batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch( BATCH_SIZE) agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal) # Save agent if total_steps > int(1e5) and total_steps > last_save_steps + int(1e4): agent.save('./{}_model/step_{}_model.ckpt'.format( args.framework, total_steps)) last_save_steps = total_steps # Evaluate episode if (total_steps + 1) // args.test_every_steps >= test_flag: while (total_steps + 1) // args.test_every_steps >= test_flag: test_flag += 1 avg_reward = run_evaluate_episodes(agent, eval_env, EVAL_EPISODES) tensorboard.add_scalar('eval/episode_reward', avg_reward, total_steps) logger.info( 'Total steps {}, Evaluation over {} episodes, Average reward: {}' .format(total_steps, EVAL_EPISODES, avg_reward))