コード例 #1
0
class RLBenchmarkDispatcher(DispatcherBase):
  '''
  An RL benchmark for elevator system
  '''

  def load_settings(self):
    self._obs_dim = obs_dim(self._mansion)
    self._act_dim = act_dim(self._mansion)
    self._ele_num = self._mansion._elevator_number
    self._max_floor = self._mansion._floor_number
    self._global_step = 0
    for i in range(self._mansion._elevator_number):
      self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1)
      self._model = RLDispatcherModel(self._act_dim)
    hyperparas = {
        'action_dim': self._act_dim,
        'lr': 5.0e-4,
        'gamma': 0.998
    }
    #print ("action dimention:", self._obs_dim, self._act_dim)
    self._algorithm = DQN(self._model, hyperparas)
    self._agent = ElevatorAgent(self._algorithm, self._obs_dim, self._act_dim)
    self._warm_up_size = 2000
    self._statistic_freq = 1000
    self._loss_queue = deque()

  def feedback(self, state, action, r):
    self._global_step += 1
    observation_array = mansion_state_preprocessing(state)
    new_actions = list()
    for ele_act in action:
      new_actions.append(action_to_action_idx(ele_act, self._act_dim))
    if(self._global_step > self._warm_up_size):
      for i in range(self._ele_num):
        self._rpm.append(
            self._last_observation_array[i], 
            self._last_action[i], 
            self._last_reward, 
            deepcopy(observation_array[i]), False)
    self._last_observation_array = deepcopy(observation_array)
    self._last_action = deepcopy(new_actions)
    self._last_reward = r

    if self._rpm.size() > self._warm_up_size:
      batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \
          self._rpm.sample_batch(BATCH_SIZE)
      cost = self._agent.learn(batch_obs, batch_action, 
          batch_reward, batch_next_obs, batch_terminal)
      self._loss_queue.appendleft(cost)
      if(len(self._loss_queue) > self._statistic_freq):
        self._loss_queue.pop()
      if(self._global_step % self._statistic_freq == 0):
        self._mansion._config.log_notice("Temporal Difference Error(Average) %f", sum(self._loss_queue)/float(len(self._loss_queue)))

  def policy(self, state):
    self._exploration_ratio = 500000.0 / (500000.0 + self._global_step) + 0.02
    observation_array = mansion_state_preprocessing(state)
    q_values = self._agent.predict(observation_array)
    ret_actions = list()
    for i in range(self._ele_num):
      if(random.random() < self._exploration_ratio):
        action = random.randint(1, self._max_floor)
      else:
        action = np.argmax(q_values[i])
      ret_actions.append(action_idx_to_action(int(action), self._act_dim))
    return ret_actions
コード例 #2
0
def run_episode(env: Env,
                agent: parl.Agent,
                rpm: ReplayMemory,
                return_time: bool = False):
    if return_time:
        start_tp = time()
        total_sample_time = 0.
        total_learn_time = 0.

    total_reward, steps = 0., 0
    obs = env.reset()
    while True:
        steps += 1
        ls_tp = time()
        if np.random.random() < param_dict["EPSILON"]:
            action = np.random.uniform(-1., 1., size=(2, ))
        else:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype("float32"))
            action = np.squeeze(action)
            # add guassion noise, clip, map to corresponding interval
            action = np.clip(np.random.normal(action, 1.0), -1., 1.)
        if return_time:
            total_sample_time += time() - ls_tp

        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)
        # with open("./log/sample.log", "a+", encoding="utf-8") as f:
        #     f.write(str(action) + "|" + str(next_obs))

        rpm.append(obs, action, param_dict["REWARD_SCALE"] * reward, next_obs,
                   done)

        # do warm up until rpm size reach MEMORY_WARMUP_SIZE
        if rpm.size() > param_dict["MEMORY_WARMUP_SIZE"]:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
                batch_terminal = rpm.sample_batch(param_dict["BATCH_SIZE"])
            ls_tp = time()
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal)
            if return_time:
                total_learn_time += time() - ls_tp

            obs = next_obs
            total_reward += reward

        if done:
            break

    if return_time:
        run_time = time() - start_tp
        time_info = {
            "run time": run_time,
            "total sample time": total_sample_time,
            "total learn time": total_learn_time
        }
        return total_reward, steps, time_info
    else:
        return total_reward, steps
コード例 #3
0
def main():
    logger.info("-----------------Carla_SAC-------------------")
    logger.set_dir('./{}_train'.format(args.env))

    # Parallel environments for training
    train_envs_params = EnvConfig['train_envs_params']
    env_num = EnvConfig['env_num']
    env_list = ParallelEnv(args.env, args.xparl_addr, train_envs_params)

    # env for eval
    eval_env_params = EnvConfig['eval_env_params']
    eval_env = LocalEnv(args.env, eval_env_params)

    obs_dim = eval_env.obs_dim
    action_dim = eval_env.action_dim

    # Initialize model, algorithm, agent, replay_memory
    if args.framework == 'torch':
        CarlaModel, SAC, CarlaAgent = TorchModel, TorchSAC, TorchAgent
    elif args.framework == 'paddle':
        CarlaModel, SAC, CarlaAgent = PaddleModel, PaddleSAC, PaddleAgent
    model = CarlaModel(obs_dim, action_dim)
    algorithm = SAC(
        model,
        gamma=GAMMA,
        tau=TAU,
        alpha=ALPHA,
        actor_lr=ACTOR_LR,
        critic_lr=CRITIC_LR)
    agent = CarlaAgent(algorithm)
    rpm = ReplayMemory(
        max_size=MEMORY_SIZE, obs_dim=obs_dim, act_dim=action_dim)

    total_steps = 0
    last_save_steps = 0
    test_flag = 0

    obs_list = env_list.reset()

    while total_steps < args.train_total_steps:
        # Train episode
        if rpm.size() < WARMUP_STEPS:
            action_list = [
                np.random.uniform(-1, 1, size=action_dim)
                for _ in range(env_num)
            ]
        else:
            action_list = [agent.sample(obs) for obs in obs_list]
        next_obs_list, reward_list, done_list, info_list = env_list.step(
            action_list)

        # Store data in replay memory
        for i in range(env_num):
            rpm.append(obs_list[i], action_list[i], reward_list[i],
                       next_obs_list[i], done_list[i])

        obs_list = env_list.get_obs()
        total_steps = env_list.total_steps
        # Train agent after collecting sufficient data
        if rpm.size() >= WARMUP_STEPS:
            batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch(
                BATCH_SIZE)
            agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
                        batch_terminal)

        # Save agent
        if total_steps > int(1e5) and total_steps > last_save_steps + int(1e4):
            agent.save('./{}_model/step_{}_model.ckpt'.format(
                args.framework, total_steps))
            last_save_steps = total_steps

        # Evaluate episode
        if (total_steps + 1) // args.test_every_steps >= test_flag:
            while (total_steps + 1) // args.test_every_steps >= test_flag:
                test_flag += 1
            avg_reward = run_evaluate_episodes(agent, eval_env, EVAL_EPISODES)
            tensorboard.add_scalar('eval/episode_reward', avg_reward,
                                   total_steps)
            logger.info(
                'Total steps {}, Evaluation over {} episodes, Average reward: {}'
                .format(total_steps, EVAL_EPISODES, avg_reward))