コード例 #1
0
def sampler_worker(config,
                   replay_queue,
                   batch_queue,
                   training_on,
                   global_episode,
                   update_step,
                   log_dir=''):
    """
    Function that transfers replay to the buffer and batches from buffer to the queue.

    Args:
        config:
        replay_queue:
        batch_queue:
        training_on:
        global_episode:
        log_dir:
    """
    batch_size = config['batch_size']
    logger = Logger(f"{log_dir}/data_struct")

    # Create replay buffer
    replay_buffer = ReplayBuffer(state_dim=config["state_dim"],
                                 action_dim=config["action_dim"],
                                 max_size=config["replay_mem_size"],
                                 save_dir=config["results_path"])

    while training_on.value:
        # (1) Transfer replays to global buffer
        n = replay_queue.qsize()
        for _ in range(n):
            replay = replay_queue.get()
            replay_buffer.add(*replay)

        # (2) Transfer batch of replay from buffer to the batch_queue
        if len(replay_buffer) < batch_size:
            continue

        try:
            batch = replay_buffer.sample(batch_size)
            batch_queue.put_nowait(batch)
        except:
            sleep(0.1)
            continue

        # Log data structures sizes
        step = update_step.value
        logger.scalar_summary("data_struct/global_episode",
                              global_episode.value, step)
        logger.scalar_summary("data_struct/replay_queue", replay_queue.qsize(),
                              step)
        logger.scalar_summary("data_struct/batch_queue", batch_queue.qsize(),
                              step)
        logger.scalar_summary("data_struct/replay_buffer", len(replay_buffer),
                              step)

    empty_torch_queue(batch_queue)
    print("Stop sampler worker.")
コード例 #2
0
    def run(self, training_on, batch_queue, update_step):
        while update_step.value < self.num_train_steps:
            try:
                batch = batch_queue.get_nowait()
            except queue.Empty:
                continue
            self._update_step(batch, update_step)

            update_step.value += 1
            if update_step.value % 1000 == 0:
                print("Training step ", update_step.value)

        training_on.value = 0
        empty_torch_queue(self.learner_w_queue)
        print("Exit learner.")
コード例 #3
0
ファイル: agent.py プロジェクト: yj-Tang/d4pg-pytorch
    def run(self, training_on, replay_queue, learner_w_queue, update_step):
        # Initialise deque buffer to store experiences for N-step returns
        self.exp_buffer = deque()

        best_reward = -float("inf")
        rewards = []
        while training_on.value:
            episode_reward = 0
            num_steps = 0
            self.local_episode += 1
            self.global_episode.value += 1
            self.exp_buffer.clear()

            if self.local_episode % 100 == 0:
                print(f"Agent: {self.n_agent}  episode {self.local_episode}")

            ep_start_time = time.time()
            state = self.env_wrapper.reset()
            self.ou_noise.reset()
            done = False
            while not done:
                action = self.actor.get_action(state)
                if self.agent_type == "exploration":
                    action = self.ou_noise.get_action(action, num_steps)
                    action = action.squeeze(0)
                else:
                    action = action.detach().cpu().numpy().flatten()
                next_state, reward, done = self.env_wrapper.step(action)

                episode_reward += reward

                state = self.env_wrapper.normalise_state(state)
                reward = self.env_wrapper.normalise_reward(reward)

                self.exp_buffer.append((state, action, reward))

                # We need at least N steps in the experience buffer before we can compute Bellman
                # rewards and add an N-step experience to replay memory
                if len(self.exp_buffer) >= self.config['n_step_returns']:
                    state_0, action_0, reward_0 = self.exp_buffer.popleft()
                    discounted_reward = reward_0
                    gamma = self.config['discount_rate']
                    for (_, _, r_i) in self.exp_buffer:
                        discounted_reward += r_i * gamma
                        gamma *= self.config['discount_rate']
                    # We want to fill buffer only with form explorator
                    if self.agent_type == "exploration":
                        try:
                            replay_queue.put_nowait([
                                state_0, action_0, discounted_reward,
                                next_state, done, gamma
                            ])
                        except:
                            pass

                state = next_state

                if done or num_steps == self.max_steps:
                    # add rest of experiences remaining in buffer
                    while len(self.exp_buffer) != 0:
                        state_0, action_0, reward_0 = self.exp_buffer.popleft()
                        discounted_reward = reward_0
                        gamma = self.config['discount_rate']
                        for (_, _, r_i) in self.exp_buffer:
                            discounted_reward += r_i * gamma
                            gamma *= self.config['discount_rate']
                        if self.agent_type == "exploration":
                            try:
                                replay_queue.put_nowait([
                                    state_0, action_0, discounted_reward,
                                    next_state, done, gamma
                                ])
                            except:
                                pass
                    break

                num_steps += 1

            # Log metrics
            step = update_step.value
            self.logger.scalar_summary("agent/reward", episode_reward, step)
            self.logger.scalar_summary("agent/episode_timing",
                                       time.time() - ep_start_time, step)

            # Saving agent
            reward_outperformed = episode_reward - best_reward > self.config[
                "save_reward_threshold"]
            time_to_save = self.local_episode % self.num_episode_save == 0
            if self.n_agent == 0 and (time_to_save or reward_outperformed):
                if episode_reward > best_reward:
                    best_reward = episode_reward
                self.save(
                    f"local_episode_{self.local_episode}_reward_{best_reward:4f}"
                )

            rewards.append(episode_reward)
            if self.agent_type == "exploration" and self.local_episode % self.config[
                    'update_agent_ep'] == 0:
                self.update_actor_learner(learner_w_queue, training_on)

        empty_torch_queue(replay_queue)
        print(f"Agent {self.n_agent} done.")
コード例 #4
0
ファイル: agent.py プロジェクト: yusukeurakami/d4pg-pytorch
    def run(self, training_on, replay_queue, learner_w_queue, update_step):
        # Initialise deque buffer to store experiences for N-step returns
        self.exp_buffer = deque()

        best_reward = -float("inf")
        rewards = []
        while training_on.value:
            episode_reward = 0
            num_steps = 0
            self.local_episode += 1
            self.global_episode.value += 1
            self.exp_buffer.clear()

            if self.local_episode % 100 == 0:
                print(f"Agent: {self.n_agent}  episode {self.local_episode}")

            succeeded = 0.0
            best_succeeded = 0.0

            ep_start_time = time.time()
            state = self.env_wrapper.reset()
            current_pos = state[:self.config["action_dim"]]
            self.ou_noise.reset()
            done = False
            while not done:
                action = self.actor.get_action(state)
                # print("action mean: ", action)
                if self.agent_type == "exploration":
                    action = self.ou_noise.get_action(action, num_steps)
                    action = action.squeeze(0)
                    # print("action with noise :", action)
                else:
                    action = action.detach().cpu().numpy().flatten()

                next_a = action
                # if self.config["pos_control"]:
                #     # print("current pos: ",current_pos)
                #     next_a += current_pos
                for _ in range(self.config["action_repeat"]):
                    next_state, reward, done = self.env_wrapper.step(
                        next_a)  # Step
                    if done:
                        break
                current_pos = next_state[:self.config["action_dim"]]

                # next_state, reward, done = self.env_wrapper.step(action)

                episode_reward += reward

                state = self.env_wrapper.normalise_state(state)
                reward = self.env_wrapper.normalise_reward(reward)

                self.exp_buffer.append((state, action, reward))

                # We need at least N steps in the experience buffer before we can compute Bellman
                # rewards and add an N-step experience to replay memory
                if len(self.exp_buffer) >= self.config['n_step_returns']:
                    state_0, action_0, reward_0 = self.exp_buffer.popleft()
                    discounted_reward = reward_0
                    gamma = self.config['discount_rate']
                    for (_, _, r_i) in self.exp_buffer:
                        discounted_reward += r_i * gamma
                        gamma *= self.config['discount_rate']
                    # We want to fill buffer only with form explorator
                    if self.agent_type == "exploration":
                        try:
                            replay_queue.put_nowait([
                                state_0, action_0, discounted_reward,
                                next_state, done, gamma
                            ])
                        except:
                            pass

                state = next_state

                if done or num_steps >= self.max_steps:
                    # if self.n_agent:
                    #     print("episode done. Step was ",num_steps)
                    # add rest of experiences remaining in buffer
                    while len(self.exp_buffer) != 0:
                        state_0, action_0, reward_0 = self.exp_buffer.popleft()
                        discounted_reward = reward_0
                        gamma = self.config['discount_rate']
                        for (_, _, r_i) in self.exp_buffer:
                            discounted_reward += r_i * gamma
                            gamma *= self.config['discount_rate']
                        if self.agent_type == "exploration":
                            try:
                                replay_queue.put_nowait([
                                    state_0, action_0, discounted_reward,
                                    next_state, done, gamma
                                ])
                            except:
                                pass
                    break

                num_steps += 1

            #
            # Log metrics
            step = update_step.value
            if self.n_agent == 0:
                self.logger.scalar_summary("agent/reward", episode_reward,
                                           step)
                self.logger.scalar_summary("agent/episode_timing",
                                           time.time() - ep_start_time, step)

            rewards.append(episode_reward)
            if self.agent_type == "exploration" and self.local_episode % self.config[
                    'update_agent_ep'] == 0:
                self.update_actor_learner(learner_w_queue, training_on)

            ###########################
            if self.local_episode % 100 == 0:
                print("evaluate")
                avg_reward = 0.
                episodes = 20
                succeeded = 0
                rewards = []
                for _ in range(episodes):
                    episode_reward = 0
                    num_steps = 0

                    state = self.env_wrapper.reset()
                    current_pos = state[:self.config["action_dim"]]
                    self.ou_noise.reset()
                    done = False
                    while not done:
                        action = self.actor.get_action(state)
                        action = action.detach().cpu().numpy().flatten()

                        next_a = action
                        # if self.config["pos_control"]:
                        #     # print("current pos: ",current_pos)
                        #     next_a += current_pos
                        for _ in range(self.config["action_repeat"]):
                            next_state, reward, done = self.env_wrapper.step(
                                next_a)  # Step
                            if done:
                                break
                        current_pos = next_state[:self.config["action_dim"]]

                        # next_state, reward, done = self.env_wrapper.step(action)

                        episode_reward += reward

                        state = self.env_wrapper.normalise_state(state)
                        reward = self.env_wrapper.normalise_reward(reward)

                        state = next_state

                        if done or num_steps >= self.max_steps:
                            if abs(self.env_wrapper.env.env.get_doorangle()
                                   ) >= 0.2:
                                succeeded += 1
                            # else:
                            #     print("not opened")

                        num_steps += 1

                avg_reward += episode_reward

                avg_reward /= episodes
                succeeded /= episodes
                if self.n_agent == 0:
                    self.logger.scalar_summary("agent/test", avg_reward, step)
                    self.logger.scalar_summary("agent/success_rate", succeeded,
                                               step)
                print("----------------------------------------")
                print(
                    "Test Episodes: {}, Avg. Reward: {}, Success rate {}% per {} trials"
                    .format(episodes, round(avg_reward, 2),
                            round(succeeded, 2), episodes))
                print("----------------------------------------")

            # Saving agent
            reward_outperformed = succeeded - best_succeeded > self.config[
                "save_success_rate_threshold"]
            # reward_outperformed = episode_reward - best_reward > self.config["save_reward_threshold"]
            time_to_save = self.local_episode % self.num_episode_save == 0
            if self.n_agent == 0 and (time_to_save or reward_outperformed):
                # print(time_to_save, reward_outperformed)
                # if episode_reward > best_reward:
                #     best_reward = episode_reward
                if succeeded > best_succeeded:
                    best_succeeded = succeeded
                self.save(
                    f"agent-{self.n_agent}_local-episode-{self.local_episode}_step-{step}_success-{best_succeeded:4f}"
                )
            ###########################

        empty_torch_queue(replay_queue)
        print(f"Agent {self.n_agent} done.")
コード例 #5
0
def sampler_worker(config,
                   replay_queue,
                   batch_queue,
                   replay_priorities_queue,
                   training_on,
                   global_episode,
                   update_step,
                   log_dir=''):
    """
    Function that transfers replay to the buffer and batches from buffer to the queue.

    Args:
        config:
        replay_queue:
        batch_queue:
        training_on:
        global_episode:
        log_dir:
    """
    batch_size = config['batch_size']
    # logger = Logger(f"{log_dir}/data_struct")

    # Create replay buffer
    replay_buffer = create_replay_buffer(config)

    while training_on.value:
        # (1) Transfer replays to global buffer
        n = replay_queue.qsize()
        for _ in range(n):
            replay = replay_queue.get()
            replay_buffer.add(*replay)

        # (2) Transfer batch of replay from buffer to the batch_queue
        if len(replay_buffer) < batch_size:
            continue

        try:
            inds, weights = replay_priorities_queue.get_nowait()
            replay_buffer.update_priorities(inds, weights)
        except queue.Empty:
            pass

        try:
            batch = replay_buffer.sample(batch_size)
            batch_queue.put_nowait(batch)
        except:
            sleep(0.1)
            continue

        # Log data structures sizes
        step = update_step.value
        # logger.scalar_summary("data_stuct/global_episode", global_episode.value, step)
        # logger.scalar_summary("data_struct/replay_queue", replay_queue.qsize(), step)
        # logger.scalar_summary("data_struct/batch_queue", batch_queue.qsize(), step)
        # logger.scalar_summary("data_struct/replay_buffer", len(replay_buffer), step)

    if config['save_buffer_on_disk']:
        replay_buffer.dump(config["results_path"])

    empty_torch_queue(batch_queue)
    print("Stop sampler worker.")