コード例 #1
0
    def __init__(self,
                 num_states,
                 num_actions,
                 learning_rate=0.01,
                 gamma=0.90,
                 batch_size=128,
                 epsilon=0.90,
                 update_target_gap=50,
                 enable_gpu=False):
        if enable_gpu:
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device("cpu")

        self.gamma = gamma
        self.batch_size = batch_size
        self.update_target_gap = update_target_gap
        self.epsilon = epsilon

        self.num_learn_step = 0

        self.memory = Memory()
        self.eval_net, self.target_net = DuelingMLPPolicy(
            num_states, num_actions).to(self.device), DuelingMLPPolicy(
                num_states, num_actions).to(self.device)
        self.optimizer = optim.Adam(self.eval_net.parameters(),
                                    lr=learning_rate)
        self.loss_func = nn.MSELoss()
コード例 #2
0
def collect_samples(pid, queue, env, policy, render, running_state,
                    min_batch_size):
    log = dict()
    memory = Memory()
    num_steps = 0
    num_episodes = 0

    min_episode_reward = float('inf')
    max_episode_reward = float('-inf')
    total_reward = 0

    while num_steps < min_batch_size:
        state = env.reset()
        episode_reward = 0
        if running_state:
            state = running_state(state)

        for t in range(10000):
            if render:
                env.render()

            state_tensor = FLOAT(state).unsqueeze(0)
            with torch.no_grad():
                action, log_prob = policy.get_action_log_prob(state_tensor)
            action = action.cpu().numpy()[0]
            log_prob = log_prob.cpu().numpy()[0]
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward

            if running_state:
                next_state = running_state(next_state)

            mask = 0 if done else 1
            # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
            memory.push(state, action, reward, next_state, mask, log_prob)
            if done:
                break

            state = next_state

        num_steps += (t + 1)
        num_episodes += 1
        total_reward += episode_reward
        min_episode_reward = min(episode_reward, min_episode_reward)
        max_episode_reward = max(episode_reward, max_episode_reward)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_episode_reward'] = max_episode_reward
    log['min_episode_reward'] = min_episode_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
コード例 #3
0
 def __init__(self, pid, env, policy, render, running_state, min_batch_size):
     self.pid = pid
     self.env = env
     self.policy = policy
     self.render = render
     self.running_state = running_state
     self.min_batch_size = min_batch_size
     self.log = dict()
     self.memory = Memory()
コード例 #4
0
    def collect_samples(self, min_batch_size):
        self.policy.to(torch.device('cpu'))
        t_start = time.time()
        process_batch_size = int(math.floor(min_batch_size / self.num_process))

        workers = [RemoteCollector.remote(i, self.env, self.policy, self.render, self.running_state, process_batch_size)
                   for i in range(self.num_process)]

        task_ids = [worker.collect.remote() for worker in workers]
        results = ray.get([worker.get_log_memory.remote() for worker in workers])

        worker_logs = []
        memory = Memory()

        for result in results:
            worker_logs += result[0],
            memory.append(result[1])

        log = merge_log(worker_logs)
        log['sample_time'] = time.time() - t_start

        self.policy.to(device)
        return memory, log
コード例 #5
0
class RemoteCollector:
    def __init__(self, pid, env, policy, render, running_state, min_batch_size):
        self.pid = pid
        self.env = env
        self.policy = policy
        self.render = render
        self.running_state = running_state
        self.min_batch_size = min_batch_size
        self.log = dict()
        self.memory = Memory()

    def collect(self):
        num_steps = 0
        num_episodes = 0

        min_episode_reward = float('inf')
        max_episode_reward = float('-inf')
        total_reward = 0

        while num_steps < self.min_batch_size:
            state = self.env.reset()
            episode_reward = 0
            if self.running_state:
                state = self.running_state(state)

            for t in range(10000):
                if self.render:
                    self.env.render()
                state_tensor = DOUBLE(state).unsqueeze(0)
                with torch.no_grad():
                    action, log_prob = self.policy.get_action_log_prob(state_tensor)
                action = action.cpu().numpy()[0]
                log_prob = log_prob.cpu().numpy()[0]
                next_state, reward, done, _ = self.env.step(action)
                episode_reward += reward

                if self.running_state:
                    next_state = self.running_state(next_state)

                mask = 0 if done else 1
                # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
                self.memory.push(state, action, reward, next_state, mask, log_prob)
                num_steps += 1

                if done or num_steps >= self.min_batch_size:
                    break

                state = next_state

            # num_steps += (t + 1)
            num_episodes += 1
            total_reward += episode_reward
            min_episode_reward = min(episode_reward, min_episode_reward)
            max_episode_reward = max(episode_reward, max_episode_reward)

        self.log['num_steps'] = num_steps
        self.log['num_episodes'] = num_episodes
        self.log['total_reward'] = total_reward
        self.log['avg_reward'] = total_reward / num_episodes
        self.log['max_episode_reward'] = max_episode_reward
        self.log['min_episode_reward'] = min_episode_reward

    def get_log_memory(self):
        return self.log, self.memory
コード例 #6
0
class DuelingDQN:
    def __init__(self,
                 num_states,
                 num_actions,
                 learning_rate=0.01,
                 gamma=0.90,
                 batch_size=128,
                 epsilon=0.90,
                 update_target_gap=50,
                 enable_gpu=False):
        if enable_gpu:
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device("cpu")

        self.gamma = gamma
        self.batch_size = batch_size
        self.update_target_gap = update_target_gap
        self.epsilon = epsilon

        self.num_learn_step = 0

        self.memory = Memory()
        self.eval_net, self.target_net = DuelingMLPPolicy(
            num_states, num_actions).to(self.device), DuelingMLPPolicy(
                num_states, num_actions).to(self.device)
        self.optimizer = optim.Adam(self.eval_net.parameters(),
                                    lr=learning_rate)
        self.loss_func = nn.MSELoss()

    # greedy 策略动作选择
    def choose_action(self, state, num_actions):
        state = torch.unsqueeze(torch.tensor(state), 0).to(self.device)
        if np.random.uniform() <= self.epsilon:  # greedy policy
            action_val = self.eval_net(state.float())
            action = action_val.max(1)[1].cpu().numpy()
            return action[0]
        else:
            action = np.random.randint(0, num_actions)
            return action

    def learn(self):
        # 更新目标网络 target_net
        if self.num_learn_step % self.update_target_gap == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.num_learn_step += 1

        # 从Memory中采batch
        batch = self.memory.sample(self.batch_size)
        batch_state = torch.cat(batch.state).to(self.device)
        batch_action = torch.stack(batch.action, 0).to(self.device)
        batch_reward = torch.stack(batch.reward, 0).to(self.device)
        batch_next_state = torch.cat(batch.next_state).to(self.device)

        # 训练eval_net
        q_eval = self.eval_net(batch_state.float()).gather(1, batch_action)
        # 不更新 target_net参数
        q_next = self.target_net(batch_next_state.float()).detach()
        q_target = batch_reward + self.gamma * q_next.max(1)[0].view(
            self.batch_size, 1)

        # 计算误差
        loss = self.loss_func(q_eval, q_target)

        # 更新梯度
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()