def __init__(self, num_states, num_actions, learning_rate=0.01, gamma=0.90, batch_size=128, epsilon=0.90, update_target_gap=50, enable_gpu=False): if enable_gpu: self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") else: self.device = torch.device("cpu") self.gamma = gamma self.batch_size = batch_size self.update_target_gap = update_target_gap self.epsilon = epsilon self.num_learn_step = 0 self.memory = Memory() self.eval_net, self.target_net = DuelingMLPPolicy( num_states, num_actions).to(self.device), DuelingMLPPolicy( num_states, num_actions).to(self.device) self.optimizer = optim.Adam(self.eval_net.parameters(), lr=learning_rate) self.loss_func = nn.MSELoss()
def collect_samples(pid, queue, env, policy, render, running_state, min_batch_size): log = dict() memory = Memory() num_steps = 0 num_episodes = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') total_reward = 0 while num_steps < min_batch_size: state = env.reset() episode_reward = 0 if running_state: state = running_state(state) for t in range(10000): if render: env.render() state_tensor = FLOAT(state).unsqueeze(0) with torch.no_grad(): action, log_prob = policy.get_action_log_prob(state_tensor) action = action.cpu().numpy()[0] log_prob = log_prob.cpu().numpy()[0] next_state, reward, done, _ = env.step(action) episode_reward += reward if running_state: next_state = running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') memory.push(state, action, reward, next_state, mask, log_prob) if done: break state = next_state num_steps += (t + 1) num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) log['num_steps'] = num_steps log['num_episodes'] = num_episodes log['total_reward'] = total_reward log['avg_reward'] = total_reward / num_episodes log['max_episode_reward'] = max_episode_reward log['min_episode_reward'] = min_episode_reward if queue is not None: queue.put([pid, memory, log]) else: return memory, log
def __init__(self, pid, env, policy, render, running_state, min_batch_size): self.pid = pid self.env = env self.policy = policy self.render = render self.running_state = running_state self.min_batch_size = min_batch_size self.log = dict() self.memory = Memory()
def collect_samples(self, min_batch_size): self.policy.to(torch.device('cpu')) t_start = time.time() process_batch_size = int(math.floor(min_batch_size / self.num_process)) workers = [RemoteCollector.remote(i, self.env, self.policy, self.render, self.running_state, process_batch_size) for i in range(self.num_process)] task_ids = [worker.collect.remote() for worker in workers] results = ray.get([worker.get_log_memory.remote() for worker in workers]) worker_logs = [] memory = Memory() for result in results: worker_logs += result[0], memory.append(result[1]) log = merge_log(worker_logs) log['sample_time'] = time.time() - t_start self.policy.to(device) return memory, log
class RemoteCollector: def __init__(self, pid, env, policy, render, running_state, min_batch_size): self.pid = pid self.env = env self.policy = policy self.render = render self.running_state = running_state self.min_batch_size = min_batch_size self.log = dict() self.memory = Memory() def collect(self): num_steps = 0 num_episodes = 0 min_episode_reward = float('inf') max_episode_reward = float('-inf') total_reward = 0 while num_steps < self.min_batch_size: state = self.env.reset() episode_reward = 0 if self.running_state: state = self.running_state(state) for t in range(10000): if self.render: self.env.render() state_tensor = DOUBLE(state).unsqueeze(0) with torch.no_grad(): action, log_prob = self.policy.get_action_log_prob(state_tensor) action = action.cpu().numpy()[0] log_prob = log_prob.cpu().numpy()[0] next_state, reward, done, _ = self.env.step(action) episode_reward += reward if self.running_state: next_state = self.running_state(next_state) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') self.memory.push(state, action, reward, next_state, mask, log_prob) num_steps += 1 if done or num_steps >= self.min_batch_size: break state = next_state # num_steps += (t + 1) num_episodes += 1 total_reward += episode_reward min_episode_reward = min(episode_reward, min_episode_reward) max_episode_reward = max(episode_reward, max_episode_reward) self.log['num_steps'] = num_steps self.log['num_episodes'] = num_episodes self.log['total_reward'] = total_reward self.log['avg_reward'] = total_reward / num_episodes self.log['max_episode_reward'] = max_episode_reward self.log['min_episode_reward'] = min_episode_reward def get_log_memory(self): return self.log, self.memory
class DuelingDQN: def __init__(self, num_states, num_actions, learning_rate=0.01, gamma=0.90, batch_size=128, epsilon=0.90, update_target_gap=50, enable_gpu=False): if enable_gpu: self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") else: self.device = torch.device("cpu") self.gamma = gamma self.batch_size = batch_size self.update_target_gap = update_target_gap self.epsilon = epsilon self.num_learn_step = 0 self.memory = Memory() self.eval_net, self.target_net = DuelingMLPPolicy( num_states, num_actions).to(self.device), DuelingMLPPolicy( num_states, num_actions).to(self.device) self.optimizer = optim.Adam(self.eval_net.parameters(), lr=learning_rate) self.loss_func = nn.MSELoss() # greedy 策略动作选择 def choose_action(self, state, num_actions): state = torch.unsqueeze(torch.tensor(state), 0).to(self.device) if np.random.uniform() <= self.epsilon: # greedy policy action_val = self.eval_net(state.float()) action = action_val.max(1)[1].cpu().numpy() return action[0] else: action = np.random.randint(0, num_actions) return action def learn(self): # 更新目标网络 target_net if self.num_learn_step % self.update_target_gap == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) self.num_learn_step += 1 # 从Memory中采batch batch = self.memory.sample(self.batch_size) batch_state = torch.cat(batch.state).to(self.device) batch_action = torch.stack(batch.action, 0).to(self.device) batch_reward = torch.stack(batch.reward, 0).to(self.device) batch_next_state = torch.cat(batch.next_state).to(self.device) # 训练eval_net q_eval = self.eval_net(batch_state.float()).gather(1, batch_action) # 不更新 target_net参数 q_next = self.target_net(batch_next_state.float()).detach() q_target = batch_reward + self.gamma * q_next.max(1)[0].view( self.batch_size, 1) # 计算误差 loss = self.loss_func(q_eval, q_target) # 更新梯度 self.optimizer.zero_grad() loss.backward() self.optimizer.step()