def train(): memory = ReplayMemory(memory_size) fill_memory(memory, env, batch_size) explorer = ExpExplorer(explore_start, explore_stop, decay_rate) dqn = DQN(state_shape=env.observation_space.shape, n_actions=(env.action_space.n), lr=lr) dqn.model.summary() rewards_list = [] loss = 1 for episode in range(5000): episode_rewards = 0 state = env.reset() done = False while not done: action, explore_probability = predict_action(dqn.model, explorer, state, env.action_space.n) next_state, reward, done, _ = env.step(action) # env.render() episode_rewards += reward memory.push(state, action, reward, next_state, done) state = next_state loss = learn(dqn.model, memory).history['loss'] if done: rewards_list.append(episode_rewards) moving_average = np.mean(rewards_list[-100:]) if episode % 10 == 0: print('Episode: {}'.format(episode), 'Total reward: {}'.format(episode_rewards), 'Explore P: {:.4f}'.format(explore_probability), 'Training Loss {}'.format(loss), 'Moving average {}'.format(moving_average)) if episode % 10 == 0: dqn.model.save(PATH)
def train(): memory = ReplayMemory(memory_size) fill_memory(memory, env, batch_size) explorer = ExpExplorer(explore_start, explore_stop, decay_rate) dqn = DQN(state_shape=env.observation_space.shape[0], n_actions=env.action_space.n).to(device) criterion = torch.nn.MSELoss().to(device) optimizer = torch.optim.Adam(dqn.parameters(), lr=lr) rewards_list = [] for episode in range(5000): episode_rewards = 0 state = env.reset() done = False while not done: action, explore_probability = predict_action( dqn, explorer, state, env.action_space.n) next_state, reward, done, _ = env.step(action) # env.render() episode_rewards += reward memory.push(state, action, reward, next_state, done) state = next_state loss = learn(dqn, memory, criterion, optimizer) if done: rewards_list.append(episode_rewards) moving_average = np.mean(rewards_list[-100:]) if episode % 50 == 0: print('Episode: {}'.format(episode), 'Total reward: {}'.format(episode_rewards), 'Explore P: {:.4f}'.format(explore_probability), 'Training Loss {}'.format(loss), 'Moving average {}'.format(moving_average)) if episode % 100 == 0: torch.save(dqn.state_dict(), MODEL_PATH)
def __init__(self, env, model, policy, ## hyper-parameter gamma=0.90, lr=1e-3, batch_size=32, buffer_size=50000, learning_starts=1000, target_network_update_freq=1000, ## decay decay=False, decay_rate=0.9, ## DDqn && DuelingDQN double_dqn=True, dueling_dqn=False, dueling_way="native", ## prioritized_replay prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, ## path=None): """ :param env: the GYM environment :param model: the Torch NN model :param policy: the policy when choosing action :param ep: the MAX episode time :param step: the MAx step time .........................hyper-parameter.................................. :param gamma: :param lr: :param batchsize: :param buffer_size: :param target_network_update_freq: .........................further improve way.................................. :param double_dqn: whether enable DDQN :param dueling_dqn: whether dueling DDQN :param dueling_way: the Dueling DQN method it can choose the following three ways `avg`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta))) `max`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta))) `naive`: Q(s,a;theta) = V(s;theta) + A(s,a;theta) .........................prioritized-part.................................. :param prioritized_replay: (bool) if True prioritized replay buffer will be used. :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer. It determines how much prioritization is used, with alpha=0 corresponding to the uniform case. :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. .........................imitation_learning_part.................................. :param imitation_learning_policy: To initial the network with the given policy which is supervised way to training the network :param IL_time: supervised training times :param network_kwargs: """ self.env = env self.policy = policy self.gamma = gamma self.batch_size = batch_size self.learning_starts = learning_starts self.target_network_update_freq = target_network_update_freq self.double_dqn = double_dqn if dueling_dqn: self.Q_net = Dueling_dqn(model, dueling_way) else: self.Q_net = model self.target_Q_net = deepcopy(self.Q_net) q_net_optim = Adam(self.Q_net.parameters(), lr=lr) if decay: self.optim = torch.optim.lr_scheduler.ExponentialLR(q_net_optim, decay_rate, last_epoch=-1) else: self.optim = q_net_optim self.replay_buffer = ReplayMemory(buffer_size) self.learning = False super(DQN_Agent, self).__init__(path)
class DQN_Agent(Agent): def __init__(self, env, model, policy, ## hyper-parameter gamma=0.90, lr=1e-3, batch_size=32, buffer_size=50000, learning_starts=1000, target_network_update_freq=1000, ## decay decay=False, decay_rate=0.9, ## DDqn && DuelingDQN double_dqn=True, dueling_dqn=False, dueling_way="native", ## prioritized_replay prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, ## path=None): """ :param env: the GYM environment :param model: the Torch NN model :param policy: the policy when choosing action :param ep: the MAX episode time :param step: the MAx step time .........................hyper-parameter.................................. :param gamma: :param lr: :param batchsize: :param buffer_size: :param target_network_update_freq: .........................further improve way.................................. :param double_dqn: whether enable DDQN :param dueling_dqn: whether dueling DDQN :param dueling_way: the Dueling DQN method it can choose the following three ways `avg`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-Avg_a(A(s,a;theta))) `max`: Q(s,a;theta) = V(s;theta) + (A(s,a;theta)-max_a(A(s,a;theta))) `naive`: Q(s,a;theta) = V(s;theta) + A(s,a;theta) .........................prioritized-part.................................. :param prioritized_replay: (bool) if True prioritized replay buffer will be used. :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer. It determines how much prioritization is used, with alpha=0 corresponding to the uniform case. :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. .........................imitation_learning_part.................................. :param imitation_learning_policy: To initial the network with the given policy which is supervised way to training the network :param IL_time: supervised training times :param network_kwargs: """ self.env = env self.policy = policy self.gamma = gamma self.batch_size = batch_size self.learning_starts = learning_starts self.target_network_update_freq = target_network_update_freq self.double_dqn = double_dqn if dueling_dqn: self.Q_net = Dueling_dqn(model, dueling_way) else: self.Q_net = model self.target_Q_net = deepcopy(self.Q_net) q_net_optim = Adam(self.Q_net.parameters(), lr=lr) if decay: self.optim = torch.optim.lr_scheduler.ExponentialLR(q_net_optim, decay_rate, last_epoch=-1) else: self.optim = q_net_optim self.replay_buffer = ReplayMemory(buffer_size) self.learning = False super(DQN_Agent, self).__init__(path) def forward(self, observation): observation = observation.astype(np.float32) observation = torch.from_numpy(observation) Q_value = self.Q_net.forward(observation) Q_value = Q_value.detach().numpy() if self.policy is not None: action = self.policy.select_action(Q_value) else: action = np.argmax(Q_value) return action, Q_value def backward(self, sample_): self.replay_buffer.push(sample_) if self.step > self.learning_starts and self.learning: sample = self.replay_buffer.sample(self.batch_size) assert len(sample["s"]) == self.batch_size a = sample["a"].long().unsqueeze(1) Q = self.Q_net(sample["s"]).gather(1, a) if self.double_dqn: _, next_actions = self.Q_net(sample["s_"]).max(1, keepdim=True) targetQ = self.target_Q_net(sample["s_"]).gather(1, next_actions) else: targetQ = self.target_Q_net(sample["s_"]).max(1, keepdim=True) targetQ = targetQ.squeeze(1) Q = Q.squeeze(1) expected_q_values = sample["r"] + self.gamma * targetQ * (1.0 - sample["tr"]) loss = torch.mean(huber_loss(expected_q_values-Q)) self.Q_net.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.Q_net.parameters(), 1, norm_type=2) self.optim.step() if self.step % self.target_network_update_freq == 0: self.target_net_update() loss = loss.data.numpy() return loss return 0 def target_net_update(self): self.target_Q_net.load_state_dict(self.Q_net.state_dict()) def load_weights(self, filepath): model = torch.load(filepath) self.Q_net.load_state_dict(model["Q_net"]) self.target_Q_net.load_state_dict(model["target_Q_net"]) self.optim.load_state_dict(model["optim"]) def save_weights(self, filepath, overwrite=False): torch.save({"Q_net": self.Q_net, "target_Q_net": self.target_Q_net, "optim": self.optim }, filepath+"DQN.pkl")
def train(): memory = ReplayMemory(memory_size) fill_memory(memory) print('finished filling memory') explorer = LinearExplorer(1, 0.1, 1000000, 0.01, 24000000) dqn = DQN(state_shape=PROCESSED_FRAME_SIZE, n_actions=env.action_space.n).to(device) target_dqn = DQN(state_shape=PROCESSED_FRAME_SIZE, n_actions=env.action_space.n).to(device) criterion = torch.nn.SmoothL1Loss().to(device) optimizer = torch.optim.Adam(dqn.parameters(), lr=lr) frame_stack = FrameStack(4, PROCESSED_FRAME_SIZE) rewards_list = [] total_steps = 0 ts_frame = 0 ts = time.time() for episode in range(episodes_train): episode_rewards = 0 losses = [] state = env.reset() state = frame_stack.push_get(process_frame(state), True) done = False while not done: action, explore_probability = predict_action(dqn, explorer, state, env.action_space.n, total_steps) next_state, reward, done, _ = env.step(action) next_state = frame_stack.push_get(process_frame(next_state)) # env.render() episode_rewards += reward memory.push(state, action, reward, next_state, done) state = next_state if total_steps % update_frequency == 0: loss = learn(dqn, target_dqn, memory, criterion, optimizer) losses.append(loss.item()) if done: speed = (total_steps - ts_frame) / (time.time() - ts) ts_frame = total_steps ts = time.time() rewards_list.append(episode_rewards) print('Episode: {}'.format(episode), 'Total reward: {}'.format(episode_rewards), 'Explore P: {:.4f}'.format(explore_probability), 'Training Loss {}'.format(np.mean(losses)), 'total steps {}'.format(total_steps), 'speed {} frames/sec'.format(speed)) if total_steps % target_net_update_freq == 0: target_dqn.load_state_dict(dqn.state_dict()) total_steps += 1 if episode % 100 == 0: torch.save(dqn.state_dict(), MODEL_PATH)
def train(): memory = ReplayMemory(memory_size) fill_memory(memory) print('finished filling memory') explorer = LinearExplorer(1, 0.1, 100000, 0.01, 1000000) dqn = DQN(state_shape=env.observation_space.shape[0], n_actions=env.action_space.n).to(device) target_dqn = DQN(state_shape=env.observation_space.shape[0], n_actions=env.action_space.n).to(device) criterion = torch.nn.SmoothL1Loss().to(device) optimizer = torch.optim.Adam(dqn.parameters(), lr=lr) latest_rewards = deque([], maxlen=100) total_steps = 0 ts_frame = 0 ts = time.time() for episode in range(episodes_train): episode_rewards = 0 losses = [] state = env.reset() done = False while not done: action, explore_probability = predict_action( dqn, explorer, state, env.action_space.n, total_steps) next_state, reward, done, _ = env.step(action) # env.render() episode_rewards += reward memory.push(state, action, reward, next_state, done) state = next_state if total_steps % update_frequency == 0: loss = learn(dqn, target_dqn, memory, criterion, optimizer) losses.append(loss.item()) if done: speed = (total_steps - ts_frame) / (time.time() - ts) ts_frame = total_steps ts = time.time() latest_rewards.append(episode_rewards) print('Episode: {}'.format(episode), 'reward: {}'.format(episode_rewards), 'explore P: {:.4f}'.format(explore_probability), 'loss: {:.4f}'.format(np.mean(losses)), 'steps: {}'.format(total_steps), 'speed: {:.1f} frames/sec'.format(speed), 'average 100: {:.2f}'.format(np.mean(latest_rewards))) if total_steps % target_net_update_freq == 0: target_dqn.load_state_dict(dqn.state_dict()) total_steps += 1 if episode % 10 == 0: torch.save(dqn.state_dict(), MODEL_PATH)