def init_dqn(args): """Intitialises and returns the necessary objects for Deep Q-learning: Q-network, target network, replay buffer and optimizer. """ logging.info( "Initialisaling DQN with architecture {} and optimizer {}".format( args.dqn_archi, args.optimizer_agent)) if args.dqn_archi == 'mlp': q_net = DQN(args.obs_shape, args.n_actions, args) q_target = DQN(args.obs_shape, args.n_actions, args) elif args.dqn_archi == 'cnn': q_net = CnnDQN(args.obs_shape, args.n_actions, args) q_target = CnnDQN(args.obs_shape, args.n_actions, args) if args.optimizer_agent == 'RMSProp': optimizer_agent = optim.RMSprop(q_net.parameters(), lr=args.lr_agent, weight_decay=args.lambda_agent) else: assert args.optimizer_agent == 'Adam' optimizer_agent = optim.Adam(q_net.parameters(), lr=args.lr_agent, weight_decay=args.lambda_agent) q_target.load_state_dict( q_net.state_dict()) # set params of q_target to be the same replay_buffer = ReplayBuffer(args.replay_buffer_size) if args.epsilon_annealing_scheme == 'linear': epsilon_schedule = LinearSchedule(schedule_timesteps=int( args.exploration_fraction * args.n_agent_steps), initial_p=args.epsilon_start, final_p=args.epsilon_stop) else: assert args.epsilon_annealing_scheme == 'exp' epsilon_schedule = ExpSchedule(decay_rate=args.epsilon_decay, final_p=args.epsilon_stop, initial_p=args.epsilon_start) return q_net, q_target, replay_buffer, optimizer_agent, epsilon_schedule
class Agent(): def __init__(self, learn_rate, state_shape, num_actions, action_shape, batch_size, slice_size): self.gamma = 0.999 self.tau = 0.01 self.clip_grad_norm = 0.1 self.has_target_net = True self.state_shape = state_shape self.num_actions = num_actions # this is how many actions there are to choose from self.action_shape = action_shape # this is how many actions the env accepts at each step self.buffer_size = 1_000_000 self.batch_size = batch_size # *times slice_size, because recurrency/rollouts self.slice_size = slice_size self.slice_replay_buffer = MemorySliceReplayBuffer( size=self.buffer_size, slice_size=self.slice_size, state_shape=self.state_shape, action_shape=self.action_shape) self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=300) # self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30) # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.device = torch.device("cpu") self.net = DQN(state_shape, num_actions).to(self.device) if self.has_target_net: self.target_net = copy.deepcopy(self.net).to(self.device) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate) def update_target_net_params(self): for param, target_param in zip(self.net.parameters(), self.target_net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def choose_action(self, observation, hidden_state): state = torch.tensor(observation).float().unsqueeze(0) state = state.detach().to(self.device) q_values, hidden_state_ = self.net(state, hidden_state) action = torch.argmax(q_values[0]).item() if random.random() <= self.epsilon.value(): action = random.randint(0, self.action_shape[0]) return action, hidden_state_ def learn(self, stats): if self.slice_replay_buffer.count < self.batch_size: return self.net.train() states_slices, actions_slices, rewards_slices, next_states_slices, dones_slices = self.slice_replay_buffer.sample(self.batch_size, self.device) batch_losses = [] hidden_states = self.net.get_batch_hidden_state(self.batch_size).to(self.device) for slice_index in range(self.slice_size): states = states_slices[:, slice_index] actions = actions_slices[:, slice_index] rewards = rewards_slices[:, slice_index] states_ = next_states_slices[:, slice_index] dones = dones_slices[:, slice_index] batch_indices = np.arange(self.batch_size, dtype=np.int64) qs, hidden_states_ = self.net(states, hidden_states) chosen_q = qs[batch_indices, actions.T[0]] if self.has_target_net: qs_, hidden_state_3 = self.target_net(states_, hidden_states_) action_qs_, hidden_state_3 = self.net(states_, hidden_states_) actions_ = torch.argmax(action_qs_, dim=1) chosen_q_ = qs_[batch_indices, actions_] else: action_qs_, hidden_state_3 = self.net(states_, hidden_states_) chosen_q_ = torch.max(action_qs_, dim=1)[0] rewards = rewards.T[0] q_target = rewards + self.gamma * chosen_q_ loss = torch.mean( (q_target - chosen_q) ** 2 ) batch_losses.append(-loss) hidden_states = hidden_states_ hidden_states[dones.T[0]] = 0.0 # if an episode ends mid slice then zero the hidden_states # this could be a problem if backprop stops here batch_losses = torch.stack(batch_losses) batch_loss = torch.mean(batch_losses) stats.last_loss = batch_loss.item() self.optimizer.zero_grad() batch_loss.backward() torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.clip_grad_norm) self.optimizer.step() self.epsilon.step() if self.has_target_net: self.update_target_net_params()
from collections import deque import random import torch from torch import optim from tqdm import tqdm from env import Env from hyperparams import ACTION_DISCRETISATION, OFF_POLICY_BATCH_SIZE as BATCH_SIZE, DISCOUNT, EPSILON, HIDDEN_SIZE, LEARNING_RATE, MAX_STEPS, REPLAY_SIZE, TARGET_UPDATE_INTERVAL, TEST_INTERVAL, UPDATE_INTERVAL, UPDATE_START from models import DQN, create_target_network from utils import plot env = Env() agent = DQN(HIDDEN_SIZE, ACTION_DISCRETISATION) target_agent = create_target_network(agent) optimiser = optim.Adam(agent.parameters(), lr=LEARNING_RATE) D = deque(maxlen=REPLAY_SIZE) def convert_discrete_to_continuous_action(action): return action.to(dtype=torch.float32) - ACTION_DISCRETISATION // 2 def test(agent): with torch.no_grad(): env = Env() state, done, total_reward = env.reset(), False, 0 while not done: action = agent(state).argmax( dim=1, keepdim=True) # Use purely exploitative policy at test time state, reward, done = env.step( convert_discrete_to_continuous_action(action))
screen[32:-16:2,::2].mean(axis=2).astype(np.uint8) ### SETUP ### env = gym.make(game) win_streak = [] frame_shape = process_frame(env.reset()).shape state_shape = (frames_number, *frame_shape) if not torch.cuda.is_available(): print('cuda not available') device = torch.device('cuda') net = DQN(state_shape, env.action_space.n).to(device) target_net = copy(net).to(device) loss = torch.nn.MSELoss() optimizer = torch.optim.Adam(net.parameters(), lr=1e-4) training_queue = Queue() memory = Memory(int(1e+4), training_queue, device) ### UTILITY FUNCTIONS ### def step(action, reset=False): if reset: state = [process_frame(env.reset())] loops = frames_number - 1 else: state = [] loops = frames_number reward = 0.
class QAgent: def __init__(self, epsilon_start, epsilon_end, epsilon_anneal, nb_actions, learning_rate, gamma, batch_size, replay_memory_size, hidden_size, model_input_size, use_PER, use_ICM): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_anneal_over_steps = epsilon_anneal self.num_actions = nb_actions self.gamma = gamma self.batch_size = batch_size self.learning_rate = learning_rate self.step_no = 0 self.policy = DQN(hidden_size=hidden_size, inputs=model_input_size, outputs=nb_actions).to(self.device) self.target = DQN(hidden_size=hidden_size, inputs=model_input_size, outputs=nb_actions).to(self.device) self.target.load_state_dict(self.policy.state_dict()) self.target.eval() self.hidden_size = hidden_size self.optimizer = torch.optim.AdamW(self.policy.parameters(), lr=self.learning_rate) self.use_PER = use_PER if use_PER: self.replay = Prioritized_Replay_Memory(replay_memory_size) else: self.replay = Replay_Memory(replay_memory_size) self.loss_function = torch.nn.MSELoss() self.use_ICM = use_ICM if use_ICM: self.icm = ICM(model_input_size, nb_actions) # Get the current epsilon value according to the start/end and annealing values def get_epsilon(self): eps = self.epsilon_end if self.step_no < self.epsilon_anneal_over_steps: eps = self.epsilon_start - self.step_no * \ ((self.epsilon_start - self.epsilon_end) / self.epsilon_anneal_over_steps) return eps # select an action with epsilon greedy def select_action(self, state): self.step_no += 1 if np.random.uniform() > self.get_epsilon(): with torch.no_grad(): return torch.argmax(self.policy(state)).view(1) else: return torch.tensor([random.randrange(self.num_actions)], device=self.device, dtype=torch.long) # update the model according to one step td targets def update_model(self): if self.use_PER: batch_index, batch, ImportanceSamplingWeights = self.replay.sample( self.batch_size) else: batch = self.replay.sample(self.batch_size) batch_tuple = Transition(*zip(*batch)) state = torch.stack(batch_tuple.state) action = torch.stack(batch_tuple.action) reward = torch.stack(batch_tuple.reward) next_state = torch.stack(batch_tuple.next_state) done = torch.stack(batch_tuple.done) self.optimizer.zero_grad() if self.use_ICM: self.icm.optimizer.zero_grad() forward_loss = self.icm.get_forward_loss(state, action, next_state) inverse_loss = self.icm.get_inverse_loss(state, action, next_state) icm_loss = (1 - self.icm.beta) * inverse_loss.mean( ) + self.ICM.beta * forward_loss.mean() td_estimates = self.policy(state).gather(1, action).squeeze() td_targets = reward + (1 - done.float()) * self.gamma * \ self.target(next_state).max(1)[0].detach_() if self.use_PER: loss = (torch.tensor(ImportanceSamplingWeights, device=self.device) * self.loss_function(td_estimates, td_targets) ).sum() * self.loss_function(td_estimates, td_targets) errors = td_estimates - td_targets self.replay.batch_update(batch_index, errors.data.numpy()) else: loss = self.loss_function(td_estimates, td_targets) if self.use_ICM: loss = self.icm.lambda_weight * loss + icm_loss loss.backward() for param in self.policy.parameters(): param.grad.data.clamp_(-1, 1) if self.use_ICM: self.icm.optimizer.step() self.optimizer.step() return loss.item() # set target net parameters to policy net parameters def update_target(self): self.target.load_state_dict(self.policy.state_dict()) # save model def save(self, path, name): dirname = os.path.dirname(__file__) filename = os.path.join(dirname, os.path.join(path, name + ".pt")) torch.save(self.policy.state_dict(), filename) # load a model def load(self, path): dirname = os.path.dirname(__file__) filename = os.path.join(dirname, path) self.policy.load_state_dict(torch.load(filename)) # store experience in replay memory def cache(self, state, action, reward, next_state, done): self.replay.push(state, action, reward, next_state, done)
class Trainer(object): def __init__(self, args, n_agents, n_cities, device, data_loader): self.n_agents = n_agents self.n_cities = n_cities self.device = device self.args = args self.Encoder = Encoder(K=args.steps, M=self.n_cities, L=args.len_encoder).to(self.device) self.DQN = DQN(N=self.n_agents, K=args.steps, L=args.len_encoder, M=n_cities).to(self.device) self.data_loader = data_loader self.iter_data = iter(data_loader) self.n_envs = len(data_loader) self.idx_env = -1 self.env = None self.EPS_START = self.args.eps_start self.EPS_END = self.args.eps_end self.EPS_DECAY = self.args.eps_decay self.criterion = nn.MSELoss() self.optimizer = torch.optim.RMSprop(self.DQN.parameters(), lr=args.lr) def calc_loss(self, samples): self.DQN.train() states = [] next_states = [] for sample in samples: states.append(sample.state.reshape(1, -1)) next_states.append(sample.next_state.reshape(1, -1)) states = torch.cat(states) next_states = torch.cat(next_states) # add one dim at 1 (batch_size, 1, state) states = states.unsqueeze(1) next_states = next_states.unsqueeze(1) with torch.enable_grad(): Q = self.DQN(states) Q_next = self.DQN(next_states) temp_Q = [] temp_Q_next = [] for i in range(len(samples)): action = samples[i].action reward = samples[i].reward.to(self.device) action_idx = action[0] * self.n_cities + action[1] temp_Q.append(Q[i][action_idx].reshape(1)) temp_Q_next.append((Q_next[i].max() * self.args.gamma + reward).reshape(1).cuda(self.device)) Q = torch.cat(temp_Q).float().cuda(self.device) Q_next = torch.cat(temp_Q_next).float().cuda(self.device) loss = self.criterion(Q, Q_next) return loss def gen_env(self): data = next(self.iter_data) self.idx_env += 1 self.env = Env(n_agents=self.n_agents, n_cities=self.n_cities, steps=self.args.steps, conn=data["conn"], tasks=data["tasks"], cities=data["cities"], rewards=data["rewards"], destinations=data["destinations"], budget=self.args.budget) def select_action(self, state): eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \ math.exp(-1. * self.env.steps_done / self.EPS_DECAY) actions = [] for i in range(self.n_agents): p = random.random() if p > eps_threshold: with torch.no_grad(): # print("not random") q = self.DQN(state[i].reshape(1, 1, -1)).reshape(2, -1).max(1) if q[0][0] > q[0][1]: action = torch.tensor([0], device=self.device, requires_grad=False) action = torch.cat( (action, q[1][0].reshape(1, ).long())) else: action = torch.tensor([1], device=self.device, requires_grad=False) action = torch.cat( (action, q[1][1].reshape(1, ).long())) actions.append(action) else: action = [ random.choice([0, 1]), random.randint(0, self.n_cities - 1) ] actions.append( torch.tensor(action, device=self.device, requires_grad=False)) return actions def step(self): self.env.steps_done += 1 x = self.env.input().reshape(self.n_agents, -1).cuda(self.device) phi = [] for i in range(self.n_agents): with torch.no_grad(): phi.append(self.Encoder(x[i])) # after encoding n = torch.cat(phi, dim=0).reshape(self.n_agents, -1) # state s = [] for i in range(self.n_agents): ni = torch.cat((n[0:i], n[i + 1:])).reshape(-1) s.append(torch.cat((x[i], ni))) s = torch.cat(s).reshape(self.n_agents, -1) # epsilon-greedy actions = self.select_action(s) # collect rewards rewards = self.env.step(actions) if rewards == -1: return "done" # state_{t+1} x_tp1 = self.env.input().reshape(self.n_agents, -1).cuda(self.device) phi_tp1 = [] for i in range(self.n_agents): with torch.no_grad(): phi_tp1.append(self.Encoder(x_tp1[i])) n_tp1 = torch.cat(phi_tp1, dim=0).reshape(self.n_agents, -1) s_tp1 = [] for i in range(self.n_agents): ni = torch.cat((n_tp1[0:i], n_tp1[i + 1:])).reshape(-1) s_tp1.append(torch.cat((x_tp1[i], ni))) s_tp1 = torch.cat(s_tp1).reshape(self.n_agents, -1) # initial Transition tuple res = [] for i in range(self.n_agents): res.append( Transition(state=s[i], action=actions[i], next_state=s_tp1[i], reward=rewards[i])) return res
def train_DQN(env: WrapIt, Q: DQN, Q_target: DQN, optimizer: namedtuple, replay_buffer: ReplayBuffer, exploration: Schedule): """ @parameters Q: Q_target: optimizer: torch.nn.optim.Optimizer with parameters buffer: store the frame @return None """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete optimizer = optimizer.constructor(Q.parameters(), **optimizer.kwargs) num_actions = env.action_space.n num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') LOG_EVERY_N_STEPS = 10000 last_obs = env.reset(passit=True) # Q.getSummary() out_count = 0 bar = tqdm(range(ARGS.timesteps)) for t in bar: last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() if t > ARGS.startepoch: value = select_epsilon_greedy_action(Q, recent_observations, exploration, t, num_actions) action = value[0, 0] else: action = random.randrange(num_actions) obs, reward, done, _ = env.step(action) reward = max(-1.0, min(reward, 1.0)) replay_buffer.store_effect(last_idx, action, reward, done) if done: obs = env.reset() last_obs = obs # bar.set_description(f"{obs.shape} {obs.dtype}") if (t > ARGS.startepoch and t % ARGS.dqn_freq == 0 and replay_buffer.can_sample(ARGS.batchsize)): bar.set_description("backward") (obs_batch, act_batch, rew_batch, next_obs_batch, done_mask) = replay_buffer.sample(ARGS.batchsize) (obs_batch, act_batch, rew_batch, next_obs_batch, not_done_mask) = TENSOR(obs_batch, act_batch, rew_batch, next_obs_batch, 1 - done_mask) (obs_batch, act_batch, rew_batch, next_obs_batch, not_done_mask) = TO(obs_batch, act_batch, rew_batch, next_obs_batch, not_done_mask) values = Q(obs_batch) current_Q_values = values.gather( 1, act_batch.unsqueeze(1).long()).squeeze() # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = Q_target(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values Q_target_values = rew_batch + (ARGS.gamma * next_Q_values) # Compute Bellman error bellman_error = Q_target_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass # current_Q_values.backward(d_error.data.unsqueeze(1)) current_Q_values.backward(d_error.data) # Perfom the update optimizer.step() num_param_updates += 1 if num_param_updates % ARGS.dqn_updatefreq == 0: bar.set_description("update") Q_target.load_state_dict(Q.state_dict())
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, name, state_size, action_size, use_double_dqn=False, use_dueling=False, seed=0, lr_decay=0.9999, use_prioritized_replay=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.name = name self.state_size = state_size self.action_size = action_size self.use_double_dqn = use_double_dqn self.use_dueling = use_dueling self.seed = random.seed(seed) self.use_prioritized_replay = use_prioritized_replay # Q-Network if use_dueling: self.qnetwork_local = DuelingDQN(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingDQN(state_size, action_size, seed).to(device) else: self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.qnetwork_target.eval() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, lr_decay) # Replay memory if self.use_prioritized_replay: self.memory = PrioritizedReplayBuffer(BUFFER_SIZE, seed, alpha=0.2, beta=0.8, beta_scheduler=1.0) else: self.memory = ReplayBuffer(BUFFER_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample(BATCH_SIZE) self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Epsilon-greedy action selection if random.random() > eps: self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if self.use_prioritized_replay: states, actions, rewards, next_states, dones, indices, weights = experiences else: states, actions, rewards, next_states, dones = experiences with torch.no_grad(): # Get max predicted Q values (for next states) from target model if self.use_double_dqn: best_local_actions = self.qnetwork_local(states).max(1)[1].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, best_local_actions).max(1)[0].unsqueeze(1) else: Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) if self.use_prioritized_replay: Q_targets.sub_(Q_expected) Q_targets.squeeze_() Q_targets.pow_(2) with torch.no_grad(): td_error = Q_targets.detach() #td_error.pow_(0.5) td_error.mul_(weights) self.memory.update_priorities(indices, td_error) Q_targets.mul_(weights) loss = Q_targets.mean() else: # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.lr_scheduler.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DoubleDQNAgent: def __init__(self, env, use_conv=True, learning_rate=3e-4, gamma=0.99, tau=0.01, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.tau = tau self.replay_buffer = BasicBuffer(max_size=buffer_size) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.use_conv = use_conv if self.use_conv: self.model1 = ConvDQN(env.observation_space.shape, env.action_space.n).to(self.device) self.model2 = ConvDQN(env.observation_space.shape, env.action_space.n).to(self.device) else: self.model1 = DQN(env.observation_space.shape, len(env.action_space)).to(self.device) self.model2 = DQN(env.observation_space.shape, len(env.action_space)).to(self.device) self.optimizer1 = torch.optim.Adam(self.model1.parameters()) self.optimizer2 = torch.optim.Adam(self.model2.parameters()) def get_action(self, state, eps=0.20): if (np.random.randn() < eps): return np.random.choice(self.env.action_space) state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device) qvals = self.model1.forward(state) action = np.argmax(qvals.cpu().detach().numpy()) return action def compute_loss(self, batch): states, actions, rewards, next_states, dones = batch states = torch.FloatTensor(states).to(self.device) actions = torch.LongTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) # resize tensors actions = actions.view(actions.size(0), 1) dones = dones.view(dones.size(0), 1) # compute loss curr_Q1 = self.model1.forward(states).gather(1, actions) curr_Q2 = self.model2.forward(states).gather(1, actions) next_Q1 = self.model1.forward(next_states) next_Q2 = self.model2.forward(next_states) next_Q = torch.min( torch.max(self.model1.forward(next_states), 1)[0], torch.max(self.model2.forward(next_states), 1)[0]) next_Q = next_Q.view(next_Q.size(0), 1) expected_Q = rewards + (1 - dones) * self.gamma * next_Q loss1 = F.mse_loss(curr_Q1, expected_Q.detach()) loss2 = F.mse_loss(curr_Q2, expected_Q.detach()) return loss1, loss2 def update(self, batch_size): batch = self.replay_buffer.sample(batch_size) loss1, loss2 = self.compute_loss(batch) self.optimizer1.zero_grad() loss1.backward() self.optimizer1.step() self.optimizer2.zero_grad() loss2.backward() self.optimizer2.step()
memory = per_replay.PrioritizedReplayBuffer(75000, alpha=0.4, beta=0.6, epsilon=0.001) action_len = 13 demos = parse_demo(args.env_name, memory, args.demo_file) TARGET_UPDATE = 10 # instantiating model and optimizer policy_net = DQN(64, 64, 512, action_len).to(device) target_net = DQN(64, 64, 512, action_len).to(device) # if args.load_name is not None: # model.load_state_dict(pickle.load(open(args.load_name, 'rb'))) if not args.no_train: optimizer = optim.Adam(policy_net.parameters(), lr=args.lr) # instantiating policy object if args.no_train: args.eps_start = 0.0 args.eps_end = 0.0 args.eps_steps = 1 policy = EpsGreedyPolicy(args.eps_start, args.eps_end, args.eps_steps) opt_step = 0 # pre-training if not args.no_train: print('Pre-training') for i in range(1000):
class DQNAgent: def __init__(self, env, use_conv=True, learning_rate=3e-4, gamma=0.99, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.replay_buffer = BasicBuffer(max_size=buffer_size) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) self.use_conv = use_conv if self.use_conv: self.model = ConvDQN(env.observation_space.shape, len(env.action_space)).to(self.device) else: self.model = DQN(env.observation_space.shape, len(env.action_space)).to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters()) self.MSE_loss = nn.MSELoss() def get_action(self, state, eps=0.20): state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device) qvals = self.model.forward(state) #print(qvals) action = np.argmax(qvals.cpu().detach().numpy()) action = np.max(action) if (np.random.randn() < eps): return np.random.choice(self.env.action_space) return action def compute_loss(self, batch): states, actions, rewards, next_states, dones = batch states = torch.FloatTensor(states).to(self.device) actions = torch.LongTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones) #print(self.model.forward(states)) curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1)) curr_Q = curr_Q.squeeze(1) next_Q = self.model.forward(next_states) max_next_Q = torch.max(next_Q, 1)[0] expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q loss = self.MSE_loss(curr_Q, expected_Q) return loss def update(self, batch_size): batch = self.replay_buffer.sample(batch_size) loss = self.compute_loss(batch) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
import torch.optim as optim import copy import pickle from utils import * from models import DQN initial_Q = AER_initial_Q() # initial_Q = torch.zeros(n_actions, device=device) policy_net = DQN(recent_k, n_agents, n_actions, initial_Q).to(device) target_net = DQN(recent_k, n_agents, n_actions, initial_Q).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) Q = torch.zeros(n_actions, n_actions, n_actions, device=device) for i in range(n_actions): for j in range(n_actions): Q[i, j, :] = initial_Q.view(-1) memory = ReplayMemory(MEM_SIZE) heat = torch.zeros(n_agents, n_actions, n_actions, device=device) heat_unique0 = [] heat_freq0 = [] heat_unique1 = [] heat_freq1 = []
class DQNAgent(BaseAgent): """ Agent with a DQN network. """ def __init__(self, input_dim, output_dim, lr, gamma, max_memory_size, batch_size, eps_start, eps_end, eps_decay, device, linear1_units=64, linear2_units=64, decay_type="linear"): super().__init__(max_memory_size, batch_size, eps_start, eps_end, eps_decay, device, decay_type) self.model_name = "DQN" self.output_dim = output_dim self.policy_net = DQN(input_dim, output_dim, linear1_units, linear2_units).to(device) # optimizer self.optim = optim.Adam(self.policy_net.parameters(), lr=lr) self.gamma = gamma def choose_action(self, state, testing=False): """ Choose an action to perform. Uses eps-greedy approach. :param state: current state of the environment :param testing: if True, always choose greedy action :return: the action chosen """ self.curr_step += 1 if not testing and np.random.random() < self.curr_eps: return np.random.randint(0, self.output_dim) else: # we're using the network for inference only, we don't want to track the gradients in this case with torch.no_grad(): return self.policy_net(state).argmax().item() def learn(self): """ Update the weights of the network. :return: the loss """ states, next_states, actions, rewards, dones = self.memory.sample( self.batch_size) curr_q_vals = self.policy_net(states).gather(1, actions) next_q_vals = self.policy_net(next_states).max( 1, keepdim=True)[0].detach() target = (rewards + self.gamma * next_q_vals * (1 - dones)).to( self.device) loss = F.smooth_l1_loss(curr_q_vals, target) self.optim.zero_grad() loss.backward() self.optim.step() return loss.item() def set_test(self): """ Sets the network in evaluation mode """ self.policy_net.eval() def set_train(self): """ Sets the network in training mode """ self.policy_net.train() def save(self, filename): """ Save the network weights. :param filename: path """ self.policy_net.save(filename) def load(self, filename): """ Load the network weights. :param filename: path of the weight file """ self.policy_net.load(filename, self.device)
def initialize(game, model_name, warm_start): # Initialize environment env = gym.make(game) num_actions = env.action_space.n # Initialize constants num_frames = 4 capacity = int(1e4) # Cold start if not warm_start: # Initialize model model = DQN(in_channels=num_frames, num_actions=num_actions) optimizer = optim.RMSprop(model.parameters(), lr=1.0e-4, weight_decay=0.01) # Initialize replay memory memory_buffer = ReplayMemory(capacity) # Initialize statistics running_reward = None running_rewards = [] # Warm start if warm_start: data_file = 'results/{}_{}.p'.format(game, model_name) try: with open(data_file, 'rb') as f: running_rewards = pickle.load(f) running_reward = running_rewards[-1] prior_eps = len(running_rewards) model_file = 'saved_models/{}_{}_ep_{}.p'.format( game, model_name, prior_eps) with open(model_file, 'rb') as f: saved_model = pickle.load(f) model, optimizer, memory_buffer = saved_model except OSError: print('Saved file not found. Creating new cold start model.') model = DQN(in_channels=num_frames, num_actions=num_actions) optimizer = optim.RMSprop(model.parameters(), lr=1.0e-4, weight_decay=0.01) # Initialize replay memory memory_buffer = ReplayMemory(capacity) running_reward = None running_rewards = [] cuda = torch.cuda.is_available() if cuda: model = model.cuda() criterion = torch.nn.MSELoss() return env, model, optimizer, criterion, memory_buffer, cuda, running_reward, running_rewards
class Agent: def __init__(self): self.controller, self.target = DQN(), DQN() # For RL self.vision = VAE() if USE_CUDA: self.controller.cuda() self.target.cuda() self.vision.cuda() # Init weights based on init function self.controller.apply(init_weights) self.vision.apply(init_weights) # Load model params into target self.target.load_state_dict(self.controller.state_dict()) self.action_number = 0 # actions taken (to determine whether or not to update) # NOTE: DQN exp buffer should use embeddings generated by vision module # The vision module (aka the VAE) has memory consisting of game states self.exp_buffer = [] # exp buffer self.exp_number = 0 # size of exp buffer so far self.opt = torch.optim.Adam(self.controller.parameters(),lr=DQN_LEARNING_RATE) self.loss = nn.SmoothL1Loss() # Make an action given a state def act(self, state, explore=True): self.action_number += 1 # Update target if self.action_number % TARGET_INTERVAL == 0: self.target.load_state_dict(self.model.state_dict()) if explore and np.random.rand() <= EPSILON: # Act randomly a = np.random.randint(NUM_ACTIONS) return a # Send state to model a_vec = self.controller(self.vision.encode(state)) a = int(torch.argmax(torch.squeeze(a_vec))) return a def load_params(self): # Looks in current directory for params for model and for VAE if LOAD_CHECKPOINT_VAE: try: self.vision.load_state_dict(torch.load("VAEparams.pt")) print("Loaded checkpoint for VAE") except: print("Could not load VAE checkpoint") if LOAD_CHECKPOINT_DQN: try: self.controller.load_state_dict(torch.load("DQNparams.pt")) self.target.load_state_dict(torch.load("DQNparams.pt")) print("Loaded checkpoint for DQN") except: print("Could not load DQN checkpoint") def save_params(self): torch.save(agent.controller.state_dict(), "DQNparams.pt") torch.save(agent.vision.state_dict(), "VAEparams.pt") # clear the buffer def clear_exp_buffer(self): self.exp_buffer = [] self.exp_number = 0 self.vision.memory = [] self.vision.memory_num = 0 # Add experience to exp buffer def add_exp(self, exp): self.vision.remember(exp[0]) if self.exp_number >= EXP_BUFFER_MAX: del self.exp_buffer[0] else: self.exp_number += 1 exp[0] = self.vision.encode(exp[0]) exp[3] = self.vision.encode(exp[3]) self.exp_buffer.append(exp) # Replay gets batch and trains on it # Returns [vision loss, controller loss] def replay(self, batch_size): v_loss, q_loss = 0,0 # Init to 0 in case we need to return without any training # Train vision component first if self.action_number % VAE_UPDATE_INTERVAL == 0: v_loss = self.vision.replay() # If experience buffer isn't right size yet, don't do anything if self.exp_number < EXP_BUFFER_MIN or self.action_number % TRAINING_INTERVAL != 0: return [v_loss, q_loss] # Get batch from experience_buffer batch = random.sample(self.exp_buffer, batch_size) s,a,r,s_new,_ = zip(*batch) s_new = s_new[:-1] # Remove last # First turn batch into something we can run through model s = torch.cat(s) a = torch.LongTensor(a).unsqueeze(1) r = torch.FloatTensor(r) s_new = torch.cat(s_new) if USE_CUDA: a = a.cuda() r = r.cuda() # Get q vals for s (what model outputted) from a # .gather gets us q value for specific action a pred_q_vals = self.model(s).gather(1,a).squeeze() # Having chosen a in s, # What is the highest possible reward we can get from s_new? # We add q of performing a in s then add best q from next state # cat 0 to end for the terminal state s_new_q_vals = self.target(s_new).max(1)[0] zero = torch.zeros(1) if USE_CUDA: zero = zero.cuda() s_new_q_vals = torch.cat((s_new_q_vals, zero)) exp_q_vals = r + s_new_q_vals*GAMMA myloss = self.loss(pred_q_vals, exp_q_vals) self.opt.zero_grad() myloss.backward() if WEIGHT_CLIPPING: for param in self.model.parameters(): param.grad.data.clamp_(-1,1) # Weight clipping avoids exploding gradients self.opt.step() global EPSILON if EPSILON > EPSILON_MIN: EPSILON *= EPSILON_DECAY return [v_loss, myloss.item()]
class Agent(): def __init__(self, learn_rate, input_shape, num_actions, batch_size): self.num_actions = num_actions self.batch_size = batch_size self.gamma = 0.99 self.tau = 0.05 self.has_target_net = False self.memories = [] # self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=2000) self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30) # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.device = torch.device("cpu") self.net = DQN().to(self.device) if self.has_target_net: self.target_net = copy.deepcopy(self.net).to(self.device) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate) def update_target_net_params(self): for param, target_param in zip(self.net.parameters(), self.target_net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def choose_action(self, observation, hidden_state): state = torch.tensor(observation).float().detach() state = state.to(self.device) q_values, hidden_state_ = self.net(state, hidden_state) action = torch.argmax(q_values).item() if random.random() <= self.epsilon.value(): action = random.randint(0, self.num_actions - 1) return action, hidden_state_ def fetch_batch(self): indices = np.random.choice(len(self.memories), self.batch_size, replace=False) indices = list(indices) for idx in indices: yield self.memories[idx] def store_trajectory(self, trajectory): self.memories.append(trajectory) def learn(self): if len(self.memories) < self.batch_size: return batch_losses = [] for memory_idx, memory in enumerate(self.fetch_batch()): states, actions, rewards, dones = memory.fetch_on_device( self.device) self.net.train() episode_losses = [] hidden_state = self.net.get_new_hidden_state().to(self.device) second_to_last_memory_index = len(memory.states) - 1 for i in range(second_to_last_memory_index): state = states[i].detach() state_ = states[i + 1].detach() action = actions[i].detach() reward = rewards[i].detach() if i == second_to_last_memory_index - 1: done = True else: done = False qs, hidden_state_ = self.net(state, hidden_state) chosen_q = qs[action] if self.has_target_net: qs_, hidden_state_3 = self.target_net( state_, hidden_state_) action_qs_, hidden_state_3 = self.net( state_, hidden_state_) action_ = torch.argmax(action_qs_) chosen_q_ = qs_[action_] else: action_qs_, hidden_state_3 = self.net( state_, hidden_state_) chosen_q_ = torch.max(action_qs_) if done: chosen_q_ = torch.tensor(0.0, dtype=torch.float32).to( self.device) q_target = reward + self.gamma * chosen_q_ loss = (q_target - chosen_q)**2 episode_losses.append(loss) hidden_state = hidden_state_ episode_loss = sum(episode_losses) / len(episode_losses) batch_losses.append(episode_loss) batch_loss = sum(batch_losses) / len(batch_losses) self.optimizer.zero_grad() batch_loss.backward() self.optimizer.step() for i in range(self.batch_size): self.epsilon.step() if self.has_target_net: self.update_target_net_params()
class DQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=0.9999, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn', device='cuda:0'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.device = device self.memory = ReplayBuffer(mem_size, input_dims, n_actions) # Create policy and target DQN models self.policy = DQN(self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + 'policy', chkpt_dir=self.chkpt_dir) self.target = DQN(self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + 'target', chkpt_dir=self.chkpt_dir) # put on correct device (GPU or CPU) self.policy.to(device) self.target.to(device) # Optimizer self.optimizer = optim.Adam(self.policy.parameters(), lr=lr) # Loss self.loss = nn.MSELoss() def choose_action(self, observation): # Choose an action if np.random.random() > self.epsilon: state = torch.tensor([observation], dtype=torch.float).to(self.device) actions = self.policy.forward(state) action = torch.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = torch.tensor(state).to(self.device) rewards = torch.tensor(reward).to(self.device) dones = torch.tensor(done).to(self.device) actions = torch.tensor(action).to(self.device) states_ = torch.tensor(new_state).to(self.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.target.load_state_dict(self.policy.state_dict()) def decrement_epsilon(self): if self.epsilon > self.eps_min: self.epsilon *= self.eps_dec def save_models(self): self.policy.save_checkpoint() def load_models(self): self.policy.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return self.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.policy.forward(states)[indices, actions] q_next = self.target.forward(states_).max(dim=1)[0] q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next loss = self.loss(q_target, q_pred).to(self.device) loss.backward() self.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon()
class Agent: def __init__(self): self.model, self.target = DQN(), DQN() if USE_CUDA: self.model.cuda() self.target.cuda() self.exp_buffer = Memory() self.exp_number = 0 # size of exp buffer so far self.param_updates = 0 # track how many times params updated self.opt = torch.optim.RMSprop(self.model.parameters(), lr=LEARNING_RATE) self.loss = nn.SmoothL1Loss() # Make an action given a state def act(self, state, explore=True): if explore and np.random.rand() <= EPSILON: # Act randomly a = np.random.randint(NUM_ACTIONS) else: # Send state to model a_vec = self.model(state) a = int(torch.argmax(torch.squeeze(a_vec))) return a # clear the buffer def clear_exp_buffer(self): self.exp_buffer = Memory() self.exp_number = 0 # Add experience to exp buffer def add_exp(self, exp): self.exp_buffer.add(exp) self.exp_number += 1 # Replay gets batch and trains on it def replay(self, batch_size): q_loss = 0 # If experience buffer isn't right size yet, don't do anything if self.exp_number < MIN_BUFFER_SIZE: return # Get batch from experience_buffer batch = self.exp_buffer.get_batch(batch_size) s, a, r, s_new, _ = zip(*batch) s_new = s_new[:-1] # Remove last item (it is 'None') # First turn batch into something we can run through model s = torch.cat(s) a = torch.LongTensor(a).unsqueeze(1) r = torch.FloatTensor(r).unsqueeze(1) s_new = torch.cat(s_new) #print(a.shape,r.shape, s.shape, s_new.shape) if USE_CUDA: a = a.cuda() r = r.cuda() # Get q vals for s (what model outputted) from a # .gather gets us q value for specific action a pred_q_vals = self.model(s).gather(1, a) # Having chosen a in s, # What is the highest possible reward we can get from s_new? # We add q of performing a in s then add best q from next state # cat 0 to end for the terminal state s_new_q_vals = self.target(s_new).max(1)[0] zero = torch.FloatTensor(0) if USE_CUDA: zero = zero.cuda() s_new_q_vals = torch.cat((s_new_q_vals, zero)) exp_q_vals = r + s_new_q_vals * GAMMA myloss = self.loss(pred_q_vals, exp_q_vals) self.opt.zero_grad() myloss.backward() self.opt.step() if WEIGHT_CLIPPING: for param in self.model.parameters(): param.grad.data.clamp_( -1, 1) # Weight clipping avoids exploding gradients if self.param_updates % TARGET_UPDATE_INTERVAL == 0: self.target.load_state_dict(self.model.state_dict()) self.param_updates += 1 global EPSILON if EPSILON > EPSILON_MIN: EPSILON *= EPSILON_DECAY return myloss.item()
j.add(infomation) if feat == 2: if receive.edgeCountInfo[edge] < give.edgeCountInfo[edge]: receive.edgeCountInfo[edge] = give.edgeCountInfo[edge] j.add(infomation) for i in range(num_agent): if i != give.num and i != receive.num: receive.featureUpdate[i] = receive.featureUpdate[i].union(j) give.featureUpdate[receive.num].clear() elif give.num == receive.num: give.featureUpdate[receive.num].clear() model = DQN(nfeat=num_feature) # model.load_state_dict(torch.load(lists)) #retrain model_target = DQN(nfeat=num_feature) model_target.load_state_dict(model.state_dict()) loss_fn = nn.MSELoss() optimizer = optim.Adam(model.parameters(),lr=0.0002) # replay = namedtuple('replay',('nextnode','state','action','reward','next_state')) class Replay_buffer(): def __init__(self , buffer_size): self.buffer_size = buffer_size self.buffer = np.zeros( [buffer_size] , dtype = replay) self.index = 0 self.cur_size = 0 def push(self,experience): self.buffer[self.index] = experience self.index = (self.index+1)%self.buffer_size if self.cur_size < self.buffer_size: self.cur_size += 1 def sample(self,batch_size): sample_index = np.random.choice(np.arange(self.cur_size),size=batch_size,replace=False) return self.buffer[sample_index]
class Agent: def __init__(self): self.model = DQN() self.exp_buffer = [] # exp buffer self.exp_number = 0 # size of exp buffer so far self.opt = torch.optim.Adam(self.model.parameters(), lr=LEARNING_RATE) self.loss = nn.MSELoss() # Make an action given a state def act(self, state, explore=True): if explore and np.random.rand() <= EPSILON: # Act randomly a = np.random.randint(2) return a # Send state to model state = torch.from_numpy(state).float() a_vec = self.model(state) a = int(torch.argmax(a_vec)) return a # clear the buffer def clear_exp_buffer(self): self.exp_buffer = [] self.exp_number = 0 # Add experience to exp buffer def add_exp(self, exp): if self.exp_number == MAX_BUFFER_SIZE: del self.exp_buffer[0] else: self.exp_number += 1 # Convert numpy arrays to tensor exp[0] = torch.from_numpy(exp[0]).float() if exp[4] == False: exp[3] = torch.from_numpy(exp[3]).float() self.exp_buffer.append(exp) # Replay gets batch and trains on it def replay(self, batch_size): # If experience buffer isn't right size yet, don't do anything if self.exp_number < MIN_BUFFER_SIZE: return # Get batch from experience_buffer batch_ind = list( torch.randint(self.exp_number, (batch_size, )).numpy()) batch = get_sublist(self.exp_buffer, batch_ind) q_loss = 0 # Go through samples for s, a, r, s_new, done in batch: if done: Q_val = r else: Q_val = r + GAMMA * torch.max(self.model(s_new)) self.opt.zero_grad() Q_pred = self.model(s) Q_targ = self.model(s) Q_targ[a] = Q_val myloss = self.loss(Q_pred, Q_targ) myloss.backward() q_loss += myloss.item() self.opt.step() global EPSILON if EPSILON > EPSILON_MIN: EPSILON *= EPSILON_DECAY return q_loss
if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don't save the history return model(Variable(obs)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(NUM_ACTIONS)]]) # vis = visdom.Visdom(port=8124) # Initialize target q function and q function Q = DQN(IMG_C, FRAME_HISTORY_LEN, NUM_ACTIONS).type(dtype) target_Q = DQN(IMG_C, FRAME_HISTORY_LEN, NUM_ACTIONS).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, FRAME_HISTORY_LEN) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() episodes_rewards = [] for t in count(): ### Step the env and store the transition