def __init__(self, input_dim, output_dim, lr, gamma, max_memory_size, batch_size, eps_start, eps_end, eps_decay, device, linear1_units=64, linear2_units=64, decay_type="linear"): super().__init__(max_memory_size, batch_size, eps_start, eps_end, eps_decay, device, decay_type) self.model_name = "DQN" self.output_dim = output_dim self.policy_net = DQN(input_dim, output_dim, linear1_units, linear2_units).to(device) # optimizer self.optim = optim.Adam(self.policy_net.parameters(), lr=lr) self.gamma = gamma
def __init__(self, learn_rate, state_shape, num_actions, action_shape, batch_size, slice_size): self.gamma = 0.999 self.tau = 0.01 self.clip_grad_norm = 0.1 self.has_target_net = True self.state_shape = state_shape self.num_actions = num_actions # this is how many actions there are to choose from self.action_shape = action_shape # this is how many actions the env accepts at each step self.buffer_size = 1_000_000 self.batch_size = batch_size # *times slice_size, because recurrency/rollouts self.slice_size = slice_size self.slice_replay_buffer = MemorySliceReplayBuffer( size=self.buffer_size, slice_size=self.slice_size, state_shape=self.state_shape, action_shape=self.action_shape) self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=300) # self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30) # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.device = torch.device("cpu") self.net = DQN(state_shape, num_actions).to(self.device) if self.has_target_net: self.target_net = copy.deepcopy(self.net).to(self.device) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate)
def main(): """Run DQN until the environment throws an exception.""" env = AllowBacktracking(make_env(stack=False, scale_rew=False)) env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False) config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config) as sess: dqn = DQN(*rainbow_models(sess, env.action_space.n, gym_space_vectorizer(env.observation_space), min_val=-200, max_val=200)) player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 4) optim, optimize = dqn.optimize(learning_rate=0.0001) sess.run(tf.global_variables_initializer()) dqn.train( num_steps=3000000, # Make sure an exception arrives before we stop. player=player, replay_buffer=PrioritizedReplayBuffer(500000, 0.5, 0.4, epsilon=0.1), optimize_op=optimize, train_interval=1, target_interval=1024, batch_size=16, min_buffer_size=20000)
def __init__(self, input_dim, output_dim, lr, gamma, max_memory_size, batch_size, eps_start, eps_end, eps_decay, device, target_update=100, linear1_units=64, linear2_units=64, decay_type="linear"): super().__init__(input_dim, output_dim, lr, gamma, max_memory_size, batch_size, eps_start, eps_end, eps_decay, device, linear1_units, linear2_units, decay_type) self.model_name = "FixedDQN" self.target_update_freq = target_update # networks self.output_dim = output_dim self.target_net = DQN(input_dim, output_dim, linear1_units, linear2_units).to(device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.updated = 0
def __init__(self, args, n_agents, n_cities, device, data_loader): self.n_agents = n_agents self.n_cities = n_cities self.device = device self.args = args self.Encoder = Encoder(K=args.steps, M=self.n_cities, L=args.len_encoder).to(self.device) self.DQN = DQN(N=self.n_agents, K=args.steps, L=args.len_encoder, M=n_cities).to(self.device) self.data_loader = data_loader self.iter_data = iter(data_loader) self.n_envs = len(data_loader) self.idx_env = -1 self.env = None self.EPS_START = self.args.eps_start self.EPS_END = self.args.eps_end self.EPS_DECAY = self.args.eps_decay self.criterion = nn.MSELoss() self.optimizer = torch.optim.RMSprop(self.DQN.parameters(), lr=args.lr)
def __init__(self, player, episode): self.EPSILON = EPS_END + (EPS_START - EPS_END) * (1 - (episode / DECAY_LEN)) self.EPSILON = max(self.EPSILON, EPS_END) self.n_states = 9 self.state = np.zeros(self.n_states, dtype=np.int) self.player = player self.reward = 0 self.prev_state = None self.dqn = DQN(self.n_states + 1, self.n_states)
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=0.9999, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn', device='cuda:0'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.device = device self.memory = ReplayBuffer(mem_size, input_dims, n_actions) # Create policy and target DQN models self.policy = DQN(self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + 'policy', chkpt_dir=self.chkpt_dir) self.target = DQN(self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + 'target', chkpt_dir=self.chkpt_dir) # put on correct device (GPU or CPU) self.policy.to(device) self.target.to(device) # Optimizer self.optimizer = optim.Adam(self.policy.parameters(), lr=lr) # Loss self.loss = nn.MSELoss()
def __init__(self, sess, state_dimension, num_actions, tau=0.001): self.sess = sess DQN.__init__(self, sess, state_dimension, num_actions, scope="model", reuse=True) self.tau = tau self._counterpart = self._register_counterpart() def update(): for op in self._counterpart: self.sess.run(op) self.update = update tf.global_variables_initializer().run(session=sess)
def __init__(self, learn_rate, input_shape, num_actions, batch_size): self.num_actions = num_actions self.batch_size = batch_size self.gamma = 0.99 self.tau = 0.05 self.has_target_net = False self.memories = [] # self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=2000) self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30) # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.device = torch.device("cpu") self.net = DQN().to(self.device) if self.has_target_net: self.target_net = copy.deepcopy(self.net).to(self.device) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate)
def __init__(self, sess, state_dimension, num_actions, scope="model", reuse=False): # sess = tf.Session() # TODO add CPU config information # Targets in loss computation self.target_in = tf.placeholder(shape=[None], dtype=tf.float32) # target Q values self.action_in = tf.placeholder(shape=[None, 2], dtype=tf.int32) train_model = DQN(sess, state_dimension, num_actions, scope, reuse=reuse) # target_model = TargetNetwork(sess, state_dimension, num_actions) self.loss = tf.losses.mean_squared_error( labels=self.target_in, predictions=tf.gather_nd(params=train_model.pred_out, indices=self.action_in)) self.optimizer = tf.train.AdamOptimizer(0.0005) self.train_step = self.optimizer.minimize(self.loss) # tf.add_to_collection(tf.GraphKeys.TRAIN_OP, self.pred_out) def train(obs, actions, targets): """ Updates the weights of the neural network, based on its targets, its predictions, its loss and its optimizer. Args: sess: TensorFlow session. obs: [current_observation] or observations of batch actions: [current_action] or actions of batch targets: [current_target] or targets of batch """ feed_dict = { train_model.obs_in: obs, self.action_in: actions, self.target_in: targets } # evaluate the TF tensors and operations self.loss and self.train_step loss, _ = sess.run([self.loss, self.train_step], feed_dict=feed_dict) return loss self.train_model = train_model # self.target_model = target_model self.train = train self.predict = train_model.predict # self.save = save_params # self.load = load_params tf.global_variables_initializer().run(session=sess)
def main(): opt = parse_opt() use_cuda = torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') env = gym.make(game) seed = 7122 env.seed(seed) random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) agent = DQN(env, opt, device=device) agent.network.apply(weights_init) agent.sync_weight() progress = trange(opt.episode, ascii=True) summary = Summary() last_rewards = 0 for episode in progress: # Training state = env.reset() for s in range(opt.max_step): # use epsilon-greedy in training action = agent.egreedy_action(state) next_state, reward, done, _ = env.step(action) loss = agent.perceive(state, action, reward, next_state, done) state = next_state if done: break summary.add(episode, 'loss', loss) # Testing if opt.test_interval > 0 and (episode + 1) % opt.test_interval == 0: rewards = 0 for t in trange(opt.test, ascii=True, leave=False): state = env.reset() for s in range(opt.max_step): action = agent.action(state) next_state, reward, done, _ = env.step(action) state = next_state rewards += reward if done: break if opt.test > 0: rewards /= opt.test last_rewards = rewards summary.add(episode, 'reward', rewards) progress.set_description('Loss: {:.4f} | Reward: {:2}'.format( loss, last_rewards)) if opt.log: summary.write(opt.log)
def __init__(self, epsilon_start, epsilon_end, epsilon_anneal, nb_actions, learning_rate, gamma, batch_size, replay_memory_size, hidden_size, model_input_size, use_PER, use_ICM): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_anneal_over_steps = epsilon_anneal self.num_actions = nb_actions self.gamma = gamma self.batch_size = batch_size self.learning_rate = learning_rate self.step_no = 0 self.policy = DQN(hidden_size=hidden_size, inputs=model_input_size, outputs=nb_actions).to(self.device) self.target = DQN(hidden_size=hidden_size, inputs=model_input_size, outputs=nb_actions).to(self.device) self.target.load_state_dict(self.policy.state_dict()) self.target.eval() self.hidden_size = hidden_size self.optimizer = torch.optim.AdamW(self.policy.parameters(), lr=self.learning_rate) self.use_PER = use_PER if use_PER: self.replay = Prioritized_Replay_Memory(replay_memory_size) else: self.replay = Replay_Memory(replay_memory_size) self.loss_function = torch.nn.MSELoss() self.use_ICM = use_ICM if use_ICM: self.icm = ICM(model_input_size, nb_actions)
def run_gym(params): if params.CnnDQN: env = make_atari(params.env) env = wrap_pytorch(wrap_deepmind(env)) q_network = CnnDQN(env.observation_space.shape, env.action_space.n) target_q_network = deepcopy(q_network) else: env = make_gym_env(params.env) q_network = DQN(env.observation_space.shape, env.action_space.n) target_q_network = deepcopy(q_network) if USE_CUDA: q_network = q_network.cuda() target_q_network = target_q_network.cuda() agent = Agent(env, q_network, target_q_network) optimizer = optim.Adam(q_network.parameters(), lr=params.learning_rate) replay_buffer = ReplayBuffer(params.replay_size) losses, all_rewards = [], [] episode_reward = 0 state = env.reset() for ts in range(1, params.max_ts + 1): epsilon = get_epsilon(params.epsilon_start, params.epsilon_end, params.epsilon_decay, ts) action = agent.act(state, epsilon) next_state, reward, done, _ = env.step(int(action.cpu())) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > params.start_train_ts: # Update the q-network & the target network loss = compute_td_loss(agent, params.batch_size, replay_buffer, optimizer, params.gamma) losses.append(loss.data) if ts % params.target_network_update_f == 0: hard_update(agent.q_network, agent.target_q_network) if ts % params.log_every == 0: out_str = "Timestep {}".format(ts) if len(all_rewards) > 0: out_str += ", Reward: {}".format(all_rewards[-1]) if len(losses) > 0: out_str += ", TD Loss: {}".format(losses[-1]) print(out_str)
def __init__(self, env, use_conv=True, learning_rate=3e-4, gamma=0.99, tau=0.01, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.tau = tau self.replay_buffer = BasicBuffer(max_size=buffer_size) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.use_conv = use_conv if self.use_conv: self.model1 = ConvDQN(env.observation_space.shape, env.action_space.n).to(self.device) self.model2 = ConvDQN(env.observation_space.shape, env.action_space.n).to(self.device) else: self.model1 = DQN(env.observation_space.shape, len(env.action_space)).to(self.device) self.model2 = DQN(env.observation_space.shape, len(env.action_space)).to(self.device) self.optimizer1 = torch.optim.Adam(self.model1.parameters()) self.optimizer2 = torch.optim.Adam(self.model2.parameters())
def __init__(self, player, nb_rows, nb_cols, timelimit, episode): self.EPSILON = EPS_END + (EPS_START - EPS_END) * (1 - (episode / DECAY_LEN)) self.EPSILON = max(self.EPSILON, EPS_END) self.timelimit = timelimit self.nb_rows = nb_rows self.nb_cols = nb_cols rows = [] for _ in range(nb_rows + 1): columns = [] for _ in range(nb_cols + 1): columns.append({"v": 0, "h": 0}) rows.append(columns) self.cells = rows self.len_states = nb_rows * (nb_cols + 1) + nb_cols * (nb_rows + 1) self.state = np.zeros(self.len_states) self.player = player self.score = [0, 0] self.reward = 0 self.prev_state = None self.dqn = DQN(self.len_states, self.len_states)
class agent(): def __init__(self, role, total_episode, epsilon, learning_rate, gamma, batch_size, target_replace_iter, memory_capacity, n_actions, n_states): self.role = role self.model = DQN(total_episode, epsilon, learning_rate, gamma, batch_size, target_replace_iter, memory_capacity, n_actions, n_states) def step(self, board: np.ndarray, episode): available = np.array([0 in board[:,col] for col in range(7)]) action = self.model.get_action(board.flatten(), episode, available) location = 5-(np.fliplr(board.T)==0).argmax(axis=1)[action] while board[location, action] != 0: print('Occupied!! Try another move') available[action] = False action = self.model.get_action(board.flatten(), episode, available) location = 5-(np.fliplr(board.T)==0).argmax(axis=1)[action] board[location,action] = self.role return board, action def store(self, in_board, action, winner, board): s = in_board.flatten() a = action r = winner * self.role s_ = board.flatten() self.model.store_transition(s, a, r, s_) def random_action(self, board: np.ndarray): available = np.array([0 in board[:,col] for col in range(7)]) action = np.random.choice(np.array(range(7))[available]) location = 5-(np.fliplr(board.T)==0).argmax(axis=1)[action] while board[location, action] != 0: print('Occupied!! Try another move') action = self.model.get_action(board) location = 5-(np.fliplr(board.T)==0).argmax(axis=1)[action] board[location,action] = self.role return board, action
def __init__(self): self.controller, self.target = DQN(), DQN() # For RL self.vision = VAE() if USE_CUDA: self.controller.cuda() self.target.cuda() self.vision.cuda() # Init weights based on init function self.controller.apply(init_weights) self.vision.apply(init_weights) # Load model params into target self.target.load_state_dict(self.controller.state_dict()) self.action_number = 0 # actions taken (to determine whether or not to update) # NOTE: DQN exp buffer should use embeddings generated by vision module # The vision module (aka the VAE) has memory consisting of game states self.exp_buffer = [] # exp buffer self.exp_number = 0 # size of exp buffer so far self.opt = torch.optim.Adam(self.controller.parameters(),lr=DQN_LEARNING_RATE) self.loss = nn.SmoothL1Loss()
def main(env_id, embedding_size): env = wrap_deepmind(make_atari(env_id), scale=True) embedding_model = DQN(embedding_size) agent = NECAgent(env, embedding_model) # subprocess.Popen(["tensorboard", "--logdir", "runs"]) configure("runs/pong-run") for t in count(): if t == 0: reward = agent.warmup() else: reward = agent.episode() print("Episode {}\nTotal Reward: {}".format(t, reward)) log_value('score', reward, t)
def play_dqn(filename, n=10, seed=0): env = gym.make("CartPole-v0") env.seed(seed) env.reset() model = DQN(net_structure=(state_size, 64, 64, action_size), gamma=gamma, optim=optim.Adam, optim_param=[alpha], loss_function=nn.MSELoss(), tau=0.1, device=device) buffer = ReplayBuffer(memory_size, batch_size, device) learning_policy = EpsDecay(eps_start, eps_min, eps_decay, env.action_space.n) playing_policy = Greedy() agent = Agent(model=model, buffer=buffer, learn_every=4, update_every=4, policy_learning=learning_policy, policy_playing=playing_policy) model.predict.load_state_dict(torch.load(filename)) agent.playing() for i in range(n): state = env.reset() score = 0 env.render() for j in range(99999999999): action = agent.act(state) env.render() state, reward, done, _ = env.step(action) score += reward if done: break print(score) env.close()
def __init__(self, name, state_size, action_size, use_double_dqn=False, use_dueling=False, seed=0, lr_decay=0.9999, use_prioritized_replay=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.name = name self.state_size = state_size self.action_size = action_size self.use_double_dqn = use_double_dqn self.use_dueling = use_dueling self.seed = random.seed(seed) self.use_prioritized_replay = use_prioritized_replay # Q-Network if use_dueling: self.qnetwork_local = DuelingDQN(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingDQN(state_size, action_size, seed).to(device) else: self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.qnetwork_target.eval() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, lr_decay) # Replay memory if self.use_prioritized_replay: self.memory = PrioritizedReplayBuffer(BUFFER_SIZE, seed, alpha=0.2, beta=0.8, beta_scheduler=1.0) else: self.memory = ReplayBuffer(BUFFER_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
class Agent: def __init__(self): self.model, self.target = DQN(), DQN() if USE_CUDA: self.model.cuda() self.target.cuda() self.exp_buffer = Memory() self.exp_number = 0 # size of exp buffer so far self.param_updates = 0 # track how many times params updated self.opt = torch.optim.RMSprop(self.model.parameters(), lr=LEARNING_RATE) self.loss = nn.SmoothL1Loss() # Make an action given a state def act(self, state, explore=True): if explore and np.random.rand() <= EPSILON: # Act randomly a = np.random.randint(NUM_ACTIONS) else: # Send state to model a_vec = self.model(state) a = int(torch.argmax(torch.squeeze(a_vec))) return a # clear the buffer def clear_exp_buffer(self): self.exp_buffer = Memory() self.exp_number = 0 # Add experience to exp buffer def add_exp(self, exp): self.exp_buffer.add(exp) self.exp_number += 1 # Replay gets batch and trains on it def replay(self, batch_size): q_loss = 0 # If experience buffer isn't right size yet, don't do anything if self.exp_number < MIN_BUFFER_SIZE: return # Get batch from experience_buffer batch = self.exp_buffer.get_batch(batch_size) s, a, r, s_new, _ = zip(*batch) s_new = s_new[:-1] # Remove last item (it is 'None') # First turn batch into something we can run through model s = torch.cat(s) a = torch.LongTensor(a).unsqueeze(1) r = torch.FloatTensor(r).unsqueeze(1) s_new = torch.cat(s_new) #print(a.shape,r.shape, s.shape, s_new.shape) if USE_CUDA: a = a.cuda() r = r.cuda() # Get q vals for s (what model outputted) from a # .gather gets us q value for specific action a pred_q_vals = self.model(s).gather(1, a) # Having chosen a in s, # What is the highest possible reward we can get from s_new? # We add q of performing a in s then add best q from next state # cat 0 to end for the terminal state s_new_q_vals = self.target(s_new).max(1)[0] zero = torch.FloatTensor(0) if USE_CUDA: zero = zero.cuda() s_new_q_vals = torch.cat((s_new_q_vals, zero)) exp_q_vals = r + s_new_q_vals * GAMMA myloss = self.loss(pred_q_vals, exp_q_vals) self.opt.zero_grad() myloss.backward() self.opt.step() if WEIGHT_CLIPPING: for param in self.model.parameters(): param.grad.data.clamp_( -1, 1) # Weight clipping avoids exploding gradients if self.param_updates % TARGET_UPDATE_INTERVAL == 0: self.target.load_state_dict(self.model.state_dict()) self.param_updates += 1 global EPSILON if EPSILON > EPSILON_MIN: EPSILON *= EPSILON_DECAY return myloss.item()
def init_dqn(args): """Intitialises and returns the necessary objects for Deep Q-learning: Q-network, target network, replay buffer and optimizer. """ logging.info( "Initialisaling DQN with architecture {} and optimizer {}".format( args.dqn_archi, args.optimizer_agent)) if args.dqn_archi == 'mlp': q_net = DQN(args.obs_shape, args.n_actions, args) q_target = DQN(args.obs_shape, args.n_actions, args) elif args.dqn_archi == 'cnn': q_net = CnnDQN(args.obs_shape, args.n_actions, args) q_target = CnnDQN(args.obs_shape, args.n_actions, args) if args.optimizer_agent == 'RMSProp': optimizer_agent = optim.RMSprop(q_net.parameters(), lr=args.lr_agent, weight_decay=args.lambda_agent) else: assert args.optimizer_agent == 'Adam' optimizer_agent = optim.Adam(q_net.parameters(), lr=args.lr_agent, weight_decay=args.lambda_agent) q_target.load_state_dict( q_net.state_dict()) # set params of q_target to be the same replay_buffer = ReplayBuffer(args.replay_buffer_size) if args.epsilon_annealing_scheme == 'linear': epsilon_schedule = LinearSchedule(schedule_timesteps=int( args.exploration_fraction * args.n_agent_steps), initial_p=args.epsilon_start, final_p=args.epsilon_stop) else: assert args.epsilon_annealing_scheme == 'exp' epsilon_schedule = ExpSchedule(decay_rate=args.epsilon_decay, final_p=args.epsilon_stop, initial_p=args.epsilon_start) return q_net, q_target, replay_buffer, optimizer_agent, epsilon_schedule
class Agent(): def __init__(self, learn_rate, input_shape, num_actions, batch_size): self.num_actions = num_actions self.batch_size = batch_size self.gamma = 0.99 self.tau = 0.05 self.has_target_net = False self.memories = [] # self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=2000) self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30) # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.device = torch.device("cpu") self.net = DQN().to(self.device) if self.has_target_net: self.target_net = copy.deepcopy(self.net).to(self.device) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate) def update_target_net_params(self): for param, target_param in zip(self.net.parameters(), self.target_net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def choose_action(self, observation, hidden_state): state = torch.tensor(observation).float().detach() state = state.to(self.device) q_values, hidden_state_ = self.net(state, hidden_state) action = torch.argmax(q_values).item() if random.random() <= self.epsilon.value(): action = random.randint(0, self.num_actions - 1) return action, hidden_state_ def fetch_batch(self): indices = np.random.choice(len(self.memories), self.batch_size, replace=False) indices = list(indices) for idx in indices: yield self.memories[idx] def store_trajectory(self, trajectory): self.memories.append(trajectory) def learn(self): if len(self.memories) < self.batch_size: return batch_losses = [] for memory_idx, memory in enumerate(self.fetch_batch()): states, actions, rewards, dones = memory.fetch_on_device( self.device) self.net.train() episode_losses = [] hidden_state = self.net.get_new_hidden_state().to(self.device) second_to_last_memory_index = len(memory.states) - 1 for i in range(second_to_last_memory_index): state = states[i].detach() state_ = states[i + 1].detach() action = actions[i].detach() reward = rewards[i].detach() if i == second_to_last_memory_index - 1: done = True else: done = False qs, hidden_state_ = self.net(state, hidden_state) chosen_q = qs[action] if self.has_target_net: qs_, hidden_state_3 = self.target_net( state_, hidden_state_) action_qs_, hidden_state_3 = self.net( state_, hidden_state_) action_ = torch.argmax(action_qs_) chosen_q_ = qs_[action_] else: action_qs_, hidden_state_3 = self.net( state_, hidden_state_) chosen_q_ = torch.max(action_qs_) if done: chosen_q_ = torch.tensor(0.0, dtype=torch.float32).to( self.device) q_target = reward + self.gamma * chosen_q_ loss = (q_target - chosen_q)**2 episode_losses.append(loss) hidden_state = hidden_state_ episode_loss = sum(episode_losses) / len(episode_losses) batch_losses.append(episode_loss) batch_loss = sum(batch_losses) / len(batch_losses) self.optimizer.zero_grad() batch_loss.backward() self.optimizer.step() for i in range(self.batch_size): self.epsilon.step() if self.has_target_net: self.update_target_net_params()
def initialize(game, model_name, warm_start): # Initialize environment env = gym.make(game) num_actions = env.action_space.n # Initialize constants num_frames = 4 capacity = int(1e4) # Cold start if not warm_start: # Initialize model model = DQN(in_channels=num_frames, num_actions=num_actions) optimizer = optim.RMSprop(model.parameters(), lr=1.0e-4, weight_decay=0.01) # Initialize replay memory memory_buffer = ReplayMemory(capacity) # Initialize statistics running_reward = None running_rewards = [] # Warm start if warm_start: data_file = 'results/{}_{}.p'.format(game, model_name) try: with open(data_file, 'rb') as f: running_rewards = pickle.load(f) running_reward = running_rewards[-1] prior_eps = len(running_rewards) model_file = 'saved_models/{}_{}_ep_{}.p'.format( game, model_name, prior_eps) with open(model_file, 'rb') as f: saved_model = pickle.load(f) model, optimizer, memory_buffer = saved_model except OSError: print('Saved file not found. Creating new cold start model.') model = DQN(in_channels=num_frames, num_actions=num_actions) optimizer = optim.RMSprop(model.parameters(), lr=1.0e-4, weight_decay=0.01) # Initialize replay memory memory_buffer = ReplayMemory(capacity) running_reward = None running_rewards = [] cuda = torch.cuda.is_available() if cuda: model = model.cuda() criterion = torch.nn.MSELoss() return env, model, optimizer, criterion, memory_buffer, cuda, running_reward, running_rewards
class Agent(): def __init__(self, learn_rate, state_shape, num_actions, action_shape, batch_size, slice_size): self.gamma = 0.999 self.tau = 0.01 self.clip_grad_norm = 0.1 self.has_target_net = True self.state_shape = state_shape self.num_actions = num_actions # this is how many actions there are to choose from self.action_shape = action_shape # this is how many actions the env accepts at each step self.buffer_size = 1_000_000 self.batch_size = batch_size # *times slice_size, because recurrency/rollouts self.slice_size = slice_size self.slice_replay_buffer = MemorySliceReplayBuffer( size=self.buffer_size, slice_size=self.slice_size, state_shape=self.state_shape, action_shape=self.action_shape) self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=300) # self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30) # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.device = torch.device("cpu") self.net = DQN(state_shape, num_actions).to(self.device) if self.has_target_net: self.target_net = copy.deepcopy(self.net).to(self.device) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate) def update_target_net_params(self): for param, target_param in zip(self.net.parameters(), self.target_net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def choose_action(self, observation, hidden_state): state = torch.tensor(observation).float().unsqueeze(0) state = state.detach().to(self.device) q_values, hidden_state_ = self.net(state, hidden_state) action = torch.argmax(q_values[0]).item() if random.random() <= self.epsilon.value(): action = random.randint(0, self.action_shape[0]) return action, hidden_state_ def learn(self, stats): if self.slice_replay_buffer.count < self.batch_size: return self.net.train() states_slices, actions_slices, rewards_slices, next_states_slices, dones_slices = self.slice_replay_buffer.sample(self.batch_size, self.device) batch_losses = [] hidden_states = self.net.get_batch_hidden_state(self.batch_size).to(self.device) for slice_index in range(self.slice_size): states = states_slices[:, slice_index] actions = actions_slices[:, slice_index] rewards = rewards_slices[:, slice_index] states_ = next_states_slices[:, slice_index] dones = dones_slices[:, slice_index] batch_indices = np.arange(self.batch_size, dtype=np.int64) qs, hidden_states_ = self.net(states, hidden_states) chosen_q = qs[batch_indices, actions.T[0]] if self.has_target_net: qs_, hidden_state_3 = self.target_net(states_, hidden_states_) action_qs_, hidden_state_3 = self.net(states_, hidden_states_) actions_ = torch.argmax(action_qs_, dim=1) chosen_q_ = qs_[batch_indices, actions_] else: action_qs_, hidden_state_3 = self.net(states_, hidden_states_) chosen_q_ = torch.max(action_qs_, dim=1)[0] rewards = rewards.T[0] q_target = rewards + self.gamma * chosen_q_ loss = torch.mean( (q_target - chosen_q) ** 2 ) batch_losses.append(-loss) hidden_states = hidden_states_ hidden_states[dones.T[0]] = 0.0 # if an episode ends mid slice then zero the hidden_states # this could be a problem if backprop stops here batch_losses = torch.stack(batch_losses) batch_loss = torch.mean(batch_losses) stats.last_loss = batch_loss.item() self.optimizer.zero_grad() batch_loss.backward() torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.clip_grad_norm) self.optimizer.step() self.epsilon.step() if self.has_target_net: self.update_target_net_params()
from collections import deque import random import torch from torch import optim from tqdm import tqdm from env import Env from hyperparams import ACTION_DISCRETISATION, OFF_POLICY_BATCH_SIZE as BATCH_SIZE, DISCOUNT, EPSILON, HIDDEN_SIZE, LEARNING_RATE, MAX_STEPS, REPLAY_SIZE, TARGET_UPDATE_INTERVAL, TEST_INTERVAL, UPDATE_INTERVAL, UPDATE_START from models import DQN, create_target_network from utils import plot env = Env() agent = DQN(HIDDEN_SIZE, ACTION_DISCRETISATION) target_agent = create_target_network(agent) optimiser = optim.Adam(agent.parameters(), lr=LEARNING_RATE) D = deque(maxlen=REPLAY_SIZE) def convert_discrete_to_continuous_action(action): return action.to(dtype=torch.float32) - ACTION_DISCRETISATION // 2 def test(agent): with torch.no_grad(): env = Env() state, done, total_reward = env.reset(), False, 0 while not done: action = agent(state).argmax( dim=1, keepdim=True) # Use purely exploitative policy at test time state, reward, done = env.step( convert_discrete_to_continuous_action(action))