def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.95, epsilon_min=0.05, epsilon_decay=0.995, exploration_type='e-annealing', learning_type='dq', replay_buffer_size=1e5): self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.exploration_type = exploration_type self.learning_type = learning_type self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # initialize replay buffer self.replay_buffer = ReplayBuffer(replay_buffer_size) # start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver()
def __init__(self, name, Q_current, Q_target, num_actions, discount_factor, batch_size, epsilon, epsilon_decay, boltzmann, double_q, buffer_capacity, random_probs=None): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ # save hyperparameters in folder self.name = name # probably useless self.Q_current = Q_current self.Q_target = Q_target self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.boltzmann = boltzmann self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor self.buffer_capacity = buffer_capacity self.double_q = double_q self.random_probs = random_probs # define replay buffer self.replay_buffer = ReplayBuffer(capacity=buffer_capacity) # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver()
def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05, act_probabilities=None, double_q=False, buffer_capacity=100000, prefill_bs_percentage=5): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # define replay buffer self.replay_buffer = ReplayBuffer(capacity=buffer_capacity, min_fill=prefill_bs_percentage * batch_size) # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() # <JAB> if act_probabilities is None: self.act_probabilities = np.ones(num_actions) / num_actions else: self.act_probabilities = act_probabilities self.double_dqn = double_q
def __init__(self, Q, Q_target, num_actions, game="cartpole", explore_type="epsilon_greedy", epsilon_decay=1, epsilon_min=0.05, tau=1, method="CQL", discount_factor=0.99, batch_size=64, epsilon=0.05): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target # now support cartpole or carracing two games self.game = game # self.state_dim = Q. self.epsilon = epsilon self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # now support CQL(classical Q) or DQL(Double Q) self.method = method self.explore_type = explore_type # for epsilon annealing self.epsilon_decay = epsilon_decay self.epsilon_min = epsilon_min # for boltzmann exploration self.tau = tau # define replay buffer self.replay_buffer = ReplayBuffer() # start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver()
def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # define replay buffer self.replay_buffer = ReplayBuffer() # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver()
def __init__(self, Q, Q_target, num_actions, gamma=0.95, batch_size=64, epsilon=0.1, tau=0.01, lr=1e-4, history_length=0): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. gamma: discount factor of future rewards. batch_size: Number of samples per batch. tao: indicates the speed of adjustment of the slowly updated target network. epsilon: Chance to sample a random action. Float betwen 0 and 1. lr: learning rate of the optimizer """ # setup networks self.Q = Q.cuda() self.Q_target = Q_target.cuda() self.Q_target.load_state_dict(self.Q.state_dict()) # define replay buffer self.replay_buffer = ReplayBuffer(history_length) # parameters self.batch_size = batch_size self.gamma = gamma self.tau = tau self.epsilon = epsilon self.loss_function = torch.nn.MSELoss() self.optimizer = optim.Adam(self.Q.parameters(), lr=lr) self.num_actions = num_actions
def __init__(self, id, env, state_size, action_size, n_episodes, lr, gamma, global_network, target_network, q, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): super(DynaQAgent, self).__init__() self.id = id self.env = env self.state_size = state_size self.action_size = action_size self.n_episodes = n_episodes self.gamma = gamma self.q = q self.local_memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE) self.t_step = 0 self.max_t = max_t self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.global_network = global_network self.target_network = target_network self.optimizer = optim.SGD(self.global_network.parameters(), lr=lr, momentum=.5) self.scores_window = deque(maxlen=100) # last 100 scores
def __init__(self, id, env, do_render, state_size, action_size, n_episodes, lr, gamma, update_every, global_network, target_network, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): super(DQNAgent, self).__init__() self.id = id self.env = env self.do_render = do_render self.state_size = state_size self.action_size = action_size self.n_episodes = n_episodes self.gamma = gamma self.update_every = update_every self.local_memory = ReplayBuffer(env.action_space.n, BUFFER_SIZE, BATCH_SIZE) self.global_network = global_network self.qnetwork_target = target_network self.optimizer = optim.SGD(self.global_network.parameters(), lr=lr, momentum=.5) self.t_step = 0 self.max_t = max_t self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay
def __init__(self, observation_space, action_space): """ Changes the frame to be same input as the PyTorchFrame wrapper frames Then creates a replay bugffer and a vanilla DQN which the weights are loaded into """ shape = observation_space.shape self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(shape[-1], shape[0], shape[1]), dtype=np.uint8) self.action_space = action_space self.memory = ReplayBuffer(int(5e3)) self.policy_network = DQN(self.observation_space, self.action_space) self.policy_network.load_state_dict( torch.load("checkpoints/40000.pth", map_location=torch.device(device))) self.policy_network.eval()
class DQNAgent: def __init__(self, Q, Q_target, num_actions, game, exploration, discount_factor=0.99, batch_size=64, epsilon=0.2, epsilon_decay=0.99, epsilon_min=0.03): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.epsilon_min = epsilon_min self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor self.exploration = exploration self.game = game # define replay buffer self.replay_buffer = ReplayBuffer() # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, done): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer # 2. sample next batch and perform batch update: # 2.1 compute td targets: # td_target = reward + discount * max_a Q_target(next_state_batch, a) # 2.2 update the Q network # self.Q.update(...) # 2.3 call soft update for target network # self.Q_target.update(...) self.replay_buffer.add_transition(state, action, next_state, reward, done) batch_state, batch_action, batch_next_state, batch_rewards, batch_done = self.replay_buffer.next_batch( self.batch_size) td_target = batch_rewards #td_target += self.discount_factor * np.amax(self.Q_target.predict(self.sess, batch_next_state)) #use this or think of something better best_action = np.amax( self.Q.predict(self.sess, batch_next_state)[np.logical_not(batch_done)], 1) td_target[np.logical_not( batch_done)] += self.discount_factor * self.Q_target.predict( self.sess, batch_next_state)[np.logical_not(batch_done), best_action] self.Q.update(self.sess, batch_state, batch_action, td_target) self.Q_target.update(self.sess) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ r = np.random.uniform() if deterministic: action_id = np.argmax(self.Q.predict(self.sess, [state])) else: if self.exploration == "greedy": if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay r = np.random.uniform() if r > self.epsilon: # TODO: take greedy action (argmax) action_id = np.argmax(self.Q.predict(self.sess, [state])) else: # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # action_id = ... if self.game == "cartpole": action_id = np.random.randint( self.num_actions) #define number of actions #else if self.game == "CarRacing" : #action_id = .... else: print('Please enter a valid game.') # if exploration == "boltzmann": # else: return action_id def load(self, file_name): self.saver.restore(self.sess, file_name)
class DQNAgent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05,epsilon_decay=1, epsilon_min=0.05,tau=1, game='cartpole',exploration="epsilon_greedy", history_length=0) :#, load_data=False): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.game = game self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.tau = tau self.epsilon_min = epsilon_min self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor self.exploration = exploration # define replay buffer self.replay_buffer = ReplayBuffer() # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer # 2. sample next batch and perform batch update: # 2.1 compute td targets: # td_target = reward + discount * max_a Q_target(next_state_batch, a) # 2.2 update the Q network # self.Q.update(...) # 2.3 call soft update for target network # self.Q_target.update(...) ''' self.replay_buffer.add_transition(state, action, next_state, reward, terminal) states, actions, next_states, rewards, dones = self.replay_buffer.next_batch (self.batch_size) target_f = np.zeros((self.batch_size)) for i in range(self.batch_size): if dones[i]: target_f[i] = rewards[i] else: target_f[i] = rewards[i] + self.discount_factor * np.max(self.Q_target.predict(self.sess, [next_states[i]]), 1) loss = self.Q.update(self.sess, states, actions, target_f) self.Q_target.update(self.sess) ''' self.replay_buffer.add_transition(state, action, next_state, reward, terminal) batch_state, batch_action, batch_next_state, batch_rewards, batch_done = self.replay_buffer.next_batch(self.batch_size) td_target = batch_rewards best_action = np.argmax(self.Q.predict(self.sess, batch_next_state)[np.logical_not(batch_done)], 1) td_target[np.logical_not(batch_done)] += self.discount_factor * self.Q_target.predict(self.sess, batch_next_state)[np.logical_not(batch_done), best_action] loss = self.Q.update(self.sess, batch_state, batch_action, td_target) self.Q_target.update(self.sess) return loss def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ ''' r = np.random.uniform() if deterministic or r > self.epsilon: # TODO: take greedy action (argmax) # action_id = ... action_id = np.argmax(self.Q.predict(self.sess, [state])) else: if self.game == 'cartpole': action_id = random.randrange(self.num_actions) elif self.game == 'carracing': # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # action_id = ... probabilities = [0.1, 0.2, 0.2, 0.45, 0.05] action_id = np.random.choice (self.num_actions, p=probabilities) ''' if deterministic: action_id = np.argmax(self.Q.predict(self.sess, [state])) else: if self.exploration == "greedy": if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay r = np.random.uniform() if r > self.epsilon: # TODO: take greedy action (argmax) action_id = np.argmax(self.Q.predict(self.sess, [state])) else: # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering inthe training on and look what the agent is doing. if self.game == "cartpole" : action_id = np.random.randint(self.num_actions) elif self.game == "carracing": probabilities = [0.15, 0.15, 0.15, 0.3, 0.05, 0.1, 0.1] action_id = np.random.choice (self.num_actions, p=probabilities) else: print("Invalid game") elif self.exploration == "boltzmann": action_value = self.Q.predict(self.sess, [state])[0] prob = self.softmax(action_value/self.tau) action_id = np.random.choice(self.num_actions, p=prob) else: print("Invalid Exploration Type") return action_id def softmax(self, input): """ Safe Softmax function to avoid overflow Args: input: input vector Returns: prob: softmax of input """ input_max = np.max(input) e = np.exp(input-input_max) prob = e / np.sum(e) return prob def load(self, file_name): self.saver.restore(self.sess, file_name) def check_early_stop(self, reward, totalreward): return self.Q_target.check_early_stop (reward, totalreward)
def main(): config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0, } env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=1, retro=True, realtime_mode=False, config=config) print(env.observation_space) print(env.action_space) hyper_params = { "seed": 6, # which seed to use "replay-buffer-size": int(5e3), # replay buffer size "learning-rate": 1e-4, # learning rate for Adam optimizer "discount-factor": 0.99, # discount factor "num-steps": int(1e6), # total number of steps to run the environment for "batch-size": 32, # number of transitions to optimize at the same time "learning-starts": 5000, # number of steps before learning starts "learning-freq": 1, # number of iterations between every optimization step "use-double-dqn": True, # use double deep Q-learning "target-update-freq": 1000, # number of iterations between every target network update "eps-start": 1.0, # e-greedy start threshold "eps-end": 0.01, # e-greedy end threshold "eps-fraction": 0.05, # fraction of num-steps "print-freq": 10 } np.random.seed(hyper_params["seed"]) random.seed(hyper_params["seed"]) #assert "NoFrameskip" in hyper_params["env"], "Require environment with no frameskip" #env = gym.make(hyper_params["env"]) env.seed(hyper_params["seed"]) #env = NoopResetEnv(env, noop_max=30) #env = MaxAndSkipEnv(env, skip=4) #env = EpisodicLifeEnv(env) #env = FireResetEnv(env) # env = WarpFrame(env) env = PyTorchFrame(env) # env = ClipRewardEnv(env) # env = FrameStack(env, 4) replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"]) agent = DQNAgent( env.observation_space, env.action_space, replay_buffer, use_double_dqn=hyper_params["use-double-dqn"], lr=hyper_params["learning-rate"], batch_size=hyper_params["batch-size"], gamma=hyper_params["discount-factor"] ) model_num = 500 agent.policy_network.load_state_dict(torch.load('./Models/' + str(model_num) + '_policy.pt',map_location=torch.device(device))) eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"]) episode_rewards = [0.0] ep_nums = model_num state = env.reset() for t in range(hyper_params["num-steps"]): fraction = min(1.0, float(t) / eps_timesteps) eps_threshold = hyper_params["eps-start"] + fraction * (hyper_params["eps-end"] - hyper_params["eps-start"]) sample = random.random() # TODO # select random action if sample is less equal than eps_threshold # take step in env # add state, action, reward, next_state, float(done) to reply memory - cast done to float # add reward to episode_reward if sample > eps_threshold: action = agent.act(np.array(state)) else: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) agent.memory.add(state, action, reward, next_state, float(done)) state = next_state episode_rewards[-1] += reward if done: state = env.reset() episode_rewards.append(0.0) ep_nums += 1 if ep_nums % 50 == 0: agent.save_models(ep_nums) plot(episode_rewards,ep_nums) if t > hyper_params["learning-starts"] and t % hyper_params["learning-freq"] == 0: agent.optimise_td_loss() if t > hyper_params["learning-starts"] and t % hyper_params["target-update-freq"] == 0: agent.update_target_network() num_episodes = len(episode_rewards) if done and hyper_params["print-freq"] is not None and len(episode_rewards) % hyper_params[ "print-freq"] == 0: mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) print("********************************************************") print("steps: {}".format(t)) print("episodes: {}".format(num_episodes)) print("mean 100 episode reward: {}".format(mean_100ep_reward)) print("% time spent exploring: {}".format(int(100 * eps_threshold))) print("********************************************************") #if done and ep_nums % 10 == 0: # animate(env,agent,"anim/progress_"+str(ep_nums)) # state = env.reset() animate(env,agent,"anim/final") env.close()
np.random.seed(args.seed) random.seed(args.seed) assert "NoFrameskip" in args.env, "Require environment with no frameskip" env = gym.make(args.env) env.seed(args.seed) env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) env = EpisodicLifeEnv(env) env = FireResetEnv(env) env = WarpFrame(env) env = PyTorchFrame(env) env = ClipRewardEnv(env) env = FrameStack(env, 4) replay_buffer = ReplayBuffer(args.replay_buffer_size) agent = DQNAgent( env.observation_space, env.action_space, replay_buffer, use_double_dqn=args.use_double_dqn, lr=args.lr, batch_size=args.batch_size, gamma=args.gamma ) eps_timesteps = args.eps_fraction * float(args.num_steps) episode_rewards = [0.0] loss = [0.0]
class DQNAgent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.995, batch_size=64, epsilon=0.05): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.epsilon_min = 0.1 self.epsilon_decay = 0.99 self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor self.neg_reward_counter = 0 self.max_neg_rewards = 100 # define replay buffer self.replay_buffer = ReplayBuffer() # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # 2. sample next batch and perform batch update: #self.gas_actions = np.array([a == 3 for a in self.replay_buffer._data.actions]) batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch( self.batch_size) td_target = batch_rewards td_target[np.logical_not( batch_dones)] += self.discount_factor * np.amax( self.Q_target.predict(self.sess, batch_next_states), 1)[np.logical_not(batch_dones)] #print(batch_actions) loss = self.Q.update(self.sess, batch_states, batch_actions, td_target) self.Q_target.update(self.sess) #if self.epsilon > self.epsilon_min: # self.epsilon *= self.epsilon_decay #print(self.epsilon) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ r = np.random.uniform() if deterministic or r > self.epsilon: act_values = self.Q.predict(self.sess, [state]) action_id = np.argmax(self.Q.predict(self.sess, [state])) #print("I PREDICTED") #print("action_id_predicted: ", action_id) return action_id else: action_id = np.random.choice( [0, 1, 2, 3, 4], p=[0.3, 0.1, 0.1, 0.49, 0.01]) #straight, left, right, accelerate, brake # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # print("action_id: ", action_id) #print("action_id_random: ", action_id) return action_id def load(self, file_name): self.saver.restore(self.sess, file_name)
env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) env = EpisodicLifeEnv(env) env = FireResetEnv(env) env = WarpFrame(env) env = PyTorchFrame(env) env = ClipRewardEnv(env) env = FrameStack(env, 4) env = gym.wrappers.Monitor( env, './video/', video_callable=lambda episode_id: episode_id % 50 == 0, force=True) replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"]) agent = DQNAgent( env.observation_space, env.action_space, replay_buffer, use_double_dqn=hyper_params["use-double-dqn"], lr=hyper_params['learning-rate'], batch_size=hyper_params['batch-size'], gamma=hyper_params['discount-factor'], device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), dqn_type=hyper_params["dqn_type"]) if (args.load_checkpoint_file): print(f"Loading a policy - { args.load_checkpoint_file } ") agent.policy_network.load_state_dict(
class DQNAgent: def __init__(self, Q, Q_target, num_actions, game="cartpole", explore_type="epsilon_greedy", epsilon_decay=1, epsilon_min=0.05, tau=1, method="CQL", discount_factor=0.99, batch_size=64, epsilon=0.05): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target # now support cartpole or carracing two games self.game = game # self.state_dim = Q. self.epsilon = epsilon self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # now support CQL(classical Q) or DQL(Double Q) self.method = method self.explore_type = explore_type # for epsilon annealing self.epsilon_decay = epsilon_decay self.epsilon_min = epsilon_min # for boltzmann exploration self.tau = tau # define replay buffer self.replay_buffer = ReplayBuffer() # start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer # 2. sample next batch and perform batch update: # 2.1 compute td targets: # td_target = reward + discount * argmax_a Q_target(next_state_batch, a) # 2.2 update the Q network # self.Q.update(...) # 2.3 call soft update for target network # self.Q_target.update(...) self.replay_buffer.add_transition(state, action, next_state, reward, terminal) batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch( self.batch_size) td_target = batch_rewards if self.method == "CQL": td_target[np.logical_not( batch_dones)] += self.discount_factor * np.max( self.Q_target.predict(self.sess, batch_next_states), 1)[np.logical_not(batch_dones)] self.Q.update(self.sess, batch_states, batch_actions, td_target) self.Q_target.update(self.sess) elif self.method == "DQL": best_action = np.argmax( self.Q.predict(self.sess, batch_next_states)[np.logical_not(batch_dones)], 1) td_target[np.logical_not( batch_dones)] += self.discount_factor * self.Q_target.predict( self.sess, batch_next_states)[np.logical_not(batch_dones), best_action] self.Q.update(self.sess, batch_states, batch_actions, td_target) self.Q_target.update(self.sess) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ if deterministic: action_id = np.argmax(self.Q.predict(self.sess, [state])) else: if self.explore_type == "epsilon_greedy": if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay r = np.random.uniform() if r > self.epsilon: # TODO: take greedy action (argmax) action_id = np.argmax(self.Q.predict(self.sess, [state])) else: # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering inthe training on and look what the agent is doing. if self.game == "cartpole" or self.game == "mountaincar": action_id = np.random.randint(self.num_actions) elif self.game == "carracing": # action_probability = np.array([1, 2, 2, 10, 1, 1, 1]) action_probability = np.array([2, 5, 5, 10, 1]) action_probability = action_probability / np.sum( action_probability) action_id = np.random.choice(self.num_actions, p=action_probability) else: print("Invalid game") elif self.explore_type == "boltzmann": action_value = self.Q.predict(self.sess, [state])[0] prob = self.softmax(action_value / self.tau) action_id = np.random.choice(self.num_actions, p=prob) else: print("Invalid Exploration Type") return action_id def softmax(self, input): """ Safe Softmax function to avoid overflow Args: input: input vector Returns: prob: softmax of input """ input_max = np.max(input) e = np.exp(input - input_max) prob = e / np.sum(e) return prob def load(self, file_name): self.saver.restore(self.sess, file_name)
class DQNAgent(mp.Process): def __init__(self, id, env, do_render, state_size, action_size, n_episodes, lr, gamma, update_every, global_network, target_network, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): super(DQNAgent, self).__init__() self.id = id self.env = env self.do_render = do_render self.state_size = state_size self.action_size = action_size self.n_episodes = n_episodes self.gamma = gamma self.update_every = update_every self.local_memory = ReplayBuffer(env.action_space.n, BUFFER_SIZE, BATCH_SIZE) self.global_network = global_network self.qnetwork_target = target_network self.optimizer = optim.SGD(self.global_network.parameters(), lr=lr, momentum=.5) self.t_step = 0 self.max_t = max_t self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay def act(self, state, eps=0.): if random.random() > eps: state = torch.from_numpy(state).float().unsqueeze(0).to(device) with torch.no_grad(): action_values = self.global_network(state) return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.local_memory.add(state, action, reward, next_state, done) # Increment local timer self.t_step += 1 # If enough samples are available in memory, get random subset and learn # Learn every UPDATE_EVERY time steps. if self.t_step % self.update_every == 0: if self.t_step > BATCH_SIZE: experiences = self.local_memory.sample(BATCH_SIZE) self.learn(experiences) def compute_loss(self, experiences): states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target.forward( next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model # Q_expected = self.qnetwork_local(states).gather(1, actions) Q_expected = self.global_network.forward(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) return loss def learn(self, experiences): loss = self.compute_loss(experiences) # Update gradients per HogWild! algorithm self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.global_network, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def run(self): scores = [] scores_window = deque(maxlen=100) # last 100 scores eps = self.eps_start # initialize epsilon start_time = time.time() for i_episode in range(1, self.n_episodes + 1): state = self.env.reset() score = 0 for t in range(self.max_t): action = self.act(state, eps) if self.do_render: self.env.render() next_state, reward, done, _ = self.env.step(action) self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(self.eps_end, self.eps_decay * eps) # decrease epsilon elapsed_time = time.time() - start_time if self.id == 0: print( '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: ' .format(self.id, i_episode, np.mean(scores_window)) + time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) if i_episode % 100 == 0: print( '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: ' .format(self.id, i_episode, np.mean(scores_window)) + time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) if np.mean(scores_window) >= 200.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(self.global_network.state_dict(), 'checkpoint.pth') break
'allowed-floors': 0, } worker_id = int(np.random.randint(999, size=1)) print(worker_id) env = ObstacleTowerEnv('./ObstacleTower/obstacletower', docker_training=False, worker_id=worker_id, retro=True, realtime_mode=False, config=config) env.seed(random_seed) # Run with specific wrappers # # This is the only Wrapper we used, as the others were didn't add enough value env = PyTorchFrame(env) # env = FrameStack(env, 3) # env = HumanActionEnv(env) # Create Agent to Train replay_buffer = ReplayBuffer(int(5e3)) agent = DQNAgent( env.observation_space, env.action_space, replay_buffer, use_double_dqn=True, lr=args.lr, batch_size=hyper_params["batch-size"], gamma=hyper_params["discount-factor"], ) # If we have pretrained weights, load them if(args.checkpoint): print(f"Loading a policy - { args.checkpoint } ") agent.policy_network.load_state_dict(torch.load(args.checkpoint))
class DQNAgent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.95, batch_size=64, epsilon=1): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.epsilon_decay = 0.995 self.epsilon_min = 0.01 self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # define replay buffer self.replay_buffer = ReplayBuffer() # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # 2. sample next batch and perform batch update: batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch( self.batch_size) for i in range(self.batch_size): # print("next state: ", batch_next_states[i]) td_target = batch_rewards[i] if not batch_dones[i]: td_target = batch_rewards[i] + self.discount_factor * np.amax( self.Q_target.predict(self.sess, [batch_next_states[i]])) target_f = self.Q_target.predict(self.sess, [batch_states[i]]) target_f[0][batch_actions[i]] = td_target loss = self.Q.update(self.sess, [batch_states[i]], [batch_actions[i]], target_f[0]) #td_targets) self.Q_target.update(self.sess) #print("loss:", loss) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay #print("epsilon: ", self.epsilon) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ r = np.random.uniform() if deterministic or r > self.epsilon: # TODO: take greedy action (argmax) #state = np.reshape(state, (1,4)) act_values = self.Q.predict(self.sess, [state]) #it was q target # we should be using act_values[0], i guess # print("act values: ", act_values) # act values: [[0.05641035 0.06138265]] # print("act values[0]: ", act_values[0]) # act values[0]: [0.05641035 0.06138265] action_id = np.argmax(act_values[0]) #print("predicted action. deterministic: {}. epsilon cond: {}. action_id: {}." #.format(deterministic, (r > self.epsilon), action_id)) else: action_id = random.randrange(self.num_actions) #print("random action. deterministic: {}. epsilon cond.: {}. action_id: {}." #.format(deterministic, (r > self.epsilon), action_id)) # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # print("action_id: ", action_id) return action_id def load(self, file_name): self.saver.restore(self.sess, file_name)
class Agent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.95, epsilon_min=0.05, epsilon_decay=0.995, exploration_type='e-annealing', learning_type='dq', replay_buffer_size=1e5): self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.exploration_type = exploration_type self.learning_type = learning_type self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # initialize replay buffer self.replay_buffer = ReplayBuffer(replay_buffer_size) # start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() # add transition to the replay buffer def add(self, state, action, next_state, reward, terminal): self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # train network def train(self): # sample batch from the replay buffer batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch( self.batch_size) # compute td targets using q- or double q-learning if self.learning_type == 'q': # q learning batch_rewards[np.logical_not( batch_dones)] += self.discount_factor * np.max( self.Q_target.predict(self.sess, batch_next_states), axis=1)[np.logical_not(batch_dones)] else: # double q learning q_actions = np.argmax(self.Q.predict(self.sess, batch_next_states), axis=1) batch_rewards[np.logical_not( batch_dones)] += self.discount_factor * self.Q_target.predict( self.sess, batch_next_states)[np.arange(self.batch_size), q_actions][np.logical_not(batch_dones)] # update network and target network loss = self.Q.update(self.sess, batch_states, batch_actions, batch_rewards) self.Q_target.update(self.sess) return loss # get action for state def act(self, state, deterministic): r = np.random.uniform() if deterministic or (self.exploration_type != 'boltzmann' and r > self.epsilon): # take greedy action (argmax) a_pred = self.Q.predict(self.sess, [state]) action_id = np.argmax(a_pred) else: if self.exploration_type == 'boltzmann': actions = self.Q.predict(self.sess, [state])[0] # softmax calculation, subtracting max for stability actions = np.exp((actions - max(actions)) / self.epsilon) actions /= np.sum(actions) # selecting action following probabilities a_value = np.random.choice(actions, p=actions) action_id = np.argmax(a_value == actions) else: # sample random action action_id = np.random.randint(0, self.num_actions) return action_id # anneal epsilon def anneal(self, e=0): self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) # linear #self.epsilon = max(self.epsilon_min, self.epsilon * np.exp(-(1 - self.epsilon_decay) * e)) # load trained network def load(self, folder): self.saver.restore(self.sess, tf.train.latest_checkpoint(folder))
class DQNAgent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05): """ Q-Learning agent for off-policy TD control using Function Approximation. ######################################################################## TD here for using as new target R + discount_factor * Q(S', A') off-policy -> use old data collected on other policy, too ####################################################################### Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # define replay buffer self.replay_buffer = ReplayBuffer(use_manual_data=False) # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal, collect_data_first=False): """ This method stores a transition to the replay buffer and updates the Q networks. """ # add current transition to replay buffer self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # if the ReplayBuffer should be filled up first, then the train step is done here if collect_data_first and len( self.replay_buffer._data.states) < self.batch_size: print("No training yet. Filling up replay buffer..") # return 0 for loss and q_values return 0, [0, 0] # If the ReplayBuffer should not be filled up or is full enough, do the following else: # get a random batch from the ReplayBuffer batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = \ self.replay_buffer.next_batch(self.batch_size) batch_targets = np.zeros((self.batch_size)) for i in range(self.batch_size): # if a state is a final state, only use the direct reward if batch_dones[i]: batch_targets[i] = batch_rewards[i] # otherwise comput the td_target else: td_target = batch_rewards[i] + self.discount_factor * \ np.max(self.Q_target.predict(self.sess, [batch_next_states[i]])) batch_targets[i] = td_target # update Q network loss = self.Q.update(self.sess, batch_states, batch_actions, batch_targets) # get predictions to check q-values -> e.g. are they diverging? q_preds = self.Q.predict(self.sess, batch_states) # update target network self.Q_target.update(self.sess) return loss, q_preds def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ r = np.random.uniform() if deterministic or r > self.epsilon: # take greedy action (argmax) action_id = np.argmax(self.Q.predict(self.sess, [state])) # print("Deterministic action:", action_id) else: # sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # for carracing: if self.num_actions == 5: action_id = np.random.choice(range(5), p=[0.32, 0.09, 0.09, 0.4, 0.1]) # for cartpole action_id = np.random.randint(self.num_actions) # print("Explorative action:", action_id) return action_id def load(self, file_name): self.saver.restore(self.sess, file_name)
class Agent_DQN: def __init__(self, args, env): self.args = args self.env = env self.input_channels = 3 if 'SpaceInvaders' in args.env_id else 4 self.num_actions = self.env.action_space.n # if testing, simply load the model we have trained if args.test_dqn: self.load(args.model) self.online_net.eval() self.target_net.eval() return # DQN variants setting self.prioritized = args.prioritized self.double = args.double self.n_steps = args.n_steps self.noise_linear = args.noise_linear if self.prioritized: self.memory = PrioritizedReplayBuffer(10000, alpha=0.6) self.beta_schedule = LinearSchedule(args.num_timesteps, initial_p=0.4, final_p=1.0) self.criterion = MSELoss else: self.memory = ReplayBuffer(10000) self.criterion = nn.MSELoss() if args.atari: DQN = DQN_Atari input_feature = self.input_channels else: DQN = DQN_Simple input_feature = env.observation_space.shape[0] # build target, online network self.target_net = DQN(input_feature, self.num_actions, dueling=args.dueling, noise_linear=args.noise_linear) self.target_net = self.target_net.cuda( ) if use_cuda else self.target_net self.online_net = DQN(input_feature, self.num_actions, dueling=args.dueling, noise_linear=args.noise_linear) self.online_net = self.online_net.cuda( ) if use_cuda else self.online_net # discounted reward self.GAMMA = 0.99 # exploration setting self.exploration = LinearSchedule(schedule_timesteps=int( 0.1 * args.num_timesteps), initial_p=1.0, final_p=0.05) # training settings self.train_freq = 4 self.learning_start = 10000 self.batch_size = args.batch_size self.num_timesteps = args.num_timesteps self.display_freq = args.display_freq self.save_freq = args.save_freq self.target_update_freq = args.target_update_freq self.optimizer = optim.RMSprop(self.online_net.parameters(), lr=1e-4) # global status self.episodes_done = 0 self.steps = 0 def make_action(self, observation, test=True): return self.act(observation, test) def save(self, save_path): print('save model to', save_path) torch.save(self.online_net, save_path + '_online') torch.save(self.target_net, save_path + '_target') def load(self, load_path): if use_cuda: self.online_net = torch.load(load_path + '_online') self.target_net = torch.load(load_path + '_target') else: self.online_net = torch.load( load_path + '_online', map_location=lambda storage, loc: storage) self.target_net = torch.load( load_path + '_target', map_location=lambda storage, loc: storage) def act(self, state, test=False): sample = random.random() if test: eps_threshold = 0.01 state = torch.from_numpy(state).permute(2, 0, 1).unsqueeze(0) state = state.cuda() if use_cuda else state else: eps_threshold = self.exploration.value(self.steps) if sample > eps_threshold: action = self.online_net( Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view( 1, 1) else: action = LongTensor([[random.randrange(self.num_actions)]]) return action if not test else action[0, 0] def reset_noise(self): assert self.noise_linear == True self.online_net.reset_noise() self.target_net.reset_noise() def update(self): if self.prioritized: batch, weight, batch_idxes = self.memory.sample( self.batch_size, beta=self.beta_schedule.value(self.steps)) weight_batch = Variable(Tensor(weight)).squeeze() else: batch = self.memory.sample(self.batch_size) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) # We don't want to backprop through the expected action values and volatile # will save us on temporarily changing the model parameters' # requires_grad to False! non_final_next_states = Variable(torch.cat( [s for s in batch.next_state if s is not None]), volatile=True) state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.cat(batch.reward)) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = self.online_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. next_state_values = Variable(torch.zeros(self.batch_size).type(Tensor)) q_next = self.target_net(non_final_next_states) if self.double: _, best_actions = self.online_net(non_final_next_states).max(1) next_state_values[non_final_mask] = q_next.gather( 1, best_actions.unsqueeze(1)).squeeze(1) else: next_state_values[non_final_mask] = q_next.max(1)[0] # Now, we don't want to mess up the loss with a volatile flag, so let's # clear it. After this, we'll just end up with a Variable that has # requires_grad=False next_state_values.volatile = False # Compute the expected Q values expected_state_action_values = ( next_state_values * (self.GAMMA**(self.n_steps))) + reward_batch # Compute loss if self.prioritized: loss = self.criterion(state_action_values, expected_state_action_values) loss = torch.mul(loss, weight_batch) new_priorities = np.abs(loss.cpu().data.numpy()) + 1e-6 self.memory.update_priorities(batch_idxes, new_priorities) loss = loss.mean() else: loss = self.criterion(state_action_values, expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.data[0] def process_state(self, state): state = np.array(state) if self.args.atari: # map shape: (84,84,4) --> (1,4,84,84) state = torch.from_numpy(state).permute(2, 0, 1).unsqueeze(0) else: state = torch.Tensor(state).unsqueeze(0) return state.cuda() if use_cuda else state def train(self): total_reward = 0 loss = 0 # set training mode self.online_net.train() while (True): if self.noise_linear: self.reset_noise() state = self.process_state(self.env.reset()) done = False episode_duration = 0 while (not done): # select and perform action action = self.act(state) next_state, reward, done, _ = self.env.step(action[0, 0]) total_reward += reward reward = Tensor([reward]) # process new state next_state = self.process_state(next_state) if done: next_state = None # store the transition in memory self.memory.push(state, action, next_state, reward) # move to the next state state = next_state # Perform one step of the optimization (on the target network) if self.steps > self.learning_start and self.steps % self.train_freq == 0: loss = self.update() if self.noise_linear: self.reset_noise() # update target network if self.steps > self.learning_start and self.steps % self.target_update_freq == 0: self.target_net.load_state_dict( self.online_net.state_dict()) if self.steps % self.save_freq == 0: self.save('dqn.cpt') self.steps += 1 episode_duration += 1 if self.episodes_done % self.display_freq == 0: print( 'Episode: %d | Steps: %d/%d | Exploration: %f | Avg reward: %f | loss: %f | Episode Duration: %d' % (self.episodes_done, self.steps, self.num_timesteps, self.exploration.value(self.steps), total_reward / self.display_freq, loss, episode_duration)) writer.add_scalar('reward', total_reward / self.display_freq, self.steps) total_reward = 0 self.episodes_done += 1 if self.steps > self.num_timesteps: break self.save('dqn_final.model') def nsteps_train(self): ''' Training procedure for multi-steps learning ''' total_reward = 0 loss = 0 # set training mode self.online_net.train() while (True): if self.noise_linear: self.reset_noise() state_buffer = deque() # store states for future use action_buffer = deque() # store actions for future use reward_buffer = deque() # store rewards for future use nstep_reward = 0 # calculate n-step discounted reward state = self.process_state(self.env.reset()) state_buffer.append(state) done = False episode_duration = 0 # run n-1 steps for _ in range(1, self.n_steps): action = self.act(state) next_state, reward, done, _ = self.env.step(action[0, 0]) next_state = self.process_state(next_state) if done: next_state = None state_buffer.append(next_state) action_buffer.append(action) nstep_reward = nstep_reward * self.GAMMA + reward reward_buffer.append(reward) state = next_state episode_duration += 1 while (not done): # select and perform action action = self.act(state) next_state, reward, done, _ = self.env.step(action[0, 0]) total_reward += reward # process new state next_state = self.process_state(next_state) if done: next_state = None # save new state, action, reward state_buffer.append(next_state) action_buffer.append(action) reward_buffer.append(reward) nstep_reward = nstep_reward * self.GAMMA + reward # store the transition in memory self.memory.push(state_buffer.popleft(), action_buffer.popleft(), next_state, Tensor([nstep_reward])) # update n-step reward nstep_reward -= (self.GAMMA**(self.n_steps - 1)) * reward_buffer.popleft() # move to the next state state = next_state # Perform one step of the optimization (on the target network) if self.steps > self.learning_start and self.steps % self.train_freq == 0: loss = self.update() if self.noise_linear: self.reset_noise() # update target network if self.steps > self.learning_start and self.steps % self.target_update_freq == 0: self.target_net.load_state_dict( self.online_net.state_dict()) if self.steps % self.save_freq == 0: self.save('dqn.cpt') self.steps += 1 episode_duration += 1 if self.episodes_done % self.display_freq == 0: print( 'Episode: %d | Steps: %d/%d | Exploration: %f | Avg reward: %f | loss: %f | Episode Duration: %d' % (self.episodes_done, self.steps, self.num_timesteps, self.exploration.value(self.steps), total_reward / self.display_freq, loss, episode_duration)) writer.add_scalar('reward', total_reward / self.display_freq, self.steps) total_reward = 0 self.episodes_done += 1 if self.steps > self.num_timesteps: break self.save('dqn_final.model')
class DynaQAgent(mp.Process): def __init__(self, id, env, state_size, action_size, n_episodes, lr, gamma, global_network, target_network, q, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): super(DynaQAgent, self).__init__() self.id = id self.env = env self.state_size = state_size self.action_size = action_size self.n_episodes = n_episodes self.gamma = gamma self.q = q self.local_memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE) self.t_step = 0 self.max_t = max_t self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.global_network = global_network self.target_network = target_network self.optimizer = optim.SGD(self.global_network.parameters(), lr=lr, momentum=.5) self.scores_window = deque(maxlen=100) # last 100 scores def act(self, state, eps=0.): if random.random() > eps: # Turn the state into a tensor state = torch.from_numpy(state).float().unsqueeze(0).to(device) with torch.no_grad(): action_values = self.global_network( state) # Make choice based on local network return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.local_memory.add(state, action, reward, next_state, done) # Increment local timer self.t_step += 1 if self.t_step > BATCH_SIZE: experiences = self.local_memory.sample(BATCH_SIZE) self.learn(experiences) # TODO: Better way to do this?? if self.q[0].empty() and np.mean(self.scores_window) < 180: experiences = self.local_memory.sample(BATCH_SIZE) self.q[0].put(experiences[0].detach().share_memory_()) self.q[1].put(experiences[1].detach().share_memory_()) self.q[2].put(experiences[2].detach().share_memory_()) self.q[3].put(experiences[3].detach().share_memory_()) self.q[4].put(experiences[4].detach().share_memory_()) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.target_network(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.global_network(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.global_network, self.target_network, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def get_experience_as_tensor(self, e): states = torch.from_numpy(np.vstack([e.state])).float().to(device) actions = torch.from_numpy(np.vstack([e.action])).long().to(device) rewards = torch.from_numpy(np.vstack([e.reward])).float().to(device) next_states = torch.from_numpy(np.vstack([e.next_state ])).float().to(device) dones = torch.from_numpy(np.vstack([e.done]).astype( np.uint8)).float().to(device) return (states, actions, rewards, next_states, dones) def get_action_values(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) with torch.no_grad(): action_values = self.target_network(state) return action_values.cpu().data.numpy()[0] def get_delta(self, state, action, next_state, reward): priority = reward + self.gamma * np.max( self.get_action_values(next_state)) - self.get_action_values( state)[action] return priority def run(self): scores = [] eps = self.eps_start # initialize epsilon start_time = time.time() for i_episode in range(1, self.n_episodes + 1): state = self.env.reset() score = 0 for t in range(self.max_t): action = self.act(state, eps) # if do_render: # self.env.render() next_state, reward, done, _ = self.env.step(action) self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break self.scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(self.eps_end, self.eps_decay * eps) # decrease epsilon elapsed_time = time.time() - start_time if self.id == 0: print( '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: ' .format(self.id, i_episode, np.mean(self.scores_window)) + time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) if i_episode % 100 == 0: print( '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: ' .format(self.id, i_episode, np.mean(self.scores_window)) + time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) if np.mean(self.scores_window) >= 200.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(self.scores_window))) break
def __init__(self, args, env): self.args = args self.env = env self.input_channels = 3 if 'SpaceInvaders' in args.env_id else 4 self.num_actions = self.env.action_space.n # if testing, simply load the model we have trained if args.test_dqn: self.load(args.model) self.online_net.eval() self.target_net.eval() return # DQN variants setting self.prioritized = args.prioritized self.double = args.double self.n_steps = args.n_steps self.noise_linear = args.noise_linear if self.prioritized: self.memory = PrioritizedReplayBuffer(10000, alpha=0.6) self.beta_schedule = LinearSchedule(args.num_timesteps, initial_p=0.4, final_p=1.0) self.criterion = MSELoss else: self.memory = ReplayBuffer(10000) self.criterion = nn.MSELoss() if args.atari: DQN = DQN_Atari input_feature = self.input_channels else: DQN = DQN_Simple input_feature = env.observation_space.shape[0] # build target, online network self.target_net = DQN(input_feature, self.num_actions, dueling=args.dueling, noise_linear=args.noise_linear) self.target_net = self.target_net.cuda( ) if use_cuda else self.target_net self.online_net = DQN(input_feature, self.num_actions, dueling=args.dueling, noise_linear=args.noise_linear) self.online_net = self.online_net.cuda( ) if use_cuda else self.online_net # discounted reward self.GAMMA = 0.99 # exploration setting self.exploration = LinearSchedule(schedule_timesteps=int( 0.1 * args.num_timesteps), initial_p=1.0, final_p=0.05) # training settings self.train_freq = 4 self.learning_start = 10000 self.batch_size = args.batch_size self.num_timesteps = args.num_timesteps self.display_freq = args.display_freq self.save_freq = args.save_freq self.target_update_freq = args.target_update_freq self.optimizer = optim.RMSprop(self.online_net.parameters(), lr=1e-4) # global status self.episodes_done = 0 self.steps = 0
class DQNAgent: def __init__(self, name, Q_current, Q_target, num_actions, discount_factor, batch_size, epsilon, epsilon_decay, boltzmann, double_q, buffer_capacity, random_probs=None): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ # save hyperparameters in folder self.name = name # probably useless self.Q_current = Q_current self.Q_target = Q_target self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.boltzmann = boltzmann self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor self.buffer_capacity = buffer_capacity self.double_q = double_q self.random_probs = random_probs # define replay buffer self.replay_buffer = ReplayBuffer(capacity=buffer_capacity) # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # 2. sample next batch batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch( self.batch_size) # find optimal actions for the sampled s' states if self.double_q: # double Q learning (select actions using current network, rather than target network) # ...in order to decorrelate noise between selection and evaluation # (Q(state,action) is still evaluated using target network in any case) action_selector = self.Q_current else: action_selector = self.Q_target # as usual, the Q network returns a vector of... predicted values for every possible action a_prime = np.argmax(action_selector.predict(self.sess, batch_next_states), axis=1) # pick a''th value from each column of the Q prediction # note, this will include action predictions for "done" state, but we'll kill them later q_values_next = self.Q_current.predict( self.sess, batch_next_states)[np.arange(self.batch_size), a_prime] # 2.1 compute td targets: # if done, there will be no next state td_targets = batch_rewards + np.where( batch_dones, 0, self.discount_factor * q_values_next) # 2.2 update the Q (current) network self.Q_current.update(self.sess, batch_states, batch_actions, td_targets) # 2.3 call soft update for target network # this is done by the dodgy associate_method therein self.Q_target.update(self.sess) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ # get action probabilities from current network Q_values = np.squeeze( self.Q_current.predict(self.sess, np.expand_dims(state, axis=0))) argmax_a = np.argmax(Q_values) if deterministic: # take greedy action return argmax_a if self.boltzmann: # implementing an interaction here between boltzmann exploration and epsilon: # viz. that epsilon controls the temperature of the softmax function # so that as before, higher eps -> higher exploration action_probs = softmax(Q_values, temperature=1 / (1 - self.epsilon)**2) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) else: action_probs = np.zeros_like(Q_values) if np.random.uniform() > self.epsilon: # choose the best action action = argmax_a else: # explore if self.random_probs is None: action = np.random.randint(self.num_actions, size=1)[0] else: action = np.random.choice(np.arange(self.num_actions), p=self.random_probs) # we decay epsilon AFTER we've checked it # (nb: if deterministic, epsilon will never decay, but of course this doesn't matter) if self.epsilon_decay > 0: self.epsilon *= (1 - self.epsilon_decay) return action def load(self, file_name): self.saver.restore(self.sess, file_name)
random.seed(hyper_params['seed']) assert "NoFrameskip" in hyper_params[ 'env'], "Require environment with no frameskip" env = create_env(0, 1) env.seed(hyper_params['seed']) #env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) #env = EpisodicLifeEnv(env) #env = FireResetEnv(env) env = WarpFrame(env) env = PyTorchFrame(env) env = ClipRewardEnv(env) env = FrameStack(env, 3) replay_buffer = ReplayBuffer(hyper_params['replay_buffer_size']) agent = DQNAgent(env.observation_space, env.action_space, replay_buffer, use_double_dqn=hyper_params['use_double_dqn'], lr=hyper_params['learning_rate'], batch_size=hyper_params['batch_size'], gamma=hyper_params['discount_factor']) eps_timesteps = hyper_params['eps_fraction'] * float( hyper_params['num_steps']) episode_rewards = [0.0] loss = [0.0] policy_actions = unpickle_object('action_map')
class DQNAgent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05, act_probabilities=None, double_q=False, buffer_capacity=100000, prefill_bs_percentage=5): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # define replay buffer self.replay_buffer = ReplayBuffer(capacity=buffer_capacity, min_fill=prefill_bs_percentage * batch_size) # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() # <JAB> if act_probabilities is None: self.act_probabilities = np.ones(num_actions) / num_actions else: self.act_probabilities = act_probabilities self.double_dqn = double_q def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer # 2. sample next batch and perform batch update: # 2.1 compute td targets: # td_target = reward + discount * argmax_a Q_target(next_state_batch, a) # 2.2 update the Q network # self.Q.update(...) # 2.3 call soft update for target network # self.Q_target.update(...) # <JAB> self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # Let the buffer fill up, otherwise we will burn up a lot of $#!+¥ states early on if self.replay_buffer.has_min_items(): buffer = self.replay_buffer.next_batch(self.batch_size) batch_states = buffer[0] batch_actions = buffer[1] batch_next_states = buffer[2] batch_rewards = buffer[3] batch_dones = buffer[4] non_terminal_states = np.logical_not(batch_dones) if self.double_dqn: a_predictions = self.Q.predict(self.sess, batch_next_states) a_predictions = np.argmax(a_predictions, axis=1) action_indexes = [np.arange(len(a_predictions)), a_predictions] q_predictions = self.Q_target.predict(self.sess, batch_next_states) q_predictions = q_predictions[action_indexes] else: q_predictions = self.Q_target.predict(self.sess, batch_next_states) q_predictions = np.max(q_predictions, axis=1) td_target = batch_rewards # If episode is not finished, add predicted Q values to the current rewards td_target[ non_terminal_states] += self.discount_factor * q_predictions[ non_terminal_states] # Update Step self.Q.update(self.sess, batch_states, batch_actions, td_target) self.Q_target.update(self.sess) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ r = np.random.uniform() if deterministic or r > self.epsilon: # <JAB> action_id = np.argmax(self.Q.predict(self.sess, state)) # </JAB> else: # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # action_id = ... # <JAB> action_id = np.random.choice(np.arange(self.num_actions), p=self.act_probabilities) # </JAB> return action_id def load(self, file_name): self.saver.restore(self.sess, file_name)