def __init__(self, params, num, global_episodes, tvars, global_network): self.params = params self.name = "worker_" + str(num) self.number = num self.model_path = self.params.logdir self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.summary_writer = tf.summary.FileWriter("train_" + str(self.number)) self.global_network = global_network #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_AC = AC_network(params, num, tvars, name=self.name) self.update_local_ops = self.update_target_graph( tvars, self.local_AC.local_vars) #The Below code is related to setting up the Doom environment self.actions = None #load cartpole self.env = gym.make('CartPole-v0') self.myBuffer = ReplayMemory(max_size=self.params.max_ep_length)
def __init__(self, params): self.env = gym.make('CartPole-v0') self.params = params self.graph = tf.Graph() with self.graph.as_default(): self.main_actor = Policy_network(params, "primary") tvars = tf.trainable_variables() tact_start_index = int(len(tvars)) self.target_actor = Policy_network(params, "target") tvars = tf.trainable_variables() mcri_start_index = int(len(tvars)) self.main_critic = Value_network(params, "primary") tvars = tf.trainable_variables() tcri_start_index = int(len(tvars)) self.target_critic = Value_network(params, "target") self.tvars = tf.trainable_variables() self.main_actor_tvars = self.tvars[:tact_start_index] self.target_actor_tvars = self.tvars[ tact_start_index:mcri_start_index] self.main_critic_tvars = self.tvars[ mcri_start_index:tcri_start_index] self.target_critic_tvars = self.tvars[tcri_start_index:] self.main_actor.backprop(self.main_actor_tvars) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver() if not os.path.exists(self.params.logdir): os.mkdir(self.params.logdir) self.myBuffer = ReplayMemory(max_size=self.params.max_buffer_size) self.running_reward = None self.reward_sum = 0 self.global_step = 0 self.actor_targetOps = self.update_TargetGraph(self.main_actor_tvars, self.target_actor_tvars, self.params.tau) self.critic_targetOps = self.update_TargetGraph( self.main_critic_tvars, self.target_critic_tvars, self.params.tau)
def __init__(self, input_size, nb_action, gamma): self.gamma = gamma # append the average of the rewards to reward_window self.reward_window = [] self.model = Network(input_size, nb_action) # take 100,000 transition for the model to learn self.memory = ReplayMemory(100000) """create an object using adam optimizer and connect it to nerual network to make sure learning doesn't happen too fast""" self.optimizer = optim.Adam(self.model.parameters(), lr=0.001) # create fake dimension -- unsqueese, tensor wrapped into gradient self.prev_state = torch.Tensor(input_size).unsqueeze(0) self.prev_action = 0 self.prev_reward = 0
def __init__(self, sess, config, environment, evaluation_enviroment): # Get the session, config, environment, and create a replaymemory self.sess = sess self.config = config self.environment = environment self.evaluation_enviroment = evaluation_enviroment if config.prm: self.memory = PrioritizedExperienceReplay(sess, config) else: self.memory = ReplayMemory(config.state_shape, config.rep_max_size) self.init_dirs() self.init_cur_epsiode() self.init_global_step() self.init_epsilon() self.init_summaries() # Intialize the DQN graph which contain 2 Networks Target and Q self.estimator = DQN(sess, config, self.environment.n_actions) # To initialize all variables self.init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(self.init) self.saver = tf.train.Saver(max_to_keep=10) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.sess.graph) if config.is_train and not config.cont_training: pass elif config.is_train and config.cont_training: self.load() elif config.is_play: self.load() else: raise Exception("Please Set proper mode for training or playing")
self.best_action = tf.argmax(self.q_value, axis=1) # 返回索引值 def _build_optimizer(self): self.target_q = tf.placeholder(shape=[None, 6], dtype=tf.float32) self.loss = tf.reduce_mean(tf.square(self.q_value - self.target_q)) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6) self.update = self.optimizer.minimize(self.loss) self.update = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6).minimize(self.cost) if __name__ == "__main__": env = GameEnv('PongDeterministic-v4', 4) # env = GameEnv('BreakoutDeterministic-v4', 4) dqn = DeepQNetwork(n_actions=6, hist_len=4, name="eval_dqn") env.reset() replay = ReplayMemory() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(100000): dummy_q = np.zeros((64, 6)) # dummy_q = [dummy_q] action = env.env.action_space.sample() terminal_life_lost, observation, reward, is_done, info = env.step( action) observation = cv2.resize(observation, (84, 84), interpolation=cv2.INTER_NEAREST) replay.add_experience(action, observation, reward, terminal_life_lost) if i > 10000: states, actions, rewards, new_states, terminal_flags = replay.get_minibatch() loss, _, best_action = sess.run([dqn.loss, dqn.update, dqn.best_action],
class Dqn(): def __init__(self, input_size, nb_action, gamma): self.gamma = gamma # append the average of the rewards to reward_window self.reward_window = [] self.model = Network(input_size, nb_action) # take 100,000 transition for the model to learn self.memory = ReplayMemory(100000) """create an object using adam optimizer and connect it to nerual network to make sure learning doesn't happen too fast""" self.optimizer = optim.Adam(self.model.parameters(), lr=0.001) # create fake dimension -- unsqueese, tensor wrapped into gradient self.prev_state = torch.Tensor(input_size).unsqueeze(0) self.prev_action = 0 self.prev_reward = 0 """Use softmax to select highest probablity of the q-value, then save memory and improve performance by convertign state to gradient """ def select_action(self, state): # T = 100, increasing the temperature makes it look more certain probs = F.softmax(self.model(Variable(state, volatile=True)) * 100) # draw randomly from probs action = probs.multinomial() return action.data[0, 0] """ Implement Markov Decision Process """ def learn(self, batch_state, batch_next_state, batch_reward, batch_action): outputs = self.model(batch_state).gather( 1, batch_action.unsqueeze(1)).squeeze(1) next_outputs = self.model(batch_next_state).detach().max(1)[0] # the reward + the next output which is the max of the values of nex state target = self.gamma * next_outputs + batch_reward # temporal difference lost (predictions- output, target- goal) td_loss = F.smooth_l1_loss(outputs, target) # re initialize optimizer self.optimizer.zero_grad() # backpropogate the temporal difference and free the memory(true) td_loss.backward(retain_variables=True) # update how much contripute to error (weight) self.optimizer.step() """ Update all the elements of our transition and select the action""" def update(self, reward, new_signal): # signal is the state, 3 signals plus orientation, -orientation new_state = torch.Tensor(new_signal).float().unsqueeze(0) # update memory and convert the list to tensor self.memory.push((self.prev_state, new_state, torch.LongTensor([int(self.prev_action)]), torch.Tensor([self.prev_reward]))) # play an action action = self.select_action(new_state) # the ai starts learning after 100 transitions if len(self.memory.memory) > 100: batch_state, batch_next_state, batch_action, batch_reward = self.memory.sample( 100) self.learn(batch_state, batch_next_state, batch_reward, batch_action) # update previous action self.prev_action = action self.prev_state = new_state self.prev_reward = reward self.reward_window.append(reward) if len(self.reward_window) > 1000: del self.reward_window[0] return action """ Take the sume of all rewards in stored reward and divide by the mean""" def score(self): return sum(self.reward_window) / (len(self.reward_window) + 1.) """ Save last model and optimizer""" def save(self): torch.save( { 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict() }, 'previous_brain.pth') """ Loads the saved file allows us to use that brain""" def load(self): if os.path.isfile('previous_brain.pth'): print("=> loading checkpoint...") checkpoint = torch.load('previous_brain.pth') self.model.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) print("Done.") else: print("File not found...")
class Worker(): def __init__(self, params, num, global_episodes, tvars, global_network): self.params = params self.name = "worker_" + str(num) self.number = num self.model_path = self.params.logdir self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.summary_writer = tf.summary.FileWriter("train_" + str(self.number)) self.global_network = global_network #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_AC = AC_network(params, num, tvars, name=self.name) self.update_local_ops = self.update_target_graph( tvars, self.local_AC.local_vars) #The Below code is related to setting up the Doom environment self.actions = None #load cartpole self.env = gym.make('CartPole-v0') self.myBuffer = ReplayMemory(max_size=self.params.max_ep_length) def train(self, sess): trainBatch = self.myBuffer.sample(self.total_steps) batch_state = np.array(trainBatch[0]).reshape( [self.total_steps, self.params.input_dim]) batch_actions = np.array(trainBatch[1]).reshape( [self.total_steps, self.params.num_actions]) batch_rewards = np.array(trainBatch[2]) batch_next_state = np.array(trainBatch[3]).reshape( [self.total_steps, self.params.input_dim]) batch_done = np.array(trainBatch[4]) end_multiplier = -(batch_done - 1) # Here we take the rewards and values from the buffer, and use them to # generate the advantage and discounted returns. # The advantage function uses "Generalized Advantage Estimation" #self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) #discounted_rewards = discount(self.rewards_plus,gamma)[:-1] #self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) #advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1] #advantages = discount(advantages,gamma) next_Q = np.max( sess.run(self.local_AC.Qout, feed_dict={self.local_AC.input_x: batch_next_state})) state_value = np.max( sess.run(self.local_AC.Qout, feed_dict={self.local_AC.input_x: batch_state})) batch_target_Q = batch_rewards + (self.params.gamma * next_Q * end_multiplier) batch_advantages = batch_target_Q - state_value # Update the global network using gradients from loss # Generate network statistics to periodically save feed_dict = { self.local_AC.input_x: batch_state, self.local_AC.target_Q: batch_target_Q, self.local_AC.actions: batch_actions, self.local_AC.advantages: batch_advantages.reshape(self.total_steps, 1) } v_l, p_l, e_l, _ = sess.run([ self.local_AC.value_loss, self.local_AC.policy_loss, self.local_AC.entropy, self.local_AC.apply_grads ], feed_dict=feed_dict) #return v_l/self.total_steps , p_l/self.total_steps , e_l/self.total_steps def work(self, sess, coord, saver): episode_count = sess.run(self.global_episodes) self.total_steps = 0 print("Starting worker " + str(self.number)) with sess.as_default(), sess.graph.as_default(): while not coord.should_stop(): episode_buffer = [] episode_values = [] episode_frames = [] episode_reward = [] episode_step_count = [] score = 0 d = False state_input = self.env.reset() state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], [] while not d: state_input = state_input.reshape( [1, self.params.input_dim]) # Run the policy network and get an action to take. curr_policy = sess.run( self.local_AC.probability, feed_dict={self.local_AC.input_x: state_input}) # get the action from predicted policy action = np.random.choice(np.arange(len(curr_policy)), p=curr_policy) # step the environment and get new measurements next_state, reward, d, _ = self.env.step(action) next_state = next_state.reshape([1, self.params.input_dim]) state_buffer.append(state_input) action_buffer.append([1, 0] if action == 0 else [0, 1]) reward_buffer.append( reward if not d or score == 399 else -200) # reward_buffer.append(reward) next_state_buffer.append(next_state) done_buffer.append(d) score += reward self.total_steps += 1 state_input = next_state self.myBuffer.append(state_buffer, action_buffer, reward_buffer, next_state_buffer, done_buffer) #state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], [] episode_reward.append(score) #print(score) episode_step_count.append(self.total_steps) self.episode_rewards.append(episode_reward) self.episode_lengths.append(episode_step_count) self.episode_mean_values.append(np.mean(episode_values)) # Update the network using the episode buffer at the end of the episode. if self.myBuffer != None: #v_l,p_l,e_l = self.train(sess) self.train(sess) # #print(v_l, p_l, e_l) self.update_Target(self.update_local_ops, sess) #print(myBuffer._memory) self.myBuffer.reset() self.total_steps = 0 # Periodically save gifs of episodes, model parameters, and summary statistics. if episode_count % 10 == 0 and episode_count != 0: if episode_count % 100 == 0 and self.name == 'worker_0': saver.save( sess, self.model_path + '/model-' + str(episode_count) + '.cptk') print("Saved Model") if self.name == "worker_0": curr_reward = 0 for i in range(5): test_done = False state = self.env.reset() while not test_done: state = state.reshape(1, self.params.input_dim) curr_policy = sess.run( self.global_network.probability, feed_dict={ self.global_network.input_x: state }) # get the action from predicted policy action = np.random.choice(np.arange( len(curr_policy)), p=curr_policy) # step the environment and get new measurements next_state, reward, test_done, _ = self.env.step( action) curr_reward += 1 state = next_state print("Episode: {}, Current global reward: {:.1f}". format(episode_count, curr_reward / 5)) time.sleep(0.5) if self.name == 'worker_0': sess.run(self.increment) episode_count += 1 if episode_count > self.params.total_episodes and self.name == "worker_0": coord.request_stop() def update_target_graph(self, from_vars, to_vars): op_holder = [] for from_var, to_var in zip(from_vars, to_vars): op_holder.append(to_var.assign(from_var)) return op_holder def update_Target(self, op_holder, sess): '''run operation defined in updateTargetGraph function''' for op in op_holder: sess.run(op)
class Agent: def __init__(self, env, lr=0.00025, batch_size=32, gamma=0.99, n_frames=3000000, start_frame=50000, anneal_frame=10**6, update_freq=5000, hist_len=4, num_reward=200, experience_size=10**6, check_point_path=r'../checkpoints', save_freq=1000, no_ops=10, eval_times=10, restore=False): # 环境搭建 self.env = GameEnv(env, hist_len) # 训练用到的参数 self.lr = lr self.batch_size = batch_size self.gamma = gamma self.hist_len = hist_len self.experience_size = experience_size # 容易乱的参数.....(- & -) self.n_frames = n_frames # 总共训练的帧数 self.start_frame = start_frame # 开始eps递增 self.anneal_frame = anneal_frame # eps递增到最大值 self.update_freq = update_freq # target_q_net_work的参数更新,每过update_freq个图,更新一次 self.num_reward = num_reward # 储存的reward的个数 self.no_ops = no_ops # 什么都不做的步数 self.eval_times = eval_times # 评估局数(默认十局) self.sess = tf.Session() self.save_freq = save_freq self.check_point_path = check_point_path n_actions = self.env.get_num_actions() self.action_chooser = ChooseAction(n_actions=n_actions, start_frame=self.start_frame, annealing_frame=self.anneal_frame) self.eval_dqn = DeepQNetwork(n_actions, batch_size=self.batch_size, lr=self.lr, name='eval_dqn') self.target_dqn = DeepQNetwork(n_actions, batch_size=self.batch_size, name='target_dqn') self.replay_memory = ReplayMemory(size=self.experience_size, batch_size=self.batch_size) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(name="model") self.restore = restore self.step = 0 # 读取以往的检查点 def _restore(self): if self.restore: print("Checkpoint Path: ", self.check_point_path) print("Checkpoint to be Restored:", tf.train.latest_checkpoint(self.check_point_path)) self.saver.restore( self.sess, tf.train.latest_checkpoint(self.check_point_path)) def _save(self): self.saver.save(self.sess, self.check_point_path + '/model.ckpt', global_step=self.step) # 评估训练结果(论文要求) def _eval(self): print("Evaluating...") # 保存当前状态 internal_state, system_state = self.env.clone_state() for eval_episodes in range(self.eval_times): self.env.reset() start_ep = True eval_reward = 0 eval_rewards = [] no_op = 0 no_ops = np.random.randint(0, self.no_ops) while True: self.env.render() if start_ep: no_op += 1 action = 0 else: action = self.action_chooser.choose_action( self.sess, 3000000, self.env.state, self.eval_dqn) _, _, reward, is_done, _ = self.env.step(action) eval_reward += reward if no_op == no_ops: start_ep = False if is_done: eval_rewards.append(eval_reward) break avg_eval_rewards = np.mean(eval_rewards) print("Evaluation average reward: {}".format(avg_eval_rewards)) # 恢复状态,继续训练 self.env.restore_state(internal_state, system_state) def _learn(self): # 从记忆库中取出记忆 states, score_lost, actions, rewards, new_states = self.replay_memory.get_minibatch( ) # [None 84 84 4] boolen [64] [64] [None 84 84 4] # 令下个状态回报最大 的 最优动作 best_action = self.sess.run( self.eval_dqn.best_action, feed_dict={self.eval_dqn.input: new_states}) # mini_batch每个状态时,下个状态的 reward target_q_val = self.sess.run( self.target_dqn.q_value, feed_dict={self.target_dqn.input: new_states}) # [batch_size, best_action]---每个状态下的最优动作值 target_q_val = target_q_val[range(self.batch_size), best_action] # 因为每次失分会导致整个游戏reset(),所以每失分一次,算作一个结束 # 根据论文算法,每次结束都采用reward作为target_q target_q = rewards + self.gamma * target_q_val * (1 - score_lost) # 为了反向传播....妈妈咪...-_- pred_q = self.sess.run(self.eval_dqn.q_value, feed_dict={self.eval_dqn.input: states}) target_q_transition = pred_q.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) eval_act_index = actions target_q_transition[batch_index, eval_act_index] = target_q target_q = target_q_transition # print(target_q - pred_q) loss, _ = self.sess.run([self.eval_dqn.loss, self.eval_dqn.update], feed_dict={ self.eval_dqn.input: states, self.eval_dqn.target_q: target_q }) return pred_q, target_q, loss def _update_target_q_network(self): eval_vars = self.eval_dqn.variables() target_vars = self.target_dqn.variables() update_operation = [] for eval_var, target_var in zip(eval_vars, target_vars): update_operation.append( tf.assign(target_var, tf.identity(eval_var))) copy_operation = tf.group(*update_operation) # 返回一个操作而非数组 self.sess.run(copy_operation) # 防止参数没有更新 check_before = self.sess.run(eval_vars[0]) check_after = self.sess.run(target_vars[0]) assert (check_before == check_after).all(), "Parameters not updated" def train(self): self._restore() self.env.reset() start_ep = True no_op = 0 no_ops = np.random.randint(0, self.no_ops) train_rewards = [] train_reward = 0 num_dones = 0 # current_reward = 0 # last_reward = 0 print("Training for {} frames...".format(self.n_frames)) # Training loop for elapsed_frame in range(0, self.n_frames): self.env.render() # 每过4帧换一次动作 if elapsed_frame % 4 == 0: # TODO 矩阵维度问题...-_-(done) action = self.action_chooser.choose_action( self.sess, elapsed_frame, self.env.state, self.eval_dqn) # print(action) score_lost, observation, reward, is_done, _ = self.env.step(action) train_reward += reward # 是否结束了一局(某一方得20分) if is_done: num_dones += 1 if len(train_rewards) < self.num_reward: train_rewards.append(train_reward) else: train_rewards[num_dones % self.num_reward] = train_reward last_reward = sum(train_rewards) / len(train_rewards) current_reward = train_reward train_reward = 0 print("Training Reward(average):", last_reward) print("Training Reward(current):", current_reward) self.replay_memory.add_experience(action, observation[:, :, 0], reward, score_lost) # start_frame之前是随机走动...只是为了丰富记忆库(林子大了什么鸟都有) if elapsed_frame > self.start_frame: _, _, loss = self._learn() print('loss:', loss) self.step += 1 if (elapsed_frame % self.update_freq == 0 and elapsed_frame > self.start_frame): print("Updating target network params", end=', ') print("Current number of frames elapsed: {}".format( elapsed_frame)) self._update_target_q_network() if (elapsed_frame % self.save_freq == 0 and elapsed_frame > self.start_frame): # 保存参数,检测训练结果 self._save() self._eval() print("Training finished") self.env.close()
def __init__(self, env, lr=0.00025, batch_size=32, gamma=0.99, n_frames=3000000, start_frame=50000, anneal_frame=10**6, update_freq=5000, hist_len=4, num_reward=200, experience_size=10**6, check_point_path=r'../checkpoints', save_freq=1000, no_ops=10, eval_times=10, restore=False): # 环境搭建 self.env = GameEnv(env, hist_len) # 训练用到的参数 self.lr = lr self.batch_size = batch_size self.gamma = gamma self.hist_len = hist_len self.experience_size = experience_size # 容易乱的参数.....(- & -) self.n_frames = n_frames # 总共训练的帧数 self.start_frame = start_frame # 开始eps递增 self.anneal_frame = anneal_frame # eps递增到最大值 self.update_freq = update_freq # target_q_net_work的参数更新,每过update_freq个图,更新一次 self.num_reward = num_reward # 储存的reward的个数 self.no_ops = no_ops # 什么都不做的步数 self.eval_times = eval_times # 评估局数(默认十局) self.sess = tf.Session() self.save_freq = save_freq self.check_point_path = check_point_path n_actions = self.env.get_num_actions() self.action_chooser = ChooseAction(n_actions=n_actions, start_frame=self.start_frame, annealing_frame=self.anneal_frame) self.eval_dqn = DeepQNetwork(n_actions, batch_size=self.batch_size, lr=self.lr, name='eval_dqn') self.target_dqn = DeepQNetwork(n_actions, batch_size=self.batch_size, name='target_dqn') self.replay_memory = ReplayMemory(size=self.experience_size, batch_size=self.batch_size) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(name="model") self.restore = restore self.step = 0
class Ddpg_Agent(): def __init__(self, params): self.env = gym.make('CartPole-v0') self.params = params self.graph = tf.Graph() with self.graph.as_default(): self.main_actor = Policy_network(params, "primary") tvars = tf.trainable_variables() tact_start_index = int(len(tvars)) self.target_actor = Policy_network(params, "target") tvars = tf.trainable_variables() mcri_start_index = int(len(tvars)) self.main_critic = Value_network(params, "primary") tvars = tf.trainable_variables() tcri_start_index = int(len(tvars)) self.target_critic = Value_network(params, "target") self.tvars = tf.trainable_variables() self.main_actor_tvars = self.tvars[:tact_start_index] self.target_actor_tvars = self.tvars[ tact_start_index:mcri_start_index] self.main_critic_tvars = self.tvars[ mcri_start_index:tcri_start_index] self.target_critic_tvars = self.tvars[tcri_start_index:] self.main_actor.backprop(self.main_actor_tvars) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver() if not os.path.exists(self.params.logdir): os.mkdir(self.params.logdir) self.myBuffer = ReplayMemory(max_size=self.params.max_buffer_size) self.running_reward = None self.reward_sum = 0 self.global_step = 0 self.actor_targetOps = self.update_TargetGraph(self.main_actor_tvars, self.target_actor_tvars, self.params.tau) self.critic_targetOps = self.update_TargetGraph( self.main_critic_tvars, self.target_critic_tvars, self.params.tau) def update_TargetGraph(self, main_tfVar, target_tfVar, tau): '''Holds operation node for assigning Target values to Target network Args: tfVars - Variables for training(weights, bias...) Tau - rate for updating (low Tau value for slow updates) Return: op_holder - tf.assign() operation. input for updateTarget Function''' assert len(main_tfVar) == len(target_tfVar) total_vars = len(main_tfVar) op_holder = [] # for latter-half part of trainable variables (= for Target network variables) for idx, var in enumerate(main_tfVar[0:total_vars]): # assigning tau*new_value+(1-tau)*old_values op_holder.append(target_tfVar[idx].assign((var.value() * tau) + ( (1 - tau) * target_tfVar[idx].value()))) return op_holder def update_Target(self, op_holder, sess): '''run operation defined in updateTargetGraph function''' for op in op_holder: sess.run(op) def _load_model(self, sess, load_ckpt): if load_ckpt: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.params.logdir) self.saver.restore(sess, ckpt.model_checkpoint_path) else: # initialize gloabl variables print('Initialize variables...') sess.run(self.init) def train(self): with tf.Session(graph=self.graph) as sess: self._load_model(sess, self.params.load_model) self.total_episodes = self.params.total_episodes # Obtain an initial observation of the environment state = self.env.reset() state_input = state.reshape([1, self.params.input_dim]) for episode_number in xrange(self.params.total_episodes): done = False score = 0 while not done: if self.global_step > self.params.preTrainStep: # Value network update trainBatch = self.myBuffer.sample( self.params.batch_size) batch_state = np.array(trainBatch[0]).reshape( [self.params.batch_size, self.params.input_dim]) batch_actions = np.array(trainBatch[1]).reshape( [self.params.batch_size, self.params.num_actions]) batch_rewards = np.array(trainBatch[2]) batch_next_state = np.array(trainBatch[3]).reshape( [self.params.batch_size, self.params.input_dim]) batch_done = np.array(trainBatch[4]) end_multiplier = -(batch_done - 1) target_action = sess.run(self.target_actor.det_prob, feed_dict={ self.target_actor.input_x: batch_next_state }) target_action = np.array([[1, 0] if i == 0 else [0, 1] for i in target_action]) targetQ_all = sess.run(self.target_critic.Qout, feed_dict={ self.target_critic.input_x: batch_next_state, self.target_critic.actions: target_action }) nextQ = np.sum(np.multiply(targetQ_all, target_action), axis=-1) targetQ = batch_rewards + (self.params.gamma * nextQ * end_multiplier) pred_actions = sess.run( self.main_actor.det_prob, feed_dict={self.main_actor.input_x: batch_state}) pred_actions = np.array([[1, 0] if i == 0 else [0, 1] for i in pred_actions]) # Update the network with our target values. sess.run(self.main_critic.update_value_model, feed_dict={ self.main_critic.input_x: batch_state, self.main_critic.target_Q: targetQ, self.main_critic.actions: batch_actions }) self.update_Target(self.critic_targetOps, sess) gradients = sess.run(self.main_critic.action_grads, feed_dict={ self.main_critic.input_x: batch_state, self.main_critic.actions: pred_actions }) gradients = np.array(gradients).reshape( self.params.batch_size, self.params.num_actions) sess.run(self.main_actor.optimize, feed_dict={ self.main_actor.input_x: batch_state, self.main_actor.action_gradient: gradients }) self.update_Target(self.actor_targetOps, sess) # Make sure the observation is in a shape the network can handle. state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], [] actor_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self.params.num_actions)) action = sess.run(self.main_actor.logits, feed_dict={ self.main_actor.input_x: state_input }) + actor_noise() action = np.argmax(action) # step the environment and get new measurements next_state, reward, done, _ = self.env.step(action) next_state = next_state.reshape([1, self.params.input_dim]) state_buffer.append(state_input) action_buffer.append([1, 0] if action == 0 else [0, 1]) reward_buffer.append( reward if not done or score == 299 else -100) #reward_buffer.append(reward) next_state_buffer.append(next_state) done_buffer.append(done) # move to next state state_input = next_state # add up reward self.reward_sum += reward score += reward self.global_step += 1 self.myBuffer.append(state_buffer, action_buffer, reward_buffer, next_state_buffer, done_buffer) if episode_number % self.params.update_freq == 0: self.running_reward = self.reward_sum if self.running_reward is None else self.running_reward * 0.99 + self.reward_sum * 0.01 print( 'Current Episode {} Average reward for episode {:.2f}. Total average reward {:.2f}.' .format(episode_number, self.reward_sum // self.params.update_freq, self.running_reward // self.params.update_freq)) self.reward_sum = 0 time.sleep(0.5) self.state = self.env.reset() state_input = self.state.reshape([1, self.params.input_dim]) self.global_step += 1
# Hyper - Parameters MAX_STEP_EP = 50 # Maximum number of timesteps in an episode/episode length MAX_NUM_EP = 100 # Maximum Number of Episodes GAMMA = 0.9 # The Discount factor on rewards PAR_RANGES = np.array( [[0, 1], [0, 0.75], [1, 2], [0, 1]] ) # Parameter ranges in the order: Pole mass, Pole Length, Cart Mass, Friction BUFFER_SIZE = 128 EPISODES = 10 # Num. of Episodes agent = RDPGAgent(experiment, GAMMA, PAR_RANGES) random_env = random_cartpole_env(experiment, PAR_RANGES) replay_buffer = ReplayMemory(BUFFER_SIZE) for i in range(MAX_NUM_EP): #each episode has different sampled dynamic parameters in the environment random_env.sample_env() env, env_parameters = random_env.get_sampled_env() state_array = env.reset() episode = EpisodeMemory( env, MAX_STEP_EP ) # The parameters passed into this function are to be decided yet # First action is a random sample from the action space of the environment, since we need history from the next time step # in an episode to implement the policy(action) from the actor network episode_reward = 0
class AC_Agent(): def __init__(self, params): self.env = gym.make('CartPole-v0') #self.env = gym.make('Pong-v0') self.params = params self.graph = tf.Graph() with self.graph.as_default(): self.actor = Policy_network(params) self.main_critic = Value_network(params, "primary") self.target_critic = Value_network(params, "target") self.init = tf.global_variables_initializer() if not os.path.exists(self.params.logdir): os.mkdir(self.params.logdir) self.saver = tf.train.Saver() self.tvars = tf.trainable_variables() main_start_index = int(len(self.tvars)/3) target_start_index = int(2*len(self.tvars)/3) self.actor_tvars = self.tvars[:main_start_index] self.main_critic_tvars = self.tvars[main_start_index:target_start_index] self.target_critic_tvars = self.tvars[target_start_index:] #self.actor.backprop(tvars=None) self.running_reward = None self.reward_sum = 0 self.episode_number = 0 rendering = False self.global_step = 0 self.critic_targetOps = self.update_critic_TargetGraph(self.main_critic_tvars, self.target_critic_tvars, self.params.tau) self.myBuffer = ReplayMemory(max_size=self.params.max_buffer_size) def update_critic_TargetGraph(self, main_tfVar, target_tfVar, tau): '''Holds operation node for assigning Target values to Target network Args: tfVars - Variables for training(weights, bias...) Tau - rate for updating (low Tau value for slow updates) Return: op_holder - tf.assign() operation. input for updateTarget Function''' assert len(main_tfVar) == len(target_tfVar) total_vars = len(main_tfVar) op_holder = [] # for latter-half part of trainable variables (= for Target network variables) for idx, var in enumerate(main_tfVar[0:total_vars]): # assigning tau*new_value+(1-tau)*old_values op_holder.append(target_tfVar[idx].assign( (var.value() * tau) + ((1 - tau) * target_tfVar[idx].value()))) return op_holder def update_critic_Target(self, op_holder, sess): '''run operation defined in updateTargetGraph function''' for op in op_holder: sess.run(op) def _load_model(self, sess, load_ckpt): if load_ckpt: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(self.params.logdir) self.saver.restore(sess, ckpt.model_checkpoint_path) else: # initialize gloabl variables print('Initialize variables...') sess.run(self.init) def rendering(self, rendering): if self.reward_sum / self.params.update_freq >= 180 or rendering == True : self.env.render() rendering = True def train(self): with tf.Session(graph=self.graph) as sess: self._load_model(sess, self.params.load_model) self.total_episodes = self.params.total_episodes # Obtain an initial observation of the environment self.state = self.env.reset() #state_input = self.prepro(self.state) state_input = self.state.reshape([1, self.params.input_dim]) for self.episode_number in xrange(self.params.total_episodes): done = False score = 0 while not done: if self.global_step > self.params.preTrainStep: #print(self.myBuffer) # Value network update trainBatch = self.myBuffer.sample(self.params.batch_size) #print(trainBatch) batch_state = np.array(trainBatch[0]).reshape([self.params.batch_size, self.params.input_dim]) batch_actions = np.array(trainBatch[1]).reshape([self.params.batch_size, self.params.num_actions]) batch_rewards = np.array(trainBatch[2]) batch_next_state = np.array(trainBatch[3]).reshape([self.params.batch_size, self.params.input_dim]) batch_done = np.array(trainBatch[4]) end_multiplier = -(batch_done - 1) targetQ_all = sess.run(self.target_critic.Qout, feed_dict={self.target_critic.input_x: batch_next_state}) targetQ = batch_rewards + (self.params.gamma * np.max(targetQ_all, axis=-1) * end_multiplier) predictedQ_all = sess.run(self.main_critic.Qout, feed_dict={self.main_critic.input_x: batch_state}) # Update the network with our target values. sess.run(self.main_critic.update_value_model, feed_dict={self.main_critic.input_x : batch_state, self.main_critic.target_Q : targetQ, self.main_critic.actions : batch_actions}) self.update_critic_Target(self.critic_targetOps, sess) batch_advantage = batch_rewards + (self.params.gamma * np.max(targetQ_all, axis=-1) * end_multiplier) - np.max(predictedQ_all) # Policy network update batch_advantage = batch_advantage.reshape([self.params.batch_size, 1]) sess.run(self.actor.optimize, feed_dict={self.actor.input_x: batch_state, self.actor.input_y: batch_actions, self.actor.advantages: batch_advantage}) # Make sure the observation is in a shape the network can handle. state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], [] #print(state_input.shape) #prev_state = state_input # Run the policy network and get an action to take. curr_policy = sess.run(self.actor.probability, feed_dict={self.actor.input_x: state_input}) # get the action from predicted policy action = np.random.choice(np.arange(len(curr_policy)), p=curr_policy) # step the environment and get new measurements next_state, reward, done, _ = self.env.step(action) next_state = next_state.reshape([1, self.params.input_dim]) #next_state = self.prepro(next_state) #next_state = next_state - prev_state state_buffer.append(state_input) action_buffer.append([1, 0] if action == 0 else [0, 1]) reward_buffer.append(reward if not done or score == 299 else -100) #reward_buffer.append(reward) next_state_buffer.append(next_state) done_buffer.append(done) state_input = next_state # move to next state # add up reward self.reward_sum += reward score += reward self.global_step += 1 self.myBuffer.append(state_buffer, action_buffer, reward_buffer, next_state_buffer, done_buffer) if self.episode_number % self.params.update_freq == 0: self.running_reward = self.reward_sum if self.running_reward is None else self.running_reward * 0.99 + self.reward_sum * 0.01 print('Current Episode {} Average reward for episode {:.2f}. Total average reward {:.2f}.' .format(self.episode_number, self.reward_sum // self.params.update_freq, self.running_reward // self.params.update_freq)) self.reward_sum = 0 time.sleep(0.5) self.state = self.env.reset() state_input = self.state.reshape([1, self.params.input_dim]) #state_input = self.prepro(self.state) self.global_step += 1
'---------------------------- vizDoom training script ---------------------------' ) print('scenario: {}, agent: {}'.format(hp.scenario, hp.agent)) print('\ntraining parameters:') print('n_epoch: {}, steps_per_epoch: {}, play_steps: {}'.format( hp.n_epoch, hp.steps_per_epoch, hp.play_steps)) print('batch_size: {}, time_size: {}, not_update: {}'.format( hp.batch_size, hp.time_size, hp.not_update)) print('tests_per_epoch: {}'.format(hp.tests_per_epoch)) train_env = DoomEnvironment('scenarios/' + hp.scenario + '.cfg', False, hp.train_skiprate) test_env = DoomEnvironment('scenarios/' + hp.scenario + '.cfg', False, hp.test_skiprate) er = ReplayMemory(hp.replay_size, hp.screen_size) policy_net = agent[hp.agent](hp.scenario, 2**train_env.get_n_buttons()) target_net = agent[hp.agent](hp.scenario, 2**train_env.get_n_buttons()) optimizer = torch.optim.RMSprop(policy_net.parameters(), hp.learning_rate) trainer = Trainer(scenario=hp.scenario, cuda=hp.cuda, environment=train_env, test_environment=test_env, experience_replay=er, policy_net=policy_net, target_net=target_net, optimizer=optimizer, not_update=hp.not_update, log_folder='logs/' + hp.scenario + '/' + hp.agent)
class Agent: """Our Wasted Agent :P """ def __init__(self, sess, config, environment, evaluation_enviroment): # Get the session, config, environment, and create a replaymemory self.sess = sess self.config = config self.environment = environment self.evaluation_enviroment = evaluation_enviroment if config.prm: self.memory = PrioritizedExperienceReplay(sess, config) else: self.memory = ReplayMemory(config.state_shape, config.rep_max_size) self.init_dirs() self.init_cur_epsiode() self.init_global_step() self.init_epsilon() self.init_summaries() # Intialize the DQN graph which contain 2 Networks Target and Q self.estimator = DQN(sess, config, self.environment.n_actions) # To initialize all variables self.init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(self.init) self.saver = tf.train.Saver(max_to_keep=10) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.sess.graph) if config.is_train and not config.cont_training: pass elif config.is_train and config.cont_training: self.load() elif config.is_play: self.load() else: raise Exception("Please Set proper mode for training or playing") def load(self): latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) self.saver.restore(self.sess, latest_checkpoint) def save(self): self.saver.save(self.sess, self.checkpoint_dir, self.global_step_tensor) def init_dirs(self): # Create directories for checkpoints and summaries self.checkpoint_dir = os.path.join(self.config.experiment_dir, "checkpoints/") self.summary_dir = os.path.join(self.config.experiment_dir, "summaries/") def init_cur_epsiode(self): """Create cur episode tensor to totally save the process of the training""" with tf.variable_scope('cur_episode'): self.cur_episode_tensor = tf.Variable(-1, trainable=False, name='cur_epsiode') self.cur_epsiode_input = tf.placeholder('int32', None, name='cur_episode_input') self.cur_episode_assign_op = self.cur_episode_tensor.assign( self.cur_epsiode_input) def init_global_step(self): """Create a global step variable to be a reference to the number of iterations""" with tf.variable_scope('step'): self.global_step_tensor = tf.Variable(0, trainable=False, name='global_step') self.global_step_input = tf.placeholder('int32', None, name='global_step_input') self.global_step_assign_op = self.global_step_tensor.assign( self.global_step_input) def init_epsilon(self): """Create an epsilon variable""" with tf.variable_scope('epsilon'): self.epsilon_tensor = tf.Variable(self.config.initial_epsilon, trainable=False, name='epsilon') self.epsilon_input = tf.placeholder('float32', None, name='epsilon_input') self.epsilon_assign_op = self.epsilon_tensor.assign( self.epsilon_input) def init_summaries(self): """Create the summary part of the graph""" with tf.variable_scope('summary'): self.summary_placeholders = {} self.summary_ops = {} self.scalar_summary_tags = [ 'episode.total_reward', 'episode.length', 'evaluation.total_reward', 'evaluation.length', 'epsilon' ] for tag in self.scalar_summary_tags: self.summary_placeholders[tag] = tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = tf.summary.scalar( tag, self.summary_placeholders[tag]) def init_replay_memory(self): # Populate the replay memory with initial experience print("initializing replay memory...") state = self.environment.reset() for i in itertools.count(): action = self.take_action(state) next_state, reward, done = self.observe_and_save( state, self.environment.valid_actions[action]) if done: if self.config.prm: if i >= self.config.prm_init_size: break else: if i >= self.config.replay_memory_init_size: break state = self.environment.reset() else: state = next_state print("finished initializing replay memory") def policy_fn(self, fn_type, estimator, n_actions): """Function that contain definitions to various number of policy functions and choose between them""" def epsilon_greedy(sess, observation, epsilon): actions = np.ones(n_actions, dtype=float) * epsilon / n_actions q_values = estimator.predict(np.expand_dims(observation, 0))[0] best_action = np.argmax(q_values) actions[best_action] += (1.0 - epsilon) return actions def greedy(sess, observation): q_values = estimator.predict(np.expand_dims(observation, 0), type="target")[0] best_action = np.argmax(q_values) return best_action if fn_type == 'epsilon_greedy': return epsilon_greedy elif fn_type == 'greedy': return greedy else: raise Exception("Please Select a proper policy function") def take_action(self, state): """Take the action based on the policy function""" action_probs = self.policy(self.sess, state, self.epsilon_tensor.eval(self.sess)) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) return action def observe_and_save(self, state, action): """Function that observe the new state , reward and save it in the memory""" next_state, reward, done = self.environment.step(action) self.memory.push(state, next_state, action, reward, done) return next_state, reward, done def update_target_network(self): """Update Target network By copying paramter between the two networks in DQN""" self.estimator.update_target_network() def add_summary(self, summaries_dict, step): """Add the summaries to tensorboard""" summary_list = self.sess.run( [self.summary_ops[tag] for tag in summaries_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in summaries_dict.items() }) for summary in summary_list: self.summary_writer.add_summary(summary, step) self.summary_writer.flush() def train_episodic(self): """Train the agent in episodic techniques""" # Initialize the epsilon step, it's step, the policy function, the replay memory self.epsilon_step = ( self.config.initial_epsilon - self.config.final_epsilon) / self.config.exploration_steps self.policy = self.policy_fn(self.config.policy_fn, self.estimator, self.environment.n_actions) self.init_replay_memory() for cur_episode in range( self.cur_episode_tensor.eval(self.sess) + 1, self.config.num_episodes, 1): # Save the current checkpoint self.save() # Update the Cur Episode tensor self.cur_episode_assign_op.eval( session=self.sess, feed_dict={ self.cur_epsiode_input: self.cur_episode_tensor.eval(self.sess) + 1 }) # Evaluate Now to see how it behave if cur_episode % self.config.evaluate_every == 0: self.evaluate(cur_episode / self.config.evaluate_every) state = self.environment.reset() total_reward = 0 # Take steps in the environment untill terminal state of epsiode for t in itertools.count(): # Update the Global step self.global_step_assign_op.eval( session=self.sess, feed_dict={ self.global_step_input: self.global_step_tensor.eval(self.sess) + 1 }) # time to update the target estimator if self.global_step_tensor.eval( self.sess ) % self.config.update_target_estimator_every == 0: self.update_target_network() # Calculate the Epsilon for this time step # Take an action ..Then observe and save self.epsilon_assign_op.eval( { self.epsilon_input: max( self.config.final_epsilon, self.epsilon_tensor.eval(self.sess) - self.epsilon_step) }, self.sess) action = self.take_action(state) next_state, reward, done = self.observe_and_save( state, self.environment.valid_actions[action]) # Sample a minibatch from the replay memory if self.config.prm: indices_batch, weights_batch, state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.sample( ) else: state_batch, next_state_batch, action_batch, reward_batch, done_batch = self.memory.get_batch( self.config.batch_size) # Calculate targets Then Compute the loss q_values_next = self.estimator.predict(next_state_batch, type="target") targets_batch = reward_batch + np.invert(done_batch).astype( np.float32) * self.config.discount_factor * np.amax( q_values_next, axis=1) if self.config.prm: _ = self.estimator.update(state_batch, action_batch, targets_batch, weights_batch) else: _ = self.estimator.update(state_batch, action_batch, targets_batch) total_reward += reward if done: # IF terminal state so exit the episode # Add summaries to tensorboard summaries_dict = { 'episode.total_reward': total_reward, 'episode.length': t, 'epsilon': self.epsilon_tensor.eval(self.sess) } self.add_summary(summaries_dict, self.global_step_tensor.eval(self.sess)) break state = next_state print("Training Finished") def train_continous(self): # TODO implement on global step only pass def play(self, n_episode=10): """Function that play greedily on the policy learnt""" # Play Greedily self.policy = self.policy_fn('greedy', self.estimator, self.environment.n_actions) for cur_episode in range(n_episode): state = self.environment.reset() total_reward = 0 for t in itertools.count(): best_action = self.policy(self.sess, state) next_state, reward, done = self.environment.step( self.environment.valid_actions[best_action]) total_reward += reward if done: print("Total Reward in Epsiode " + str(cur_episode) + " = " + str(total_reward)) print("Total Length in Epsiode " + str(cur_episode) + " = " + str(t)) break state = next_state def evaluate(self, local_step): print('evaluation #{0}'.format(local_step)) policy = self.policy_fn('greedy', self.estimator, self.evaluation_enviroment.n_actions) for cur_episode in range(self.config.evaluation_episodes): state = self.evaluation_enviroment.reset() total_reward = 0 for t in itertools.count(): best_action = policy(self.sess, state) next_state, reward, done = self.evaluation_enviroment.step( self.evaluation_enviroment.valid_actions[best_action]) total_reward += reward if done: # Add summaries to tensorboard summaries_dict = { 'evaluation.total_reward': total_reward, 'evaluation.length': t } self.add_summary(summaries_dict, local_step * 5 + cur_episode) break state = next_state print('Finished evaluation #{0}'.format(local_step))
def train(): parser = argparse.ArgumentParser( description='Train an agent in the ViZDoom environment.') parser.add_argument('map_name', help='path to the map config') parser.add_argument('--output_path', dest='output_path', help='output path for agent checkpoints') parser.add_argument( '--save_interval', dest='save_interval', default=10, type=int, help='interval, measured in epochs, between each agent checkpoint') parser.add_argument('--cuda', dest='cuda', default=False, action='store_true', help='whether to use cuda') parser.add_argument('--log_interval', dest='log_interval', default=10, type=int, help='interval between each progress update log') parser.add_argument( '--score_buffer_size', dest='score_buffer_size', default=50, type=int, help= 'the amount of last scores that will be saved to compute statistics') parser.add_argument('--n_epochs', dest='n_epochs', default=1000, type=int, help='number of epochs') parser.add_argument('--epoch_len', dest='epoch_len', default=1024, type=int, help='the length of an epoch') parser.add_argument('--lr', dest='lr', default=2.5e-4, type=float, help='learning rate') parser.add_argument('--lr_decay', dest='decay_lr', default=False, help='whether to decay learning rate each epoch') parser.add_argument('--gamma', dest='gamma', default=0.99, type=float, help='discount factor') parser.add_argument('--batch_size', dest='batch_size', default=32, type=int, help='batch size') parser.add_argument('--alg', dest='alg', default='ppo', choices=['ppo', 'dqn', 'a2c'], help='the algorithm the agent will use') parser.add_argument( '--nn', dest='nn', default='deepmind_cnn', choices=['deepmind_cnn', 'capsnet'], help='neural network that the agent will use as its feature network') parser.add_argument('--frame_skip', dest='frame_skip', default=4, type=int, help='number of frames to skip each action') parser.add_argument('--frames_per_state', dest='frames_per_state', default=4, type=int, help='number of frames to stack every state') parser.add_argument('--state_w', dest='state_w', default=108, type=int, help='target state width to resize each frame to') parser.add_argument('--state_h', dest='state_h', default=60, type=int, help='target state height to resize each frame to') parser.add_argument('--state_rgb', dest='rgb', default=False, action='store_true', help='whether to use rgb or gray frames') parser.add_argument( '--shape_rewards', dest='shape_rewards', default=False, action='store_true', help= 'whether to use a reward shaping function specified for the selected map' ) parser.add_argument( '--use_default_actions_for_map', dest='use_default_actions', default=False, action='store_true', help= 'whether to use a default set of actions specified for the selected map' ) parser.add_argument('--ppo_lambda', dest='lam', default=0.95, type=float, help='lambda value for GAE') parser.add_argument('--ppo_eps', dest='eps', default=0.1, type=float, help='clipping parameter for PPO') parser.add_argument( '--ppo_decay_params', dest='ppo_decay', default=False, action='store_true', help= 'whether to decay PPO learning rate and epsilon each epoch linearly') parser.add_argument('--ppo_ent_coeff', dest='ent_coeff', default=0.01, type=float, help='entropy coefficient for PPO') parser.add_argument('--ppo_value_coeff', dest='value_coeff', default=1.0, type=float, help='value coefficient for PPO') parser.add_argument('--ppo_opt_epochs', dest='opt_epochs', default=4, type=int, help='number of optimization epochs for PPO') parser.add_argument('--dqn_use_ddqn', dest='ddqn', default=False, action='store_true', help='whether to use ddqn instead of dqn') parser.add_argument('--dqn_dueling', dest='dueling', default=False, action='store_true', help='whether to use a dueling architecture in dqn') parser.add_argument('--dqn_min_eps', dest='min_eps', default=0.01, type=float, help='minimum value of epsilon for dqn') parser.add_argument('--dqn_mem_size', dest='memory_size', default=100000, type=int, help='replay memory size for dqn') parser.add_argument('--dqn_init_size', dest='init_size', default=10000, type=int, help='number of timesteps before dqn starts learning') parser.add_argument('--dqn_q_update_interval', dest='q_update_interval', default=1, type=int, help='the interval between updates of the q function') parser.add_argument( '--dqn_target_update_interval', dest='target_update_interval', default=1000, type=int, help='the interval between updated of the target q function') args = parser.parse_args() game = initialize_vizdoom(args.map_name) if args.use_default_actions: actions = default_actions_for_map(game, args.map_name) else: actions = all_actions(game) reward_fn = default_reward_shaping( args.map_name) if args.shape_rewards else None in_channels = args.frames_per_state * (3 if args.rgb else 1) if args.nn == 'deepmind_cnn': feature_net = CNN(in_channels) elif args.nn == 'capsnet': feature_net = CapsNet(in_channels) if args.alg == 'ppo': policy = ActorCriticPolicy(feature_net, len(actions)) optimizer = torch.optim.Adam(policy.parameters(), lr=args.lr) eps_sched = LinearSchedule("eps", args.eps, 1, args.n_epochs, end_val=1.0 if not args.ppo_decay else 0.0) lr_sched = LRWrapper( optimizer, LinearSchedule("lr", args.lr, 1, args.n_epochs, end_val=1.0 if not args.ppo_decay else 0.0)) schedules = [lr_sched, eps_sched] agent = PPOAgent(policy, optimizer, eps_sched, cuda=args.cuda, n_timesteps=args.epoch_len, batch_size=args.batch_size, opt_epochs=args.opt_epochs, gamma=args.gamma, lam=args.lam, entropy_coeff=args.ent_coeff, value_coeff=args.value_coeff) elif args.alg == 'a2c': policy = ActorCriticPolicy(feature_net, len(actions)) optimizer = torch.optim.Adam(policy.parameters(), lr=args.lr) lr_sched = LRWrapper( optimizer, LinearSchedule("lr", args.lr, 1, args.n_epochs, end_val=1.0 if not args.decay_lr else 0.0)) schedules = [lr_sched] agent = A2CAgent(policy, optimizer, args.cuda, args.gamma, args.epoch_len) elif args.alg == 'dqn': q = QNetwork(feature_net, len(actions)) tq = QNetwork(feature_net, len(actions)) optimizer = torch.optim.Adam(q.parameters(), lr=args.lr) memory = ReplayMemory(args.memory_size) eps_sched = LinearSchedule("eps", 1, 1, args.n_epochs, end_val=args.min_eps) lr_sched = LRWrapper( optimizer, LinearSchedule("lr", args.lr, 1, args.n_epochs, end_val=1.0 if not args.decay_lr else 0.0)) schedules = [lr_sched, eps_sched] agent = DQNAgent(q, tq, optimizer, memory, eps_sched, cuda=args.cuda, init_steps=args.init_size, q_update_interval=args.q_update_interval, target_update_interval=args.target_update_interval, ddqn=args.ddqn, gamma=args.gamma, batch_size=args.batch_size) progress_monitor = ProgressMonitor(args.score_buffer_size, monitor_interval=args.log_interval) env_params = { "env": { "frame_skip": args.frame_skip, "frames_per_state": args.frames_per_state, "state_dim": (3 if args.rgb else 1, args.state_h, args.state_w), "actions": actions }, "agent": { "alg": args.alg, "nn": args.nn }, "save_path": args.output_path, "save_interval": args.save_interval, "progress_monitor": progress_monitor, "map_name": args.map_name } if args.output_path: checkpoint_monitor = CheckpointMonitor(env_params, agent) monitors = [checkpoint_monitor, progress_monitor] else: monitors = [progress_monitor] generator = TrajectoryGenerator(game, args.n_epochs, args.epoch_len, agent, shape_reward_fn=reward_fn, monitors=monitors, param_schedules=schedules, **env_params["env"]) generator.run()