def __init__(self, master, env, task_id, n_iter, start_at_iter=0): super(AKTThread, self).__init__() self.master = master self.config = self.master.config self.task_id = task_id self.nA = env.action_space.n self.n_iter = n_iter self.start_at_iter = start_at_iter self.add_accum_grad = None # To be filled in later self.build_networks() self.states = self.master.states self.session = self.master.session self.task_runner = EnvRunner(env, TaskPolicy(self.action, self), self.master.config) # Write the summary of each task in a different directory self.writer = tf.summary.FileWriter( os.path.join(self.master.monitor_path, "task" + str(self.task_id)), self.master.session.graph) self.optimizer = tf.train.RMSPropOptimizer( learning_rate=self.config["learning_rate"], decay=self.config["decay"], epsilon=self.config["epsilon"])
def __init__(self, env_id, task_id, comm, monitor_path, config, seed=None): super(DPPOWorker, self).__init__() self.comm = comm self.config = config self.env = make(env_id) self.task_id = task_id if seed is not None: self.env.seed(seed) self.writer = tf.summary.FileWriter( os.path.join(monitor_path, "task{}".format(task_id))) # Only used (and overwritten) by agents that use an RNN self.initial_features = None with tf.device('/cpu:0'): with tf.variable_scope( "new_network"): # The workers only have 1 network self.global_network = self.build_networks() self.states = self.global_network.states self.action = self.global_network.action self.value = self.global_network.value self.global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.env_runner = EnvRunner(self.env, self, {}, summary_writer=self.writer)
def __init__(self, env, monitor_path, video=True, **usercfg): super(A2C, self).__init__(**usercfg) self.monitor_path = monitor_path self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False)) self.env_runner = EnvRunner(self.env, self, usercfg) self.config.update( dict(timesteps_per_batch=10000, trajectories_per_batch=10, batch_update="timesteps", n_iter=100, gamma=0.99, actor_learning_rate=0.01, critic_learning_rate=0.05, actor_n_hidden=20, critic_n_hidden=20, repeat_n_actions=1, save_model=False)) self.config.update(usercfg) self.build_networks() init = tf.global_variables_initializer() # Launch the graph. self.session = tf.Session() self.session.run(init) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = tf.train.Saver() self.rewards = tf.placeholder("float", name="Rewards") self.episode_lengths = tf.placeholder("float", name="Episode_lengths") summary_actor_loss = tf.summary.scalar("Actor_loss", self.summary_actor_loss) summary_critic_loss = tf.summary.scalar("Critic_loss", self.summary_critic_loss) summary_rewards = tf.summary.scalar("Rewards", self.rewards) summary_episode_lengths = tf.summary.scalar("Episode_lengths", self.episode_lengths) self.summary_op = tf.summary.merge([ summary_actor_loss, summary_critic_loss, summary_rewards, summary_episode_lengths ]) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "summaries"), self.session.graph) return
def __init__(self, envs, monitor_path, **usercfg): super(KnowledgeTransfer, self).__init__(**usercfg) self.envs = envs self.n_tasks = len(envs) self.monitor_path = monitor_path self.nA = envs[0].action_space.n self.config.update( dict( timesteps_per_batch=10000, trajectories_per_batch=10, batch_update="timesteps", n_iter=100, switch_at_iter=None, gamma=0.99, # Discount past rewards by a percentage decay=0.9, # Decay of RMSProp optimizer epsilon=1e-9, # Epsilon of RMSProp optimizer learning_rate=0.005, n_hidden_units=10, repeat_n_actions=1, n_sparse_units=10, feature_extraction=False)) self.config.update(usercfg) self.build_networks() self.task_runners = [ EnvRunner(envs[i], TaskPolicy(action, self), self.config) for i, action in enumerate(self.action_tensors) ] if self.config["save_model"]: for action_tensor in self.action_tensors: tf.add_to_collection("action", action_tensor) tf.add_to_collection("states", self.states) self.saver = tf.train.Saver()
def __init__(self, env, monitor_path, video=True, **usercfg): super(REINFORCE, self).__init__(**usercfg) self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False)) self.env_runner = EnvRunner(self.env, self, usercfg) self.monitor_path = monitor_path # Default configuration. Can be overwritten using keyword arguments. self.config.update( dict( batch_update="timesteps", timesteps_per_batch=1000, n_iter=100, gamma=0.99, # Discount past rewards by a percentage decay=0.9, # Decay of RMSProp optimizer epsilon=1e-9, # Epsilon of RMSProp optimizer learning_rate=0.05, n_hidden_units=20, repeat_n_actions=1, save_model=False)) self.config.update(usercfg) self.build_network() self.make_trainer() init = tf.global_variables_initializer() # Launch the graph. self.session = tf.Session() self.session.run(init) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = tf.train.Saver() self.rewards = tf.placeholder("float", name="Rewards") self.episode_lengths = tf.placeholder("float", name="Episode_lengths") summary_loss = tf.summary.scalar("Loss", self.summary_loss) summary_rewards = tf.summary.scalar("Rewards", self.rewards) summary_episode_lengths = tf.summary.scalar("Episode_lengths", self.episode_lengths) self.summary_op = tf.summary.merge( [summary_loss, summary_rewards, summary_episode_lengths]) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "task0"), self.session.graph)
class REINFORCE(Agent): """ REINFORCE with baselines """ def __init__(self, env, monitor_path, video=True, **usercfg): super(REINFORCE, self).__init__(**usercfg) self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False)) self.env_runner = EnvRunner(self.env, self, usercfg) self.monitor_path = monitor_path # Default configuration. Can be overwritten using keyword arguments. self.config.update( dict( batch_update="timesteps", timesteps_per_batch=1000, n_iter=100, gamma=0.99, # Discount past rewards by a percentage decay=0.9, # Decay of RMSProp optimizer epsilon=1e-9, # Epsilon of RMSProp optimizer learning_rate=0.05, n_hidden_units=20, repeat_n_actions=1, save_model=False)) self.config.update(usercfg) self.build_network() self.make_trainer() init = tf.global_variables_initializer() # Launch the graph. self.session = tf.Session() self.session.run(init) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = tf.train.Saver() self.rewards = tf.placeholder("float", name="Rewards") self.episode_lengths = tf.placeholder("float", name="Episode_lengths") summary_loss = tf.summary.scalar("Loss", self.summary_loss) summary_rewards = tf.summary.scalar("Rewards", self.rewards) summary_episode_lengths = tf.summary.scalar("Episode_lengths", self.episode_lengths) self.summary_op = tf.summary.merge( [summary_loss, summary_rewards, summary_episode_lengths]) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "task0"), self.session.graph) def choose_action(self, state): """Choose an action.""" action = self.session.run([self.action], feed_dict={self.states: [state]})[0] return action def learn(self): """Run learning algorithm""" reporter = Reporter() config = self.config total_n_trajectories = 0 for iteration in range(config["n_iter"]): # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.env_runner.get_trajectories() total_n_trajectories += len(trajectories) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths summary, _ = self.session.run( [self.summary_op, self.train], feed_dict={ self.states: all_state, self.a_n: all_action, self.adv_n: all_adv, self.episode_lengths: np.mean(episode_lengths), self.rewards: np.mean(episode_rewards) }) self.writer.add_summary(summary, iteration) self.writer.flush() reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories) if self.config["save_model"]: self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
class DPPOWorker(object): """Distributed Proximal Policy Optimization Worker.""" def __init__(self, env_id, task_id, comm, monitor_path, config, seed=None): super(DPPOWorker, self).__init__() self.comm = comm self.config = config self.env = make(env_id) self.task_id = task_id if seed is not None: self.env.seed(seed) self.writer = tf.summary.FileWriter( os.path.join(monitor_path, "task{}".format(task_id))) # Only used (and overwritten) by agents that use an RNN self.initial_features = None with tf.device('/cpu:0'): with tf.variable_scope( "new_network"): # The workers only have 1 network self.global_network = self.build_networks() self.states = self.global_network.states self.action = self.global_network.action self.value = self.global_network.value self.global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.env_runner = EnvRunner(self.env, self, {}, summary_writer=self.writer) def build_networks(self): raise NotImplementedError def run(self): with tf.Session() as sess, sess.as_default(): var_receivers = [ np.zeros(var.shape.as_list(), dtype=var.dtype.as_numpy_dtype) for var in self.global_vars ] while True: for var_receiver, tf_var in zip(var_receivers, self.global_vars): self.comm.Bcast(var_receiver, root=0) tf_var.load(var_receiver) experiences = self.env_runner.get_steps( self.config["n_local_steps"], stop_at_trajectory_end=False) T = experiences.steps value = 0 if experiences.terminals[ -1] else self.get_critic_value( np.asarray(experiences.states)[None, -1], experiences.features[-1]) vpred = np.asarray(experiences.values + [value]) gamma = self.config["gamma"] lambda_ = self.config["gae_lambda"] terminals = np.append(experiences.terminals, 0) gaelam = advantages = np.empty(T, 'float32') last_gaelam = 0 for t in reversed(range(T)): nonterminal = 1 - terminals[t + 1] delta = experiences.rewards[t] + gamma * vpred[ t + 1] * nonterminal - vpred[t] gaelam[ t] = last_gaelam = delta + gamma * lambda_ * nonterminal * last_gaelam returns = advantages + experiences.values processed = experiences.states, experiences.actions, advantages, returns, experiences.features[ 0] self.comm.gather(processed, root=0) @property def global_step(self): return self._global_step.eval() def get_critic_value(self, state, *rest): fetches = [self.global_network.value] feed_dict = {self.global_network.states: state} value = tf.get_default_session().run(fetches, feed_dict=feed_dict)[0].flatten() return value def choose_action(self, state, *rest): fetches = [self.global_network.action, self.global_network.value] feed_dict = {self.global_network.states: [state]} action, value = tf.get_default_session().run(fetches, feed_dict=feed_dict) return {"action": action, "value": value[0]} def get_env_action(self, action): return np.argmax(action) def new_trajectory(self): pass
class AKTThread(Thread): """Asynchronous knowledge transfer learner thread. Used to learn using one specific variation of a task.""" def __init__(self, master, env, task_id, n_iter, start_at_iter=0): super(AKTThread, self).__init__() self.master = master self.config = self.master.config self.task_id = task_id self.nA = env.action_space.n self.n_iter = n_iter self.start_at_iter = start_at_iter self.add_accum_grad = None # To be filled in later self.build_networks() self.states = self.master.states self.session = self.master.session self.task_runner = EnvRunner(env, TaskPolicy(self.action, self), self.master.config) # Write the summary of each task in a different directory self.writer = tf.summary.FileWriter( os.path.join(self.master.monitor_path, "task" + str(self.task_id)), self.master.session.graph) self.optimizer = tf.train.RMSPropOptimizer( learning_rate=self.config["learning_rate"], decay=self.config["decay"], epsilon=self.config["epsilon"]) def build_networks(self): with tf.variable_scope("task{}".format(self.task_id)): self.sparse_representation = tf.Variable( tf.truncated_normal( [self.master.config["n_sparse_units"], self.nA], mean=0.0, stddev=0.02)) self.probs = tf.nn.softmax( tf.matmul( self.master.L1, tf.matmul(self.master.knowledge_base, self.sparse_representation))) self.action = tf.squeeze(tf.multinomial(tf.log(self.probs), 1), name="action") good_probabilities = tf.reduce_sum(tf.multiply( self.probs, tf.one_hot(tf.cast(self.master.action_taken, tf.int32), self.nA)), reduction_indices=[1]) eligibility = tf.log(good_probabilities + 1e-10) * self.master.advantage self.loss = -tf.reduce_sum(eligibility) def run(self): """Run the appropriate learning algorithm.""" if self.master.learning_method == "REINFORCE": self.learn_REINFORCE() else: self.learn_Karpathy() def learn_REINFORCE(self): """Learn using updates like in the REINFORCE algorithm.""" reporter = Reporter() total_n_trajectories = 0 iteration = self.start_at_iter while iteration < self.n_iter and not self.master.stop_requested: iteration += 1 # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.task_runner.get_trajectories() total_n_trajectories += len(trajectories) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory["reward"], self.config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths results = self.master.session.run( [self.loss, self.apply_grad], feed_dict={ self.master.states: all_state, self.master.action_taken: all_action, self.master.advantage: all_adv }) print("Task:", self.task_id) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories) summary = self.master.session.run( [self.master.summary_op], feed_dict={ self.master.loss: results[0], self.master.reward: np.mean(episode_rewards), self.master.episode_length: np.mean(episode_lengths) }) self.writer.add_summary(summary[0], iteration) self.writer.flush() def learn_Karpathy(self): """Learn using updates like in the Karpathy algorithm.""" iteration = self.start_at_iter while iteration < self.n_iter and not self.master.stop_requested: # Keep executing episodes until the master requests a stop (e.g. using SIGINT) iteration += 1 trajectory = self.task_runner.get_trajectory() reward = sum(trajectory["reward"]) action_taken = trajectory["action"] discounted_episode_rewards = discount_rewards( trajectory["reward"], self.config["gamma"]) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) std = np.std(discounted_episode_rewards) std = std if std > 0 else 1 discounted_episode_rewards /= std feedback = discounted_episode_rewards results = self.master.session.run( [self.loss, self.apply_grad], feed_dict={ self.master.states: trajectory["state"], self.master.action_taken: action_taken, self.master.advantage: feedback }) results = self.master.session.run( [self.master.summary_op], feed_dict={ self.master.loss: results[0], self.master.reward: reward, self.master.episode_length: trajectory["steps"] }) self.writer.add_summary(results[0], iteration) self.writer.flush()
def __init__(self, env, monitor_path: str, video: bool = True, **usercfg) -> None: super(A2C, self).__init__(**usercfg) self.monitor_path = monitor_path self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False)) self.config.update( dict(n_iter=100, gamma=0.99, learning_rate=0.001, n_hidden_units=20, n_hidden_layers=1, gradient_clip_value=0.5, n_local_steps=20, vf_coef=0.5, entropy_coef=0.01, loss_reducer="mean", save_model=False)) self.config.update(usercfg) # Only used (and overwritten) by agents that use an RNN self.initial_features = None self.ac_net = None # Overwritten by build_networks self.build_networks() self.action = self.ac_net.action self.states = self.ac_net.states self.actions_taken = self.ac_net.actions_taken self.advantage = tf.placeholder(tf.float32, [None], name="advantage") self.ret = tf.placeholder(tf.float32, [None], name="return") self.actor_loss, self.critic_loss, self.loss = self.make_loss() self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.optimizer = tf.train.AdamOptimizer(self.config["learning_rate"], name="optim") grads = tf.gradients(self.loss, self.vars) grads, _ = tf.clip_by_global_norm(grads, self.config["gradient_clip_value"]) # Apply gradients to the weights of the master network apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars)) self.n_steps = tf.shape(self.states)[0] inc_step = self._global_step.assign_add(self.n_steps) self.train_op = tf.group(apply_grads, inc_step) init = tf.global_variables_initializer() # Launch the graph. self.session = tf.Session() self.session.run(init) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() n_steps = tf.to_float(self.n_steps) actor_loss_summary = tf.summary.scalar( "model/actor_loss", tf.squeeze(self.actor_loss / n_steps)) critic_loss_summary = tf.summary.scalar( "model/critic_loss", tf.squeeze(self.critic_loss / n_steps)) loss_summary = tf.summary.scalar("model/loss", tf.squeeze(self.loss / n_steps)) self.loss_summary_op = tf.summary.merge( [actor_loss_summary, critic_loss_summary, loss_summary]) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "summaries"), self.session.graph) self.env_runner = EnvRunner(self.env, self, usercfg, summary_writer=self.writer) return
class A2C(Agent): """Advantage Actor Critic""" def __init__(self, env, monitor_path: str, video: bool = True, **usercfg) -> None: super(A2C, self).__init__(**usercfg) self.monitor_path = monitor_path self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False)) self.config.update( dict(n_iter=100, gamma=0.99, learning_rate=0.001, n_hidden_units=20, n_hidden_layers=1, gradient_clip_value=0.5, n_local_steps=20, vf_coef=0.5, entropy_coef=0.01, loss_reducer="mean", save_model=False)) self.config.update(usercfg) # Only used (and overwritten) by agents that use an RNN self.initial_features = None self.ac_net = None # Overwritten by build_networks self.build_networks() self.action = self.ac_net.action self.states = self.ac_net.states self.actions_taken = self.ac_net.actions_taken self.advantage = tf.placeholder(tf.float32, [None], name="advantage") self.ret = tf.placeholder(tf.float32, [None], name="return") self.actor_loss, self.critic_loss, self.loss = self.make_loss() self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.optimizer = tf.train.AdamOptimizer(self.config["learning_rate"], name="optim") grads = tf.gradients(self.loss, self.vars) grads, _ = tf.clip_by_global_norm(grads, self.config["gradient_clip_value"]) # Apply gradients to the weights of the master network apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars)) self.n_steps = tf.shape(self.states)[0] inc_step = self._global_step.assign_add(self.n_steps) self.train_op = tf.group(apply_grads, inc_step) init = tf.global_variables_initializer() # Launch the graph. self.session = tf.Session() self.session.run(init) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() n_steps = tf.to_float(self.n_steps) actor_loss_summary = tf.summary.scalar( "model/actor_loss", tf.squeeze(self.actor_loss / n_steps)) critic_loss_summary = tf.summary.scalar( "model/critic_loss", tf.squeeze(self.critic_loss / n_steps)) loss_summary = tf.summary.scalar("model/loss", tf.squeeze(self.loss / n_steps)) self.loss_summary_op = tf.summary.merge( [actor_loss_summary, critic_loss_summary, loss_summary]) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "summaries"), self.session.graph) self.env_runner = EnvRunner(self.env, self, usercfg, summary_writer=self.writer) return def build_networks(self): return NotImplementedError("Abstract method") def make_loss(self): return NotImplementedError("Abstract method") @property def global_step(self): return self._global_step.eval(session=self.session) def get_critic_value(self, state, features): return self.session.run([self.ac_net.value], feed_dict={self.states: state})[0].flatten() def choose_action(self, state, features) -> dict: action, value = self.session.run( [self.ac_net.action, self.ac_net.value], feed_dict={self.states: [state]}) return {"action": action, "value": value[0]} def get_env_action(self, action) -> int: return np.argmax(action) def learn(self): """Run learning algorithm""" config = self.config for _ in range(config["n_iter"]): # Collect trajectories until we get timesteps_per_batch total timesteps trajectory = self.env_runner.get_steps( self.config["n_local_steps"]) v = 0 if trajectory.terminals[-1] else self.get_critic_value( np.asarray(trajectory.states)[None, -1], trajectory.features[-1]) rewards_plus_v = np.asarray(trajectory.rewards + [v]) vpred_t = np.asarray(trajectory.values + [v]) delta_t = trajectory.rewards + \ self.config["gamma"] * vpred_t[1:] - vpred_t[:-1] batch_r = discount_rewards(rewards_plus_v, self.config["gamma"])[:-1] batch_adv = discount_rewards(delta_t, self.config["gamma"]) fetches = [self.loss_summary_op, self.train_op, self._global_step] states = np.asarray(trajectory.states) feed_dict = { self.states: states, self.actions_taken: np.asarray(trajectory.actions), self.advantage: batch_adv, self.ret: np.asarray(batch_r) } feature = trajectory.features[0] if feature != [] and feature is not None: feed_dict[self.ac_net.rnn_state_in] = feature summary, _, global_step = self.session.run(fetches, feed_dict) self.writer.add_summary(summary, global_step) self.writer.flush() if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
class A2C(Agent): """Advantage Actor Critic""" def __init__(self, env, monitor_path, video=True, **usercfg): super(A2C, self).__init__(**usercfg) self.monitor_path = monitor_path self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False)) self.env_runner = EnvRunner(self.env, self, usercfg) self.config.update( dict(timesteps_per_batch=10000, trajectories_per_batch=10, batch_update="timesteps", n_iter=100, gamma=0.99, actor_learning_rate=0.01, critic_learning_rate=0.05, actor_n_hidden=20, critic_n_hidden=20, repeat_n_actions=1, save_model=False)) self.config.update(usercfg) self.build_networks() init = tf.global_variables_initializer() # Launch the graph. self.session = tf.Session() self.session.run(init) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = tf.train.Saver() self.rewards = tf.placeholder("float", name="Rewards") self.episode_lengths = tf.placeholder("float", name="Episode_lengths") summary_actor_loss = tf.summary.scalar("Actor_loss", self.summary_actor_loss) summary_critic_loss = tf.summary.scalar("Critic_loss", self.summary_critic_loss) summary_rewards = tf.summary.scalar("Rewards", self.rewards) summary_episode_lengths = tf.summary.scalar("Episode_lengths", self.episode_lengths) self.summary_op = tf.summary.merge([ summary_actor_loss, summary_critic_loss, summary_rewards, summary_episode_lengths ]) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "summaries"), self.session.graph) return def get_critic_value(self, state): return self.session.run([self.critic_value], feed_dict={self.states: state})[0].flatten() def choose_action(self, state): """Choose an action.""" return self.session.run([self.action], feed_dict={self.states: [state]})[0] def learn(self): """Run learning algorithm""" reporter = Reporter() config = self.config possible_actions = np.arange(self.env_runner.nA) total_n_trajectories = 0 for iteration in range(config["n_iter"]): # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.env_runner.get_trajectories() total_n_trajectories += len(trajectories) all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_action = (possible_actions == all_action[:, None]).astype( np.float32) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards returns = np.concatenate([ discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories ]) qw_new = self.get_critic_value(all_state) episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths results = self.session.run( [self.summary_op, self.critic_train, self.actor_train], feed_dict={ self.states: all_state, self.critic_target: returns, self.states: all_state, self.actions_taken: all_action, self.critic_feedback: qw_new, self.critic_rewards: returns, self.rewards: np.mean(episode_rewards), self.episode_lengths: np.mean(episode_lengths) }) self.writer.add_summary(results[0], iteration) self.writer.flush() reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
def __init__(self, env, monitor_path: str, video=False, **usercfg) -> None: super(PPO, self).__init__(**usercfg) self.monitor_path: str = monitor_path self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False)) self.config.update( dict( n_hidden_units=20, n_hidden_layers=2, gamma=0.99, gae_lambda=0.95, learning_rate=0.001, n_epochs=10, n_iter=10000, batch_size=64, # Timesteps per training batch n_local_steps=256, normalize_states=False, gradient_clip_value=None, adam_epsilon=1e-5, vf_coef=0.5, entropy_coef=0.01, cso_epsilon=0.2 # Clipped surrogate objective epsilon )) self.config.update(usercfg) with tf.variable_scope("old_network"): self.old_network = self.build_networks() self.old_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) with tf.variable_scope("new_network"): self.new_network = self.build_networks() if self.RNN: self.initial_features = self.new_network.state_init else: self.initial_features = None self.new_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.action = self.new_network.action self.value = self.new_network.value self.states = self.new_network.states self.actions_taken = self.new_network.actions_taken self.advantage = tf.placeholder(tf.float32, [None], name="advantage") self.ret = tf.placeholder(tf.float32, [None], name="return") self.set_old_to_new = tf.group(*[ v1.assign(v2) for v1, v2 in zip(self.old_network_vars, self.new_network_vars) ]) ratio = tf.exp(self.new_network.action_log_prob - self.old_network.action_log_prob) ratio_clipped = tf.clip_by_value(ratio, 1.0 - self.config["cso_epsilon"], 1.0 + self.config["cso_epsilon"]) cso_loss = tf.minimum(ratio * self.advantage, ratio_clipped * self.advantage) self.actor_loss = -tf.reduce_mean(cso_loss) self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret)) self.mean_entropy = tf.reduce_mean(self.new_network.entropy) self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \ self.config["entropy_coef"] * self.mean_entropy grads = tf.gradients(self.loss, self.new_network_vars) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.n_steps = tf.shape(self.states)[0] self.session = tf.Session() if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() summary_actor_loss = tf.summary.scalar("model/Actor_loss", self.actor_loss) summary_critic_loss = tf.summary.scalar("model/Critic_loss", self.critic_loss) summary_loss = tf.summary.scalar("model/Loss", self.loss) adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0]) summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean) summary_adv_std = tf.summary.scalar("model/advantage/std", adv_std) ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0]) summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean) summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std) summary_new_log_prob_mean = tf.summary.scalar( "model/new_log_prob/mean", tf.reduce_mean(self.new_network.action_log_prob)) summary_old_log_prob_mean = tf.summary.scalar( "model/old_log_prob/mean", tf.reduce_mean(self.old_network.action_log_prob)) summary_ret = tf.summary.scalar("model/return/mean", tf.reduce_mean(self.ret)) summary_entropy = tf.summary.scalar("model/entropy", -self.mean_entropy) summary_grad_norm = tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) summary_var_norm = tf.summary.scalar( "model/var_global_norm", tf.global_norm(self.new_network_vars)) summaries = [] for v in tf.trainable_variables(): if "new_network" in v.name: summaries.append(tf.summary.histogram(v.name, v)) summaries += [ summary_actor_loss, summary_critic_loss, summary_loss, summary_adv_mean, summary_adv_std, summary_ratio_mean, summary_ratio_std, summary_new_log_prob_mean, summary_old_log_prob_mean, summary_ret, summary_entropy, summary_grad_norm, summary_var_norm ] self.model_summary_op = tf.summary.merge(summaries) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "summaries"), self.session.graph) self.env_runner = EnvRunner( self.env, self, usercfg, normalize_states=self.config["normalize_states"], summary_writer=self.writer) # grads before clipping were passed to the summary, now clip and apply them if self.config["gradient_clip_value"] is not None: grads, _ = tf.clip_by_global_norm( grads, self.config["gradient_clip_value"]) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.config["learning_rate"], epsilon=self.config["adam_epsilon"], name="optim") apply_grads = self.optimizer.apply_gradients( zip(grads, self.new_network_vars)) inc_step = self._global_step.assign_add(self.n_steps) self.train_op = tf.group(apply_grads, inc_step) init = tf.global_variables_initializer() self.session.run(init) return
class PPO(Agent): """Proximal Policy Optimization agent.""" RNN = False def __init__(self, env, monitor_path: str, video=False, **usercfg) -> None: super(PPO, self).__init__(**usercfg) self.monitor_path: str = monitor_path self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False)) self.config.update( dict( n_hidden_units=20, n_hidden_layers=2, gamma=0.99, gae_lambda=0.95, learning_rate=0.001, n_epochs=10, n_iter=10000, batch_size=64, # Timesteps per training batch n_local_steps=256, normalize_states=False, gradient_clip_value=None, adam_epsilon=1e-5, vf_coef=0.5, entropy_coef=0.01, cso_epsilon=0.2 # Clipped surrogate objective epsilon )) self.config.update(usercfg) with tf.variable_scope("old_network"): self.old_network = self.build_networks() self.old_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) with tf.variable_scope("new_network"): self.new_network = self.build_networks() if self.RNN: self.initial_features = self.new_network.state_init else: self.initial_features = None self.new_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.action = self.new_network.action self.value = self.new_network.value self.states = self.new_network.states self.actions_taken = self.new_network.actions_taken self.advantage = tf.placeholder(tf.float32, [None], name="advantage") self.ret = tf.placeholder(tf.float32, [None], name="return") self.set_old_to_new = tf.group(*[ v1.assign(v2) for v1, v2 in zip(self.old_network_vars, self.new_network_vars) ]) ratio = tf.exp(self.new_network.action_log_prob - self.old_network.action_log_prob) ratio_clipped = tf.clip_by_value(ratio, 1.0 - self.config["cso_epsilon"], 1.0 + self.config["cso_epsilon"]) cso_loss = tf.minimum(ratio * self.advantage, ratio_clipped * self.advantage) self.actor_loss = -tf.reduce_mean(cso_loss) self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret)) self.mean_entropy = tf.reduce_mean(self.new_network.entropy) self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \ self.config["entropy_coef"] * self.mean_entropy grads = tf.gradients(self.loss, self.new_network_vars) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.n_steps = tf.shape(self.states)[0] self.session = tf.Session() if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() summary_actor_loss = tf.summary.scalar("model/Actor_loss", self.actor_loss) summary_critic_loss = tf.summary.scalar("model/Critic_loss", self.critic_loss) summary_loss = tf.summary.scalar("model/Loss", self.loss) adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0]) summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean) summary_adv_std = tf.summary.scalar("model/advantage/std", adv_std) ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0]) summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean) summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std) summary_new_log_prob_mean = tf.summary.scalar( "model/new_log_prob/mean", tf.reduce_mean(self.new_network.action_log_prob)) summary_old_log_prob_mean = tf.summary.scalar( "model/old_log_prob/mean", tf.reduce_mean(self.old_network.action_log_prob)) summary_ret = tf.summary.scalar("model/return/mean", tf.reduce_mean(self.ret)) summary_entropy = tf.summary.scalar("model/entropy", -self.mean_entropy) summary_grad_norm = tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) summary_var_norm = tf.summary.scalar( "model/var_global_norm", tf.global_norm(self.new_network_vars)) summaries = [] for v in tf.trainable_variables(): if "new_network" in v.name: summaries.append(tf.summary.histogram(v.name, v)) summaries += [ summary_actor_loss, summary_critic_loss, summary_loss, summary_adv_mean, summary_adv_std, summary_ratio_mean, summary_ratio_std, summary_new_log_prob_mean, summary_old_log_prob_mean, summary_ret, summary_entropy, summary_grad_norm, summary_var_norm ] self.model_summary_op = tf.summary.merge(summaries) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "summaries"), self.session.graph) self.env_runner = EnvRunner( self.env, self, usercfg, normalize_states=self.config["normalize_states"], summary_writer=self.writer) # grads before clipping were passed to the summary, now clip and apply them if self.config["gradient_clip_value"] is not None: grads, _ = tf.clip_by_global_norm( grads, self.config["gradient_clip_value"]) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.config["learning_rate"], epsilon=self.config["adam_epsilon"], name="optim") apply_grads = self.optimizer.apply_gradients( zip(grads, self.new_network_vars)) inc_step = self._global_step.assign_add(self.n_steps) self.train_op = tf.group(apply_grads, inc_step) init = tf.global_variables_initializer() self.session.run(init) return def make_actor_loss(self, old_network, new_network, advantage): return ppo_loss(old_network.action_log_prob, new_network.action_log_prob, self.config["cso_epsilon"], advantage) def build_networks(self): raise NotImplementedError @property def global_step(self): return self._global_step.eval(session=self.session) def get_critic_value(self, state, *rest): return self.session.run([self.value], feed_dict={self.states: state})[0].flatten() def choose_action(self, state, *rest): action, value = self.session.run([self.action, self.value], feed_dict={self.states: [state]}) return {"action": action, "value": value[0]} def get_env_action(self, action): return np.argmax(action) def get_processed_trajectories(self): experiences = self.env_runner.get_steps(self.config["n_local_steps"], stop_at_trajectory_end=False) T = experiences.steps v = 0 if experiences.terminals[-1] else self.get_critic_value( np.asarray(experiences.states)[None, -1], experiences.features[-1]) vpred = np.asarray(experiences.values + [v]) gamma = self.config["gamma"] lambda_ = self.config["gae_lambda"] gaelam = advantages = np.empty(T, 'float32') last_gaelam = 0 for t in reversed(range(T)): nonterminal = 1 - experiences.terminals[t] delta = experiences.rewards[t] + gamma * vpred[ t + 1] * nonterminal - vpred[t] gaelam[ t] = last_gaelam = delta + gamma * lambda_ * nonterminal * last_gaelam rs = advantages + experiences.values return experiences.states, experiences.actions, advantages, rs, experiences.features def learn(self): """Run learning algorithm""" config = self.config n_updates = 0 for _ in range(config["n_iter"]): # Collect trajectories until we get timesteps_per_batch total timesteps states, actions, advs, rs, _ = self.get_processed_trajectories() advs = np.array(advs) advs = (advs - advs.mean()) / advs.std() self.session.run(self.set_old_to_new) indices = np.arange(len(states)) for _ in range(self.config["n_epochs"]): np.random.shuffle(indices) batch_size = self.config["batch_size"] for j in range(0, len(states), batch_size): batch_indices = indices[j:(j + batch_size)] batch_states = np.array(states)[batch_indices] batch_actions = np.array(actions)[batch_indices] batch_advs = np.array(advs)[batch_indices] batch_rs = np.array(rs)[batch_indices] losses = [self.actor_loss, self.critic_loss, self.loss] fetches = losses + [self.model_summary_op, self.train_op] feed_dict = { self.states: batch_states, self.old_network.states: batch_states, self.actions_taken: batch_actions, self.old_network.actions_taken: batch_actions, self.advantage: batch_advs, self.ret: batch_rs } results = self.session.run(fetches, feed_dict) self.writer.add_summary(results[len(losses)], n_updates) n_updates += 1 self.writer.flush() if self.config["save_model"]: self.saver.save(self.session, os.path.join(self.monitor_path, "model"))