def __init__(self, env, monitor_path, video=True, **usercfg): super(KarpathyCNN, self).__init__(**usercfg) self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False)) self.nA = env.action_space.n self.monitor_path = monitor_path # Default configuration. Can be overwritten using keyword arguments. self.config.update( dict( # timesteps_per_batch=10000, # n_iter=100, n_hidden_units=200, learning_rate=1e-3, batch_size= 10, # Amount of episodes after which to adapt gradients gamma=0.99, # Discount past rewards by a percentage decay=0.99, # Decay of RMSProp optimizer epsilon=1e-9, # Epsilon of RMSProp optimizer draw_frequency=50 # Draw a plot every 50 episodes )) self.config.update(usercfg) self.build_network() if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver()
def __init__(self, envs, monitor_path, **usercfg): super(KnowledgeTransfer, self).__init__(**usercfg) self.envs = envs self.n_tasks = len(envs) self.monitor_path = monitor_path self.nA = envs[0].action_space.n self.config.update( dict( timesteps_per_batch=10000, trajectories_per_batch=10, batch_update="timesteps", n_iter=100, switch_at_iter=None, gamma=0.99, # Discount past rewards by a percentage decay=0.9, # Decay of RMSProp optimizer epsilon=1e-9, # Epsilon of RMSProp optimizer learning_rate=0.005, n_hidden_units=10, repeat_n_actions=1, n_sparse_units=10, feature_extraction=False)) self.config.update(usercfg) self.build_networks() self.task_runners = [ EnvRunner(envs[i], TaskPolicy(action, self), self.config) for i, action in enumerate(self.action_tensors) ] if self.config["save_model"]: for action_tensor in self.action_tensors: tf.add_to_collection("action", action_tensor) tf.add_to_collection("states", self.states) self.saver = FastSaver()
def __init__(self, env_id: str, task_id: int, cluster: tf.train.ClusterDef, monitor_path: str, config: dict, clip_gradients: bool = True, video: bool = False, seed: Optional[int] = None) -> None: super(A3CTask, self).__init__() self.task_id = task_id self.config = config self.clip_gradients = clip_gradients self.env = make(env_id) self.env.seed(seed) if task_id == 0: self.env = wrappers.Monitor( self.env, monitor_path, force=True, video_callable=(None if video else False) ) # Only used (and overwritten) by agents that use an RNN self.initial_features = None worker_device = "/job:worker/task:{}/cpu:0".format(task_id) # Global network shared_device = tf.train.replica_device_setter( ps_tasks=1, worker_device=worker_device, cluster=cluster) with tf.device(shared_device): with tf.variable_scope("global"): self.global_network = self.build_networks() self.global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) # Local network with tf.device(worker_device): with tf.variable_scope("local"): self.local_network = self.build_networks() self.states = self.local_network.states self.actions_taken = self.local_network.actions_taken self.advantage = tf.placeholder(tf.float32, [None], name="advantage") self.ret = tf.placeholder(tf.float32, [None], name="return") self.actor_loss, self.critic_loss, self.loss = self.make_loss() self.local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.sync_net = create_sync_net_op(self.global_vars, self.local_vars) self.n_steps = tf.shape(self.local_network.states)[0] inc_step = self._global_step.assign_add(self.n_steps) device = shared_device if self.config["shared_optimizer"] else worker_device with tf.device(device): apply_optim_op = self.make_trainer() self.train_op = tf.group(apply_optim_op, inc_step) loss_summaries = self.create_summary_losses() self.reward = tf.placeholder("float", name="reward") tf.summary.scalar("Reward", self.reward) self.episode_length = tf.placeholder("float", name="episode_length") tf.summary.scalar("Episode_length", self.episode_length) self.summary_op = tf.summary.merge(loss_summaries) variables_to_save = [v for v in tf.global_variables() if not v.name.startswith("local")] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() saver = FastSaver(variables_to_save) # Write the summary of each task in a different directory self.writer = tf.summary.FileWriter(os.path.join(monitor_path, "task{}".format(task_id))) self.runner = RunnerThread(self.env, self, int(self.config["n_local_steps"]), task_id == 0 and video) self.server = tf.train.Server( cluster, job_name="worker", task_index=task_id, config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2) ) def init_fn(scaffold, sess): sess.run(init_all_op) self.report_uninit_op = tf.report_uninitialized_variables(variables_to_save) self.scaffold = tf.train.Scaffold( init_op=init_op, init_fn=init_fn, ready_for_local_init_op=self.report_uninit_op, saver=saver, ready_op=self.report_uninit_op ) self.config_proto = tf.ConfigProto(device_filters=["/job:ps", "/job:worker/task:{}/cpu:0".format(task_id)]) self.session = None
def __init__(self, env, monitor_path: str, monitor: bool = False, video: bool = True, **usercfg) -> None: super(REINFORCE, self).__init__(**usercfg) self.env = env if monitor: self.env = wrappers.Monitor( self.env, monitor_path, force=True, video_callable=(None if video else False)) self.monitor_path = monitor_path # Default configuration. Can be overwritten using keyword arguments. self.config.update( dict( batch_update="timesteps", timesteps_per_batch=1000, n_iter=100, gamma=0.99, # Discount past rewards by a percentage learning_rate=0.05, entropy_coef=1e-3, n_hidden_layers=2, n_hidden_units=20, repeat_n_actions=1, save_model=False)) self.config.update(usercfg) self.states = tf.placeholder(tf.float32, [None] + list(self.env.observation_space.shape), name="states") # Observation self.actions_taken = tf.placeholder( tf.float32, name="actions_taken") # Discrete action self.advantage = tf.placeholder(tf.float32, name="advantage") # Advantage self.build_network() self.make_trainer() if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() summary_loss = tf.summary.scalar("model/loss", self.summary_loss) summaries = [summary_loss] if hasattr(self, "entropy"): summary_entropy = tf.summary.scalar("model/entropy", self.entropy) summaries += [summary_entropy] self.summary_op = tf.summary.merge(summaries) self.init_op = tf.global_variables_initializer() # Launch the graph. num_cpu = multiprocessing.cpu_count() tf_config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.session = tf.Session(config=tf_config) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "task0"), self.session.graph) self.env_runner = EnvRunner(self.env, self, usercfg, summary_writer=self.writer)
class REINFORCE(Agent): """ REINFORCE with baselines """ def __init__(self, env, monitor_path: str, monitor: bool = False, video: bool = True, **usercfg) -> None: super(REINFORCE, self).__init__(**usercfg) self.env = env if monitor: self.env = wrappers.Monitor( self.env, monitor_path, force=True, video_callable=(None if video else False)) self.monitor_path = monitor_path # Default configuration. Can be overwritten using keyword arguments. self.config.update( dict( batch_update="timesteps", timesteps_per_batch=1000, n_iter=100, gamma=0.99, # Discount past rewards by a percentage learning_rate=0.05, entropy_coef=1e-3, n_hidden_layers=2, n_hidden_units=20, repeat_n_actions=1, save_model=False)) self.config.update(usercfg) self.states = tf.placeholder(tf.float32, [None] + list(self.env.observation_space.shape), name="states") # Observation self.actions_taken = tf.placeholder( tf.float32, name="actions_taken") # Discrete action self.advantage = tf.placeholder(tf.float32, name="advantage") # Advantage self.build_network() self.make_trainer() if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() summary_loss = tf.summary.scalar("model/loss", self.summary_loss) summaries = [summary_loss] if hasattr(self, "entropy"): summary_entropy = tf.summary.scalar("model/entropy", self.entropy) summaries += [summary_entropy] self.summary_op = tf.summary.merge(summaries) self.init_op = tf.global_variables_initializer() # Launch the graph. num_cpu = multiprocessing.cpu_count() tf_config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.session = tf.Session(config=tf_config) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "task0"), self.session.graph) self.env_runner = EnvRunner(self.env, self, usercfg, summary_writer=self.writer) def _initialize(self) -> None: self.session.run(self.init_op) def build_network(self): raise NotImplementedError() def make_trainer(self): raise NotImplementedError() def choose_action(self, state, features) -> Dict[str, np.ndarray]: """Choose an action.""" action = self.session.run([self.action], feed_dict={self.states: [state]})[0] return {"action": action} def learn(self): """Run learning algorithm""" self._initialize() reporter = Reporter() config = self.config total_n_trajectories = 0 for iteration in range(config["n_iter"]): # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.env_runner.get_trajectories() total_n_trajectories += len(trajectories) all_state = np.concatenate( [trajectory.states for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory.rewards, config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory.actions for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ sum(trajectory.rewards) for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory.rewards) for trajectory in trajectories ]) # episode lengths # TODO: deal with RNN state summary, _ = self.session.run( [self.summary_op, self.train], feed_dict={ self.states: all_state, self.actions_taken: all_action, self.advantage: all_adv }) self.writer.add_summary(summary, iteration) self.writer.flush() reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories) if self.config["save_model"]: self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
def __init__(self, env, monitor_path: str, video: bool = True, **usercfg) -> None: super(A2C, self).__init__(**usercfg) self.monitor_path = monitor_path self.env = wrappers.Monitor( env, monitor_path, force=True, video_callable=(None if video else False)) self.config.update(dict( n_iter=100, gamma=0.99, learning_rate=0.001, n_hidden_units=20, n_hidden_layers=1, gradient_clip_value=0.5, n_local_steps=20, vf_coef=0.5, entropy_coef=0.01, loss_reducer="mean", save_model=False )) self.config.update(usercfg) # Only used (and overwritten) by agents that use an RNN self.initial_features = None self.ac_net = None # Overwritten by build_networks self.build_networks() self.action = self.ac_net.action self.states = self.ac_net.states self.actions_taken = self.ac_net.actions_taken self.advantage = tf.placeholder(tf.float32, [None], name="advantage") self.ret = tf.placeholder(tf.float32, [None], name="return") self.actor_loss, self.critic_loss, self.loss = self.make_loss() self.vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.optimizer = tf.train.AdamOptimizer( self.config["learning_rate"], name="optim") grads = tf.gradients(self.loss, self.vars) grads, _ = tf.clip_by_global_norm( grads, self.config["gradient_clip_value"]) # Apply gradients to the weights of the master network apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars)) self.n_steps = tf.shape(self.states)[0] inc_step = self._global_step.assign_add(self.n_steps) self.train_op = tf.group(apply_grads, inc_step) self.init_op = tf.global_variables_initializer() # Launch the graph. num_cpu = multiprocessing.cpu_count() tf_config = tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.session = tf.Session(config=tf_config) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() n_steps = tf.to_float(self.n_steps) actor_loss_summary = tf.summary.scalar("model/actor_loss", tf.squeeze(self.actor_loss / n_steps)) critic_loss_summary = tf.summary.scalar("model/critic_loss", tf.squeeze(self.critic_loss / n_steps)) loss_summary = tf.summary.scalar("model/loss", tf.squeeze(self.loss / n_steps)) self.loss_summary_op = tf.summary.merge( [actor_loss_summary, critic_loss_summary, loss_summary]) self.writer = tf.summary.FileWriter(os.path.join( self.monitor_path, "summaries"), self.session.graph) self.env_runner = EnvRunner(self.env, self, usercfg, summary_writer=self.writer) return
class A2C(Agent): """Advantage Actor Critic""" def __init__(self, env, monitor_path: str, video: bool = True, **usercfg) -> None: super(A2C, self).__init__(**usercfg) self.monitor_path = monitor_path self.env = wrappers.Monitor( env, monitor_path, force=True, video_callable=(None if video else False)) self.config.update(dict( n_iter=100, gamma=0.99, learning_rate=0.001, n_hidden_units=20, n_hidden_layers=1, gradient_clip_value=0.5, n_local_steps=20, vf_coef=0.5, entropy_coef=0.01, loss_reducer="mean", save_model=False )) self.config.update(usercfg) # Only used (and overwritten) by agents that use an RNN self.initial_features = None self.ac_net = None # Overwritten by build_networks self.build_networks() self.action = self.ac_net.action self.states = self.ac_net.states self.actions_taken = self.ac_net.actions_taken self.advantage = tf.placeholder(tf.float32, [None], name="advantage") self.ret = tf.placeholder(tf.float32, [None], name="return") self.actor_loss, self.critic_loss, self.loss = self.make_loss() self.vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.optimizer = tf.train.AdamOptimizer( self.config["learning_rate"], name="optim") grads = tf.gradients(self.loss, self.vars) grads, _ = tf.clip_by_global_norm( grads, self.config["gradient_clip_value"]) # Apply gradients to the weights of the master network apply_grads = self.optimizer.apply_gradients(zip(grads, self.vars)) self.n_steps = tf.shape(self.states)[0] inc_step = self._global_step.assign_add(self.n_steps) self.train_op = tf.group(apply_grads, inc_step) self.init_op = tf.global_variables_initializer() # Launch the graph. num_cpu = multiprocessing.cpu_count() tf_config = tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.session = tf.Session(config=tf_config) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() n_steps = tf.to_float(self.n_steps) actor_loss_summary = tf.summary.scalar("model/actor_loss", tf.squeeze(self.actor_loss / n_steps)) critic_loss_summary = tf.summary.scalar("model/critic_loss", tf.squeeze(self.critic_loss / n_steps)) loss_summary = tf.summary.scalar("model/loss", tf.squeeze(self.loss / n_steps)) self.loss_summary_op = tf.summary.merge( [actor_loss_summary, critic_loss_summary, loss_summary]) self.writer = tf.summary.FileWriter(os.path.join( self.monitor_path, "summaries"), self.session.graph) self.env_runner = EnvRunner(self.env, self, usercfg, summary_writer=self.writer) return def _initialize(self): self.session.run(self.init_op) def build_networks(self): return NotImplementedError("Abstract method") def make_loss(self): return NotImplementedError("Abstract method") @property def global_step(self): return self._global_step.eval(session=self.session) def get_critic_value(self, state, features): return self.session.run([self.ac_net.value], feed_dict={self.states: state})[0].flatten() def choose_action(self, state, features) -> dict: action, value = self.session.run( [self.ac_net.action, self.ac_net.value], feed_dict={self.states: [state]}) return {"action": action, "value": value[0]} def get_env_action(self, action) -> int: return np.argmax(action) def learn(self): """Run learning algorithm""" self._initialize() config = self.config for _ in range(int(config["n_iter"])): # Collect trajectories until we get timesteps_per_batch total timesteps trajectory = self.env_runner.get_steps(int(self.config["n_local_steps"])) v = 0 if trajectory.terminals[-1] else self.get_critic_value( np.asarray(trajectory.states)[None, -1], trajectory.features[-1]) rewards_plus_v = np.asarray(trajectory.rewards + [v]) vpred_t = np.asarray(trajectory.values + [v]) delta_t = trajectory.rewards + \ self.config["gamma"] * vpred_t[1:] - vpred_t[:-1] batch_r = discount_rewards( rewards_plus_v, self.config["gamma"])[:-1] batch_adv = discount_rewards(delta_t, self.config["gamma"]) fetches = [self.loss_summary_op, self.train_op, self._global_step] states = np.asarray(trajectory.states) feed_dict = { self.states: states, self.actions_taken: np.asarray(trajectory.actions), self.advantage: batch_adv, self.ret: np.asarray(batch_r) } feature = trajectory.features[0] if feature != [] and feature is not None: feed_dict[self.ac_net.rnn_state_in] = feature summary, _, global_step = self.session.run(fetches, feed_dict) self.writer.add_summary(summary, global_step) self.writer.flush() if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver.save(self.session, os.path.join( self.monitor_path, "model"))
def __init__(self, env, monitor_path, **usercfg): super(DPPO, self).__init__() self.env = env self.env_name: str = env.spec.id self.monitor_path: str = monitor_path self.comm = MPI.COMM_SELF self.config.update( dict( n_workers=3, n_hidden_units=20, n_hidden_layers=2, gamma=0.99, gae_lambda=0.95, learning_rate=2.5e-4, n_iter=10000, n_epochs=4, n_local_steps=128, gradient_clip_value=0.5, vf_coef=0.5, entropy_coef=0.01, cso_epsilon=0.1, # Clipped surrogate objective epsilon learn_method="batches", batch_size=64, save_model=False)) self.config.update(usercfg) self.task_type = None # To be filled in by subclasses self.n_updates: int = 0 with tf.variable_scope("new_network"): self.new_network = self.build_networks() if self.RNN: self.initial_features = self.new_network.state_init else: self.initial_features = None self.new_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.action = self.new_network.action self.value = self.new_network.value self.states = self.new_network.states self.actions_taken = self.new_network.actions_taken self.advantage = tf.placeholder(tf.float32, [None], name="advantage") self.ret = tf.placeholder(tf.float32, [None], name="return") with tf.variable_scope("old_network"): self.old_network = self.build_networks() self.old_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.set_old_to_new = tf.group(*[ v1.assign(v2) for v1, v2 in zip(self.old_network_vars, self.new_network_vars) ]) # Reduces by taking the mean instead of summing self.actor_loss = -tf.reduce_mean( self.make_actor_loss(self.old_network, self.new_network, self.advantage)) self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret)) self.mean_entropy = tf.reduce_mean(self.new_network.entropy) self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \ self.config["entropy_coef"] * self.mean_entropy grads = tf.gradients(self.loss, self.new_network_vars) self.n_steps = tf.shape(self.states)[0] if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() summary_actor_loss = tf.summary.scalar("model/Actor_loss", self.actor_loss) summary_critic_loss = tf.summary.scalar("model/Critic_loss", self.critic_loss) summary_loss = tf.summary.scalar("model/Loss", self.loss) summary_entropy = tf.summary.scalar("model/Entropy", -self.mean_entropy) summary_grad_norm = tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) summary_var_norm = tf.summary.scalar( "model/var_global_norm", tf.global_norm(self.new_network_vars)) self.model_summary_op = tf.summary.merge([ summary_actor_loss, summary_critic_loss, summary_loss, summary_entropy, summary_grad_norm, summary_var_norm ]) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "master")) # grads before clipping were passed to the summary, now clip and apply them if self.config["gradient_clip_value"] is not None: grads, _ = tf.clip_by_global_norm( grads, self.config["gradient_clip_value"]) with tf.variable_scope("optimizer"): self.optimizer = tf.train.AdamOptimizer( self.config["learning_rate"], name="optim") apply_grads = self.optimizer.apply_gradients( zip(grads, self.new_network_vars)) inc_step = self._global_step.assign_add(self.n_steps) self.train_op = tf.group(apply_grads, inc_step) optimizer_variables = [ var for var in tf.global_variables() if var.name.startswith("optimizer") ] self.init_op = tf.variables_initializer(self.new_network_vars + optimizer_variables + [self._global_step])
def __init__(self, env, monitor_path: str, monitor: bool = False, video: bool = False, **usercfg) -> None: super(PPO, self).__init__(**usercfg) self.monitor_path: str = monitor_path self.env = env if monitor: self.env = wrappers.Monitor( self.env, monitor_path, force=True, video_callable=(None if video else False)) self.config.update( dict( n_hidden_units=20, n_hidden_layers=2, gamma=0.99, gae_lambda=0.95, learning_rate=0.001, n_epochs=10, n_iter=10000, batch_size=64, # Timesteps per training batch n_local_steps=256, normalize_states=False, gradient_clip_value=None, adam_epsilon=1e-5, vf_coef=0.5, entropy_coef=0.01, cso_epsilon=0.2, # Clipped surrogate objective epsilon save_model=False)) self.config.update(usercfg) with tf.variable_scope("old_network"): self.old_network = self.build_networks() self.old_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) with tf.variable_scope("new_network"): self.new_network = self.build_networks() if self.RNN: self.initial_features = self.new_network.state_init else: self.initial_features = None self.new_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.action = self.new_network.action self.value = self.new_network.value self.states = self.new_network.states self.actions_taken = self.new_network.actions_taken self.advantage = tf.placeholder(tf.float32, [None], name="advantage") self.ret = tf.placeholder(tf.float32, [None], name="return") self.set_old_to_new = tf.group(*[ v1.assign(v2) for v1, v2 in zip(self.old_network_vars, self.new_network_vars) ]) self.actor_loss = -tf.reduce_mean( self.make_actor_loss(self.old_network, self.new_network, self.advantage)) self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret)) self.mean_entropy = tf.reduce_mean(self.new_network.entropy) self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \ self.config["entropy_coef"] * self.mean_entropy grads = tf.gradients(self.loss, self.new_network_vars) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.n_steps = tf.shape(self.states)[0] num_cpu = multiprocessing.cpu_count() tf_config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.session = tf.Session(config=tf_config) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() summary_actor_loss = tf.summary.scalar("model/Actor_loss", self.actor_loss) summary_critic_loss = tf.summary.scalar("model/Critic_loss", self.critic_loss) summary_loss = tf.summary.scalar("model/Loss", self.loss) adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0]) summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean) summary_adv_std = tf.summary.scalar("model/advantage/std", tf.sqrt(adv_std)) # TODO: get from ppo_loss function # ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0]) # summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean) # summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std) summary_new_log_prob_mean = tf.summary.scalar( "model/new_log_prob/mean", tf.reduce_mean(self.new_network.action_log_prob)) summary_old_log_prob_mean = tf.summary.scalar( "model/old_log_prob/mean", tf.reduce_mean(self.old_network.action_log_prob)) ret_mean, ret_std = tf.nn.moments(self.ret, axes=[0]) summary_ret_mean = tf.summary.scalar("model/return/mean", ret_mean) summary_ret_std = tf.summary.scalar("model/return/std", tf.sqrt(ret_std)) summary_entropy = tf.summary.scalar("model/entropy", -self.mean_entropy) summary_grad_norm = tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) summary_var_norm = tf.summary.scalar( "model/var_global_norm", tf.global_norm(self.new_network_vars)) summaries: List[tf.Tensor] = [] # Weight summaries: not turned on right now because they take too much space # TODO: use config to make this optional #for v in tf.trainable_variables(): # if "new_network" in v.name: # summaries.append(tf.summary.histogram(v.name, v)) summaries += self._specific_summaries() summaries += [ summary_actor_loss, summary_critic_loss, summary_loss, summary_adv_mean, summary_adv_std, # summary_ratio_mean, summary_ratio_std, summary_new_log_prob_mean, summary_old_log_prob_mean, summary_ret_mean, summary_ret_std, summary_entropy, summary_grad_norm, summary_var_norm ] self.model_summary_op = tf.summary.merge(summaries) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "summaries"), self.session.graph) self.env_runner = EnvRunner( self.env, self, usercfg, normalize_states=self.config["normalize_states"], summary_writer=self.writer) # grads before clipping were passed to the summary, now clip and apply them if self.config["gradient_clip_value"] is not None: grads, _ = tf.clip_by_global_norm( grads, self.config["gradient_clip_value"]) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.config["learning_rate"], epsilon=self.config["adam_epsilon"], name="optim") apply_grads = self.optimizer.apply_gradients( zip(grads, self.new_network_vars)) inc_step = self._global_step.assign_add(self.n_steps) self.train_op = tf.group(apply_grads, inc_step) self.init_op = tf.global_variables_initializer() return
class PPO(Agent): """Proximal Policy Optimization agent.""" RNN = False def __init__(self, env, monitor_path: str, monitor: bool = False, video: bool = False, **usercfg) -> None: super(PPO, self).__init__(**usercfg) self.monitor_path: str = monitor_path self.env = env if monitor: self.env = wrappers.Monitor( self.env, monitor_path, force=True, video_callable=(None if video else False)) self.config.update( dict( n_hidden_units=20, n_hidden_layers=2, gamma=0.99, gae_lambda=0.95, learning_rate=0.001, n_epochs=10, n_iter=10000, batch_size=64, # Timesteps per training batch n_local_steps=256, normalize_states=False, gradient_clip_value=None, adam_epsilon=1e-5, vf_coef=0.5, entropy_coef=0.01, cso_epsilon=0.2, # Clipped surrogate objective epsilon save_model=False)) self.config.update(usercfg) with tf.variable_scope("old_network"): self.old_network = self.build_networks() self.old_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) with tf.variable_scope("new_network"): self.new_network = self.build_networks() if self.RNN: self.initial_features = self.new_network.state_init else: self.initial_features = None self.new_network_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) self.action = self.new_network.action self.value = self.new_network.value self.states = self.new_network.states self.actions_taken = self.new_network.actions_taken self.advantage = tf.placeholder(tf.float32, [None], name="advantage") self.ret = tf.placeholder(tf.float32, [None], name="return") self.set_old_to_new = tf.group(*[ v1.assign(v2) for v1, v2 in zip(self.old_network_vars, self.new_network_vars) ]) self.actor_loss = -tf.reduce_mean( self.make_actor_loss(self.old_network, self.new_network, self.advantage)) self.critic_loss = tf.reduce_mean(tf.square(self.value - self.ret)) self.mean_entropy = tf.reduce_mean(self.new_network.entropy) self.loss = self.actor_loss + self.config["vf_coef"] * self.critic_loss + \ self.config["entropy_coef"] * self.mean_entropy grads = tf.gradients(self.loss, self.new_network_vars) self._global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.n_steps = tf.shape(self.states)[0] num_cpu = multiprocessing.cpu_count() tf_config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.session = tf.Session(config=tf_config) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() summary_actor_loss = tf.summary.scalar("model/Actor_loss", self.actor_loss) summary_critic_loss = tf.summary.scalar("model/Critic_loss", self.critic_loss) summary_loss = tf.summary.scalar("model/Loss", self.loss) adv_mean, adv_std = tf.nn.moments(self.advantage, axes=[0]) summary_adv_mean = tf.summary.scalar("model/advantage/mean", adv_mean) summary_adv_std = tf.summary.scalar("model/advantage/std", tf.sqrt(adv_std)) # TODO: get from ppo_loss function # ratio_mean, ratio_std = tf.nn.moments(ratio, axes=[0]) # summary_ratio_mean = tf.summary.scalar("model/ratio/mean", ratio_mean) # summary_ratio_std = tf.summary.scalar("model/ratio/std", ratio_std) summary_new_log_prob_mean = tf.summary.scalar( "model/new_log_prob/mean", tf.reduce_mean(self.new_network.action_log_prob)) summary_old_log_prob_mean = tf.summary.scalar( "model/old_log_prob/mean", tf.reduce_mean(self.old_network.action_log_prob)) ret_mean, ret_std = tf.nn.moments(self.ret, axes=[0]) summary_ret_mean = tf.summary.scalar("model/return/mean", ret_mean) summary_ret_std = tf.summary.scalar("model/return/std", tf.sqrt(ret_std)) summary_entropy = tf.summary.scalar("model/entropy", -self.mean_entropy) summary_grad_norm = tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads)) summary_var_norm = tf.summary.scalar( "model/var_global_norm", tf.global_norm(self.new_network_vars)) summaries: List[tf.Tensor] = [] # Weight summaries: not turned on right now because they take too much space # TODO: use config to make this optional #for v in tf.trainable_variables(): # if "new_network" in v.name: # summaries.append(tf.summary.histogram(v.name, v)) summaries += self._specific_summaries() summaries += [ summary_actor_loss, summary_critic_loss, summary_loss, summary_adv_mean, summary_adv_std, # summary_ratio_mean, summary_ratio_std, summary_new_log_prob_mean, summary_old_log_prob_mean, summary_ret_mean, summary_ret_std, summary_entropy, summary_grad_norm, summary_var_norm ] self.model_summary_op = tf.summary.merge(summaries) self.writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "summaries"), self.session.graph) self.env_runner = EnvRunner( self.env, self, usercfg, normalize_states=self.config["normalize_states"], summary_writer=self.writer) # grads before clipping were passed to the summary, now clip and apply them if self.config["gradient_clip_value"] is not None: grads, _ = tf.clip_by_global_norm( grads, self.config["gradient_clip_value"]) self.optimizer = tf.train.AdamOptimizer( learning_rate=self.config["learning_rate"], epsilon=self.config["adam_epsilon"], name="optim") apply_grads = self.optimizer.apply_gradients( zip(grads, self.new_network_vars)) inc_step = self._global_step.assign_add(self.n_steps) self.train_op = tf.group(apply_grads, inc_step) self.init_op = tf.global_variables_initializer() return def _initialize(self): self.session.run(self.init_op) def _specific_summaries(self) -> List[tf.Tensor]: """Summaries that are specific to the variant of the algorithm. None (empty list) for the base algorithm""" return [] def make_actor_loss(self, old_network, new_network, advantage): return ppo_loss(old_network.action_log_prob, new_network.action_log_prob, self.config["cso_epsilon"], advantage) def build_networks(self): raise NotImplementedError @property def global_step(self): return self._global_step.eval(session=self.session) def get_critic_value(self, state, *rest): return self.session.run([self.value], feed_dict={self.states: state})[0].flatten() def choose_action(self, state, *rest): action, value = self.session.run([self.action, self.value], feed_dict={self.states: [state]}) return {"action": action, "value": value[0]} def get_env_action(self, action): return np.argmax(action) def get_processed_trajectories(self): experiences = self.env_runner.get_steps(int( self.config["n_local_steps"]), stop_at_trajectory_end=False) T = experiences.steps v = 0 if experiences.terminals[-1] else self.get_critic_value( np.asarray(experiences.states)[None, -1], experiences.features[-1]) vpred = np.asarray(experiences.values + [v]) gamma = self.config["gamma"] lambda_ = self.config["gae_lambda"] gaelam = advantages = np.empty(T, 'float32') last_gaelam = 0 for t in reversed(range(T)): nonterminal = 1 - experiences.terminals[t] delta = experiences.rewards[t] + gamma * vpred[ t + 1] * nonterminal - vpred[t] gaelam[ t] = last_gaelam = delta + gamma * lambda_ * nonterminal * last_gaelam rs = advantages + experiences.values return experiences.states, experiences.actions, advantages, rs, experiences.features def learn(self): """Run learning algorithm""" self._initialize() config = self.config n_updates = 0 for _ in range(int(config["n_iter"])): # Collect trajectories until we get timesteps_per_batch total timesteps states, actions, advs, rs, _ = self.get_processed_trajectories() advs = np.array(advs) normalized_advs = (advs - advs.mean()) / advs.std() self.session.run(self.set_old_to_new) indices = np.arange(len(states)) for _ in range(int(self.config["n_epochs"])): np.random.shuffle(indices) batch_size = int(self.config["batch_size"]) for j in range(0, len(states), batch_size): batch_indices = indices[j:(j + batch_size)] batch_states = np.array(states)[batch_indices] batch_actions = np.array(actions)[batch_indices] batch_advs = np.array(normalized_advs)[batch_indices] batch_rs = np.array(rs)[batch_indices] fetches = [self.train_op] if (n_updates % 1000) == 0: fetches.append(self.model_summary_op) feed_dict = { self.states: batch_states, self.old_network.states: batch_states, self.actions_taken: batch_actions, self.old_network.actions_taken: batch_actions, self.advantage: batch_advs, self.ret: batch_rs } results = self.session.run(fetches, feed_dict) if (n_updates % 1000) == 0: self.writer.add_summary(results[-1], n_updates) n_updates += 1 self.writer.flush() if self.config["save_model"]: self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
def __init__(self, envs, monitor_path, learning_method="REINFORCE", **usercfg): super(AsyncKnowledgeTransfer, self).__init__(**usercfg) self.envs = envs self.learning_method = learning_method self.monitor_path = monitor_path self.config.update( dict( timesteps_per_batch=10000, trajectories_per_batch=10, batch_update="timesteps", n_iter=200, switch_at_iter= None, # None to deactivate, otherwhise an iteration at which to switch gamma=0.99, # Discount past rewards by a percentage decay=0.9, # Decay of RMSProp optimizer epsilon=1e-9, # Epsilon of RMSProp optimizer learning_rate=0.005, n_hidden_units=10, repeat_n_actions=1, n_task_variations=3, n_sparse_units=10, feature_extraction=False)) self.config.update(usercfg) self.stop_requested = False self.session = tf.Session(config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True)) self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer( 0, dtype=tf.int32), trainable=False) self.build_networks() self.loss = tf.placeholder("float", name="loss") summary_loss = tf.summary.scalar("Loss", self.loss) self.reward = tf.placeholder("float", name="reward") summary_rewards = tf.summary.scalar("Reward", self.reward) self.episode_length = tf.placeholder("float", name="episode_length") summary_episode_lengths = tf.summary.scalar("Episode_length", self.episode_length) self.summary_op = tf.summary.merge( [summary_loss, summary_rewards, summary_episode_lengths]) self.jobs = [] for i, env in enumerate(self.envs): self.jobs.append( self.make_thread( env, i, self.config["switch_at_iter"] if self.config["switch_at_iter"] is not None and i != len(self.envs) - 1 else self.config["n_iter"], start_at_iter=(0 if self.config["switch_at_iter"] is None or i != len(self.envs) - 1 else self.config["switch_at_iter"]))) for i, job in enumerate(self.jobs): only_sparse = (self.config["switch_at_iter"] is not None and i == len(self.jobs) - 1) grads = tf.gradients( job.loss, (self.shared_vars if not (only_sparse) else []) + [job.sparse_representation]) job.apply_grad = job.optimizer.apply_gradients( zip(grads, (self.shared_vars if not (only_sparse) else []) + [job.sparse_representation]), global_step=self.global_step) self.session.run(tf.global_variables_initializer()) if self.config["save_model"]: for job in self.jobs: tf.add_to_collection("action", job.action) tf.add_to_collection("states", self.states) self.saver = FastSaver()
class AsyncKnowledgeTransfer(Agent): """Asynchronous learner for variations of a task.""" def __init__(self, envs, monitor_path, learning_method="REINFORCE", **usercfg): super(AsyncKnowledgeTransfer, self).__init__(**usercfg) self.envs = envs self.learning_method = learning_method self.monitor_path = monitor_path self.config.update( dict( timesteps_per_batch=10000, trajectories_per_batch=10, batch_update="timesteps", n_iter=200, switch_at_iter= None, # None to deactivate, otherwhise an iteration at which to switch gamma=0.99, # Discount past rewards by a percentage decay=0.9, # Decay of RMSProp optimizer epsilon=1e-9, # Epsilon of RMSProp optimizer learning_rate=0.005, n_hidden_units=10, repeat_n_actions=1, n_task_variations=3, n_sparse_units=10, feature_extraction=False)) self.config.update(usercfg) self.stop_requested = False self.session = tf.Session(config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True)) self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer( 0, dtype=tf.int32), trainable=False) self.build_networks() self.loss = tf.placeholder("float", name="loss") summary_loss = tf.summary.scalar("Loss", self.loss) self.reward = tf.placeholder("float", name="reward") summary_rewards = tf.summary.scalar("Reward", self.reward) self.episode_length = tf.placeholder("float", name="episode_length") summary_episode_lengths = tf.summary.scalar("Episode_length", self.episode_length) self.summary_op = tf.summary.merge( [summary_loss, summary_rewards, summary_episode_lengths]) self.jobs = [] for i, env in enumerate(self.envs): self.jobs.append( self.make_thread( env, i, self.config["switch_at_iter"] if self.config["switch_at_iter"] is not None and i != len(self.envs) - 1 else self.config["n_iter"], start_at_iter=(0 if self.config["switch_at_iter"] is None or i != len(self.envs) - 1 else self.config["switch_at_iter"]))) for i, job in enumerate(self.jobs): only_sparse = (self.config["switch_at_iter"] is not None and i == len(self.jobs) - 1) grads = tf.gradients( job.loss, (self.shared_vars if not (only_sparse) else []) + [job.sparse_representation]) job.apply_grad = job.optimizer.apply_gradients( zip(grads, (self.shared_vars if not (only_sparse) else []) + [job.sparse_representation]), global_step=self.global_step) self.session.run(tf.global_variables_initializer()) if self.config["save_model"]: for job in self.jobs: tf.add_to_collection("action", job.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() def build_networks(self): with tf.variable_scope("shared"): self.states = tf.placeholder( tf.float32, [None] + list(self.envs[0].observation_space.shape), name="states") self.action_taken = tf.placeholder(tf.float32, name="action_taken") self.advantage = tf.placeholder(tf.float32, name="advantage") if self.config["feature_extraction"]: self.L1 = tf.contrib.layers.fully_connected( inputs=self.states, num_outputs=self.config["n_hidden_units"], activation_fn=tf.tanh, weights_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.02), biases_initializer=tf.zeros_initializer(), scope="L1") else: self.L1 = self.states self.knowledge_base = tf.Variable(tf.truncated_normal( [self.L1.get_shape()[-1].value, self.config["n_sparse_units"]], mean=0.0, stddev=0.02), name="knowledge_base") self.shared_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def signal_handler(self, signal, frame): """When a (SIGINT) signal is received, request the threads (via the master) to stop after completing an iteration.""" logging.info("SIGINT signal received: Requesting a stop...") self.stop_requested = True def learn(self): signal.signal(signal.SIGINT, self.signal_handler) if self.config["switch_at_iter"] is None: idx = None else: idx = -1 for job in self.jobs[:idx]: job.start() for job in self.jobs[:idx]: job.join() try: self.jobs[idx].start() self.jobs[idx].join() except TypeError: # idx is None pass if self.config["save_model"]: self.saver.save(self.session, os.path.join(self.monitor_path, "model")) def make_thread(self, env, task_id, n_iter, start_at_iter=0): return AKTThread(self, env, task_id, n_iter, start_at_iter=start_at_iter)
class KarpathyCNN(Agent): """Karpathy policy gradient learner using a convolutional neural network""" def __init__(self, env, monitor_path, video=True, **usercfg): super(KarpathyCNN, self).__init__(**usercfg) self.env = wrappers.Monitor(env, monitor_path, force=True, video_callable=(None if video else False)) self.nA = env.action_space.n self.monitor_path = monitor_path # Default configuration. Can be overwritten using keyword arguments. self.config.update( dict( # timesteps_per_batch=10000, # n_iter=100, n_hidden_units=200, learning_rate=1e-3, batch_size= 10, # Amount of episodes after which to adapt gradients gamma=0.99, # Discount past rewards by a percentage decay=0.99, # Decay of RMSProp optimizer epsilon=1e-9, # Epsilon of RMSProp optimizer draw_frequency=50 # Draw a plot every 50 episodes )) self.config.update(usercfg) self.build_network() if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver = FastSaver() def build_network(self): image_size = 80 image_depth = 1 # aka nr. of feature maps. Eg 3 for RGB images. 1 here because we use grayscale images self.states = tf.placeholder( tf.float32, [None, image_size, image_size, image_depth], name="states") # Convolution layer 1 depth = 32 patch_size = 4 self.w1 = tf.Variable( tf.truncated_normal([patch_size, patch_size, image_depth, depth], stddev=0.01)) self.b1 = tf.Variable(tf.zeros([depth])) self.L1 = tf.nn.relu( tf.nn.conv2d( self.states, self.w1, strides=[1, 2, 2, 1], padding="SAME") + self.b1) self.L1 = tf.nn.max_pool(self.L1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME") # Convolution layer 2 self.w2 = tf.Variable( tf.truncated_normal([patch_size, patch_size, depth, depth], stddev=0.01)) self.b2 = tf.Variable(tf.zeros([depth])) self.L2 = tf.nn.relu( tf.nn.conv2d( self.L1, self.w2, strides=[1, 2, 2, 1], padding="SAME") + self.b2) # Flatten shape = self.L2.get_shape().as_list() reshape = tf.reshape(self.L2, [-1, shape[1] * shape[2] * shape[3] ]) # -1 for the (unknown) batch size # Fully connected layer 1 self.L3 = tf.contrib.layers.fully_connected( inputs=reshape, num_outputs=self.config["n_hidden_units"], activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.02), biases_initializer=tf.zeros_initializer()) # Fully connected layer 2 self.probs = tf.contrib.layers.fully_connected( inputs=self.L3, num_outputs=self.nA, activation_fn=tf.nn.softmax, weights_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.02), biases_initializer=tf.zeros_initializer()) self.action = tf.squeeze(tf.multinomial(tf.log(self.probs), 1), name="action") self.vars = [ self.w1, self.b1, self.w2, self.b2, self.w3, self.b3, self.w4, self.b4 ] self.action_taken = tf.placeholder(tf.float32, shape=[None, self.nA], name="action_taken") self.feedback = tf.placeholder(tf.float32, shape=[None, self.nA], name="feedback") loss = tf.reduce_mean( tf.squared_difference(self.action_taken, self.probs) * self.feedback) self.create_accumulative_grads = create_accumulative_gradients_op( self.vars) self.accumulate_grads = add_accumulative_gradients_op( self.vars, self.create_accumulative_grads, loss) self.reset_accumulative_grads = reset_accumulative_gradients_op( self.vars, self.create_accumulative_grads) self.optimizer = tf.train.RMSPropOptimizer( learning_rate=self.config["learning_rate"], decay=self.config["decay"], epsilon=self.config["epsilon"]) self.apply_gradients = self.optimizer.apply_gradients( zip(self.create_accumulative_grads, self.vars)) init = tf.global_variables_initializer() # Launch the graph. num_cpu = multiprocessing.cpu_count() tf_config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=num_cpu, intra_op_parallelism_threads=num_cpu) self.session = tf.Session(config=tf_config) self.session.run(init) def choose_action(self, state): return self.session.run([self.action], feed_dict={self.states: [state]})[0] def get_trajectory(self, render=False): """ Run agent-environment loop for one whole episode (trajectory) Return dictionary of results Note that this function returns more than the get_trajectory in the EnvRunner class. """ state = preprocess_image(self.env.reset()) prev_state = state states = [] actions = [] rewards = [] for _ in range(self.config["episode_max_length"]): delta = state - prev_state action = self.choose_action(delta) states.append(delta) prev_state = state state, rew, done, _ = self.env.step(action) state = preprocess_image(state) actions.append(action) rewards.append(rew) if done: break if render: self.env.render() return { "reward": np.array(rewards), "state": np.array(states), "action": np.array(actions), } def learn(self): reporter = Reporter() self.session.run([self.reset_accumulative_grads]) iteration = 0 # amount of batches processed episode_nr = 0 episode_lengths = np.zeros(self.config["batch_size"]) episode_rewards = np.zeros(self.config["batch_size"]) mean_rewards = [] while True: # Keep executing episodes trajectory = self.get_trajectory() episode_rewards[episode_nr % self.config["batch_size"]] = sum( trajectory["reward"]) episode_lengths[episode_nr % self.config["batch_size"]] = len( trajectory["reward"]) episode_nr += 1 action_taken = (np.arange( self.nA) == trajectory["action"][:, None]).astype( np.float32) # one-hot encoding discounted_episode_rewards = discount_rewards( trajectory["reward"], self.config["gamma"]) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) std = np.std(discounted_episode_rewards) std = std if std > 0 else 1 discounted_episode_rewards /= std feedback = np.reshape( np.repeat(discounted_episode_rewards, self.nA), (len(discounted_episode_rewards), self.nA)) self.session.run( [self.accumulate_grads], feed_dict={ self.states: trajectory["state"], self.action_taken: action_taken, self.feedback: feedback }) if episode_nr % self.config["batch_size"] == 0: # batch is done iteration += 1 self.session.run([self.apply_gradients]) self.session.run([self.reset_accumulative_grads]) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, episode_nr) mean_rewards.append(episode_rewards.mean()) if episode_nr % self.config["draw_frequency"] == 0: reporter.draw_rewards(mean_rewards) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
class KnowledgeTransfer(Agent): """Learner for variations of a task.""" def __init__(self, envs, monitor_path, **usercfg): super(KnowledgeTransfer, self).__init__(**usercfg) self.envs = envs self.n_tasks = len(envs) self.monitor_path = monitor_path self.nA = envs[0].action_space.n self.config.update( dict( timesteps_per_batch=10000, trajectories_per_batch=10, batch_update="timesteps", n_iter=100, switch_at_iter=None, gamma=0.99, # Discount past rewards by a percentage decay=0.9, # Decay of RMSProp optimizer epsilon=1e-9, # Epsilon of RMSProp optimizer learning_rate=0.005, n_hidden_units=10, repeat_n_actions=1, n_sparse_units=10, feature_extraction=False)) self.config.update(usercfg) self.build_networks() self.task_runners = [ EnvRunner(envs[i], TaskPolicy(action, self), self.config) for i, action in enumerate(self.action_tensors) ] if self.config["save_model"]: for action_tensor in self.action_tensors: tf.add_to_collection("action", action_tensor) tf.add_to_collection("states", self.states) self.saver = FastSaver() def build_networks(self): self.session = tf.Session() with tf.variable_scope("shared"): self.states = tf.placeholder( tf.float32, [None] + list(self.envs[0].observation_space.shape), name="states") self.action_taken = tf.placeholder(tf.float32, name="action_taken") self.advantage = tf.placeholder(tf.float32, name="advantage") L1 = None if self.config["feature_extraction"]: L1 = tf.contrib.layers.fully_connected( inputs=self.states, num_outputs=self.config["n_hidden_units"], activation_fn=tf.tanh, weights_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.02), biases_initializer=tf.zeros_initializer(), scope="L1") else: L1 = self.states knowledge_base = tf.Variable(tf.truncated_normal( [L1.get_shape()[-1].value, self.config["n_sparse_units"]], mean=0.0, stddev=0.02), name="knowledge_base") self.shared_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="shared") # Every task has its own (sparse) representation sparse_representations = [ tf.Variable(tf.truncated_normal( [self.config["n_sparse_units"], self.nA], mean=0.0, stddev=0.02), name="sparse%d" % i) for i in range(self.n_tasks) ] self.probs_tensors = [ tf.nn.softmax(tf.matmul(L1, tf.matmul(knowledge_base, s))) for s in sparse_representations ] self.action_tensors = [ tf.squeeze(tf.multinomial(tf.log(probs), 1)) for probs in self.probs_tensors ] self.optimizer = tf.train.RMSPropOptimizer( learning_rate=self.config["learning_rate"], decay=self.config["decay"], epsilon=self.config["epsilon"]) net_vars = self.shared_vars + sparse_representations self.accum_grads = create_accumulative_gradients_op(net_vars, 0) self.loss = tf.placeholder("float", name="loss") summary_loss = tf.summary.scalar("Loss", self.loss) self.rewards = tf.placeholder("float", name="Rewards") summary_rewards = tf.summary.scalar("Reward", self.rewards) self.episode_lengths = tf.placeholder("float", name="Episode_lengths") summary_episode_lengths = tf.summary.scalar("Length", self.episode_lengths) self.summary_op = tf.summary.merge( [summary_loss, summary_rewards, summary_episode_lengths]) self.writers = [] self.losses = [] regularizer = tf.contrib.layers.l1_regularizer(.05) for i, probabilities in enumerate(self.probs_tensors): good_probabilities = tf.reduce_sum(tf.multiply( probabilities, tf.one_hot(tf.cast(self.action_taken, tf.int32), self.nA)), reduction_indices=[1]) eligibility = tf.log(good_probabilities) * self.advantage loss = -tf.reduce_sum(eligibility) + regularizer( sparse_representations[i]) self.losses.append(loss) writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "task" + str(i)), self.session.graph) self.writers.append(writer) # An add op for every task & its loss self.add_accum_grads = [] for i, loss in enumerate(self.losses): # Use all variables if the switch tasks experiment is disactivated or it's not the last task all_vars = self.config["switch_at_iter"] is None or i != len( self.losses) - 1 self.add_accum_grads.append( add_accumulative_gradients_op( (self.shared_vars if all_vars else []) + [sparse_representations[i]], ([self.accum_grads[0]] if all_vars else []) + [self.accum_grads[i + 1]], loss, i)) self.apply_gradients = self.optimizer.apply_gradients( zip(self.accum_grads, net_vars)) self.reset_accum_grads = reset_accumulative_gradients_op( net_vars, self.accum_grads, 0) self.init_op = tf.global_variables_initializer() def _initialize(self): self.session.run(self.init_op) def learn(self): """Run learning algorithm""" self._initialize() reporter = Reporter() config = self.config total_n_trajectories = np.zeros(len(self.envs)) for iteration in range(config["n_iter"]): self.session.run([self.reset_accum_grads]) for i, task_runner in enumerate(self.task_runners): if self.config["switch_at_iter"] is not None: if iteration >= self.config["switch_at_iter"] and i != ( len(self.task_runners) - 1): continue elif iteration < self.config["switch_at_iter"] and i == len( self.task_runners) - 1: continue # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = task_runner.get_trajectories() total_n_trajectories[i] += len(trajectories) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths results = self.session.run( [ self.losses[i], self.add_accum_grads[i], self.accum_grads ], feed_dict={ self.states: all_state, self.action_taken: all_action, self.advantage: all_adv }) summary = self.session.run( [self.summary_op], feed_dict={ self.loss: results[0], self.rewards: np.mean(episode_rewards), self.episode_lengths: np.mean(episode_lengths) }) self.writers[i].add_summary(summary[0], iteration) self.writers[i].flush() print("Task:", i) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories[i]) # Apply accumulated gradient after all the gradients of each task are summed self.session.run([self.apply_gradients]) if self.config["save_model"]: if not os.path.exists(self.monitor_path): os.makedirs(self.monitor_path) self.saver.save(self.session, os.path.join(self.monitor_path, "model"))