def __init__(self, sample_count=10): self._weights = np.array([-10, -10, -10, -10]) self._grad = np.array([1, 1, 1, 1]) self._sample_count = sample_count self.obs_filter = MeanStdFilter(()) self.rew_filter = MeanStdFilter(()) self.filters = {"obs_filter": self.obs_filter, "rew_filter": self.rew_filter}
def test_synchronize(self): """Synchronize applies filter buffer onto own filter""" filt1 = MeanStdFilter(()) for i in range(10): filt1(i) self.assertEqual(filt1.rs.n, 10) filt1.clear_buffer() self.assertEqual(filt1.buffer.n, 0) RemoteWorker = ray.remote(_MockWorker) remote_e = RemoteWorker.remote(sample_count=10) remote_e.sample.remote() FilterManager.synchronize( { "obs_filter": filt1, "rew_filter": filt1.copy() }, [remote_e]) filters = ray.get(remote_e.get_filters.remote()) obs_f = filters["obs_filter"] self.assertEqual(filt1.rs.n, 20) self.assertEqual(filt1.buffer.n, 0) self.assertEqual(obs_f.rs.n, filt1.rs.n) self.assertEqual(obs_f.buffer.n, filt1.buffer.n)
def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder(name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None, )) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None, )) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, config["model"]) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder(tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, )) self.inputs = [("obs", self.observations), ("value_targets", self.value_targets), ("advantages", self.advantages), ("actions", self.actions), ("logprobs", self.prev_logits), ("vf_preds", self.prev_vf_preds)] self.common_policy = self.build_tf_loss([ph for _, ph in self.inputs]) # References to the model weights self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter(config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.sampler = SyncSampler(self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"])
class _MockWorker: def __init__(self, sample_count=10): self._weights = np.array([-10, -10, -10, -10]) self._grad = np.array([1, 1, 1, 1]) self._sample_count = sample_count self.obs_filter = MeanStdFilter(()) self.rew_filter = MeanStdFilter(()) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } def sample(self): samples_dict = {"observations": [], "rewards": []} for i in range(self._sample_count): samples_dict["observations"].append( self.obs_filter(np.random.randn())) samples_dict["rewards"].append(self.rew_filter(np.random.randn())) return SampleBatch(samples_dict) def compute_gradients(self, samples): return self._grad * samples.count, {"batch_count": samples.count} def apply_gradients(self, grads): self._weights += self._grad def get_weights(self): return self._weights def set_weights(self, weights): self._weights = weights def get_filters(self, flush_after=False): obs_filter = self.obs_filter.copy() rew_filter = self.rew_filter.copy() if flush_after: self.obs_filter.reset_buffer(), self.rew_filter.reset_buffer() return {"obs_filter": obs_filter, "rew_filter": rew_filter} def sync_filters(self, new_filters): assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k])
class _MockEvaluator(object): def __init__(self, sample_count=10): self._weights = np.array([-10, -10, -10, -10]) self._grad = np.array([1, 1, 1, 1]) self._sample_count = sample_count self.obs_filter = MeanStdFilter(()) self.rew_filter = MeanStdFilter(()) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } def sample(self): samples_dict = {"observations": [], "rewards": []} for i in range(self._sample_count): samples_dict["observations"].append( self.obs_filter(np.random.randn())) samples_dict["rewards"].append(self.rew_filter(np.random.randn())) return SampleBatch(samples_dict) def compute_gradients(self, samples): return self._grad * samples.count, {"batch_count": samples.count} def apply_gradients(self, grads): self._weights += self._grad def get_weights(self): return self._weights def set_weights(self, weights): self._weights = weights def get_filters(self, flush_after=False): obs_filter = self.obs_filter.copy() rew_filter = self.rew_filter.copy() if flush_after: self.obs_filter.clear_buffer(), self.rew_filter.clear_buffer() return {"obs_filter": obs_filter, "rew_filter": rew_filter} def sync_filters(self, new_filters): assert all(k in new_filters for k in self.filters) for k in self.filters: self.filters[k].sync(new_filters[k])
def __init__(self, config, env_creator, worker_index): super().__init__(config, env_creator, worker_index) self.elite = VBNChromosome(number_actions=self.config['number_actions'], input_channels=self.config['input_channels']) self.oponent = VBNChromosome(number_actions=self.config['number_actions'], input_channels=self.config['input_channels']) self.filter = MeanStdFilter(self.env.reset()[PLAYER_2_ID].shape) self.video_path = f'/tmp/video_worker_{worker_index}.mp4' self.video_path_eval = f'/tmp/video_worker_{worker_index}_eval.mp4'
def collect_samples(self): """ Sample game frames from the environment by letting two random policies play against eachother. """ env = self.env_creator(self.config) obs = env.reset() obs_filter = MeanStdFilter(obs[PLAYER_1_ID].shape) policy = RandomPolicy(self.config['number_actions']) samples = [] for _ in range(500): obs, _, done, _ = env.step samples += [obs_filter(obs[PLAYER_1_ID]), obs_filter(obs[PLAYER_2_ID])] if done[PLAYER_1_ID]: env.reset() return samples
def testSynchronize(self): """Synchronize applies filter buffer onto own filter""" filt1 = MeanStdFilter(()) for i in range(10): filt1(i) self.assertEqual(filt1.rs.n, 10) filt1.clear_buffer() self.assertEqual(filt1.buffer.n, 0) RemoteEvaluator = ray.remote(_MockEvaluator) remote_e = RemoteEvaluator.remote(sample_count=10) remote_e.sample.remote() FilterManager.synchronize({ "obs_filter": filt1, "rew_filter": filt1.copy() }, [remote_e]) filters = ray.get(remote_e.get_filters.remote()) obs_f = filters["obs_filter"] self.assertEqual(filt1.rs.n, 20) self.assertEqual(filt1.buffer.n, 0) self.assertEqual(obs_f.rs.n, filt1.rs.n) self.assertEqual(obs_f.buffer.n, filt1.buffer.n)
def testBasic(self): for shape in [(), (3, ), (3, 4, 4)]: filt = MeanStdFilter(shape) for i in range(5): filt(np.ones(shape)) self.assertEqual(filt.rs.n, 5) self.assertEqual(filt.buffer.n, 5) filt2 = MeanStdFilter(shape) filt2.sync(filt) self.assertEqual(filt2.rs.n, 5) self.assertEqual(filt2.buffer.n, 5) filt.clear_buffer() self.assertEqual(filt.buffer.n, 0) self.assertEqual(filt2.buffer.n, 5) filt.apply_changes(filt2, with_buffer=False) self.assertEqual(filt.buffer.n, 0) self.assertEqual(filt.rs.n, 10) filt.apply_changes(filt2, with_buffer=True) self.assertEqual(filt.buffer.n, 5) self.assertEqual(filt.rs.n, 15)
def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder(name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None, )) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None, )) action_space = self.env.action_space # TODO(rliaw): pull this into model_catalog if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder(tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None, )) else: raise NotImplemented("action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder(tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, )) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss(self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [ self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds ], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack( values=[policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter(config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.sampler = SyncSampler(self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.sess.run(tf.global_variables_initializer())
def _init(self): self.reward_filter = MeanStdFilter((), clip=5.)