def __init__(self, initial_frames, problem): self._history_buff = None initial_shape = common_layers.shape_list(initial_frames) var_shape = [ initial_shape[0], problem.frame_height, problem.frame_width, problem.num_channels ] with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): history_buff = tf.get_variable("history_observ", var_shape, initializer=tf.zeros_initializer, trainable=False) self._history_buff = history_buff self._assigned = False with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): if FLAGS.autoencoder_path: # Feeds for autoencoding. problem.setup_autoencoder() autoencoder_model = problem.autoencoder_model autoencoder_model.set_mode(tf.estimator.ModeKeys.EVAL) autoencoded = autoencoder_model.encode( tf.expand_dims(initial_frames, axis=1)) autoencoded_shape = common_layers.shape_list(autoencoded) autoencoded = tf.reshape( # Make 8-bit groups. autoencoded, autoencoded_shape[:-1] + [3, 8]) initial_frames = discretization.bit_to_int(autoencoded, 8) initial_frames = tf.to_float(initial_frames) self.initial_frames = initial_frames with tf.control_dependencies([history_buff.assign(initial_frames) ]): self._history_buff_id = tf.identity(history_buff)
def testBitToIntOnes(self): x_bit = tf.ones(shape=[1, 3], dtype=tf.float32) x_int = 7 * tf.ones(shape=[1], dtype=tf.int32) diff = discretization.bit_to_int(x_bit, num_bits=3) - x_int with self.test_session() as sess: tf.global_variables_initializer().run() d = sess.run(diff) self.assertEqual(d, 0)
def encode_dataset(model, dataset, problem, ae_hparams, autoencoder_path, out_files): """Encode all frames in dataset with model and write them out to out_files.""" batch_size = 8 dataset = dataset.batch(batch_size) examples = dataset.make_one_shot_iterator().get_next() images = examples.pop("frame") images = tf.cast(images, tf.int32) encoded = model.encode(images) encoded_frame_height = int( math.ceil(problem.frame_height / 2**ae_hparams.num_hidden_layers)) encoded_frame_width = int( math.ceil(problem.frame_width / 2**ae_hparams.num_hidden_layers)) num_bits = 8 encoded = tf.reshape( encoded, [-1, encoded_frame_height, encoded_frame_width, 3, num_bits]) encoded = tf.cast(discretization.bit_to_int(encoded, num_bits), tf.uint8) pngs = tf.map_fn(tf.image.encode_png, encoded, dtype=tf.string, back_prop=False) with tf.Session() as sess: autoencoder_saver = tf.train.Saver( tf.global_variables("autoencoder.*")) trainer_lib.restore_checkpoint(autoencoder_path, autoencoder_saver, sess, must_restore=True) def generator(): """Generate examples.""" while True: try: pngs_np, examples_np = sess.run([pngs, examples]) rewards = examples_np["reward"].tolist() actions = examples_np["action"].tolist() frame_numbers = examples_np["frame_number"].tolist() for action, reward, frame_number, png in \ zip(actions, rewards, frame_numbers, pngs_np): yield { "action": action, "reward": reward, "frame_number": frame_number, "image/encoded": [png], "image/format": ["png"], "image/height": [encoded_frame_height], "image/width": [encoded_frame_width], } except tf.errors.OutOfRangeError: break generator_utils.generate_files( generator(), out_files, cycle_every_n=problem.total_number_of_frames // 10)
def autoencode_tensor(self, x): if self.autoencoder_model is None: return x shape = [self.raw_frame_height, self.raw_frame_width, self.num_channels] with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): self.autoencoder_model.set_mode(tf.estimator.ModeKeys.EVAL) # TODO(lukaszkaiser): we assume batch size=1 for now here, change later! autoencoded = self.autoencoder_model.encode( tf.reshape(x, [1, 1] + shape)) autoencoded = tf.reshape( autoencoded, [self.frame_height, self.frame_width, self.num_channels, 8]) # 8-bit groups. return discretization.bit_to_int(autoencoded, 8)
def autoencode_tensor(self, x, batch_size=1): if self.autoencoder_model is None: return x shape = [self.raw_frame_height, self.raw_frame_width, self.num_channels] with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): self.autoencoder_model.set_mode(tf.estimator.ModeKeys.EVAL) autoencoded = self.autoencoder_model.encode( tf.reshape(x, [batch_size, 1] + shape)) autoencoded = tf.reshape( autoencoded, [batch_size] + self.frame_shape + [8]) # 8-bit groups. if batch_size == 1: autoencoded = tf.squeeze(autoencoded, axis=0) return discretization.bit_to_int(autoencoded, 8)
def get_initial_observations(self): initial_frames = self.input_data_iterator.get_next()["inputs"] if self.autoencoder_model: autoencoded = self.autoencoder_model.encode( tf.expand_dims(initial_frames, axis=1)) autoencoded_shape = common_layers.shape_list(autoencoded) autoencoded = tf.reshape( # Make 8-bit groups. autoencoded, autoencoded_shape[:-1] + [3, 8]) initial_frames = discretization.bit_to_int(autoencoded, 8) initial_frames = tf.to_float(initial_frames) else: initial_frames = tf.cast(initial_frames, tf.float32) return initial_frames
def encode_dataset(model, dataset, problem, ae_hparams, autoencoder_path, out_files): """Encode all frames in dataset with model and write them out to out_files.""" batch_size = 8 dataset = dataset.batch(batch_size) examples = dataset.make_one_shot_iterator().get_next() images = examples.pop("frame") images = tf.expand_dims(images, 1) encoded = model.encode(images) encoded_frame_height = int( math.ceil(problem.frame_height / 2**ae_hparams.num_hidden_layers)) encoded_frame_width = int( math.ceil(problem.frame_width / 2**ae_hparams.num_hidden_layers)) num_bits = 8 encoded = tf.reshape( encoded, [-1, encoded_frame_height, encoded_frame_width, 3, num_bits]) encoded = tf.cast(discretization.bit_to_int(encoded, num_bits), tf.uint8) pngs = tf.map_fn(tf.image.encode_png, encoded, dtype=tf.string, back_prop=False) with tf.Session() as sess: autoencoder_saver = tf.train.Saver(tf.global_variables("autoencoder.*")) trainer_lib.restore_checkpoint(autoencoder_path, autoencoder_saver, sess, must_restore=True) def generator(): """Generate examples.""" while True: try: pngs_np, examples_np = sess.run([pngs, examples]) rewards_np = [list(el) for el in examples_np["reward"]] actions_np = [list(el) for el in examples_np["action"]] pngs_np = [el for el in pngs_np] for action, reward, png in zip(actions_np, rewards_np, pngs_np): yield { "action": action, "reward": reward, "image/encoded": [png], "image/format": ["png"], "image/height": [encoded_frame_height], "image/width": [encoded_frame_width], } except tf.errors.OutOfRangeError: break generator_utils.generate_files( generator(), out_files, cycle_every_n=problem.total_number_of_frames // 10)
def inject_latent(self, layer, features, filters): """Inject a deterministic latent based on the target frame.""" del filters hparams = self.hparams final_filters = common_layers.shape_list(layer)[-1] filters = hparams.hidden_size kernel = (4, 4) layer_shape = common_layers.shape_list(layer) batch_size = layer_shape[0] state_size = hparams.latent_predictor_state_size lstm_cell = tf.contrib.rnn.LSTMCell(state_size) discrete_predict = tf.layers.Dense(256, name="discrete_predict") discrete_embed = tf.layers.Dense(state_size, name="discrete_embed") def add_d(layer, d): z_mul = tf.layers.dense(d, final_filters, name="unbottleneck_mul") if not hparams.complex_addn: return layer + z_mul layer *= tf.nn.sigmoid(z_mul) z_add = tf.layers.dense(d, final_filters, name="unbottleneck_add") layer += z_add return layer if self.is_predicting: if hparams.full_latent_tower: rand = tf.random_uniform(layer_shape[:-1] + [hparams.bottleneck_bits]) else: layer_pred = tf.reshape( layer, [batch_size, prod(layer_shape[1:])]) prediction = tf.layers.dense(layer_pred, state_size, name="istate") c_state = tf.layers.dense(layer_pred, state_size, name="cstate") m_state = tf.layers.dense(layer_pred, state_size, name="mstate") state = (c_state, m_state) outputs = [] for i in range(hparams.bottleneck_bits // 8): output, state = lstm_cell(prediction, state) discrete_logits = discrete_predict(output) discrete_samples = common_layers.sample_with_temperature( discrete_logits, hparams.latent_predictor_temperature) outputs.append(tf.expand_dims(discrete_samples, axis=1)) prediction = discrete_embed( tf.one_hot(discrete_samples, 256)) outputs = tf.concat(outputs, axis=1) outputs = discretization.int_to_bit(outputs, 8) rand = tf.reshape(outputs, [batch_size, 1, 1, hparams.bottleneck_bits]) d = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0 return add_d(layer, d), 0.0 # Embed. frames = tf.concat([features["cur_target_frame"], features["inputs"]], axis=-1) x = tf.layers.dense( frames, filters, name="latent_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) if hparams.full_latent_tower: for i in range(hparams.num_compress_steps): with tf.variable_scope("latent_downstride%d" % i): x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tf.layers.conv2d(x, filters, kernel, activation=common_layers.belu, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) else: x = common_layers.double_discriminator(x) x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1) x = tf.layers.dense(x, hparams.bottleneck_bits, name="bottleneck") x0 = tf.tanh(x) d = x0 + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x0)) - 1.0 - x0) pred_loss = 0.0 if not hparams.full_latent_tower: d_pred = tf.reshape(tf.maximum(tf.stop_gradient(d), 0), [batch_size, hparams.bottleneck_bits // 8, 8]) d_int = discretization.bit_to_int(d_pred, 8) tf.summary.histogram("d_int", tf.reshape(d_int, [-1])) d_hot = tf.one_hot(d_int, 256, axis=-1) d_pred = discrete_embed(d_hot) layer_pred = tf.reshape(layer, [batch_size, prod(layer_shape[1:])]) prediction0 = tf.layers.dense(layer_pred, state_size, name="istate") c_state = tf.layers.dense(layer_pred, state_size, name="cstate") m_state = tf.layers.dense(layer_pred, state_size, name="mstate") pred = tf.concat([tf.expand_dims(prediction0, axis=1), d_pred], axis=1) state = (c_state, m_state) outputs = [] for i in range(hparams.bottleneck_bits // 8): output, state = lstm_cell(pred[:, i, :], state) outputs.append(tf.expand_dims(output, axis=1)) outputs = tf.concat(outputs, axis=1) d_int_pred = discrete_predict(outputs) pred_loss = tf.losses.sparse_softmax_cross_entropy( logits=d_int_pred, labels=d_int) pred_loss = tf.reduce_mean(pred_loss) if hparams.mode == tf.estimator.ModeKeys.TRAIN: x += tf.truncated_normal(common_layers.shape_list(x), mean=0.0, stddev=0.2) x = tf.tanh(x) noise = tf.random_uniform(common_layers.shape_list(x)) noise = 2.0 * tf.to_float(tf.less(hparams.bottleneck_noise, noise)) - 1.0 x *= noise d = x + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x)) - 1.0 - x) p = common_layers.inverse_lin_decay(hparams.discrete_warmup_steps) d = tf.where(tf.less(tf.random_uniform([batch_size]), p), d, x) return add_d(layer, d), pred_loss
def next_frame(self, frames, actions, rewards, target_frame, internal_states, video_extra): del rewards, video_extra hparams = self.hparams filters = hparams.hidden_size kernel2 = (4, 4) action = actions[-1] activation_fn = common_layers.belu if self.hparams.activation_fn == "relu": activation_fn = tf.nn.relu # Normalize frames. frames = [common_layers.standardize_images(f) for f in frames] # Stack the inputs. if internal_states is not None and hparams.concat_internal_states: # Use the first part of the first internal state if asked to concatenate. batch_size = common_layers.shape_list(frames[0])[0] internal_state = internal_states[0][0][:batch_size, :, :, :] stacked_frames = tf.concat(frames + [internal_state], axis=-1) else: stacked_frames = tf.concat(frames, axis=-1) inputs_shape = common_layers.shape_list(stacked_frames) # Update internal states early if requested. if hparams.concat_internal_states: internal_states = self.update_internal_states_early( internal_states, frames) # Using non-zero bias initializer below for edge cases of uniform inputs. x = tf.layers.dense( stacked_frames, filters, name="inputs_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) # Down-stride. layer_inputs = [x] for i in range(hparams.num_compress_steps): with tf.variable_scope("downstride%d" % i): layer_inputs.append(x) x = tf.nn.dropout(x, 1.0 - self.hparams.dropout) x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tf.layers.conv2d(x, filters, kernel2, activation=activation_fn, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) if self.has_actions: with tf.variable_scope("policy"): x_flat = tf.layers.flatten(x) policy_pred = tf.layers.dense(x_flat, self.hparams.problem.num_actions) value_pred = tf.layers.dense(x_flat, 1) value_pred = tf.squeeze(value_pred, axis=-1) else: policy_pred, value_pred = None, None # Add embedded action if present. if self.has_actions: x = common_video.inject_additional_input(x, action, "action_enc", hparams.action_injection) # Inject latent if present. Only for stochastic models. norm_target_frame = common_layers.standardize_images(target_frame) x, extra_loss = self.inject_latent(x, frames, norm_target_frame, action) x_mid = tf.reduce_mean(x, axis=[1, 2], keepdims=True) x, internal_states = self.middle_network(x, internal_states) # Up-convolve. layer_inputs = list(reversed(layer_inputs)) for i in range(hparams.num_compress_steps): with tf.variable_scope("upstride%d" % i): x = tf.nn.dropout(x, 1.0 - self.hparams.dropout) if self.has_actions: x = common_video.inject_additional_input( x, action, "action_enc", hparams.action_injection) if i >= hparams.num_compress_steps - hparams.filter_double_steps: filters //= 2 x = tf.layers.conv2d_transpose(x, filters, kernel2, activation=activation_fn, strides=(2, 2), padding="SAME") y = layer_inputs[i] shape = common_layers.shape_list(y) x = x[:, :shape[1], :shape[2], :] x = common_layers.layer_norm(x + y) x = common_attention.add_timing_signal_nd(x) # Cut down to original size. x = x[:, :inputs_shape[1], :inputs_shape[2], :] x_fin = tf.reduce_mean(x, axis=[1, 2], keepdims=True) if hparams.do_autoregressive_rnn: # If enabled, we predict the target frame autoregregressively using rnns. # To this end, the current prediciton is flattened into one long sequence # of sub-pixels, and so is the target frame. Each sub-pixel (RGB value, # from 0 to 255) is predicted with an RNN. To avoid doing as many steps # as width * height * channels, we only use a number of pixels back, # as many as hparams.autoregressive_rnn_lookback. with tf.variable_scope("autoregressive_rnn"): batch_size = common_layers.shape_list(frames[0])[0] # Height, width, channels and lookback are the constants we need. h, w = inputs_shape[1], inputs_shape[ 2] # 105, 80 on Atari games c = hparams.problem.num_channels lookback = hparams.autoregressive_rnn_lookback assert ( h * w ) % lookback == 0, "Number of pixels must divide lookback." m = (h * w) // lookback # Batch size multiplier for the RNN. # These are logits that will be used as inputs to the RNN. rnn_inputs = tf.layers.dense(x, c * 64, name="rnn_inputs") # They are of shape [batch_size, h, w, c, 64], reshaping now. rnn_inputs = tf.reshape(rnn_inputs, [batch_size * m, lookback * c, 64]) # Same for the target frame. rnn_target = tf.reshape(target_frame, [batch_size * m, lookback * c]) # Construct rnn starting state: flatten rnn_inputs, apply a relu layer. rnn_start_state = tf.nn.relu( tf.layers.dense(tf.nn.relu(tf.layers.flatten(rnn_inputs)), 256, name="rnn_start_state")) # Our RNN function API is on bits, each subpixel has 8 bits. total_num_bits = lookback * c * 8 # We need to provide RNN targets as bits (due to the API). rnn_target_bits = discretization.int_to_bit(rnn_target, 8) rnn_target_bits = tf.reshape(rnn_target_bits, [batch_size * m, total_num_bits]) if self.is_training: # Run the RNN in training mode, add it's loss to the losses. rnn_predict, rnn_loss = discretization.predict_bits_with_lstm( rnn_start_state, 128, total_num_bits, target_bits=rnn_target_bits, extra_inputs=rnn_inputs) extra_loss += rnn_loss # We still use non-RNN predictions too in order to guide the network. x = tf.layers.dense(x, c * 256, name="logits") x = tf.reshape(x, [batch_size, h, w, c, 256]) rnn_predict = tf.reshape(rnn_predict, [batch_size, h, w, c, 256]) # Mix non-RNN and RNN predictions so that after warmup the RNN is 90%. x = tf.reshape(tf.nn.log_softmax(x), [batch_size, h, w, c * 256]) rnn_predict = tf.nn.log_softmax(rnn_predict) rnn_predict = tf.reshape(rnn_predict, [batch_size, h, w, c * 256]) alpha = 0.9 * common_layers.inverse_lin_decay( hparams.autoregressive_rnn_warmup_steps) x = alpha * rnn_predict + (1.0 - alpha) * x else: # In prediction mode, run the RNN without any targets. bits, _ = discretization.predict_bits_with_lstm( rnn_start_state, 128, total_num_bits, extra_inputs=rnn_inputs, temperature=0.0 ) # No sampling from this RNN, just greedy. # The output is in bits, get back the predicted pixels. bits = tf.reshape(bits, [batch_size * m, lookback * c, 8]) ints = discretization.bit_to_int(tf.maximum(bits, 0), 8) ints = tf.reshape(ints, [batch_size, h, w, c]) x = tf.reshape(tf.one_hot(ints, 256), [batch_size, h, w, c * 256]) elif self.is_per_pixel_softmax: x = tf.layers.dense(x, hparams.problem.num_channels * 256, name="logits") else: x = tf.layers.dense(x, hparams.problem.num_channels, name="logits") reward_pred = None if self.has_rewards: # Reward prediction based on middle and final logits. reward_pred = tf.concat([x_mid, x_fin], axis=-1) reward_pred = tf.nn.relu( tf.layers.dense(reward_pred, 128, name="reward_pred")) reward_pred = tf.squeeze(reward_pred, axis=1) # Remove extra dims reward_pred = tf.squeeze(reward_pred, axis=1) # Remove extra dims return x, reward_pred, policy_pred, value_pred, extra_loss, internal_states
def testBitToIntOnes(self): x_bit = tf.ones(shape=[1, 3], dtype=tf.float32) x_int = 7 * tf.ones(shape=[1], dtype=tf.int32) diff = discretization.bit_to_int(x_bit, num_bits=3) - x_int d = self.evaluate(diff) self.assertEqual(d, 0)
def testBitToIntZeros(self): x_bit = tf.zeros(shape=[1, 10], dtype=tf.float32) x_int = tf.zeros(shape=[1], dtype=tf.int32) diff = discretization.bit_to_int(x_bit, num_bits=10) - x_int d = self.evaluate(diff) self.assertEqual(d, 0)
def _setup(self): if self.make_extra_debug_info: self.report_reward_statistics_every = 10 self.dones = 0 self.real_reward = 0 self.real_env.reset() # Slight weirdness to make sim env and real env aligned for _ in range(self.num_input_frames): self.real_ob, _, _, _ = self.real_env.step(0) self.total_sim_reward, self.total_real_reward = 0.0, 0.0 self.sum_of_rewards = 0.0 self.successful_episode_reward_predictions = 0 in_graph_wrappers = self.in_graph_wrappers + [(atari.MemoryWrapper, {})] env_hparams = tf.contrib.training.HParams( in_graph_wrappers=in_graph_wrappers, problem=self, simulated_environment=self.simulated_environment) generator_batch_env = batch_env_factory( self.environment_spec, env_hparams, num_agents=1, xvfb=False) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): if FLAGS.agent_policy_path: policy_lambda = self.collect_hparams.network else: # When no agent_policy_path is set, just generate random samples. policy_lambda = rl.random_policy_fun policy_factory = tf.make_template( "network", functools.partial(policy_lambda, self.environment_spec().action_space, self.collect_hparams), create_scope_now_=True, unique_name_="network") with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): self.collect_hparams.epoch_length = 10 _, self.collect_trigger_op = collect.define_collect( policy_factory, generator_batch_env, self.collect_hparams, eval_phase=False, scope="define_collect") if FLAGS.autoencoder_path: # TODO(lukaszkaiser): remove hard-coded autoencoder params. with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): self.setup_autoencoder() autoencoder_model = self.autoencoder_model # Feeds for autoencoding. shape = [self.raw_frame_height, self.raw_frame_width, self.num_channels] self.autoencoder_feed = tf.placeholder(tf.int32, shape=shape) autoencoded = autoencoder_model.encode( tf.reshape(self.autoencoder_feed, [1, 1] + shape)) autoencoded = tf.reshape( autoencoded, [self.frame_height, self.frame_width, self.num_channels, 8]) # 8-bit groups. self.autoencoder_result = discretization.bit_to_int(autoencoded, 8) # Now for autodecoding. shape = [self.frame_height, self.frame_width, self.num_channels] self.autodecoder_feed = tf.placeholder(tf.int32, shape=shape) bottleneck = tf.reshape( discretization.int_to_bit(self.autodecoder_feed, 8), [1, 1, self.frame_height, self.frame_width, self.num_channels * 8]) autoencoder_model.set_mode(tf.estimator.ModeKeys.PREDICT) self.autodecoder_result = autoencoder_model.decode(bottleneck) self.avilable_data_size_op = atari.MemoryWrapper.singleton.speculum.size() self.data_get_op = atari.MemoryWrapper.singleton.speculum.dequeue()