def _build_networks(self): """Builds the Q-value network computations needed for acting and training. These are: self.online_convnet: For computing the current state's Q-values. self.target_convnet: For computing the next state's target Q-values. self._net_outputs: The actual Q-values. self._q_argmax: The action maximizing the current state's Q-values. self._replay_net_outputs: The replayed states' Q-values. self._replay_next_target_net_outputs: The replayed next states' target Q-values (see Mnih et al., 2015 for details). self.linear_features: The linear features from second last layer """ # Calling online_convnet will generate a new graph as defined in # self._get_network_template using whatever input is passed, but will always # share the same weights. self.online_convnet = tf.make_template('Online', self._network_template) self.target_convnet = tf.make_template('Target', self._network_template) self._net_outputs, self.linear_features = self.online_convnet( self.state_ph) self._next_target_net_outputs_q, self.target_linear_features = self.target_convnet( self.state_ph) self.next_qt_max = tf.reduce_max(self._next_target_net_outputs_q) self.ddqn_replay_next_target_net_outputs, _ = self.online_convnet( self._replay.next_states) self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0] self._replay_net_outputs, _ = self.online_convnet(self._replay.states) self._replay_next_target_net_outputs, _ = self.target_convnet( self._replay.next_states)
def _build_networks(self): """Builds the Q-value network computations needed for acting and training. These are: self.online_convnet: For computing the current state's Q-values. self.target_convnet: For computing the next state's target Q-values. self._net_outputs: The actual Q-values. self._q_argmax: The action maximizing the current state's Q-values. self._replay_net_outputs: The replayed states' Q-values. self._replay_next_target_net_outputs: The replayed next states' target Q-values (see Mnih et al., 2015 for details). """ # Calling online_convnet will generate a new graph as defined in # self._get_network_template using whatever input is passed, but will always # share the same weights. self.online_convnet = tf.make_template('Online', self._network_template) self.target_convnet = tf.make_template('Target', self._network_template) self._net_outputs = self.online_convnet(self.state_ph) self._replay_net_outputs = self.online_convnet(self._replay.states) self._replay_next_target_net_outputs = self.target_convnet( self._replay.next_states) if self.acting_policy == 'hyperbolic': self._q_argmax = tf.argmax(self._net_outputs.hyp_q_value, axis=1)[0] elif self.acting_policy == 'largest_gamma': self._q_argmax = tf.argmax(self._net_outputs.q_values[-1], axis=1)[0] else: raise NotImplementedError
def _build_all_bisimulation_parts(self): """Builds the bisimulation networks and ops.""" self.batch_size = tf.shape(self._replay.rewards)[0] self._replay_target_outputs = self.target_convnet(self._replay.states) self.bisim_horizon_ph = tf.placeholder(tf.float32, ()) self.online_bisimulation = tf.make_template('OnlineBisim', bisimulation_network) self.target_bisimulation = tf.make_template('TargetBisim', bisimulation_network, trainable=False) # For evaluating the metric from an episode's first state. self.source_state_ph = tf.placeholder(self.observation_dtype, self.state_ph.shape, name='source_state_ph') self._initial_state_net = self.online_convnet(self.source_state_ph) concat_states = tf.concat( [self._initial_state_net.representation, self._net_outputs.representation], 1) self.state_distances = tf.squeeze(self.online_bisimulation(concat_states)) self.state_value = tf.reduce_max(self._net_outputs.q_values, axis=1)[0] if self.summary_writer is not None: tf.summary.scalar('Eval/StateDistances', self.state_distances) if self.evaluate_metric_only: return self.s1_online_distances = self.online_bisimulation( self._concat_states(self._replay_net_outputs.representation)) self.s2_target_distances = self.target_bisimulation( self._concat_states( self._replay_next_target_net_outputs.representation)) # bisimulation_target = rew_diff + gamma * next_distance. bisimulation_target = tf.stop_gradient(self._build_bisimulation_target()) # We zero-out diagonal entries, since those are estimating the distance # between a state and itself, which we know to be 0. diagonal_mask = 1.0 - tf.diag(tf.ones(self.batch_size, dtype=tf.float32)) diagonal_mask = tf.reshape(diagonal_mask, (self.batch_size**2, 1)) bisimulation_target *= diagonal_mask bisimulation_estimate = self.s1_online_distances bisimulation_loss = tf.losses.mean_squared_error( bisimulation_target, bisimulation_estimate) if self.summary_writer is not None: average_distance = tf.reduce_mean(bisimulation_estimate) average_target = tf.reduce_mean(bisimulation_target) average_next_state_dists = tf.reduce_mean(self.next_state_distances) tf.summary.scalar('Training/loss', bisimulation_loss) tf.summary.scalar('Training/AverageDistance', average_distance) tf.summary.scalar('Training/AverageTargetDistance', average_target) tf.summary.scalar('Training/AverageNextStateDistance', average_next_state_dists) tf.summary.scalar('Training/BisimHorizon', self.bisim_horizon_ph) tf.summary.histogram('Training/OnlineDistance', bisimulation_estimate) tf.summary.histogram('Training/TargetDistance', bisimulation_target) self._train_bisim_op = self.bisim_optimizer.minimize(bisimulation_loss) self._bisim_sync_op = self._build_sync_op(online_scope='OnlineBisim', target_scope='TargetBisim')
def forward_pass(self, state1, state2): """ ### PROBLEM 3 ### YOUR CODE HERE args: state1: tf variable state2: tf variable encoder1: tfp.distributions.MultivariateNormalDiag distribution encoder2: tfp.distributions.MultivariateNormalDiag distribution prior: tfp.distributions.MultivariateNormalDiag distribution discriminator: tfp.distributions.Bernoulli distribution TODO: 1. z1: sample from encoder1 2. z2: sample from encoder2 3. z: concatenate z1 and z2 Hint: https://www.tensorflow.org/probability/api_docs/python/tfp/distributions """ # Reuse make_encoder1 = tf.make_template('encoder1', self.make_encoder) make_encoder2 = tf.make_template('encoder2', self.make_encoder) make_discriminator = tf.make_template('decoder', self.make_discriminator) # Encoder encoder1 = make_encoder1(state1, self.hid_dim / 2, 'z1', n_layers=2, hid_size=self.hid_dim) encoder2 = make_encoder2(state2, self.hid_dim / 2, 'z2', n_layers=2, hid_size=self.hid_dim) # Prior prior = self.make_prior(self.hid_dim / 2) # Sampled Latent z1 = encoder1.sample() z2 = encoder2.sample() z = tf.concat([z1, z2], axis=1) # Discriminator discriminator = make_discriminator(z, 1, 'discriminator', n_layers=2, hid_size=self.hid_dim) return encoder1, encoder2, prior, discriminator
def __init__(self, sess, reward_scale, ipd_scale, observation_shape=NATURE_DQN_OBSERVATION_SHAPE, resize_shape=PSEUDO_COUNT_OBSERVATION_SHAPE, quantization_factor=PSEUDO_COUNT_QUANTIZATION_FACTOR, tf_device='/cpu:*', optimizer=tf.train.RMSPropOptimizer( learning_rate=0.0001, momentum=0.9, epsilon=0.0001)): self._sess = sess self.reward_scale = reward_scale self.ipd_scale = ipd_scale self.observation_shape = observation_shape self.resize_shape = resize_shape self.quantization_factor = quantization_factor self.optimizer = optimizer with tf.device(tf_device), tf.name_scope('intrinsic_pixelcnn'): observation_shape = (1,) + observation_shape + (1,) self.obs_ph = tf.placeholder(tf.uint8, shape=observation_shape, name='obs_ph') self.preproccessed_obs = self._preprocess(self.obs_ph, resize_shape) self.iter_ph = tf.placeholder(tf.uint32, shape=[], name='iter_num') self.eval_ph = tf.placeholder(tf.bool, shape=[], name='eval_mode') self.network = tf.make_template('PixelCNN', self._network_template) self.ipd = tf.cond(tf.logical_not(self.eval_ph), self.update, self.virtual_update) self.reward = self.ipd_to_reward(self.ipd, self.iter_ph)
def testMakeLogJointFnTemplate(self): """Test `make_log_joint_fn` on program returned by tf1.make_template.""" def variational(): loc = tf1.get_variable("loc", []) qz = ed.Normal(loc=loc, scale=0.5, name="qz") return qz def true_log_joint(loc, qz): log_prob = tf.reduce_sum( tfd.Normal(loc=loc, scale=0.5).log_prob(qz)) return log_prob qz_value = 1.23 variational_template = tf1.make_template("variational", variational) log_joint = ed.make_log_joint_fn(variational_template) expected_log_prob = log_joint(qz=qz_value) loc = tf1.trainable_variables("variational")[0] actual_log_prob = true_log_joint(loc, qz_value) with self.cached_session() as sess: sess.run(tf1.initialize_all_variables()) actual_log_prob_, expected_log_prob_ = sess.run( [actual_log_prob, expected_log_prob]) self.assertEqual(actual_log_prob_, expected_log_prob_)
def __init__(self, dummy_inputs, logit_axis, config): self.logit_axis = logit_axis self.config = config self.fc_sizes = getattr(config, 'fc_hid_sizes', []) + [len(logit_axis)] self.fc_init_factors = ( getattr(config, 'fc_init_factors', []) + [config.output_init_factor]) if not dummy_inputs: raise ValueError('network has size 0 input') if logit_axis.size == 0: raise ValueError('network has size 0 output') if len({ len(self.fc_sizes), len(self.fc_init_factors), len(config.dropouts) }) != 1: raise ValueError('invalid hyperparameter config for fc layers') self.num_fc_layers = len(self.fc_sizes) self._conv_config = _ConvConfig(*[ getattr(config, 'conv_' + field, []) for field in _ConvConfig._fields ]) if len(set(map(len, self._conv_config))) != 1: raise ValueError('invalid hyperparameter config for conv layers') self.num_conv_layers = len(self._conv_config.depths) self.fprop = tf.make_template('feedforward', self._fprop) # create variables self.fprop(dummy_inputs, mode='test') self.params = contrib_framework.get_variables( scope=self.fprop.variable_scope.name)
def testBijectorConditionKwargs(self): batch_size = 3 x_ = np.linspace(-1.0, 1.0, (batch_size * 4 * 2)).astype(np.float32).reshape( (batch_size, 4 * 2)) conditions = { 'a': tf.random.normal((batch_size, 4), dtype=tf.float32, seed=584), 'b': tf.random.normal((batch_size, 2), dtype=tf.float32, seed=9817), } def _condition_shift_and_log_scale_fn(x0, output_units, a, b): x = tf.concat((x0, a, b), axis=-1) out = tf1.layers.dense(inputs=x, units=2 * output_units) shift, log_scale = tf.split(out, 2, axis=-1) return shift, log_scale condition_shift_and_log_scale_fn = tf1.make_template( 'real_nvp_condition_template', _condition_shift_and_log_scale_fn) nvp = tfb.RealNVP( num_masked=4, validate_args=True, is_constant_jacobian=False, shift_and_log_scale_fn=condition_shift_and_log_scale_fn) x = tf.constant(x_) forward_x = nvp.forward(x, **conditions) # Use identity to invalidate cache. inverse_y = nvp.inverse(tf.identity(forward_x), **conditions) forward_inverse_y = nvp.forward(inverse_y, **conditions) fldj = nvp.forward_log_det_jacobian(x, event_ndims=1, **conditions) # Use identity to invalidate cache. ildj = nvp.inverse_log_det_jacobian(tf.identity(forward_x), event_ndims=1, **conditions) self.evaluate(tf1.global_variables_initializer()) [ forward_x_, inverse_y_, forward_inverse_y_, ildj_, fldj_, ] = self.evaluate([ forward_x, inverse_y, forward_inverse_y, ildj, fldj, ]) self.assertStartsWith(nvp.name, 'real_nvp') self.assertAllClose(forward_x_, forward_inverse_y_, rtol=1e-5, atol=1e-5) self.assertAllClose(x_, inverse_y_, rtol=1e-5, atol=1e-5) self.assertAllClose(ildj_, -fldj_, rtol=1e-5, atol=1e-5)
def make_encoder(encoder_cfg, name): def call_encoder(inputs, length): return sequence_encoder(inputs=inputs, length=length, is_training=is_training, cfg=encoder_cfg) return tf.make_template(name, call_encoder)
def real_nvp_template(neuron_list,name=None): with tf.name_scope(name or 'real_nvp_template'): def _fn(x,output_units,**condition_kwargs): for neurons in neuron_list: x = tf1.layers.dense(x,neurons) x = tf.nn.relu(x) x = tf1.layers.dense(x,2*output_units) shift, logscale = tf.split(x, 2, axis=-1) return shift, logscale return tf1.make_template('real_nvp_template', _fn)
def __init__(self, stoch_size=30, deter_size=200, min_stddev=0.1, layers=1, reward_layers=3, units=300, free_nats=3.0, reward_loss_multiplier=10, frame_size=(64, 64, 3), task=gin.REQUIRED, reward_from_frames=False, reward_stop_gradient=False, include_frames_in_prediction=False, activation=tf.nn.relu): self._action_space = task.create_env().action_space self._stoch_size = stoch_size self._deter_size = deter_size self._min_stddev = min_stddev self._num_layers = layers self._num_reward_layers = reward_layers self._num_units = units self._free_nats = free_nats self._include_frames_in_prediction = include_frames_in_prediction self._activation = activation self._cell = tf.keras.layers.GRUCell(self._deter_size) self._prior_tpl = tf.make_template('prior', self._prior) self._posterior_tpl = tf.make_template('posterior', self._posterior) self._encoder_tpl = tf.make_template('encoder', self._encoder) self._reward_loss_mul = reward_loss_multiplier self._frame_size = list(frame_size) self._reward_from_frames = reward_from_frames self._reward_stop_gradient = reward_stop_gradient self._predict_frames_tpl = tf.make_template('predict_frames', self._predict_frames, out_shape=self._frame_size) self._predict_reward_tpl = tf.make_template('predict_reward', self._predict_reward, out_shape=[1])
def Wrapper(template_name, **kwargs): # Do a bit of hackery with scopes so that the current config scope used when # constructing the template is also used when the template is instantiated. saved_scopes = gin.current_scope() def GinWrapper(*args, **kwargs): with gin.config_scope(saved_scopes): return function(*args, **kwargs) return tf.make_template(template_name, GinWrapper, create_scope_now_=True, unique_name_=template_name, **kwargs)
def _make_gated_bijector_fn(): def _bijector_fn(x, output_units): if tensorshape_util.rank(x.shape) == 1: x = x[tf.newaxis, ...] reshape_output = lambda x: x[0] else: reshape_output = lambda x: x out = tf1.layers.dense(inputs=x, units=2 * output_units) shift, logit_gate = tf.split(out, 2, axis=-1) shift = reshape_output(shift) logit_gate = reshape_output(logit_gate) gate = tf.nn.sigmoid(logit_gate) return tfb.AffineScalar(shift=(1. - gate) * shift, scale=gate) return tf1.make_template('gated_bijector', _bijector_fn)
def init_networks(self): network_image_size = self.image_size if self.data_format == 'channels_first': data_generator_entries = OrderedDict([('image', [1] + network_image_size)]) else: data_generator_entries = OrderedDict([('image', network_image_size + [1])]) # create model with shared weights between train and val training_net = tf.make_template('net', self.network) # build val graph val_placeholders = create_placeholders(data_generator_entries, shape_prefix=[1]) self.data_val = val_placeholders['image'] self.prediction_val, self.local_prediction_val, self.spatial_prediction_val = training_net(self.data_val, num_labels=self.num_labels, is_training=False, data_format=self.data_format) self.prediction_softmax_val = tf.nn.softmax(self.prediction_val, axis=1 if self.data_format == 'channels_first' else 4)
def real_nvp_default_template( hidden_layers, shift_only=False, activation=tf.nn.relu, name=None, *args, # pylint: disable=keyword-arg-before-vararg **kwargs): with tf.compat.v1.name_scope(name or 'real_nvp_default_template'): def _fn(x, output_units, **condition_kwargs): """Fully connected MLP parameterized via `real_nvp_template`.""" if condition_kwargs: raise NotImplementedError( 'Conditioning not implemented in the default template.') if tensorshape_util.rank(x.shape) == 1: x = x[tf.newaxis, ...] reshape_output = lambda x: x[0] else: reshape_output = lambda x: x for units in hidden_layers: x = tf1.layers.dense( inputs=x, units=units, activation=activation, *args, # pylint: disable=keyword-arg-before-vararg **kwargs) x = tf1.layers.dense( inputs=x, units=(1 if shift_only else 2) * output_units, activation=None, *args, # pylint: disable=keyword-arg-before-vararg **kwargs) if shift_only: return reshape_output(x), None shift, log_scale = tf.split(x, 2, axis=-1) # ** Here is the second modification. return reshape_output(shift), 1e-7 + tf.nn.softplus( reshape_output(log_scale)) return tf1.make_template('real_nvp_default_template', _fn)
def make_conditioning(): if config.embed_once: with tf.variable_scope('cond_im', initializer=embedding_initializer): embedding = tf.get_variable( 'embedding', [ config.conditioning_vocab_size, config.input_embedding_size ], initializer=embedding_initializer, dtype=tf.float32) if self.embedding_dropout is not None: embedding = tf.nn.dropout( embedding, 1 - self.embedding_dropout, noise_shape=tf.stack( [config.conditioning_vocab_size, 1])) embedded_source = tf.nn.embedding_lookup( embedding, self.conditioning) if self.token_dropout is not None: embedding = tf.nn.dropout(embedding, 1 - self.token_dropout, noise_shape=tf.stack([ config.max_time_steps, batch_size, 1 ])) if config.scale_input_embeddings: embedded_source *= tf.sqrt( tf.cast(config.input_embedding_size, tf.float32)) conditioning_sources = embedded_source else: assert False, 'Not implemented.' conditioning_cell = make_cell() conditioning_lm = tf.make_template('cond_lm', lm_1) initial_state = conditioning_cell.zero_state(batch_size, dtype=tf.float32) _, conditioning_last_state = conditioning_lm( conditioning_cell, initial_state, conditioning_sources, self.conditioning_len) return conditioning_last_state
def __init__(self, batch_env, step, is_training, should_log, config): """Create an instance of the PPO algorithm. Args: batch_env: In-graph batch environment. step: Integer tensor holding the current training step. is_training: Boolean tensor for whether the algorithm should train. should_log: Boolean tensor for whether summaries should be returned. config: Object containing the agents configuration as attributes. """ self._batch_env = batch_env self._step = step self._is_training = is_training self._should_log = should_log self._config = config self._observ_filter = normalize.StreamingNormalize( self._batch_env.observ[0], center=True, scale=True, clip=5, name='normalize_observ') self._reward_filter = normalize.StreamingNormalize( self._batch_env.reward[0], center=False, scale=True, clip=10, name='normalize_reward') # Memory stores tuple of observ, action, mean, logstd, reward. template = (self._batch_env.observ[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.action[0], self._batch_env.reward[0]) self._memory = memory.EpisodeMemory(template, config.update_every, config.max_length, 'memory') self._memory_index = tf.Variable(0, False) use_gpu = self._config.use_gpu and utility.available_gpus() with tf.device('/gpu:0' if use_gpu else '/cpu:0'): # Create network variables for later calls to reuse. action_size = self._batch_env.action.shape[1].value self._network = tf.make_template( 'network', functools.partial(config.network, config, action_size)) output = self._network( tf.zeros_like(self._batch_env.observ)[:, None], tf.ones(len(self._batch_env))) with tf.variable_scope('ppo_temporary'): self._episodes = memory.EpisodeMemory(template, len(batch_env), config.max_length, 'episodes') if output.state is None: self._last_state = None else: # Ensure the batch dimension is set. tf.contrib.framework.nest.map_structure( lambda x: x.set_shape([len(batch_env)] + x.shape. as_list()[1:]), output.state) # pylint: disable=undefined-variable self._last_state = tf.contrib.framework.nest.map_structure( lambda x: tf.Variable(lambda: tf.zeros_like(x), False), output.state) self._last_action = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_action') self._last_mean = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_mean') self._last_logstd = tf.Variable(tf.zeros_like( self._batch_env.action), False, name='last_logstd') self._penalty = tf.Variable(self._config.kl_init_penalty, False, dtype=tf.float32) self._optimizer = self._config.optimizer(self._config.learning_rate)
# ys = [tf.placeholder(tf.int32, shape=(args.batch_size,)) for i in range(args.nr_gpu)] # hs = [tf.one_hot(ys[i], num_labels) for i in range(args.nr_gpu)] # else: h_init = None h_sample = [None] * args.nr_gpu hs = h_sample # create the model model_opt = { 'nr_resnet': args.nr_resnet, 'nr_filters': args.nr_filters, 'nr_logistic_mix': args.nr_logistic_mix, 'resnet_nonlinearity': args.resnet_nonlinearity, 'energy_distance': args.energy_distance } model = tf.make_template('model', model_spec) # run once for data dependent initialization of parameters init_pass = model(x_init, h_init, init=True, dropout_p=args.dropout_p, **model_opt) # keep track of moving average # all_params = tf.trainable_variables() # ema = tf.train.ExponentialMovingAverage(decay=args.polyak_decay) # maintain_averages_op = tf.group(ema.apply(all_params)) # ema_params = [ema.average(p) for p in all_params] # get loss gradients over multiple GPUs + sampling
def fit_with_hmc(model, observed_time_series, num_results=100, num_warmup_steps=50, num_leapfrog_steps=15, initial_state=None, initial_step_size=None, chain_batch_shape=(), num_variational_steps=150, variational_optimizer=None, seed=None, name=None): """Draw posterior samples using Hamiltonian Monte Carlo (HMC). Markov chain Monte Carlo (MCMC) methods are considered the gold standard of Bayesian inference; under suitable conditions and in the limit of infinitely many draws they generate samples from the true posterior distribution. HMC [1] uses gradients of the model's log-density function to propose samples, allowing it to exploit posterior geometry. However, it is computationally more expensive than variational inference and relatively sensitive to tuning. This method attempts to provide a sensible default approach for fitting StructuralTimeSeries models using HMC. It first runs variational inference as a fast posterior approximation, and initializes the HMC sampler from the variational posterior, using the posterior standard deviations to set per-variable step sizes (equivalently, a diagonal mass matrix). During the warmup phase, it adapts the step size to target an acceptance rate of 0.75, which is thought to be in the desirable range for optimal mixing [2]. Args: model: An instance of `StructuralTimeSeries` representing a time-series model. This represents a joint distribution over time-series and their parameters with batch shape `[b1, ..., bN]`. observed_time_series: `float` `Tensor` of shape `concat([sample_shape, model.batch_shape, [num_timesteps, 1]]) where `sample_shape` corresponds to i.i.d. observations, and the trailing `[1]` dimension may (optionally) be omitted if `num_timesteps > 1`. May optionally be an instance of `tfp.sts.MaskedTimeSeries`, which includes a mask `Tensor` to specify timesteps with missing observations. num_results: Integer number of Markov chain draws. Default value: `100`. num_warmup_steps: Integer number of steps to take before starting to collect results. The warmup steps are also used to adapt the step size towards a target acceptance rate of 0.75. Default value: `50`. num_leapfrog_steps: Integer number of steps to run the leapfrog integrator for. Total progress per HMC step is roughly proportional to `step_size * num_leapfrog_steps`. Default value: `15`. initial_state: Optional Python `list` of `Tensor`s, one for each model parameter, representing the initial state(s) of the Markov chain(s). These should have shape `concat([chain_batch_shape, param.prior.batch_shape, param.prior.event_shape])`. If `None`, the initial state is set automatically using a sample from a variational posterior. Default value: `None`. initial_step_size: Python `list` of `Tensor`s, one for each model parameter, representing the step size for the leapfrog integrator. Must broadcast with the shape of `initial_state`. Larger step sizes lead to faster progress, but too-large step sizes make rejection exponentially more likely. If `None`, the step size is set automatically using the standard deviation of a variational posterior. Default value: `None`. chain_batch_shape: Batch shape (Python `tuple`, `list`, or `int`) of chains to run in parallel. Default value: `[]` (i.e., a single chain). num_variational_steps: Python `int` number of steps to run the variational optimization to determine the initial state and step sizes. Default value: `150`. variational_optimizer: Optional `tf.train.Optimizer` instance to use in the variational optimization. If `None`, defaults to `tf.train.AdamOptimizer(0.1)`. Default value: `None`. seed: Python integer to seed the random number generator. name: Python `str` name prefixed to ops created by this function. Default value: `None` (i.e., 'fit_with_hmc'). Returns: samples: Python `list` of `Tensors` representing posterior samples of model parameters, with shapes `[concat([[num_results], chain_batch_shape, param.prior.batch_shape, param.prior.event_shape]) for param in model.parameters]`. kernel_results: A (possibly nested) `tuple`, `namedtuple` or `list` of `Tensor`s representing internal calculations made within the HMC sampler. #### Examples Assume we've built a structural time-series model: ```python day_of_week = tfp.sts.Seasonal( num_seasons=7, observed_time_series=observed_time_series, name='day_of_week') local_linear_trend = tfp.sts.LocalLinearTrend( observed_time_series=observed_time_series, name='local_linear_trend') model = tfp.sts.Sum(components=[day_of_week, local_linear_trend], observed_time_series=observed_time_series) ``` To draw posterior samples using HMC under default settings: ```python samples, kernel_results = tfp.sts.fit_with_hmc(model, observed_time_series) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) samples_, kernel_results_ = sess.run((samples, kernel_results)) print("acceptance rate: {}".format( np.mean(kernel_results_.inner_results.is_accepted, axis=0))) print("posterior means: {}".format( {param.name: np.mean(param_draws, axis=0) for (param, param_draws) in zip(model.parameters, samples_)})) ``` We can also run multiple chains. This may help diagnose convergence issues and allows us to exploit vectorization to draw samples more quickly, although warmup still requires the same number of sequential steps. ```python from matplotlib import pylab as plt samples, kernel_results = tfp.sts.fit_with_hmc( model, observed_time_series, chain_batch_shape=[10]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) samples_, kernel_results_ = sess.run((samples, kernel_results)) print("acceptance rate: {}".format( np.mean(kernel_results_.inner_results.inner_results.is_accepted, axis=0))) # Plot the sampled traces for each parameter. If the chains have mixed, their # traces should all cover the same region of state space, frequently crossing # over each other. for (param, param_draws) in zip(model.parameters, samples_): if param.prior.event_shape.ndims > 0: print("Only plotting traces for scalar parameters, skipping {}".format( param.name)) continue plt.figure(figsize=[10, 4]) plt.title(param.name) plt.plot(param_draws) plt.ylabel(param.name) plt.xlabel("HMC step") # Combining the samples from multiple chains into a single dimension allows # us to easily pass sampled parameters to downstream forecasting methods. combined_samples_ = [np.reshape(param_draws, [-1] + list(param_draws.shape[2:])) for param_draws in samples_] ``` For greater flexibility, you may prefer to implement your own sampler using the TensorFlow Probability primitives in `tfp.mcmc`. The following recipe constructs a basic HMC sampler, using a `TransformedTransitionKernel` to incorporate constraints on the parameter space. ```python transformed_hmc_kernel = mcmc.TransformedTransitionKernel( inner_kernel=mcmc.SimpleStepSizeAdaptation( inner_kernel=mcmc.HamiltonianMonteCarlo( target_log_prob_fn=model.joint_log_prob(observed_time_series), step_size=step_size, num_leapfrog_steps=num_leapfrog_steps, state_gradients_are_stopped=True, seed=seed), num_adaptation_steps = int(0.8 * num_warmup_steps)), bijector=[param.bijector for param in model.parameters]) # Initialize from a Uniform[-2, 2] distribution in unconstrained space. initial_state = [tfp.sts.sample_uniform_initial_state( param, return_constrained=True) for param in model.parameters] samples, kernel_results = tfp.mcmc.sample_chain( kernel=transformed_hmc_kernel, num_results=num_results, current_state=initial_state, num_burnin_steps=num_warmup_steps) ``` #### References [1]: Radford Neal. MCMC Using Hamiltonian Dynamics. _Handbook of Markov Chain Monte Carlo_, 2011. https://arxiv.org/abs/1206.1901 [2] M.J. Betancourt, Simon Byrne, and Mark Girolami. Optimizing The Integrator Step Size for Hamiltonian Monte Carlo. https://arxiv.org/abs/1411.6669 """ with tf.name_scope(name or 'fit_with_hmc') as name: seed = tfd.SeedStream(seed, salt='StructuralTimeSeries_fit_with_hmc') # Initialize state and step sizes from a variational posterior if not # specified. if initial_step_size is None or initial_state is None: # To avoid threading variational distributions through the training # while loop, we build our own copy here. `make_template` ensures # that our variational distributions share the optimized parameters. def make_variational(): return build_factored_variational_loss( model, observed_time_series, init_batch_shape=chain_batch_shape, seed=seed()) make_variational = tf1.make_template('make_variational', make_variational) _, variational_distributions = make_variational() minimize_op = _minimize_in_graph( build_loss_fn=lambda: make_variational()[ 0], # return just the loss. num_steps=num_variational_steps, optimizer=variational_optimizer) with tf.control_dependencies([minimize_op]): if initial_state is None: initial_state = [ tf.stop_gradient(d.sample()) for d in variational_distributions.values() ] # Set step sizes using the unconstrained variational distribution. if initial_step_size is None: initial_step_size = [ transformed_q.distribution.stddev() for transformed_q in variational_distributions.values() ] # Multiple chains manifest as an extra param batch dimension, so we need to # add a corresponding batch dimension to `observed_time_series`. observed_time_series = sts_util.pad_batch_dimension_for_multiple_chains( observed_time_series, model, chain_batch_shape=chain_batch_shape) # Run HMC to sample from the posterior on parameters. samples, kernel_results = mcmc.sample_chain( num_results=num_results, current_state=initial_state, num_burnin_steps=num_warmup_steps, kernel=mcmc.SimpleStepSizeAdaptation( inner_kernel=mcmc.TransformedTransitionKernel( inner_kernel=mcmc.HamiltonianMonteCarlo( target_log_prob_fn=model.joint_log_prob( observed_time_series), step_size=initial_step_size, num_leapfrog_steps=num_leapfrog_steps, state_gradients_are_stopped=True, seed=seed()), bijector=[param.bijector for param in model.parameters]), num_adaptation_steps=int(num_warmup_steps * 0.8), adaptation_rate=tf.convert_to_tensor( value=0.1, dtype=initial_state[0].dtype)), parallel_iterations=1 if seed is not None else 10) return samples, kernel_results
def fit(net, upsample_factor, channels_times_layers, img_shape, image_mode, decoder_type, upsample_mode, filter_size, img_name, type_measurements, num_measurements, num_channels_real, num_layers, act_function, y_feed, A_feed, mask_info1, mask_info2, mask_feed=None, lr_decay_epoch=0, lr_decay_rate=0.65, LR=0.01, OPTIMIZER='adam', num_iter=5000, find_best=False, verbose=False, input_size=128, random_vector=None, selection_mask=None, save=False, random_array=None): """Fit a model. Args: net: the generative model channels_times_layers: Number of upsample channels #e.g.[k, k ,...] with shape of [1,6] img_shape: original real image shape, a 4D tensor, e.g. [1,128,128,3] type_measurements, num_measurements: the type and number of measurements y_feed, A_feed: real oberservation y and measurment matrix A LR, lr_decay_epoch, lr_decay_rate: parameters of learning rate device: device name """ with tf.Graph().as_default(): # Global step global_step = tf.train.get_or_create_global_step() # Set up palceholders if mask_feed is None: n_input = img_shape[1] * img_shape[2] * img_shape[3] if type_measurements == 'random': #compressed sensing with random matirx A = tf.placeholder(tf.float32, shape=(n_input, num_measurements), name='A') #e.g.[img_wid*img_high*3, 200] y = tf.placeholder(tf.float32, shape=(1, num_measurements), name='y') #e.g.[1, 200] elif type_measurements == 'identity': #denosing if image_mode != '3D': A = tf.placeholder( tf.float32, shape=(n_input, n_input), name='A' ) #e.g.[img_wid*img_high*3, img_wid*img_high*3] ########!!!!!!#####!!!!!!! y = tf.placeholder(tf.float32, shape=(1, n_input), name='y') #e.g.[1, img_wid*img_high*3] elif type_measurements == 'circulant': #compressed sensing with circulant matirx y = tf.placeholder(tf.float32, shape=(1, n_input), name='y') #e.g.[1, img_wid*img_high*3] else: #inpainting y = tf.placeholder(tf.float32, shape=(1, img_shape[1], img_shape[2], img_shape[3]), name='y') # Define input uniform noise if upsample_mode == 'bilinear': ## -- fix output size only --## #totalupsample = upsample_factor**len(num_layers) #e.g. 2^6, 1.5^3 #width = int(img_shape[1] / totalupsample) #if image_mode == '1D': # height = int(img_shape[2]) #elif image_mode == '2D' or '3D': # height = int(img_shape[2] / totalupsample) ## -- fix input size and output size--: ## width = input_size if image_mode == '1D': height = int(img_shape[2]) elif image_mode == '2D' or image_mode == '3D': height = input_size #print('9/11 input noise check, width: {} height:{}'.format(width, height)) elif upsample_mode == 'none': width = int(img_shape[1]) height = int(img_shape[2]) z = tf.constant( np.random.uniform( size=[1, width, height, channels_times_layers[0]]).astype( np.float32) * 1. / 10) z = tf.Variable(z, name='z', trainable=False) # Deep decoder prior feed_forward = tf.make_template( "DeepDecoder", net ) #feed_forward takes a 4D Tensor (batch, width, height, channels) as input and outputs a 4D Tensor (batch, width*2^6, height*2^6, channels=3) x = feed_forward(z) #net_output with shape [1, img_wid, img_high, 3] # Inverse problem settings def circulant_tf(signal_vector, random_vector, selection_mask): signal_vector = tf.cast(signal_vector, dtype=tf.complex64, name='circulant_real2complex') t = tf.convert_to_tensor(random_vector, dtype=tf.complex64) #step 1: F^{-1} @ x r1 = tf.signal.ifft(signal_vector, name='circulant_step1_ifft') #step 2: Diag() @ F^{-1} @ x Ft = tf.signal.fft(t) r2 = tf.multiply(r1, Ft, name='circulant_step2_diag') #step 3: F @ Diag() @ F^{-1} @ x compressive = tf.signal.fft(r2, name='circulant_step3_fft') float_compressive = tf.cast(compressive, tf.float32, name='circulant_complex2real') #step 4: R_{omega} @ C_{t} select_compressive = tf.multiply(float_compressive, selection_mask, name='circulant_step4_A') return select_compressive if mask_feed is None: # Compressed sensing & Denoising if type_measurements == 'circulant': # Compressed sensing with Circulant matrix flip = tf.convert_to_tensor(random_array, dtype=tf.float32) # flip x_circulant = tf.reshape(x, [1, -1]) * flip y_hat = circulant_tf(x_circulant, random_vector, selection_mask) else: # Compressed sensing with Random matrix & Denoising if image_mode != '3D': y_hat = tf.matmul(tf.reshape(x, [1, -1]), A) else: y_hat = tf.reshape(x, [1, -1]) ########!!!!!!#####!!!!!!! else: # Inpainting y_hat = x * mask_feed # Define loss mse = tf.losses.mean_squared_error loss = mse(y, y_hat) # Define learning rate if lr_decay_epoch > 0: LR = tf.train.exponential_decay(LR, global_step, lr_decay_epoch, lr_decay_rate, staircase=True) # Define optimizer if OPTIMIZER == 'adam': #print("optimize with adam", LR) optimizer = tf.train.AdamOptimizer(LR) elif OPTIMIZER == 'LBFGS': raise NotImplementedError('LBFGS Optimizer') update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) # Set up gpu config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.85 config.log_device_placement = True with tf.Session() as sess: # Init mse = [0.] * num_iter sess.run(tf.global_variables_initializer()) # Initial deep decoder output if find_best: if not os.path.exists('log'): os.makedirs('log/') if not os.path.exists('result'): os.makedirs('result/') #saver = tf.train.Saver(max_to_keep=1) #saver.save(sess, os.path.join('log/', 'model.ckpt'), global_step=0) best_mse = 1000000.0 best_img = sess.run(x) #save_img(best_img, 'result/', img_name, '0', image_mode, decoder_type, filter_size, upsample_mode) # Feed dict if mask_feed is None: if type_measurements == 'circulant': feed_dict = {y: y_feed} elif type_measurements == 'identity': if image_mode != '3D': feed_dict = { A: A_feed, y: y_feed } ########!!!!!!#####!!!!!!! else: feed_dict = {y: y_feed} else: feed_dict = {y: y_feed} # Desired noised/masked output #y_recov = sess.run(y, feed_dict=feed_dict) #y_name = 'y_recov_ini' + '_' + decoder_type + '_' + str(filter_size) + '.npy' #imsave(os.path.join('result/', y_name), y_recov.astype(np.uint8)) # Optimize num_params = get_num_params() sess.graph.finalize() #print('\x1b[37mFinal graph size: %.2f MB\x1b[0m' % (tf.get_default_graph().as_graph_def().ByteSize() / 10e6)) for i in range(num_iter): loss_, _ = sess.run([loss, train_op], feed_dict=feed_dict) #psnr = 10 * np.log10(1 * 1 / loss_) #PSNR mse[i] = loss_ # Display #if i > 0 and i % 100 == 0: # print ('\r[Iteration %05d] loss=%.9f' % (i, loss_), end='') # Best net if find_best and best_mse > 1.005 * loss_: best_mse = loss_ #best_psnr = 10 * np.log10(1 * 1 / best_mse) best_img = sess.run(x) #saver.save(sess, os.path.join('log/', 'model.ckpt'), global_step=i + 1) # Return final image or best found so far if `find_best` if find_best: out_img = best_img # if image_mode == '1D': # mask_info = mask_info1[8:-4] # elif image_mode == '2D' or image_mode == '3D': # mask_info = mask_info2[8:-4] #if save: # save_img(out_img, 'result/', img_name, '{}'.format(i + 1), image_mode, decoder_type, filter_size, upsample_mode, num_channels_real, num_layers, input_size, mask_info, act_function) #print('Best MSE (wrt noisy) {}: {}: {}: {}: {}: {}: {}: {}: {}'.format(num_channels_real, num_layers, img_name, mask_info, decoder_type, filter_size, upsample_mode, upsample_factor, best_mse)) else: out_img = sess.run(x) # if image_mode == '1D': # mask_info = mask_info1[8:-4] # elif image_mode == '2D' or image_mode == '3D': # mask_info = mask_info2[8:-4] #if save: # save_img(out_img, 'result/', img_name, '{}'.format(i + 1), image_mode, decoder_type, filter_size, upsample_mode, num_channels_real, num_layers, input_size, mask_info, act_function) #print('FINAL MSE (wrt noisy) {}: {}: {}: {}: {}: {}: {}: {}: {}'.format(num_channels_real, num_layers, img_name, mask_info, decoder_type, filter_size, upsample_mode, upsample_factor, mse[-1])) if verbose: return mse, out_img, num_params else: return mse, out_img
def __init__(self, train_batch_size=4096, test_chain_batch_size=4096, bijector="iaf", log_dir="/tmp/neutra", base_learning_rate=1e-3, q_base_scale=1., learning_rate_schedule=[[6000, 1e-1]]): target, target_spec = GetTargetSpec() self.target = target self.target_spec = target_spec with gin.config_scope("train"): train_target, train_target_spec = GetTargetSpec() self.train_target = train_target self.train_target_spec = train_target_spec if bijector == "rnvp": bijector_fn = tf.make_template("bijector", MakeRNVPBijectorFn, num_dims=self.target_spec.num_dims) elif bijector == "iaf": bijector_fn = tf.make_template("bijector", MakeIAFBijectorFn, num_dims=self.target_spec.num_dims) elif bijector == "affine": bijector_fn = tf.make_template("bijector", MakeAffineBijectorFn, num_dims=self.target_spec.num_dims) else: bijector_fn = lambda *args, **kwargs: tfb.Identity() self.train_bijector = bijector_fn(train=True) self.bijector = bijector_fn(train=False) if train_target_spec.bijector is not None: print("Using train target bijector") self.train_bijector = tfb.Chain( [train_target_spec.bijector, self.train_bijector]) if target_spec.bijector is not None: print("Using target bijector") self.bijector = tfb.Chain([target_spec.bijector, self.bijector]) q_base = tfd.Independent( tfd.Normal(loc=tf.zeros(self.target_spec.num_dims), scale=q_base_scale * tf.ones(self.target_spec.num_dims)), 1) self.q_x_train = tfd.TransformedDistribution(q_base, self.train_bijector) self.q_x = tfd.TransformedDistribution(q_base, self.bijector) # Params self.train_batch_size = int(train_batch_size) self.test_chain_batch_size = tf.placeholder_with_default( test_chain_batch_size, [], "test_chain_batch_size") self.test_batch_size = tf.placeholder_with_default( 16384 * 8, [], "test_batch_size") self.test_num_steps = tf.placeholder_with_default( 1000, [], "test_num_steps") self.test_num_leapfrog_steps = tf.placeholder_with_default( tf.to_int32(2), [], "test_num_leapfrog_steps") self.test_step_size = tf.placeholder_with_default( 0.1, [], "test_step_size") # Test self.neutra_outputs = MakeNeuTra( target=self.target, q=self.q_x, batch_size=self.test_chain_batch_size, num_steps=self.test_num_steps, num_leapfrog_steps=self.test_num_leapfrog_steps, step_size=self.test_step_size, ) self.z_chain = tf.reshape( self.bijector.inverse( tf.reshape(self.neutra_outputs.x_chain, [-1, self.target_spec.num_dims])), tf.shape(self.neutra_outputs.x_chain)) self.target_samples = self.target.sample(self.test_batch_size) self.target_z = self.bijector.inverse(self.target_samples) self.q_samples = self.q_x.sample(self.test_batch_size) self.target_cov = utils.Covariance(self.target_samples) self.target_eigvals, self.target_eigvecs = tf.linalg.eigh( self.target_cov) self.cached_target_eigvals = tf.get_local_variable( "cached_target_eigvals", self.target_eigvals.shape, initializer=tf.zeros_initializer()) self.cached_target_eigvecs = tf.get_local_variable( "cached_target_eigvecs", self.target_eigvecs.shape, initializer=tf.zeros_initializer()) self.cached_target_stats_update_op = [ self.cached_target_eigvals.assign(self.target_eigvals), self.cached_target_eigvecs.assign(self.target_eigvecs), tf.print("Assigning target stats") ] def variance(x): x -= tf.reduce_mean(x, 0, keep_dims=True) x = tf.square(x) return x def rotated_variance(x): x2 = tf.reshape(x, [-1, self.target_spec.num_dims]) x2 -= tf.reduce_mean(x2, 0, keep_dims=True) x2 = tf.matmul(x2, self.cached_target_eigvecs) x2 = tf.square(x2) return tf.reshape(x2, tf.shape(x)) functions = [ ("mean", tf.identity), # ("var", variance), ("square", tf.square), # ("rot_square", rot_square), # ("rot_var", rotated_variance), ] self.cached_target_mean = {} self.cached_target_mean_update_op = [ tf.print("Assigning target means.") ] self.neutra_stats = {} self.q_stats = {} for name, f in functions: target_mean = tf.reduce_mean(f(self.target_samples), 0) cached_target_mean = tf.get_local_variable(name + "_cached_mean", target_mean.shape) if self.target_spec.stats is not None: self.cached_target_mean_update_op.append( cached_target_mean.assign(self.target_spec.stats[name])) else: self.cached_target_mean_update_op.append( cached_target_mean.assign(target_mean)) self.cached_target_mean[name] = cached_target_mean self.q_stats[name] = ComputeQStats(f(self.q_samples), cached_target_mean) self.neutra_stats[name] = ComputeChainStats( f(self.neutra_outputs.x_chain), cached_target_mean, self.test_num_leapfrog_steps) # Training self.train_q_samples = self.q_x_train.sample(self.train_batch_size) self.train_log_q_x = self.q_x_train.log_prob(self.train_q_samples) self.kl_q_p = tf.reduce_mean( self.train_log_q_x - self.target.log_prob(self.train_q_samples)) loss = self.kl_q_p reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) if reg_losses: tf.logging.info("Regularizing.") loss += tf.add_n(reg_losses) self.loss = tf.check_numerics(loss, "Loss has NaNs") self.global_step = tf.train.get_or_create_global_step() steps, factors = list(zip(*learning_rate_schedule)) learning_rate = base_learning_rate * tf.train.piecewise_constant( self.global_step, steps, [1.0] + list(factors)) opt = tf.train.AdamOptimizer(learning_rate=learning_rate) self.train_op = opt.minimize(self.loss, global_step=self.global_step) tf.summary.scalar("kl_q_p", self.kl_q_p) tf.summary.scalar("loss", self.loss) self.init = [ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.print("Initializing variables") ] self.saver = tf.train.Saver() self.log_dir = log_dir
def init_networks(self): """ Init training and validation networks. """ network_image_size = list(reversed(self.image_size)) num_instances = 1 if self.bitwise_instance_image else None num_instances_val = None if self.data_format == 'channels_first': data_generator_entries = OrderedDict([ ('image', [1, self.num_frames] + network_image_size), ('instances_merged', [num_instances, self.num_frames] + network_image_size), ('instances_bac', [num_instances, self.num_frames] + network_image_size) ]) data_generator_entries_test_cropped_single_frame = OrderedDict([ ('image', [1] + network_image_size), ('instances_merged', [num_instances_val] + network_image_size), ('instances_bac', [num_instances_val] + network_image_size) ]) embedding_normalization_function = lambda x: tf.nn.l2_normalize( x, dim=self.channel_axis) else: assert 'channels_last not supported' data_generator_types = { 'image': tf.float32, 'instances_merged': self.bitwise_instances_image_type, 'instances_bac': self.bitwise_instances_image_type } # create model with shared weights between train and val training_net = tf.make_template('net', self.network) # build train graph self.train_queue = DataGeneratorPadding( self.dataset_train, self.coord, data_generator_entries, batch_size=self.batch_size, data_types=data_generator_types, n_threads=4) # build train graph data, instances_tra, instances_bac = self.train_queue.dequeue() embeddings_tuple = training_net( data, num_outputs_embedding=self.num_embeddings, is_training=True, data_format=self.data_format, actual_network=self.actual_network, **self.network_parameters) if not isinstance(embeddings_tuple, tuple): embeddings_tuple = (embeddings_tuple, ) loss_reg = get_reg_loss(self.reg_constant, True) with tf.name_scope('train_loss'): train_losses_dict = self.losses( embeddings_tuple, instances_tra, instances_bac, bitwise_instances=self.bitwise_instance_image) train_losses_dict['loss_reg'] = loss_reg self.loss = tf.reduce_sum(list(train_losses_dict.values())) self.train_losses = train_losses_dict # solver global_step = tf.Variable(self.current_iter) learning_rate = tf.train.piecewise_constant( global_step, self.learning_rate_boundaries, self.learning_rates) #optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.optimizer = optimizer.minimize(self.loss, global_step=global_step) # initialize variables self.sess.run(tf.global_variables_initializer()) self.sess.run(tf.local_variables_initializer()) print('Variables') for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): print(i) # build val graph val_placeholders_cropped = create_placeholders( data_generator_entries_test_cropped_single_frame, shape_prefix=[1]) self.data_cropped_val = val_placeholders_cropped['image'] self.instances_cropped_tra_val = val_placeholders_cropped[ 'instances_merged'] self.instances_cropped_bac_val = val_placeholders_cropped[ 'instances_bac'] with tf.variable_scope('net/rnn', reuse=True): output_tuple = network_single_frame_with_lstm_states( self.data_cropped_val, num_outputs_embedding=self.num_embeddings, data_format=self.data_format, actual_network=self.actual_network, **self.network_parameters) self.lstm_input_states_cropped_val = output_tuple[0] self.lstm_output_states_cropped_val = output_tuple[1] self.embeddings_cropped_val = output_tuple[2:] if not isinstance(self.embeddings_cropped_val, tuple): self.embeddings_cropped_val = (self.embeddings_cropped_val, ) with tf.variable_scope('loss'): val_losses_dict = self.losses(self.embeddings_cropped_val, self.instances_cropped_tra_val, self.instances_cropped_bac_val, bitwise_instances=False) val_losses_dict['loss_reg'] = loss_reg self.loss_val = tf.reduce_sum(list(val_losses_dict.values())) self.val_losses = val_losses_dict if not self.normalized_embeddings: self.embeddings_cropped_val = tuple([ embedding_normalization_function(e) for e in self.embeddings_cropped_val ])
def masked_autoregressive_conditional_template( hidden_layers, conditional_tensor, shift_only=False, activation=tf.nn.relu, log_scale_min_clip=-3., log_scale_max_clip=3., log_scale_clip_gradient=True, name=None, *args, # pylint: disable=keyword-arg-before-vararg **kwargs): """Build the Masked Autoregressive Density Estimator (Germain et al., 2015). This will be wrapped in a make_template to ensure the variables are only created once. It takes the input and returns the `loc` ('mu' in [Germain et al. (2015)][1]) and `log_scale` ('alpha' in [Germain et al. (2015)][1]) from the MADE network. Warning: This function uses `masked_dense` to create randomly initialized `tf.Variables`. It is presumed that these will be fit, just as you would any other neural architecture which uses `tf.layers.dense`. #### About Hidden Layers Each element of `hidden_layers` should be greater than the `input_depth` (i.e., `input_depth = tf.shape(input)[-1]` where `input` is the input to the neural network). This is necessary to ensure the autoregressivity property. #### About Clipping This function also optionally clips the `log_scale` (but possibly not its gradient). This is useful because if `log_scale` is too small/large it might underflow/overflow making it impossible for the `MaskedAutoregressiveFlow` bijector to implement a bijection. Additionally, the `log_scale_clip_gradient` `bool` indicates whether the gradient should also be clipped. The default does not clip the gradient; this is useful because it still provides gradient information (for fitting) yet solves the numerical stability problem. I.e., `log_scale_clip_gradient = False` means `grad[exp(clip(x))] = grad[x] exp(clip(x))` rather than the usual `grad[clip(x)] exp(clip(x))`. Args: hidden_layers: Python `list`-like of non-negative integer, scalars indicating the number of units in each hidden layer. Default: `[512, 512]. shift_only: Python `bool` indicating if only the `shift` term shall be computed. Default: `False`. activation: Activation function (callable). Explicitly setting to `None` implies a linear activation. log_scale_min_clip: `float`-like scalar `Tensor`, or a `Tensor` with the same shape as `log_scale`. The minimum value to clip by. Default: -5. log_scale_max_clip: `float`-like scalar `Tensor`, or a `Tensor` with the same shape as `log_scale`. The maximum value to clip by. Default: 3. log_scale_clip_gradient: Python `bool` indicating that the gradient of `tf.clip_by_value` should be preserved. Default: `False`. name: A name for ops managed by this function. Default: 'masked_autoregressive_default_template'. *args: `tf.layers.dense` arguments. **kwargs: `tf.layers.dense` keyword arguments. Returns: shift: `Float`-like `Tensor` of shift terms (the 'mu' in [Germain et al. (2015)][1]). log_scale: `Float`-like `Tensor` of log(scale) terms (the 'alpha' in [Germain et al. (2015)][1]). Raises: NotImplementedError: if rightmost dimension of `inputs` is unknown prior to graph execution. #### References [1]: Mathieu Germain, Karol Gregor, Iain Murray, and Hugo Larochelle. MADE: Masked Autoencoder for Distribution Estimation. In _International Conference on Machine Learning_, 2015. https://arxiv.org/abs/1502.03509 """ name = name or 'masked_autoregressive_conditional_template' with tf.name_scope(name): def _fn(x): """MADE parameterized via `masked_autoregressive_default_template`.""" # TODO(b/67594795): Better support of dynamic shape. cond_depth = tf.compat.dimension_value( tensorshape_util.with_rank_at_least(conditional_tensor.shape, 1)[-1]) input_shape = (np.int32(tensorshape_util.as_list(x.shape)) if tensorshape_util.is_fully_defined(x.shape) else tf.shape(x)) if tensorshape_util.rank(x.shape) == 1: x = x[tf.newaxis, ...] x = tf.concat([conditional_tensor, x], axis=-1) input_depth = tf.compat.dimension_value( tensorshape_util.with_rank_at_least(x.shape, 1)[-1]) if input_depth is None: raise NotImplementedError( 'Rightmost dimension must be known prior to graph execution.' ) for i, units in enumerate(hidden_layers): x = masked_dense( inputs=x, units=units, num_blocks=input_depth, exclusive=True if i == 0 else False, activation=activation, *args, # pylint: disable=keyword-arg-before-vararg **kwargs) x = masked_dense( inputs=x, units=(1 if shift_only else 2) * input_depth, num_blocks=input_depth, activation=None, *args, # pylint: disable=keyword-arg-before-vararg **kwargs) if shift_only: x = x[..., cond_depth:] x = tf.reshape(x, shape=input_shape) return x, None else: x = x[..., 2 * cond_depth:] x = tf.reshape(x, shape=tf.concat([input_shape, [2]], axis=0)) shift, log_scale = tf.unstack(x, num=2, axis=-1) which_clip = (tf.clip_by_value if log_scale_clip_gradient else clip_by_value_preserve_gradient) log_scale = which_clip(log_scale, log_scale_min_clip, log_scale_max_clip) return shift, log_scale return tf1.make_template(name, _fn)
def __init__(self, _sentinel=None, custom_getter=None, name=None): # pylint: disable=invalid-name """Performs the initialisation necessary for all AbstractModule instances. Every subclass of AbstractModule must begin their constructor with a call to this constructor, i.e. `super(MySubModule, self).__init__(custom_getter=custom_getter, name=name)`. If you instantiate sub-modules in __init__ you must create them within the `_enter_variable_scope` context manager to ensure they are in the module's variable scope. Alternatively, instantiate sub-modules in `_build`. Args: _sentinel: Variable that only carries a non-None value if `__init__` was called without named parameters. If this is the case, a deprecation warning is issued in form of a `ValueError`. custom_getter: Callable or dictionary of callables to use as custom getters inside the module. If a dictionary, the keys correspond to regexes to match variable names. See the `tf.get_variable` documentation for information about the custom_getter API. name: Name of this module. Used to construct the Templated build function. If `None` the module's class name is used (converted to snake case). Raises: TypeError: If `name` is not a string. TypeError: If a given `custom_getter` is not callable. ValueError: If `__init__` was called without named arguments. """ if _sentinel is not None: raise ValueError("Calling AbstractModule.__init__ without named " "arguments is not supported.") if name is None: name = util.to_snake_case(self.__class__.__name__) elif not isinstance(name, six.string_types): raise TypeError("Name must be a string, not {} of type {}.".format( name, type(name))) self._is_connected = False self._connected_subgraphs = [] # If the given custom getter is a dictionary with a per-variable custom # getter, wrap it into a single custom getter. if isinstance(custom_getter, collections.Mapping): self._custom_getter = util.custom_getter_router( custom_getter_map=custom_getter, name_fn=lambda name: name[len(self.scope_name) + 1:]) elif custom_getter is not None and not callable(custom_getter): raise TypeError("Given custom_getter is not callable.") else: self._custom_getter = custom_getter self._template = tf.make_template( name, self._build_wrapper, create_scope_now_=True, custom_getter_=self._custom_getter) self._original_name = name self._unique_name = self._template.variable_scope.name.split("/")[-1] # Copy signature of _build to __call__. adapter_fn = getattr(self._build, "__func__", self._build) @wrapt.decorator(adapter=adapter_fn) def copy_signature(method, unused_instance, args, kwargs): return method(*args, **kwargs) @copy_signature def __call__(instance, *args, **kwargs): # pylint: disable=invalid-name return AbstractModule.__call__(instance, *args, **kwargs) # use __dict__ instead of setting directly to avoid a Callable pytype error self.__dict__["__call__"] = types.MethodType(__call__, self) # Update __call__ and the object docstrings to enable better introspection. self.__doc__ = self._build.__doc__ self.__call__.__func__.__doc__ = self._build.__doc__ # Keep track of which graph this module has been connected to. Sonnet # modules cannot be connected to multiple graphs, as transparent variable # sharing is impossible in that case. self._graph = None # Container for all variables created in this module and its sub-modules. self._all_variables = set([]) # Calling `.defun()` causes the module's call method to become wrapped as # a graph function. self._defun_wrapped = False
def init_networks(self): network_image_size = self.image_size if self.data_format == 'channels_first': data_generator_entries = OrderedDict([ ('image', [1] + network_image_size), ('labels', [self.num_labels] + network_image_size) ]) else: data_generator_entries = OrderedDict([ ('image', network_image_size + [1]), ('labels', network_image_size + [self.num_labels]) ]) # create model with shared weights between train and val training_net = tf.make_template('net', self.network) # build train graph self.train_queue = DataGenerator(self.dataset_train, self.coord, data_generator_entries, batch_size=self.batch_size) data, mask = self.train_queue.dequeue() prediction, _, _ = training_net(data, num_labels=self.num_labels, is_training=True, data_format=self.data_format) # losses self.loss_net = self.loss_function(labels=mask, logits=prediction, data_format=self.data_format) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): if self.reg_constant > 0: reg_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) self.loss_reg = self.reg_constant * tf.add_n(reg_losses) self.loss = self.loss_net + self.loss_reg else: self.loss = self.loss_net self.train_losses = OrderedDict([('loss', self.loss_net), ('loss_reg', self.loss_reg)]) # solver global_step = tf.Variable(self.current_iter) learning_rate = tf.train.piecewise_constant( global_step, self.learning_rate_boundaries, self.learning_rates) self.optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(self.loss, global_step=global_step) # build val graph val_placeholders = tensorflow_train.utils.tensorflow_util.create_placeholders( data_generator_entries, shape_prefix=[1]) self.data_val = val_placeholders['image'] self.prediction_val, self.local_prediction_val, self.spatial_prediction_val = training_net( self.data_val, num_labels=self.num_labels, is_training=False, data_format=self.data_format) self.prediction_softmax_val = tf.nn.softmax( self.prediction_val, axis=1 if self.data_format == 'channels_first' else 4) if self.has_validation_groundtruth: self.mask_val = val_placeholders['labels'] # losses self.loss_val = self.loss_function(labels=self.mask_val, logits=self.prediction_val, data_format=self.data_format) self.val_losses = OrderedDict([('loss', self.loss_val), ('loss_reg', self.loss_reg)])
def __init__(self, train_envs, test_envs, replay_buffer, obs_dim, action_dim, reward_dim, env_params_dim, seq_len, qf1, qf2, vf, policy, policy_lr=1e-3, qf_lr=1e-3, alpha_lr=1e-3, gamma=0.99, target_entropy='auto', tau=0.005, no_info_policy=False): self.replay_buffer = replay_buffer # environment self.train_envs = train_envs self.train_envs_ids = env_utils.get_env_id(self.train_envs) self.test_envs = test_envs self.test_envs_ids = env_utils.get_env_id(self.test_envs) # dims self.obs_dim = obs_dim self.action_dim = action_dim self.reward_dim = reward_dim self.env_params_dim = env_params_dim self.target_entropy = target_entropy if target_entropy != 'auto' else -np.prod( self.action_dim) # logger self.logger = EpochLogger() # learning rates self.policy_lr = policy_lr self.qf_lr = qf_lr self.vf_lr = qf_lr self.alpha_lr = alpha_lr # other params self.seq_len = seq_len self.gamma = gamma self.tau = tau self.no_info_policy = no_info_policy # give policy the env params? # alpha log_alpha = tf.compat.v1.get_variable('log_alpha', dtype=tf.float32, initializer=0.) self.alpha = tf.exp(log_alpha) # placeholders self.iteration_var = tf1.placeholder(tf.int64, shape=None, name='iteration') self.obs_var = tf1.placeholder(tf.float32, shape=(None, self.obs_dim), name='obs') self.next_obs_var = tf1.placeholder(tf.float32, shape=(None, self.obs_dim), name='next_obs') self.actions_var = tf1.placeholder(tf.float32, shape=(None, self.action_dim), name='actions') self.rewards_var = tf1.placeholder(tf.float32, shape=(None, self.reward_dim), name='rewards') self.terminals_var = tf1.placeholder(tf.float32, shape=(None, ), name='terminals') self.env_params_var = tf1.placeholder(tf.float32, shape=(None, self.env_params_dim), name='env_params') # placeholders for sequences self.obs_seq_var = tf1.placeholder(tf.float32, shape=(None, self.seq_len, self.obs_dim), name='obs_seq_var') self.next_obs_seq_var = tf1.placeholder(tf.float32, shape=(None, self.seq_len, self.obs_dim), name='next_obs_seq_var') self.actions_seq_var = tf1.placeholder(tf.float32, shape=(None, self.seq_len, self.action_dim), name='actions_seq') self.rewards_seq_var = tf1.placeholder(tf.float32, shape=(None, self.seq_len, self.reward_dim), name='rewards_seq') self.terminals_seq_var = tf1.placeholder(tf.float32, shape=(None, self.seq_len), name='terminals_seq') self.env_params_seq_var = tf1.placeholder(tf.float32, shape=(None, self.seq_len, self.env_params_dim), name='env_params_seq') # templates self.qf1 = tf1.make_template('qf1', qf1) self.qf2 = tf1.make_template('qf2', qf2) self.vf = tf1.make_template('vf_main', vf) self.vf_target = tf1.make_template('vf_target', vf) self.policy = tf1.make_template('policy', policy) # outputs from the networks self.qf1_out = self.qf1( tf.concat([ self.obs_seq_var, self.actions_seq_var, self.env_params_seq_var ], axis=-1)) qf2_out = self.qf2( tf.concat([ self.obs_seq_var, self.actions_seq_var, self.env_params_seq_var ], axis=-1)) vf_out = self.vf( tf.concat([self.obs_seq_var, self.env_params_seq_var], axis=-1)) vf_target_out = self.vf_target( tf.concat([self.next_obs_seq_var, self.env_params_seq_var], axis=-1)) sampled_seq_actions, actions_seq_logprobs = self.sample_oracle_actions_sequence( policy=self.policy, obs=self.obs_seq_var, env_params=self.env_params_seq_var) qf1_pi_out = self.qf1( tf.concat([ self.obs_seq_var, sampled_seq_actions, self.env_params_seq_var ], axis=-1)) qf2_pi_out = self.qf2( tf.concat([ self.obs_seq_var, sampled_seq_actions, self.env_params_seq_var ], axis=-1)) self.get_sampled_action = self.sample_oracle_action( policy=self.policy, obs=self.obs_var, env_params=self.env_params_var) # session and init weights self.sess = tf.Session() init_networks_params = tf.global_variables_initializer() self.sess.run(init_networks_params) self.saver = tf.train.Saver() print( 'number of parameters:', np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ])) # network parameters policy_params = tf.trainable_variables(self.policy.name) qf1_params = tf.trainable_variables(self.qf1.name) qf2_params = tf.trainable_variables(self.qf2.name) vf_params = tf.trainable_variables(self.vf.name) vf_target_params = tf.trainable_variables(self.vf_target.name) print('policy params', nn_utils.count_vars(self.policy.name), policy_params) print('QF1', nn_utils.count_vars(self.qf1.name), qf1_params) print('QF2', nn_utils.count_vars(self.qf2.name), qf2_params) print('VF', nn_utils.count_vars(self.vf.name), vf_params) print('VF_target', nn_utils.count_vars(self.vf_target.name), vf_target_params) # losses self.q_target = tf.stop_gradient( tf.squeeze(self.rewards_seq_var) + (1. - self.terminals_seq_var) * self.gamma * vf_target_out) qf1_loss = 0.5 * tf.reduce_mean((self.q_target - self.qf1_out)**2) qf2_loss = 0.5 * tf.reduce_mean((self.q_target - qf2_out)**2) min_q_pi = tf.minimum(qf1_pi_out, qf2_pi_out) v_target = tf.stop_gradient(min_q_pi - self.alpha * actions_seq_logprobs) vf_loss = 0.5 * tf.reduce_mean((v_target - vf_out)**2) value_loss = qf1_loss + qf2_loss + vf_loss policy_loss = tf.reduce_mean(self.alpha * actions_seq_logprobs - min_q_pi) alpha_loss = -tf.reduce_mean( log_alpha * tf.stop_gradient(actions_seq_logprobs + self.target_entropy)) entropy = -tf.reduce_mean(actions_seq_logprobs) policy_train_op = tf1.train.AdamOptimizer( learning_rate=self.policy_lr).minimize(policy_loss, var_list=policy_params, name='policy_opt') with tf.control_dependencies([policy_train_op]): value_params = qf1_params + qf2_params + vf_params critics_train_op = tf1.train.AdamOptimizer(self.qf_lr).minimize( value_loss, var_list=value_params, name='qf_vf_opt') with tf.control_dependencies([critics_train_op]): alpha_train_op = tf1.train.AdamOptimizer( self.alpha_lr, name='alpha_opt').minimize(loss=alpha_loss, var_list=[log_alpha]) target_update = tf.group([ tf.assign(v_targ, (1. - self.tau) * v_targ + tau * v_main) for v_main, v_targ in zip(vf_params, vf_target_params) ]) self.actor_critic_train_step_ops = [ policy_loss, qf1_loss, qf2_loss, vf_loss, alpha_loss, self.qf1_out, qf2_out, vf_out, entropy, policy_train_op, critics_train_op, alpha_train_op, target_update ] # init the rest of variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(vf_params, vf_target_params) ]) uninitialized_vars = [] for var in tf.global_variables(): try: self.sess.run(var) except tf.errors.FailedPreconditionError: uninitialized_vars.append(var) init_new_vars_op = tf.initialize_variables(uninitialized_vars) self.sess.run(init_new_vars_op) self.sess.run(target_init)
def _build_model(self, config): self.global_step_var = tf.Variable(tf.zeros([], tf.int64), name='global_step', trainable=False) self.learning_rate = tf.placeholder(tf.float32, shape=[], name='learning_rate') ## Input variables self.num_samples = tf.placeholder_with_default(1, shape=[], name='num_samples') # For MT, this is the source language text. For LM, this is not used. if config.conditioning_separator: assert config.episodic, 'conditioning and non-episodic do not mix.' self.conditioning = tf.placeholder( dtype=tf.int64, shape=[config.max_time_steps, None], name='conditioning') self.conditioning_len = tf.placeholder(dtype=tf.int64, shape=[None], name='conditioning_len') # For plain LM, this is the input text. For MT this is the target language # text. self.source = tf.placeholder(dtype=tf.int64, shape=[config.max_time_steps, None], name='source') self.source_len = tf.placeholder(dtype=tf.int64, shape=[None], name='source_len') # This is the ground truth text to be predicted. A shifted by one version # version of self.source. self.target = tf.placeholder(dtype=tf.int64, shape=[config.max_time_steps, None], name='target') def maybe_create_dropout_placeholder(configured_dropout_rate, name): if configured_dropout_rate > 0.0: return tf.placeholder(tf.float32, shape=[], name=name) else: return None self.embedding_dropout = maybe_create_dropout_placeholder( config.embedding_dropout, 'embedding_dropout') self.token_dropout = maybe_create_dropout_placeholder( config.token_dropout, 'token_dropout') self.input_dropout = maybe_create_dropout_placeholder( config.input_dropout, 'input_dropout') self.inter_layer_dropout = maybe_create_dropout_placeholder( config.inter_layer_dropout, 'inter_layer_dropout') self.update_dropout = maybe_create_dropout_placeholder( config.update_dropout, 'update_dropout') self.state_dropout = maybe_create_dropout_placeholder( config.state_dropout, 'state_dropout') self.flip_prob = maybe_create_dropout_placeholder( config.state_dropout_flip_rate, 'flip_prob') self.output_dropout = maybe_create_dropout_placeholder( config.output_dropout, 'output_dropout') self.downprojected_output_dropout = maybe_create_dropout_placeholder( config.downprojected_output_dropout, 'downprojected_output_dropout') self.softmax_temperature = tf.placeholder_with_default( 1.0, shape=[], name='softmax_temperature') ## Training embedding_initializer = tf.variance_scaling_initializer( scale=config.embedding_init_factor, mode='fan_out', distribution='truncated_normal') output_initializer = tf.variance_scaling_initializer( scale=config.output_init_factor, mode='fan_in', distribution='truncated_normal') batch_size = tf.shape(self.source)[1] last_hidden_size = utils.ensure_list(config.hidden_size)[-1] tb_h = tf.stack([config.max_time_steps * batch_size, last_hidden_size]) t_b_v = tf.stack( [config.max_time_steps, batch_size, config.vocab_size]) t_bk_o = tf.stack([ config.max_time_steps, batch_size * (config.mos_num_components or 1), config.output_embedding_size ]) tbk_o = tf.stack([ config.max_time_steps * batch_size * (config.mos_num_components or 1), config.output_embedding_size ]) t_b0_s_v = tf.stack([ config.max_time_steps, tf.div(batch_size, self.num_samples), self.num_samples, config.vocab_size ]) if config.embed_once: with tf.variable_scope('im', initializer=embedding_initializer): embedding = tf.get_variable( 'embedding', [config.vocab_size, config.input_embedding_size], initializer=embedding_initializer, dtype=tf.float32) if self.embedding_dropout is not None: embedding = tf.nn.dropout(embedding, 1 - self.embedding_dropout, noise_shape=tf.stack( [config.vocab_size, 1])) embedded_source = tf.nn.embedding_lookup( embedding, self.source) if self.token_dropout is not None: embedding = tf.nn.dropout(embedding, 1 - self.token_dropout, noise_shape=tf.stack([ config.max_time_steps, batch_size, 1 ])) if config.scale_input_embeddings: embedded_source *= tf.sqrt( tf.cast(config.input_embedding_size, tf.float32)) sources = embedded_source else: assert self.embedding_dropout is None, 'Not implemented.' assert self.token_dropout is None, 'Not implemented.' sources = self.source def lm_1(cell, initial_state, inputs, input_lens, scope=None): # According to tests (2019-03-13) swap_memory carries only a very penalty # so we use it to choose between dynamic_rnn and static_rnn. For some # reason, static_rnn can be 2x faster ... sometimes. On the other hand, # dynamic_rnn handles memory better even without swap_memory=True. if FLAGS.swap_memory: return tf.nn.dynamic_rnn(cell=cell, inputs=inputs, time_major=True, sequence_length=input_lens, initial_state=initial_state, swap_memory=FLAGS.swap_memory, dtype=tf.float32, scope=scope) else: return tf.nn.static_rnn(cell=cell, inputs=tf.unstack(inputs), sequence_length=input_lens, initial_state=initial_state, dtype=tf.float32, scope=scope) # This is for the config.output_once=True case. def output_module_1(outputs): with tf.variable_scope('om', initializer=output_initializer): # Create the matrix and bias for the final projection into the softmax. if config.share_input_and_output_embeddings: assert config.embed_once, 'Not implemented.' softmax_weights = embedding softmax_weights_transpose = True else: softmax_weights = tf.get_variable( 'weights', [config.output_embedding_size, config.vocab_size], dtype=tf.float32) softmax_weights_transpose = False softmax_bias = tf.get_variable( 'bias', [1, config.vocab_size], initializer=tf.zeros_initializer(), dtype=tf.float32) def to_softmax(x, dropout=self.downprojected_output_dropout): if dropout is not None: if not config.shared_mask_dropout: x = tf.nn.dropout(x, 1.0 - dropout) else: x = tf.reshape(x, t_bk_o) x = tf.nn.dropout( x, 1.0 - dropout, # same mask for all time steps noise_shape=[ 1, batch_size * (config.mos_num_components or 1), config.output_embedding_size ]) x = tf.reshape(x, tbk_o) return (self.softmax_temperature * (tf.matmul(x, softmax_weights, transpose_b=softmax_weights_transpose) + softmax_bias)) last_hidden_size = utils.ensure_list(config.hidden_size)[-1] outputs_t_b_h = tf.convert_to_tensor(outputs) if self.output_dropout is not None: if not config.shared_mask_dropout: outputs_t_b_h = tf.nn.dropout( outputs_t_b_h, 1.0 - self.output_dropout) else: outputs_t_b_h = tf.nn.dropout( outputs_t_b_h, 1.0 - self.output_dropout, noise_shape=[1, batch_size, last_hidden_size]) outputs_tb_h = tf.reshape(outputs_t_b_h, tb_h) if config.mos_num_components == 0: if config.output_embedding_size == last_hidden_size: return (tf.reshape(to_softmax(outputs_tb_h, None), t_b_v), outputs_t_b_h) else: downprojected_outputs_tb_o = utils.linear( outputs_tb_h, config.output_embedding_size, False, initializer=utils.orthogonal_initializer(), scope='projection') logits_tb_v = to_softmax(downprojected_outputs_tb_o) return tf.reshape(logits_tb_v, t_b_v), outputs_t_b_h else: logits_tb_v = utils.mixture_of_softmaxes( outputs_tb_h, config.mos_num_components, config.output_embedding_size, to_softmax) return tf.reshape(logits_tb_v, t_b_v), outputs_t_b_h # This is for the config.output_once=False case. def output_module_per_step_1(outputs_b_h): with tf.variable_scope('om', initializer=output_initializer): def to_softmax(x, dropout=self.downprojected_output_dropout): # Create the matrix and bias for the final projection into the # softmax. if config.share_input_and_output_embeddings: assert config.embed_once, 'Not implemented.' softmax_weights = embedding softmax_weights_transpose = True else: softmax_weights = tf.get_variable( 'weights', [config.output_embedding_size, config.vocab_size], dtype=tf.float32) softmax_weights_transpose = False softmax_bias = tf.get_variable( 'bias', [1, config.vocab_size], initializer=tf.zeros_initializer(), dtype=tf.float32) if dropout is not None: x = Dropout(1.0 - dropout, share_mask=config.shared_mask_dropout)(x) return (self.softmax_temperature * (tf.matmul(x, softmax_weights, transpose_b=softmax_weights_transpose) + softmax_bias)) last_hidden_size = utils.ensure_list(config.hidden_size)[-1] outputs_b_h = Dropout( 1.0 - self.output_dropout, share_mask=self.output_dropout)(outputs_b_h) if config.mos_num_components == 0: if config.output_embedding_size == last_hidden_size: return to_softmax(outputs_b_h, None) else: downprojected_outputs_b_o = utils.linear( outputs_b_h, config.output_embedding_size, False, initializer=utils.orthogonal_initializer(), scope='projection') logits_b_v = to_softmax(downprojected_outputs_b_o) return logits_b_v else: logits_b_v = utils.mixture_of_softmaxes( outputs_b_h, config.mos_num_components, config.output_embedding_size, to_softmax) return logits_b_v lm = tf.make_template('lm', lm_1) def make_cell(): return build_cell( model=config.model, num_layers=config.num_layers, hidden_size=config.hidden_size, layer_norm=config.layer_norm, cell_init_factor=config.cell_init_factor, shared_mask_dropout=config.shared_mask_dropout, input_dropout=self.input_dropout, inter_layer_dropout=self.inter_layer_dropout, state_dropout=self.state_dropout, update_dropout=self.update_dropout, state_dropout_flip_rate=self.flip_prob, tie_forget_and_input_gates=config.tie_forget_and_input_gates, cap_input_gate=config.cap_input_gate, forget_bias=config.forget_bias, feature_mask_rounds=config.feature_mask_rounds, feature_mask_rank=config.feature_mask_rank, overlay_rank=config.overlay_rank, sparsity_ratio=config.sparsity_ratio, cell_clip=config.cell_clip, activation_fn=config.activation_fn, lstm_skip_connection=config.lstm_skip_connection, residual_connections=config.residual_connections) def make_conditioning(): if config.embed_once: with tf.variable_scope('cond_im', initializer=embedding_initializer): embedding = tf.get_variable( 'embedding', [ config.conditioning_vocab_size, config.input_embedding_size ], initializer=embedding_initializer, dtype=tf.float32) if self.embedding_dropout is not None: embedding = tf.nn.dropout( embedding, 1 - self.embedding_dropout, noise_shape=tf.stack( [config.conditioning_vocab_size, 1])) embedded_source = tf.nn.embedding_lookup( embedding, self.conditioning) if self.token_dropout is not None: embedding = tf.nn.dropout(embedding, 1 - self.token_dropout, noise_shape=tf.stack([ config.max_time_steps, batch_size, 1 ])) if config.scale_input_embeddings: embedded_source *= tf.sqrt( tf.cast(config.input_embedding_size, tf.float32)) conditioning_sources = embedded_source else: assert False, 'Not implemented.' conditioning_cell = make_cell() conditioning_lm = tf.make_template('cond_lm', lm_1) initial_state = conditioning_cell.zero_state(batch_size, dtype=tf.float32) _, conditioning_last_state = conditioning_lm( conditioning_cell, initial_state, conditioning_sources, self.conditioning_len) return conditioning_last_state cell = make_cell() if not config.embed_once: cell = tf.nn.rnn_cell.EmbeddingWrapper( cell, config.vocab_size, config.input_embedding_size, initializer=embedding_initializer) if config.conditioning_separator: self.initial_state = make_conditioning() elif config.trainable_initial_state: with tf.variable_scope('lm_init'): self.initial_state = utils.trainable_initial_state( batch_size, cell.state_size) else: self.initial_state = cell.zero_state(batch_size, dtype=tf.float32) outputs, self.last_state = lm(cell, self.initial_state, sources, self.source_len) self.cell_outputs = tf.convert_to_tensor(outputs) if config.output_once: output_module = tf.make_template('om', output_module_1) logits_, self.dropped_cell_outputs = output_module(outputs) else: assert config.activation_norm_penalty == 0.0, ( 'activation_norm_penalty not implemented for output_once=False.' ) output_module_per_step = tf.make_template( 'om', output_module_per_step_1) # KLUDGE: calling output_module_per_step here gets rid of the # 'rnn/FNCell/' prefix on the variables names so output_once=False and # output_once=True checkpoints are compatible. output_module_per_step(outputs[0]) output_cell = utils.FNCell(output_module_per_step, config.vocab_size) logits_, _ = tf.nn.dynamic_rnn( cell=output_cell, inputs=tf.convert_to_tensor(outputs), time_major=True, sequence_length=self.source_len, swap_memory=FLAGS.swap_memory, dtype=tf.float32) def average_samples(): # logits has shape t_b_v, where b=b0*num_samples. Separate out # the samples in a new dimension. logits = tf.reshape(logits_, t_b0_s_v) if config.model_average == 'geometric': x = tf.reduce_sum(logits, axis=2, keepdims=True) elif config.model_average == 'arithmetic': log_probs = tf.nn.log_softmax(logits) x = tf.reduce_logsumexp(log_probs, axis=2, keepdims=True) else: assert False, 'Not implemented.' # x is t_b0_1_v, tile it to t_b0_s_v. x = tf.ones_like(logits) * x return tf.reshape(x, t_b_v) self.logits = tf.cond(tf.equal(self.num_samples, 1), lambda: logits_, average_samples)
def _create_network(self, name): return tf.make_template('Online', self._network_template)
def real_nvp_default_template( hidden_layers, shift_only=False, activation=tf.nn.relu, name=None, *args, # pylint: disable=keyword-arg-before-vararg **kwargs): """Build a scale-and-shift function using a multi-layer neural network. This will be wrapped in a make_template to ensure the variables are only created once. It takes the `d`-dimensional input x[0:d] and returns the `D-d` dimensional outputs `loc` ('mu') and `log_scale` ('alpha'). The default template does not support conditioning and will raise an exception if `condition_kwargs` are passed to it. To use conditioning in Real NVP bijector, implement a conditioned shift/scale template that handles the `condition_kwargs`. Arguments: hidden_layers: Python `list`-like of non-negative integer, scalars indicating the number of units in each hidden layer. Default: `[512, 512]. shift_only: Python `bool` indicating if only the `shift` term shall be computed (i.e. NICE bijector). Default: `False`. activation: Activation function (callable). Explicitly setting to `None` implies a linear activation. name: A name for ops managed by this function. Default: 'real_nvp_default_template'. *args: `tf.layers.dense` arguments. **kwargs: `tf.layers.dense` keyword arguments. Returns: shift: `Float`-like `Tensor` of shift terms ('mu' in [Papamakarios et al. (2016)][1]). log_scale: `Float`-like `Tensor` of log(scale) terms ('alpha' in [Papamakarios et al. (2016)][1]). Raises: NotImplementedError: if rightmost dimension of `inputs` is unknown prior to graph execution, or if `condition_kwargs` is not empty. #### References [1]: George Papamakarios, Theo Pavlakou, and Iain Murray. Masked Autoregressive Flow for Density Estimation. In _Neural Information Processing Systems_, 2017. https://arxiv.org/abs/1705.07057 """ with tf.name_scope(name or 'real_nvp_default_template'): def _fn(x, output_units, **condition_kwargs): """Fully connected MLP parameterized via `real_nvp_template`.""" if condition_kwargs: raise NotImplementedError( 'Conditioning not implemented in the default template.') if tensorshape_util.rank(x.shape) == 1: x = x[tf.newaxis, ...] reshape_output = lambda x: x[0] else: reshape_output = lambda x: x for units in hidden_layers: x = tf1.layers.dense( inputs=x, units=units, activation=activation, *args, # pylint: disable=keyword-arg-before-vararg **kwargs) x = tf1.layers.dense( inputs=x, units=(1 if shift_only else 2) * output_units, activation=None, *args, # pylint: disable=keyword-arg-before-vararg **kwargs) if shift_only: return reshape_output(x), None shift, log_scale = tf.split(x, 2, axis=-1) return reshape_output(shift), reshape_output(log_scale) return tf1.make_template('real_nvp_default_template', _fn)
def fit(net, img_shape, img_name, image_mode, type_measurements, num_measurements, y_feed, A_feed, mask_info1, ini_channel = 32, mask_feed = None, lr_decay_epoch=0, lr_decay_rate=0.65, LR=0.01, OPTIMIZER='adam', num_iter=5000, find_best=False, verbose=False, random_vector = None, selection_mask = None, save = False, random_array = None): with tf.Graph().as_default(): # Global step global_step = tf.train.get_or_create_global_step() # Set up palceholders n_input = img_shape[1]*img_shape[2]*img_shape[3] width = int(img_shape[1]) height = int(img_shape[2]) if mask_feed is None: if type_measurements == 'random': #compressed sensing with random matirx A = tf.placeholder(tf.float32, shape=(n_input, num_measurements), name='A') #e.g.[img_wid*img_high*3, 200] y = tf.placeholder(tf.float32, shape=(1, num_measurements), name='y') #e.g.[1, 200] #rand = tf.placeholder(tf.float32, shape=(1, width, height, ini_channel), name='random_noise') #e.g.[1,img_wid,img_high,32] elif type_measurements == 'identity': #denosing if image_mode != '3D': A = tf.placeholder(tf.float32, shape=(n_input, n_input), name='A') #e.g.[img_wid*img_high*3, img_wid*img_high*3] ########!!!!!!#####!!!!!!! y = tf.placeholder(tf.float32, shape=(1, n_input), name='y') #e.g.[1, img_wid*img_high*3] #rand = tf.placeholder(tf.float32, shape=(1, width, height, ini_channel), name='random_noise') #e.g.[1,img_wid,img_high,32] elif type_measurements == 'circulant': #compressed sensing with circulant matirx y = tf.placeholder(tf.float32, shape=(1, n_input), name='y')#e.g.[1, img_wid*img_high*3] #rand = tf.placeholder(tf.float32, shape=(1, width, height, ini_channel), name='random_noise') #e.g.[1,img_wid,img_high,32] else: #inpainting y = tf.placeholder(tf.float32, shape=(1, img_shape[1], img_shape[2], img_shape[3]), name='y')#e.g.[1, img_wid, img_high, 3] #rand = tf.placeholder(tf.float32, shape=(1, width, height, ini_channel), name='random_noise') #e.g.[1,img_wid,img_high,32] # Define input uniform noise #rand = np.random.uniform(0, 1.0/30.0, size=(1, width, height, ini_channel)).astype(np.float32) out = tf.constant(np.random.uniform(size=(1, width, height, ini_channel)).astype(np.float32) * 1. / 10) #+ rand #[1,4096,1,32] out = tf.Variable(out, name='input_noise', trainable=False) # Deep image prior feed_forward = tf.make_template("DeepImagePrior", net) #feed_forward takes a 4D Tensor (batch, width, height, channels) as input and outputs a 4D Tensor (batch, width*2^6, height*2^6, channels=3) x = feed_forward(out) #e.g. net_output with shape [1, img_wid, img_high, 3] # Inverse problem settings def circulant_tf(signal_vector, random_vector_m, selection_mask_m): signal_vector = tf.cast(signal_vector, dtype=tf.complex64, name='circulant_real2complex') t = tf.convert_to_tensor(random_vector_m, dtype=tf.complex64) #step 1: F^{-1} @ x r1 = tf.signal.ifft(signal_vector, name='circulant_step1_ifft') #step 2: Diag() @ F^{-1} @ x Ft = tf.signal.fft(t) r2 = tf.multiply(r1, Ft, name='circulant_step2_diag') #step 3: F @ Diag() @ F^{-1} @ x compressive = tf.signal.fft(r2, name='circulant_step3_fft') float_compressive = tf.cast(compressive, tf.float32, name='circulant_complex2real') #step 4: R_{omega} @ C_{t} select_compressive = tf.multiply(float_compressive, selection_mask_m, name='circulant_step4_A') return select_compressive if mask_feed is None: # Compressed sensing & Denoising if type_measurements == 'circulant': # Compressed sensing with Circulant matrix flip = tf.convert_to_tensor(random_array, dtype=tf.float32) # flip x_circulant = tf.reshape(x, [1,-1]) * flip y_hat = circulant_tf(x_circulant, random_vector, selection_mask) else: # Compressed sensing with Random matrix & Denoising if image_mode != '3D': y_hat = tf.matmul(tf.reshape(x, [1,-1]), A) ########!!!!!!#####!!!!!!! else: y_hat = tf.reshape(x, [1,-1]) else: # Inpainting y_hat = x * mask_feed # Define loss mse = tf.losses.mean_squared_error loss = mse(y, y_hat) # Define learning rate if lr_decay_epoch > 0: LR = tf.train.exponential_decay(LR, global_step, lr_decay_epoch, lr_decay_rate, staircase=True) # Define optimizer if OPTIMIZER == 'adam': #print("optimize with adam", LR) optimizer = tf.train.AdamOptimizer(LR) elif OPTIMIZER == 'LBFGS': raise NotImplementedError('LBFGS Optimizer') update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) # Set up gpu config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.85 config.log_device_placement= True with tf.Session() as sess: # Init mse = [0.] * num_iter sess.run(tf.global_variables_initializer()) # Initial deep decoder output if find_best: if not os.path.exists('log'): os.makedirs('log/') if not os.path.exists('result'): os.makedirs('result/') saver = tf.train.Saver(max_to_keep=1) #saver.save(sess, os.path.join('log/', 'model.ckpt'), global_step=0) best_mse = 1000000.0 best_img = sess.run(x) #save_img(best_img, 'result/', img_name, '0', image_mode, decoder_type, filter_size, upsample_mode) # Feed dict if mask_feed is None: if type_measurements == 'circulant':#compressed sensing feed_dict = {y: y_feed} elif type_measurements == 'identity': if image_mode != '3D': feed_dict = {A: A_feed, y: y_feed} ########!!!!!!#####!!!!!!! else: feed_dict = {y: y_feed} else:#inpainting feed_dict = {y: y_feed} # Optimize num_params = get_num_params() sess.graph.finalize() #print('\x1b[37mFinal graph size: %.2f MB\x1b[0m' % (tf.get_default_graph().as_graph_def().ByteSize() / 10e6)) for i in range(num_iter): loss_, _ = sess.run([loss, train_op], feed_dict=feed_dict) #psnr = 10 * np.log10(1 * 1 / loss_) #PSNR mse[i] = loss_ # Display #if i > 0 and i % 100 == 0: # print ('\r[Iteration %05d] loss=%.9f' % (i, loss_), end='') # Best net if find_best and best_mse > 1.005 * loss_: best_mse = loss_ #best_psnr = 10 * np.log10(1 * 1 / best_mse) best_img = sess.run(x) #saver.save(sess, os.path.join('log/', 'model.ckpt'), global_step=i + 1) # Return final image or best found so far if `find_best` if find_best: out_img = best_img #mask_info = mask_info1[8:-4] # if save: # save_img(out_img, 'result/', img_name, '{}'.format(i + 1), image_mode, decoder_type, filter_size, upsample_mode, num_channels_real, num_layers, input_size, mask_info, act_function) #print('Best MSE (wrt noisy) {}: {}: {}: {}: {}: {}: {}: {}: {}'.format(num_channels_real, num_layers, img_name, mask_info, decoder_type, filter_size, upsample_mode, upsample_factor, best_mse)) else: out_img = sess.run(x) #mask_info = mask_info1[8:-4] # if save: # save_img(out_img, 'result/', img_name, '{}'.format(i + 1), image_mode, decoder_type, filter_size, upsample_mode, num_channels_real, num_layers, input_size, mask_info, act_function) #print('FINAL MSE (wrt noisy) {}: {}: {}: {}: {}: {}: {}: {}: {}'.format(num_channels_real, num_layers, img_name, mask_info, decoder_type, filter_size, upsample_mode, upsample_factor, mse[-1])) if verbose: return mse, out_img, num_params else: return mse, out_img