def sample(self, time, outputs, state, name=None): with ops.name_scope(name, "ScheduledOutputTrainingHelperSample", [time, outputs, state]): sampler = Bernoulli(probs=self._sampling_probability) return math_ops.cast( sampler.sample(sample_shape=self.batch_size, seed=self._seed), dtypes.bool)
def __init__(self, n_in, n_out, model_prob=0.9, model_lam=1e-2, activation=None, name="hidden"): self.model_prob = model_prob # probability to keep units self.model_lam = model_lam # l^2 / 2*tau self.model_bern = Bernoulli(probs=self.model_prob, dtype=tf.float32) self.dropout_mask = self.model_bern.sample((n_in, )) if activation is None: self.activation = tf.identity else: self.activation = activation kernel_initializer = tf.initializers.truncated_normal(mean=0.0, stddev=0.01) self.model_M = tf.get_variable("{}_M".format(name), initializer=kernel_initializer( [n_in, n_out])) # variational parameters self.model_m = tf.get_variable("{}_b".format(name), initializer=tf.zeros([n_out])) self.model_W = tf.matmul(tf.diag(self.dropout_mask), self.model_M)
def tensor_rnn_with_feed_prev(cell, inputs, is_training, config, initial_states=None): """High Order Recurrent Neural Network Layer """ #tuple of 2-d tensor (batch_size, s) outputs = [] prev = None is_sample = is_training and initial_states is not None with tf.variable_scope("trnn") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) inputs_shape = inputs.get_shape().with_rank_at_least(3) batch_size = tf.shape(inputs)[0] num_steps = inputs_shape[1] input_size = int(inputs_shape[2]) output_size = cell.output_size inp_steps = config.inp_steps # Scheduled sampling dist = Bernoulli(probs=config.sample_prob) samples = dist.sample(sample_shape=num_steps) if initial_states is None: initial_states =[] for lag in range(config.num_lags): initial_state = cell.zero_state(batch_size, dtype= tf.float32) initial_states.append(initial_state) states_list = initial_states #list of high order states for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() inp = inputs[:, time_step, :] if is_sample and time_step > 0: with tf.variable_scope(tf.get_variable_scope(), reuse=True): inp = tf.cond(tf.cast(samples[time_step], tf.bool), lambda:tf.identity(inp) , \ lambda:fully_connected(cell_output, input_size, activation_fn=tf.sigmoid)) if not is_training and prev is not None and time_step >= inp_steps: with tf.variable_scope(tf.get_variable_scope(), reuse=True): inp = fully_connected(cell_output, input_size, activation_fn=tf.sigmoid) #print("t", time_step, ">=", burn_in_steps, "--> feeding back output into input.") states = _list_to_states(states_list) """input tensor is [batch_size, num_steps, input_size]""" (cell_output, state)=cell(inp, states) states_list = _shift(states_list, state) prev = cell_output with tf.variable_scope(tf.get_variable_scope(), reuse=False): output = fully_connected(cell_output, input_size, activation_fn=tf.sigmoid) outputs.append(output) outputs = tf.stack(outputs,1) return outputs, states_list
def sample_v_given_h(self, h0_sample): ''' This function infers state of visible units given hidden units ''' pre_sigmoid_v1, v1_mean = self.propdown(h0_sample) dist = Bernoulli(probs=v1_mean, dtype=tf.float32) v1_sample = dist.sample() return [pre_sigmoid_v1, v1_mean, v1_sample]
def tensor_rnn_with_feed_prev(cell, inputs, is_training, config, initial_states=None): outputs = [] cell_output = None is_sample = is_training and initial_states is not None with tf.variable_scope("trnn") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) inputs_shape = inputs.get_shape().with_rank_at_least(3) batch_size = tf.shape(inputs)[0] num_steps = inputs_shape[1] input_size = int(inputs_shape[2]) output_size = cell.output_size inp_steps = config.inp_steps acv_func = tf.sigmoid dist = Bernoulli(probs=config.sample_prob) samples = dist.sample(sample_shape=num_steps) if initial_states is None: initial_states = [] for lag in range(config.num_lags): initial_state = cell.zero_state(batch_size, dtype=tf.float32) initial_states.append(initial_state) states_list = initial_states #list of high order states # for time_step in range(num_steps): for time_step in range(1): if time_step > 0: tf.get_variable_scope().reuse_variables() inp = inputs[:, time_step, :] if is_sample and time_step > 0: with tf.variable_scope(tf.get_variable_scope(), reuse=True): inp = tf.cond(tf.cast(samples[time_step], tf.bool), lambda: tf.identity(inp), lambda: output) if not is_training and cell_output is not None and time_step >= inp_steps: with tf.variable_scope(tf.get_variable_scope(), reuse=True): inp = output states = _list_to_states(states_list) (cell_output, state) = cell(inp, states) states_list = _shift(states_list, state) with tf.variable_scope(tf.get_variable_scope(), reuse=False): output = fully_connected(cell_output, input_size, activation_fn=acv_func) outputs.append(output) outputs = tf.stack(outputs, 1) return outputs, states_list
def sample(self, n=None): if self._bernoulli is None: self._bernoulli = Bernoulli(self._steps_probs) sample = self._bernoulli.sample(n) sample = tf.cumprod(sample, tf.rank(sample) - 1) sample = tf.reduce_sum(sample, -1) return sample
def sample_h_given_v(self, v0_sample): ''' This function infers state of hidden units given visible units ''' # compute the activation of the hidden units given visible samples pre_sigmoid_h1, h1_mean = self.propup(v0_sample) dist = Bernoulli(probs=h1_mean, dtype=tf.float32) h1_sample = dist.sample() return [pre_sigmoid_h1, h1_mean, h1_sample]
def rnn_with_feed_prev(cell, inputs, is_training, config, initial_state=None): prev = None outputs = [] sample_prob = config.sample_prob # scheduled sampling probability is_sample = is_training and initial_state is not None # whether to use scheduled sampling with tf.variable_scope("rnn") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) inputs_shape = inputs.get_shape().with_rank_at_least(3) batch_size = tf.shape(inputs)[0] num_steps = inputs_shape[1] input_size = int(inputs_shape[2]) inp_steps = config.inp_steps output_size = cell.output_size # phased lstm input inp_t = tf.expand_dims(tf.range(1,batch_size+1), 1) dist = Bernoulli(probs=config.sample_prob) samples = dist.sample(sample_shape=num_steps) # with tf.Session() as sess: # print('bernoulli',samples.eval()) if initial_state is None: initial_state = cell.zero_state(batch_size, dtype= tf.float32) state = initial_state for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() inp = inputs[:, time_step, :] if is_sample and time_step > 0: with tf.variable_scope(tf.get_variable_scope(), reuse=True): inp = tf.cond(tf.cast(samples[time_step], tf.bool), lambda:tf.identity(inp) , \ lambda:fully_connected(cell_output, input_size, activation_fn=tf.sigmoid)) if not is_training and prev is not None and time_step >= inp_steps: with tf.variable_scope(tf.get_variable_scope(), reuse=True): inp = fully_connected(prev, input_size, activation_fn=tf.sigmoid) #print("t", time_step, ">=", inp_steps, "--> feeding back output into input.") if isinstance(cell._cells[0], tf.contrib.rnn.PhasedLSTMCell): (cell_output, state) = cell((inp_t, inp), state) else: (cell_output, state) = cell(inp, state) prev = cell_output with tf.variable_scope(tf.get_variable_scope(), reuse=False): output = fully_connected(cell_output, input_size, activation_fn=tf.sigmoid) outputs.append(output) outputs = tf.stack(outputs, 1) return outputs, state
def _make_particles_update(self, n_steps=None, sample=True, G_fed=False): """Update negative particles by running Gibbs sampler for specified number of steps. """ if n_steps is None: n_steps = self._n_gibbs_steps # self._n_particles = 1 # self.sample_h_states = True with tf.name_scope('gibbs_chain'): logits = tf.zeros([self._n_runs, self._n_hidden]) T = Bernoulli(logits=logits).sample(seed=self.make_random_seed()) self._H = tf.cast(T, dtype=self._tf_dtype) self._H_new = tf.cast(T, dtype=self._tf_dtype) logits = tf.zeros([self._n_runs, self._n_visible]) T = Bernoulli(logits=logits).sample(seed=self.make_random_seed()) self._v = tf.cast(T, dtype=self._tf_dtype) self._v_new = tf.cast(T, dtype=self._tf_dtype) def cond(step, max_step, v, H, v_new, H_new): return step < max_step def body(step, max_step, v, H, v_new, H_new): # v, H, v_new, H_new = self._make_gibbs_step(v, H, v_new, H_new, # update_v=True, sample=sample) # v, H, v_new, H_new = self._make_gibbs_step(H) v_new, _, H_new, _ = self._make_gibbs_step(H) return step + 1, max_step, v_new, H_new, v, H # swap particles _, _, v, H, v_new, H_new = \ tf.while_loop(cond=cond, body=body, loop_vars=[tf.constant(0), n_steps, self._v, self._H, self._v_new, self._H_new], parallel_iterations=10, back_prop=False) # _, _, v, H, v_new, H_new = \ # tf.while_loop(cond=cond, body=body, # loop_vars=[tf.constant(0), # n_steps, # self._v, self._H, # self._v_new, self._H_new], # parallel_iterations=1, # back_prop=False) # v_update = self._v.assign(v) # v_new_update = self._v_new.assign(v_new) # H_updates = self._H.assign(H) # H_new_updates = self._H_new.assign(H_new) v_update = v #self._v.assign(v) v_new_update = v_new #self._v_new.assign(v_new) H_updates = H #self._H.assign(H) H_new_updates = H_new #self._H_new.assign(H_new) return v_update, H_updates, v_new_update, H_new_updates
def ret(y_true, y_pred): bernoulli = Bernoulli(probs=retain) b = tf.cast(bernoulli.sample(sample_shape=tf.shape(y_true)), dtype=tf.float32) output = y_pred mask = tf.maximum(b, y_true) output = output * mask output = tf.nn.softmax(output) loss = keras.losses.categorical_crossentropy(y_true,output) return loss
def __init__(self, n_in, n_out, model_prob, model_lam): self.model_prob = model_prob self.model_lam = model_lam self.model_bern = Bernoulli(probs=self.model_prob, dtype=tf.float32) self.model_M = tf.Variable( tf.truncated_normal([n_in, n_out], stddev=0.01)) self.model_m = tf.Variable(tf.zeros([n_out])) self.model_W = tf.matmul(tf.diag(self.model_bern.sample((n_in, ))), self.model_M)
def __init__(self, input_data, output_data, model_prob, model_lam): self.model_prob = model_prob self.model_lam = model_lam self.model_bern = Bernoulli(probs=self.model_prob, dtype=tf.float32) self.model_M = tf.Variable( tf.truncated_normal((input_data, output_data), stddev=0.01)) self.model_m = tf.Variable(tf.zeros((output_data))) self.model_W = tf.matmul( tf.diag(self.model_bern.sample((input_data, ))), self.model_M)
def __init__(self, n_in, n_out, model_prob=0.9, model_lam=1e-2, name="hidden"): self.model_prob = model_prob # probability to keep units self.model_lam = model_lam # l^2 / 2*tau self.model_bern = Bernoulli(probs=self.model_prob, dtype=tf.float32) # with tf.variable_scope("variational_dense"): self.model_M = tf.get_variable("{}_M".format(name), initializer=tf.truncated_normal([n_in, n_out], stddev=0.01)) self.model_m = tf.get_variable("{}_b".format(name), initializer=tf.zeros([n_out])) self.model_W = tf.matmul( tf.diag(self.model_bern.sample((n_in, ))), self.model_M )
def ret(y_true, y_pred): bernoulli = Bernoulli(probs=retain) b = tf.cast(bernoulli.sample(sample_shape=tf.shape(y_true)), dtype=tf.float32) output = y_pred mask = tf.maximum(b, y_true) exp_output = tf.exp(output - tf.reduce_max(output, reduction_indices=[1], keep_dims=True)) exp_output = exp_output * mask + 1e-4 sum_output = tf.reduce_sum(output, axis=1, keep_dims=True) output = exp_output / sum_output loss = keras.losses.categorical_crossentropy(y_true,output) return loss
def first(): first_token = self.inputs[:, 0] # (batch_size, 1) select_sampler = Bernoulli(probs=1.0, dtype=tf.bool) select_sample = select_sampler.sample( sample_shape=self.batch_size) token_rhyme = tf.cast(tf.gather(self.table, first_token), tf.float32) return tf.where( select_sample, tf.log(tf.multiply(token_rhyme, tf.nn.softmax(o_t))), tf.log(tf.nn.softmax(o_t)))
class VariationalDense: """Variational Dense Layer Class""" def __init__(self, n_in, n_out, model_prob=0.9, model_lam=1e-2, name="hidden"): self.model_prob = model_prob # probability to keep units self.model_lam = model_lam # l^2 / 2*tau self.model_bern = Bernoulli(probs=self.model_prob, dtype=tf.float32) # with tf.variable_scope("variational_dense"): self.model_M = tf.get_variable("{}_M".format(name), initializer=tf.truncated_normal([n_in, n_out], stddev=0.01)) self.model_m = tf.get_variable("{}_b".format(name), initializer=tf.zeros([n_out])) self.model_W = tf.matmul( tf.diag(self.model_bern.sample((n_in, ))), self.model_M ) def __call__(self, X, activation=tf.identity): if activation is None: activation = tf.identity output = activation(tf.matmul(X, self.model_W) + self.model_m) # if self.model_M.shape[1] == 1: # output = tf.squeeze(output) return output @property def regularization(self): return self.model_lam * ( self.model_prob * tf.reduce_sum(tf.square(self.model_M)) + tf.reduce_sum(tf.square(self.model_m)) )
class NumStepsDistribution(object): """Probability distribution used for the number of steps Transforms Bernoulli probabilities of an event = 1 into p(n) where n is the number of steps as described in the AIR paper.""" def __init__(self, steps_probs): """ :param steps_probs: tensor; Bernoulli success probabilities """ self._steps_probs = steps_probs self._joint = bernoulli_to_modified_geometric(steps_probs) self._bernoulli = None def sample(self, n=None): if self._bernoulli is None: self._bernoulli = Bernoulli(self._steps_probs) sample = self._bernoulli.sample(n) sample = tf.cumprod(sample, tf.rank(sample) - 1) sample = tf.reduce_sum(sample, -1) return sample def prob(self, samples=None): if samples is None: return self._joint return sample_from_tensor(self._joint, samples) def log_prob(self, samples): prob = self.prob(samples) prob = clip_preserve(prob, 1e-32, prob) return tf.log(prob)
def make_distribs(self, xxx_todo_changeme): """Converts parameters return by `_build` into probability distributions. """ (prior_where_loc, prior_where_scale, prior_what_loc, prior_what_scale, prop_prob_logit) = xxx_todo_changeme what_prior = Normal(prior_what_loc, prior_what_scale) where_prior = Normal(prior_where_loc, prior_where_scale) prop_prior = Bernoulli(logits=tf.squeeze(prop_prob_logit, -1)) return what_prior, where_prior, prop_prior
def Dropout(X, prob=0.7, train=tf.constant(False), name='Dropout'): from tensorflow.contrib.distributions import Bernoulli if not isinstance(prob, float) or prob > 1.0 or prob < 0.0: raise ValueError( 'Encountered illegal value for param (prob), expecting float between 0 and 1' ) with tf.name_scope(name): Dropout_Mask = tf.diag( Bernoulli(probs=prob, dtype=tf.float32).sample( (tf.shape(X)[-1], )), 'Dropout_Mask') X_dropped = tf.matmul(X, Dropout_Mask) return tf.cond(tf.equal(train, tf.constant(True)), lambda: X_dropped, lambda: X)
def init_eval_model(self): with tf.name_scope('eval_model'): self.eval_alpha_state = tf.placeholder(tf.float32) self.eval_rho_state = tf.placeholder(tf.float32) self.eval_n_test = tf.placeholder(tf.int32) eval_n_minibatch = self.eval_n_test - self.cs # Data Placeholder with tf.name_scope('input'): self.eval_ph = tf.placeholder(tf.int32) words = self.eval_ph # Index Masks with tf.name_scope('context_mask'): p_mask = tf.cast( tf.range(self.cs / 2, eval_n_minibatch + self.cs / 2), tf.int32) rows = tf.cast( tf.tile(tf.expand_dims(tf.range(0, self.cs / 2), [0]), [eval_n_minibatch, 1]), tf.int32) columns = tf.cast( tf.tile(tf.expand_dims(tf.range(0, eval_n_minibatch), [1]), [1, self.cs / 2]), tf.int32) ctx_mask = tf.concat( [rows + columns, rows + columns + self.cs / 2 + 1], 1) with tf.name_scope('natural_param'): with tf.name_scope('target_word'): p_idx = tf.gather(words, p_mask) p_rho = tf.squeeze(tf.gather(self.eval_rho_state, p_idx)) # Negative samples with tf.name_scope('negative_samples'): self.eval_n_idx = tf.placeholder(tf.int32) n_rho = tf.gather(self.eval_rho_state, self.eval_n_idx) with tf.name_scope('context'): ctx_idx = tf.squeeze(tf.gather(words, ctx_mask)) ctx_alphas = tf.gather(self.eval_alpha_state, ctx_idx) # Natural parameter ctx_sum = tf.reduce_sum(ctx_alphas, [1]) p_eta = tf.expand_dims( tf.reduce_sum(tf.multiply(p_rho, ctx_sum), -1), 1) n_eta = tf.reduce_sum( tf.multiply( n_rho, tf.tile(tf.expand_dims(ctx_sum, 1), [1, self.ns, 1])), -1) # Conditional likelihood y_pos = Bernoulli(logits=p_eta) y_neg = Bernoulli(logits=n_eta) ll_pos = y_pos.log_prob(1.0) ll_neg = tf.reduce_mean(y_neg.log_prob(0.0), axis=1) self.eval_ll = tf.nn.moments(ll_pos + ll_neg, axes=[0, 1])
def _build_graph(self): with tf.variable_scope('vae'): self.x = tf.placeholder(tf.float32, shape=[None, self._observation_dim]) with tf.variable_scope('encoder'): encoded = self._encode(self.x, self._latent_dim) with tf.variable_scope('latent'): self.mean = encoded[:, :self._latent_dim] logvar = encoded[:, self._latent_dim:] stddev = tf.sqrt(tf.exp(logvar)) epsilon = tf.random_normal([self._batch_size, self._latent_dim]) self.z = self.mean + stddev * epsilon with tf.variable_scope('decoder'): decoded = self._decode(self.z, self._observation_dim) self.obs_mean = decoded if self._observation_distribution == 'Gaussian': obs_epsilon = tf.random_normal([self._batch_size, self._observation_dim]) self.sample = self.obs_mean + self._observation_std * obs_epsilon else: self.sample = Bernoulli(probs=self.obs_mean).sample() with tf.variable_scope('loss'): with tf.variable_scope('kl-divergence'): kl = self._kl_diagnormal_stdnormal(self.mean, logvar) if self._observation_distribution == 'Gaussian': with tf.variable_scope('gaussian'): obj = self._gaussian_log_likelihood(self.x, self.obs_mean, self._observation_std) else: with tf.variable_scope('bernoulli'): obj = self._bernoulli_log_likelihood(self.x, self.obs_mean) self._loss = (kl + obj) / self._batch_size with tf.variable_scope('optimizer'): optimizer = tf.train.RMSPropOptimizer(learning_rate=self._learning_rate) with tf.variable_scope('training-step'): self._train = optimizer.minimize(self._loss) self._sesh = tf.Session() init = tf.global_variables_initializer() self._sesh.run(init)
def _make_ais(self): with tf.name_scope('annealed_importance_sampling'): # x_0 ~ Ber(0.5) of size (M, H_1) logits = tf.zeros([self._n_ais_runs, self._n_hiddens[0]]) T = Bernoulli(logits=logits).sample(seed=self.make_random_seed()) x_0 = tf.cast(T, dtype=self._tf_dtype) # x_1 ~ T_1(x_1 | x_0) x_1 = self._make_ais_next_sample(x_0, self._delta_beta) # -log p_0(x_1) log_Z = -self._unnormalized_log_prob_H0(x_1, 0.) def cond(log_Z, x, beta, delta_beta): return beta < 1. - delta_beta + 1e-5 def body(log_Z, x, beta, delta_beta): # with tf.control_dependencies([tf.Print('beta', [beta])]): # + log p_i(x_i) log_Z += self._unnormalized_log_prob_H0(x, beta) # x_{i + 1} ~ T_{i + 1}(x_{i + 1} | x_i) x_new = self._make_ais_next_sample(x, beta + delta_beta) # -log p_i(x_{i + 1}) log_Z -= self._unnormalized_log_prob_H0(x_new, beta) return log_Z, x_new, beta + delta_beta, delta_beta log_Z, x_M, _, _ = tf.while_loop(cond=cond, body=body, loop_vars=[log_Z, x_1, self._delta_beta, self._delta_beta], back_prop=False, parallel_iterations=1) # + log p_M(x_M) log_Z += self._unnormalized_log_prob_H0(x_M, 1.) # + log(Z_0) = (V + H_1 + H_2) * log(2) log_Z0 = self._n_visible + self._n_hiddens[0] + self._n_hiddens[1] log_Z0 = tf.cast(log_Z0, dtype=self._tf_dtype) log_Z0 *= tf.cast(tf.log(2.), dtype=self._tf_dtype) log_Z += log_Z0 tf.add_to_collection('log_Z', log_Z)
class VariationalDense: """Variational Dense Layer Class""" def __init__(self, n_in, n_out, model_prob=0.9, model_lam=1e-2, activation=None, name="hidden"): self.model_prob = model_prob # probability to keep units self.model_lam = model_lam # l^2 / 2*tau self.model_bern = Bernoulli(probs=self.model_prob, dtype=tf.float32) self.dropout_mask = self.model_bern.sample((n_in, )) if activation is None: self.activation = tf.identity else: self.activation = activation kernel_initializer = tf.initializers.truncated_normal(mean=0.0, stddev=0.01) self.model_M = tf.get_variable("{}_M".format(name), initializer=kernel_initializer( [n_in, n_out])) # variational parameters self.model_m = tf.get_variable("{}_b".format(name), initializer=tf.zeros([n_out])) self.model_W = tf.matmul(tf.diag(self.dropout_mask), self.model_M) # self.model_W = self.model_M def __call__(self, X): output = self.activation(tf.matmul(X, self.model_W) + self.model_m) if self.model_M.shape[1] == 1: output = tf.squeeze(output) return output @property def regularization(self): return self.model_lam * ( self.model_prob * tf.reduce_sum(tf.square(self.model_M)) + tf.reduce_sum(tf.square(self.model_m)))
def set_input_shape(self, input_shape, reuse): batch_size, rows, cols, input_channels = input_shape kernel_shape = tuple( self.kernel_shape) + (input_channels, self.output_channels) assert len(kernel_shape) == 4 assert all(isinstance(e, int) for e in kernel_shape), kernel_shape with tf.variable_scope(self.scope_name + '_init', reuse): init = tf.truncated_normal(kernel_shape, stddev=0.2, dtype=tf.float32) self.kernels = tf.get_variable("k", initializer=init) k_summ = tf.summary.histogram(name="k", values=self.kernels) if self.binary: from tensorflow.contrib.distributions import Bernoulli with self.G.gradient_override_map( {"Bernoulli": "QuantizeGrad"}): self.kernels = 2. * Bernoulli( probs=hard_sigmoid( self.kernels), dtype=tf.float32).sample() - 1. else: from tensorflow.contrib.distributions import MultivariateNormalDiag with self.G.gradient_override_map( {"MultivariateNormalDiag": "QuantizeGrad"}): self.kernels = MultivariateNormalDiag( loc=self.kernels).sample() k_rand_summ = tf.summary.histogram(name="k_rand", values=self.kernels) orig_input_batch_size = input_shape[0] input_shape = list(input_shape) input_shape[0] = 1 dummy_batch = tf.zeros(input_shape) dummy_output = self.fprop(dummy_batch, False) output_shape = [int(e) for e in dummy_output.get_shape()] output_shape[0] = 1 self.output_shape = tuple(output_shape)
class VariationalDense: """Variational Dense Layer Class""" def __init__(self, n_in, n_out, model_prob, model_lam): self.model_prob = model_prob self.model_lam = model_lam self.model_bern = Bernoulli(probs=self.model_prob, dtype=tf.float32) self.model_M = tf.Variable( tf.truncated_normal([n_in, n_out], stddev=0.01)) self.model_m = tf.Variable(tf.zeros([n_out])) self.model_W = tf.matmul(tf.diag(self.model_bern.sample((n_in, ))), self.model_M) def __call__(self, X, activation=tf.identity): output = activation(tf.matmul(X, self.model_W) + self.model_m) if self.model_M.shape[1] == 1: output = tf.squeeze(output) return output @property def regularization(self): return self.model_lam * ( self.model_prob * tf.reduce_sum(tf.square(self.model_M)) + tf.reduce_sum(tf.square(self.model_m)))
def __call__(self, inputs, seq_len, keep_prob=1.0, is_train=None, concat_layers=True): outputs = [tf.transpose(inputs, [1, 0, 2])] import ipdb; ipdb.set_trace() # only 2 layers, first layer is bidirectional # second layer gets output from first layer for layer in range(self.num_layers): gru_fw, gru_bw = self.grus[layer] param_fw, param_bw = self.params[layer] init_fw, init_bw = self.inits[layer] mask_fw, mask_bw = self.dropout_mask[layer] with tf.variable_scope("fw"): out_fw, _ = gru_fw(outputs[-1] * mask_fw, init_fw, param_fw) if layer == 0: import ipdb;ipdb.set_trace() b1 = tf.nn.relu(tf.matmul(out_fw, self.b1_w)) bnd = tf.nn.sigmoid(tf.matmul(b1, self.b2_w)) gates = Bernoulli(bnd) # TODO: initially just take the hidden state from the last layer and try to predict the boundary #bnd_input = tf.concat([out_fw, ]) #h1_out = tf. with tf.variable_scope("bw"): inputs_bw = tf.reverse_sequence( outputs[-1] * mask_bw, seq_lengths=seq_len, seq_dim=0, batch_dim=1) out_bw, _ = gru_bw(inputs_bw, init_bw, param_bw) out_bw = tf.reverse_sequence( out_bw, seq_lengths=seq_len, seq_dim=0, batch_dim=1) outputs.append(tf.concat([out_fw, out_bw], axis=2)) if concat_layers: res = tf.concat(outputs[1:], axis=2) else: res = outputs[-1] res = tf.transpose(res, [1, 0, 2]) return res
def __init__(self, args, d, logdir): super(dynamic_bern_emb_model, self).__init__(args, d, logdir) with tf.name_scope('model'): with tf.name_scope('embeddings'): self.alpha = tf.Variable(self.alpha_init, name='alpha', trainable=self.alpha_trainable) self.rho_t = {} for t in range(-1, self.T): self.rho_t[t] = tf.Variable( self.rho_init + 0.001 * tf.random_normal([self.L, self.K]) / self.K, name='rho_' + str(t)) with tf.name_scope('priors'): global_prior = Normal(loc=0.0, scale=self.sig) local_prior = Normal(loc=0.0, scale=self.sig / 100.0) self.log_prior = tf.reduce_sum( global_prior.log_prob(self.alpha)) self.log_prior = tf.reduce_sum( global_prior.log_prob(self.rho_t[-1])) for t in range(self.T): self.log_prior += tf.reduce_sum( local_prior.log_prob(self.rho_t[t] - self.rho_t[t - 1])) with tf.name_scope('likelihood'): self.placeholders = {} self.y_pos = {} self.y_neg = {} self.ll_pos = 0.0 self.ll_neg = 0.0 for t in range(self.T): # Index Masks p_mask = tf.range(int(self.cs / 2), self.n_minibatch[t] + int(self.cs / 2)) rows = tf.tile( tf.expand_dims(tf.range(0, int(self.cs / 2)), [0]), [self.n_minibatch[t], 1]) columns = tf.tile( tf.expand_dims(tf.range(0, self.n_minibatch[t]), [1]), [1, int(self.cs / 2)]) ctx_mask = tf.concat([ rows + columns, rows + columns + int(self.cs / 2) + 1 ], 1) # Data Placeholder self.placeholders[t] = tf.placeholder( tf.int32, shape=(self.n_minibatch[t] + self.cs)) # Taget and Context Indices p_idx = tf.gather(self.placeholders[t], p_mask) ctx_idx = tf.squeeze( tf.gather(self.placeholders[t], ctx_mask)) # Negative samples unigram_logits = tf.tile( tf.expand_dims(tf.log(tf.constant(self.unigram)), [0]), [self.n_minibatch[t], 1]) n_idx = tf.multinomial(unigram_logits, self.ns) # Context vectors ctx_alphas = tf.gather(self.alpha, ctx_idx) p_rho = tf.squeeze(tf.gather(self.rho_t[t], p_idx)) n_rho = tf.gather(self.rho_t[t], n_idx) # Natural parameter ctx_sum = tf.reduce_sum(ctx_alphas, [1]) p_eta = tf.expand_dims( tf.reduce_sum(tf.multiply(p_rho, ctx_sum), -1), 1) n_eta = tf.reduce_sum( tf.multiply( n_rho, tf.tile(tf.expand_dims(ctx_sum, 1), [1, self.ns, 1])), -1) # Conditional likelihood self.y_pos[t] = Bernoulli(logits=p_eta) self.y_neg[t] = Bernoulli(logits=n_eta) self.ll_pos += tf.reduce_sum(self.y_pos[t].log_prob(1.0)) self.ll_neg += tf.reduce_sum(self.y_neg[t].log_prob(0.0)) self.loss = -(self.n_epochs * (self.ll_pos + self.ll_neg) + self.log_prior)
def __init__(self, args, d, logdir): super(bern_emb_model, self).__init__(args, d, logdir) self.n_minibatch = self.n_minibatch.sum() with tf.name_scope('model'): # Data Placeholder with tf.name_scope('input'): self.placeholders = tf.placeholder(tf.int32) self.words = self.placeholders # Index Masks with tf.name_scope('context_mask'): self.p_mask = tf.cast( tf.range(int(self.cs / 2), self.n_minibatch + int(self.cs / 2)), tf.int32) rows = tf.cast( tf.tile(tf.expand_dims(tf.range(0, int(self.cs / 2)), [0]), [self.n_minibatch, 1]), tf.int32) columns = tf.cast( tf.tile(tf.expand_dims(tf.range(0, self.n_minibatch), [1]), [1, int(self.cs / 2)]), tf.int32) self.ctx_mask = tf.concat( [rows + columns, rows + columns + int(self.cs / 2) + 1], 1) with tf.name_scope('embeddings'): self.rho = tf.Variable(self.rho_init, name='rho') self.alpha = tf.Variable(self.alpha_init, name='alpha', trainable=self.alpha_trainable) with tf.name_scope('priors'): prior = Normal(loc=0.0, scale=self.sig) if self.alpha_trainable: self.log_prior = tf.reduce_sum( prior.log_prob(self.rho) + prior.log_prob(self.alpha)) else: self.log_prior = tf.reduce_sum(prior.log_prob( self.rho)) with tf.name_scope('natural_param'): # Taget and Context Indices with tf.name_scope('target_word'): self.p_idx = tf.gather(self.words, self.p_mask) self.p_rho = tf.squeeze(tf.gather(self.rho, self.p_idx)) # Negative samples with tf.name_scope('negative_samples'): unigram_logits = tf.tile( tf.expand_dims(tf.log(tf.constant(self.unigram)), [0]), [self.n_minibatch, 1]) self.n_idx = tf.multinomial(unigram_logits, self.ns) self.n_rho = tf.gather(self.rho, self.n_idx) with tf.name_scope('context'): self.ctx_idx = tf.squeeze( tf.gather(self.words, self.ctx_mask)) self.ctx_alphas = tf.gather(self.alpha, self.ctx_idx) # Natural parameter ctx_sum = tf.reduce_sum(self.ctx_alphas, [1]) self.p_eta = tf.expand_dims( tf.reduce_sum(tf.multiply(self.p_rho, ctx_sum), -1), 1) self.n_eta = tf.reduce_sum( tf.multiply( self.n_rho, tf.tile(tf.expand_dims(ctx_sum, 1), [1, self.ns, 1])), -1) # Conditional likelihood self.y_pos = Bernoulli(logits=self.p_eta) self.y_neg = Bernoulli(logits=self.n_eta) self.ll_pos = tf.reduce_sum(self.y_pos.log_prob(1.0)) self.ll_neg = tf.reduce_sum(self.y_neg.log_prob(0.0)) self.log_likelihood = self.ll_pos + self.ll_neg scale = 1.0 * self.N / self.n_minibatch self.loss = -(self.n_epochs * self.log_likelihood + self.log_prior)
def bernoulli_log_probs(args): from tensorflow.contrib.distributions import Bernoulli mu, x = args log_px = Bernoulli(probs=mu, name='dec_bernoulli').log_prob(x) return log_px
def minimize(self, loss, var_list=None, global_step=None): orig_graph_view = None trainable_vars = var_list if var_list != None else tf.trainable_variables( ) if self.inputs is not None: seed_ops = [t.op for t in self.inputs] result = list(seed_ops) wave = set(seed_ops) while wave: # stolen from grap_editor.select new_wave = set() for op in wave: for new_t in op.outputs: if new_t == loss: continue for new_op in new_t.consumers(): #if new_op not in result and is_within(new_op): if new_op not in result: new_wave.add(new_op) for op in new_wave: if op not in result: result.append(op) wave = new_wave orig_graph_view = ge.sgv(result) else: orig_graph_view = ge.sgv(self.work_graph) self.global_step_tensor = tf.Variable( 0, name='global_step', trainable=False) if global_step is None else global_step # Perturbations deltas = {} n_perturbations = {} p_perturbations = {} with tf.name_scope("Perturbator"): self.c_t = tf.div( self.c, tf.pow( tf.add(tf.cast(self.global_step_tensor, tf.float32), tf.constant(1, dtype=tf.float32)), self.gamma), name="SPSA_ct") # self.c_t = 0.00 #MOD for var in trainable_vars: self.num_params += self._mul_dims(var.get_shape()) var_name = var.name.split(':')[0] random = Bernoulli(tf.fill(var.get_shape(), 0.5), dtype=tf.float32) deltas[var] = tf.subtract(tf.constant(1, dtype=tf.float32), tf.scalar_mul( tf.constant(2, dtype=tf.float32), random.sample(1)[0]), name="SPSA_delta") c_t_delta = tf.scalar_mul(tf.reshape(self.c_t, []), deltas[var]) n_perturbations[var_name + '/read:0'] = tf.subtract( var, c_t_delta, name="perturb_n") p_perturbations[var_name + '/read:0'] = tf.add( var, c_t_delta, name="perturb_p") # print("{} parameters".format(self.num_params)) # Evaluator with tf.name_scope("Evaluator"): _, self.ninfo = self._clone_model(orig_graph_view, n_perturbations, 'N_Eval') _, self.pinfo = self._clone_model(orig_graph_view, p_perturbations, 'P_Eval') # Weight Updater optimizer_ops = [] with tf.control_dependencies([loss]): with tf.name_scope('Updater'): a_t = self.a / (tf.pow( tf.add(tf.cast(self.global_step_tensor, tf.float32), tf.constant(1, dtype=tf.float32)), self.alpha)) # a_t = 0.00 #MOD for var in trainable_vars: l_pos = self.pinfo.transformed(loss) l_neg = self.ninfo.transformed(loss) # print( "l_pos: ", l_pos) # print( "l_neg: ", l_neg) ghat = (l_pos - l_neg) / (tf.constant(2, dtype=tf.float32) * self.c_t * deltas[var]) optimizer_ops.append(tf.assign_sub(var, a_t * ghat)) grp = control_flow_ops.group(*optimizer_ops) with tf.control_dependencies([grp]): tf.assign_add(self.global_step_tensor, tf.constant(1, dtype=self.global_step_tensor.dtype)) return grp