def _compareGradient(self, shape, axis, exclusive, reverse): x = np.arange(1, 9).reshape(shape).astype(np.float64) with self.test_session(): t = tf.convert_to_tensor(x) result = tf.cumprod(t, axis, exclusive, reverse) jacob_t, jacob_n = tf.test.compute_gradient(t, shape, result, shape, x_init_value=x, delta=1) self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
def to_simplex(x): """Transform real vector of length `(K-1)` to a simplex of dimension `K` using a backward stick breaking construction. Args: x: tf.Tensor. A 1-D or 2-D tensor. Returns: tf.Tensor. A tensor of same shape as input but with last dimension of size `K`. Raises: InvalidArgumentError. If the input has Inf or NaN values. #### Notes x as a 3-D or higher tensor is not guaranteed to be supported. """ x = tf.cast(x, dtype=tf.float32) dependencies = [tf.verify_tensor_all_finite(x, msg='')] x = control_flow_ops.with_dependencies(dependencies, x) if isinstance(x, (tf.Tensor, tf.Variable)): shape = x.get_shape().as_list() else: shape = x.shape if len(shape) == 1: K_minus_one = shape[0] eq = -tf.log(tf.cast(K_minus_one - tf.range(K_minus_one), dtype=tf.float32)) z = tf.sigmoid(eq + x) pil = tf.concat([z, tf.constant([1.0])], 0) piu = tf.concat([tf.constant([1.0]), 1.0 - z], 0) S = tf.cumprod(piu) return S * pil else: n_rows = shape[0] K_minus_one = shape[1] eq = -tf.log(tf.cast(K_minus_one - tf.range(K_minus_one), dtype=tf.float32)) z = tf.sigmoid(eq + x) pil = tf.concat([z, tf.ones([n_rows, 1])], 1) piu = tf.concat([tf.ones([n_rows, 1]), 1.0 - z], 1) S = tf.cumprod(piu, axis=1) return S * pil
def create_tf_graph_for_simulate_paths(): S = tf.placeholder(tf.float32) K = tf.placeholder(tf.float32) dt = tf.placeholder(tf.float32) T = tf.placeholder(tf.float32) sigma = tf.placeholder(tf.float32) r = tf.placeholder(tf.float32) dw = tf.placeholder(tf.float32) S_T = S * tf.cumprod(tf.exp((r-sigma**2/2)*dt+sigma*tf.sqrt(dt)*dw), axis=1) return (S, K, dt, T, sigma, r, dw, S_T)
def _compare(self, x, axis, reverse, use_gpu=False): np_out = x if reverse: np_out = numpy_reverse(np_out, axis) np_out = np.cumprod(np_out, axis=axis) if reverse: np_out = numpy_reverse(np_out, axis) with self.test_session(use_gpu=use_gpu): tf_out = tf.cumprod(x, axis, reverse).eval() self.assertAllClose(np_out, tf_out)
def at(ut, N): """ returns the allocation weighting given the updated usage vector """ sorted_ut, free_list = tf.nn.top_k(-1 * ut, N) sorted_ut *= -1 # brings the usages to the original positive values # the exclusive argument makes the first element in the cumulative # product a 1 instead of the first element in the given tensor sorted_ut_cumprod = tf.cumprod(sorted_ut, exclusive=True) out_of_location_at = (1 - sorted_ut) * sorted_ut_cumprod empty_at_container = tf.TensorArray(tf.float32, N) full_at_container = empty_at_container.scatter(free_list, out_of_location_at) return full_at_container.pack()
def testInvalidAxis(self): x = np.arange(0, 10).reshape([2, 5]).astype(np.float32) input_tensor = tf.convert_to_tensor(x) with self.test_session(use_gpu=True): with self.assertRaisesWithPredicateMatch( tf.errors.InvalidArgumentError, lambda e: "Expected scan axis in the range [-2, 2)" in str(e)): tf.cumprod(input_tensor, -3).eval() with self.assertRaisesWithPredicateMatch( tf.errors.InvalidArgumentError, lambda e: "Expected scan axis in the range [-2, 2)" in str(e)): tf.cumprod(input_tensor, 2).eval() with self.assertRaisesWithPredicateMatch( tf.errors.InvalidArgumentError, lambda e: "axis must be a scalar" in str(e)): tf.cumprod(input_tensor, [0]).eval()
def _compare(self, x, axis, exclusive, reverse): np_out = handle_options(np.cumprod, x, axis, exclusive, reverse) with self.test_session(use_gpu=True): tf_out = tf.cumprod(x, axis, exclusive, reverse).eval() self.assertAllClose(np_out, tf_out)
def discounted_reduce_sum(X, discount, axis=-1): if discount != 1.0: disc = tf.cumprod(discount*tf.ones_like(X), axis=axis) else: disc = 1.0 return tf.reduce_sum(X*disc, axis=axis)
def SB_Conv2d(inp, ksize, S=128, padding='SAME', strides=[1, 1, 1, 1], bias=True, train=True, reuse=False, sbp=False, temp_bern=0.5, temp_cat=0.5, activation='lwta', name='conv'): """ Convolutional layer for the SB-LWTA model, incorporating local competition. Parameters: inp: 4d tensor The input to the current layer. ksize: 5d tensor The size of the kernels. The last 2 dimensions denote the blocks and units therein. padding: str The padding for the conv operation. Default: SAME. (see tf conv documentation). strides: 4d tensor The strides for the conv operation. Default: [1,1,1,1] (see tf conv). bias: boolean Flag denoting the use of bias. train: boolean Flag to alternate between train or not branches. reuse: boolean Flag to reuse or not the variables of the layer. sbp: boolean Flag to enable or disable the stick breaking process temp_bern: float The temperature for the bernoulli relaxation temp_cat: float The temperature for the categorical relaxation activation: String Select the activation function for the current layer. name: str The name of the current layer. Returns: out: 4d tensor The output of the layer after the masked convolution operation, the addition of bias (if bias==True) and the LWTA activation. mW: 2d tensor The mean of the weights. Used to load values when calling the compression script masked_mw: 4d tensor The mean of the weights of the convolutional kernel masked with a sample from the IBP (if active). Used for calculating the compression ability of the implementation. masked_sw: 4d tensor The variance of the weights of the convolutional kernel masked with a sample from the IBP (if active). Used for calculating the compression ability of the implementation. activations: 2d tensor The activations for the current batch. Used for plotting the probability of activations. """ K = ksize[-2] U = ksize[-1] tau = 1e-2 name = name + '_' + activation with tf.variable_scope(name, reuse=reuse): # variables for the weights mW = tf.get_variable( 'mW', [ksize[0], ksize[1], ksize[2], K * U], initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32) sW = tf.get_variable('sW', [ksize[0], ksize[1], ksize[2], K * U], initializer=tf.constant_initializer(-5.), constraint=lambda x: tf.clip_by_value(x, -7., x), dtype=tf.float32) sW = tf.nn.softplus(sW) # variables and construction for the stick breaking process if sbp: # posterior concentrations for the Kumaraswamy distribution conc1 = variable_on_cpu( 'sb_t_u_1', [K], initializer=tf.constant_initializer(3.), constraint=lambda x: tf.clip_by_value(x, -6., x), dtype=tf.float32) conc0 = variable_on_cpu( 'sb_t_u_2', [K], initializer=tf.constant_initializer(1.), constraint=lambda x: tf.clip_by_value(x, -6., x), dtype=tf.float32) conc1 = tf.nn.softplus(conc1) conc0 = tf.nn.softplus(conc0) # stick breaking construction q_u = kumaraswamy_sample( conc1, conc0, sample_shape=[inp.get_shape()[1].value, K]) pi = tf.cumprod(q_u) # posterior bernooulli (relaxed) probabilities t_pi = tf.get_variable('sb_t_pi', [K], \ initializer = tf.initializers.random_uniform(-5., 1.), constraint = lambda x: tf.clip_by_value(x, -7., 600.),\ dtype = tf.float32) t_pi = tf.nn.sigmoid(t_pi) biases = 0. if bias: biases = variable_on_cpu('bias', [K * U], tf.constant_initializer(0.0)) z = 1. # train branch if train: # reparametrizable normal sample eps = tf.stop_gradient(tf.random_normal(mW.get_shape())) W = mW + eps * sW re = tf.ones_like(W) # stick breaking kl and operations if sbp: z_sample = bin_concrete_sample(t_pi, temp_bern) z = tf.tile(z_sample, [U]) W *= z kl_sticks = tf.reduce_sum( kumaraswamy_kl(tf.ones_like(conc1), tf.ones_like(conc0), conc1, conc0, q_u)) kl_z = tf.reduce_sum( bin_concrete_kl(pi, t_pi, temp_bern, z_sample)) tf.add_to_collection('kl_loss', kl_sticks) tf.add_to_collection('kl_loss', kl_z) tf.summary.scalar('kl_sticks', kl_sticks) tf.summary.scalar('kl_z', kl_z) # if probability of activation is smaller than tau, it's inactive tf.summary.scalar( 'sparsity', tf.reduce_sum( tf.cast(tf.greater(t_pi / (1. + t_pi), tau), tf.float32)) * U) # add the kl terms to the collection kl_weights = tf.reduce_sum(normal_kl(tf.zeros_like(mW), tf.ones_like(sW), \ mW, sW, W)) tf.add_to_collection('losses', kl_weights) tf.summary.scalar('kl_weights', kl_weights) # convolution operation lam = tf.nn.conv2d(inp, W, strides=strides, padding=padding) + biases # choose activation based on input if activation == 'lwta': assert U > 1, 'The number of competing units should be larger than 1' # reshape weight to calculate probabilities lam_re = tf.reshape( lam, [-1, lam.get_shape()[1], lam.get_shape()[2], K, U]) prbs = tf.nn.softmax(lam_re) + 1e-5 prbs /= tf.reduce_sum(prbs, -1, keepdims=True) # draw relaxed sample and apply activation xi = concrete_sample(prbs, temp_cat) out = lam_re * xi out = tf.reshape(out, tf.shape(lam)) # add the relative kl terms kl_xi = tf.reduce_mean( tf.reduce_sum( concrete_kl(tf.ones_like(lam_re) / U, prbs, xi), [1])) tf.add_to_collection('kl_loss', kl_xi) tf.summary.scalar('kl_xi', kl_xi) elif activation == 'relu': # apply relu out = tf.nn.relu(lam) elif activation == 'maxout': #apply maxout activation lam_re = tf.reshape( lam, [-1, lam.get_shape()[1], lam.get_shape()[2], K, U]) out = tf.reduce_max(lam_re, -1, keepdims=False) else: print('Activation:', activation, 'not implemented.') # test branch, same with train but replace samples with means else: re = tf.ones_like(mW) z = 1. # if sbp is active calculate mask and draw samples if sbp: mask = tf.cast(tf.greater(t_pi, tau), tf.float32) z = Bernoulli(probs=mask * t_pi, name="q_z_test", dtype=tf.float32).sample() z = tf.tile(z, [U]) re = tf.tile(mask * t_pi, [U]) # convolution operation lam = tf.nn.conv2d(inp, re * mW, strides=strides, padding=padding) + biases if activation == 'lwta': # calculate probabilities of activation lam_re = tf.reshape( lam, [-1, lam.get_shape()[1], lam.get_shape()[2], K, U]) prbs = tf.nn.softmax(lam_re) + 1e-5 prbs /= tf.reduce_sum(prbs, -1, keepdims=True) # draw sample for activated units out = lam_re * concrete_sample(prbs, 0.01) out = tf.reshape(out, tf.shape(lam)) elif activation == 'relu': # apply relu out = tf.nn.relu(lam) elif activation == 'maxout': # apply maxout operation lam_re = tf.reshape( lam, [-1, lam.get_shape()[1], lam.get_shape()[2], K, U]) out = tf.reduce_max(lam_re, -1) else: print('Activation:', activation, ' not implemented.') return out, mW, z * mW, z * sW**2, z
def ngrams(strings, ngram_range): """Create a tensor of n-grams. Given a vector of strings, return a sparse matrix containing the ngrams from each string. Each row in the output sparse tensor contains the set of ngrams from the corresponding element in the input tensor. The output ngrams including all whitespace and punctuation from the original strings. Example: strings = ['ab: c', 'wxy.'] ngrams_range = (1,3) output is a sparse tensor with indices = [[0, 0], [0, 1], ..., [0, 11], [1, 0], [1, 1], ..., [1, 8]] values = ['a', 'ab', 'ab:', 'b', 'b:', 'b: ', ':', ': ', ': c', ' ', ' c', 'c', 'w', 'wx', 'wxy', 'x', 'xy', 'xy.', 'y', 'y.', '.'] dense_shape = [2, 12] Args: strings: A tensor of strings with size [batch_size,]. ngram_range: A pair with the range (inclusive) of ngram sizes to return. Returns: A SparseTensor containing all ngrams from each element of the input. Raises: ValueError: if ngram_range[0] < 1 or ngram_range[1] < ngram_range[0] """ # This function is implemented as follows. First we split the input. If the # input is ['abcd', 'q', 'xyz'] then the split opreation returns a # SparseTensor with # # indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [2, 0], [2, 1], [2, 2]] # values=['a', 'b', 'c', 'd', 'q', 'x', 'y', 'z'] # dense_shape=[3, 4] # # We then create shifts of the values and first column of indices, buffering # to avoid overruning the end of the array, so the shifted values (if we are # creating ngrams up to size 3) are # # shifted_batch_indices[0]=[0, 0, 0, 0, 1, 2, 2, 2] # shifted_chars[0]=['a', 'b', 'c', 'd', 'q', 'x', 'y', 'z'] # # shifted_batch_indices[1]=[0, 0, 0, 1, 2, 2, 2, -1] # shifted_chars[1]=['b', 'c', 'd', 'q', 'x', 'y', 'z', ''] # # shifted_batch_indices[2]=[0, 0, 1, 2, 2, 2, -1, -1] # shifted_chars[2]=['c', 'd', 'q', 'x', 'y', 'z', '', ''] # # These shifted ngrams are used to create the ngrams as follows. We use # tf.string_join to join shifted_chars[:k] to create k-grams. The batch that # the first of these belonged to is given by shifted_batch_indices[0]. # However some of these will cross the boundaries between 'batches' and so # we we create a boolean mask which is True when shifted_indices[:k] are all # equal. # # This results in tensors of ngrams, their batch indices and a boolean mask, # which we then use to construct the output SparseTensor. chars = tf.string_split(strings, delimiter='') if ngram_range[0] < 1 or ngram_range[1] < ngram_range[0]: raise ValueError('Invalid ngram_range: %r' % (ngram_range, )) def _sliding_windows(values, num_shifts, fill_value): buffered_values = tf.concat( [values, tf.fill([num_shifts - 1], fill_value)], 0) return [ tf.slice(buffered_values, [i], tf.shape(values)) for i in range(num_shifts) ] shifted_batch_indices = _sliding_windows(chars.indices[:, 0], ngram_range[1] + 1, tf.constant(-1, dtype=tf.int64)) shifted_chars = _sliding_windows(chars.values, ngram_range[1] + 1, '') # Construct a tensor of the form # [['a', 'ab, 'abc'], ['b', 'bcd', cde'], ...] def _string_join(tensors): if tensors: return tf.string_join(tensors) else: return ngrams_array = [ _string_join(shifted_chars[:k]) for k in range(ngram_range[0], ngram_range[1] + 1) ] ngrams_tensor = tf.stack(ngrams_array, 1) # Construct a boolean mask for whether each ngram in ngram_tensor is valid, # in that each character cam from the same batch. valid_ngram = tf.equal( tf.cumprod(tf.to_int32( tf.equal(tf.stack(shifted_batch_indices, 1), tf.expand_dims(shifted_batch_indices[0], 1))), axis=1), 1) valid_ngram = valid_ngram[:, (ngram_range[0] - 1):ngram_range[1]] # Construct a tensor with the batch that each ngram in ngram_tensor belongs # to. batch_indices = tf.tile(tf.expand_dims(chars.indices[:, 0], 1), [1, ngram_range[1] + 1 - ngram_range[0]]) # Apply the boolean mask and construct a SparseTensor with the given indices # and values, where another index is added to give the position within a # batch. batch_indices = tf.boolean_mask(batch_indices, valid_ngram) ngrams_tensor = tf.boolean_mask(ngrams_tensor, valid_ngram) instance_indices = segment_indices(batch_indices) return tf.SparseTensor( tf.stack([batch_indices, instance_indices], 1), ngrams_tensor, tf.stack([ tf.size(strings, out_type=tf.int64), tf.reduce_max(instance_indices) + 1 ], 0))
def _compare(self, x, axis, exclusive, reverse, use_gpu=False): np_out = handle_options(np.cumprod, x, axis, exclusive, reverse) with self.test_session(use_gpu=use_gpu): tf_out = tf.cumprod(x, axis, exclusive, reverse).eval() self.assertAllClose(np_out, tf_out)
def _build(self, batch_size, horizon, behavioral, per_decision, normalize=False, truncate_at=np.infty): if [batch_size, horizon, behavioral, per_decision, normalize, truncate_at]!=self._setting: #checkpoint = time.time() self._setting = [batch_size, horizon, behavioral, per_decision, normalize, truncate_at] self.mask = tf.placeholder(name="mask", dtype=tf.float32, shape=[batch_size*horizon, 1]) rews_by_episode = tf.split(self.rew, batch_size) rews_by_episode = tf.stack(rews_by_episode) disc = self.gamma + 0*rews_by_episode disc = tf.cumprod(disc, axis=1, exclusive=True) disc_rews = rews_by_episode * disc rets = tf.reduce_sum(disc_rews, axis=1) if behavioral is None: #On policy avg_J, var_J = tf.nn.moments(tf.reduce_sum(disc_rews, axis=1), axes=[0]) grad_avg_J = tf.constant(0) grad_var_J = tf.constant(0) avg_iw = tf.constant(1) var_iw = tf.constant(0) max_iw = tf.constant(1) ess = batch_size else: #Off policy -> importance weighting :( log_ratios = self.logprobs - behavioral.pd.logp(self.ac_in) log_ratios = tf.expand_dims(log_ratios, axis=1) log_ratios = tf.multiply(log_ratios, self.mask) log_ratios_by_episode = tf.split(log_ratios, batch_size) log_ratios_by_episode = tf.stack(log_ratios_by_episode) if per_decision: #Per-decision iw = tf.exp(tf.cumsum(log_ratios_by_episode, axis=1)) if not normalize: #Per-decision, unnormalized (possibly truncated) iw = tf.clip_by_value(iw, 0, truncate_at) weighted_rets = tf.reduce_sum(tf.multiply(disc_rews,iw), axis=1) avg_J, var_J = tf.nn.moments(weighted_rets, axes=[0]) else: #Per-decision, self-normalized iw = batch_size*iw/tf.reduce_sum(iw, axis=0) avg_J_t = tf.reduce_mean(disc_rews* iw, axis=0) avg_J = tf.reduce_sum(avg_J_t) var_J = 1./batch_size * tf.reduce_sum(disc**2 * tf.reduce_mean(iw**2 * (rews_by_episode - avg_J_t)**2, axis=0)) #Da controllare weighted_rets = tf.reduce_sum(tf.multiply(disc_rews,iw), axis=1) eff_iw = weighted_rets/rets avg_iw, var_iw = tf.nn.moments(eff_iw, axes=[0]) max_iw = tf.reduce_max(eff_iw) else: #Per-trajectory iw = tf.exp(tf.reduce_sum(log_ratios_by_episode, axis=1)) if not normalize: #Per trajectory, unnormalized (possibly truncated) iw = tf.clip_by_value(iw, 0, truncate_at) weighted_rets = tf.multiply(rets, iw) avg_J, var_J = tf.nn.moments(weighted_rets, axes=[0]) avg_iw, var_iw = tf.nn.moments(iw, axes=[0]) ess = tf.round(tf.reduce_sum(iw)**2 / tf.reduce_sum(iw**2)) else: #Per-trajectory, self-normalized iw = batch_size*iw/tf.reduce_sum(iw, axis=0) avg_J = tf.reduce_mean(rets*iw, axis=0) var_J = 1./batch_size * tf.reduce_mean(iw**2 * (rets - avg_J)**2) avg_iw = tf.reduce_mean(iw, axis=0) var_iw = 1./batch_size * tf.reduce_mean((iw - 1)**2) ess = tf.round(tf.reduce_sum(iw)**2 / tf.reduce_sum(iw**2)) max_iw = tf.reduce_max(iw) grad_avg_J = U.flatgrad(avg_J, self.get_param()) grad_var_J = U.flatgrad(var_J, self.get_param()) avg_ret, var_ret = tf.nn.moments(tf.reduce_sum(disc_rews, axis=1), axes=[0]) max_ret = tf.reduce_max(tf.reduce_sum(disc_rews, axis=1)) self._avg_J = avg_J self._var_J = var_J self._grad_avg_J = grad_avg_J self._grad_var_J = grad_var_J self._get_avg_J = U.function([self.ob, self.ac_in, self.rew, self.gamma, self.mask], [avg_J]) self._get_var_J = U.function([self.ob, self.ac_in, self.rew, self.gamma, self.mask], [var_J]) self._get_grad_J = U.function([self.ob, self.ac_in, self.rew, self.gamma, self.mask], [grad_avg_J]) self._get_grad_var_J = U.function([self.ob, self.ac_in, self.rew, self.gamma, self.mask], [grad_var_J]) self._get_all = U.function([self.ob, self.ac_in, self.rew, self.gamma, self.mask], [avg_J, var_J, grad_avg_J, grad_var_J]) self._get_ess = U.function([self.ob, self.ac_in, self.rew, self.gamma, self.mask], [ess]) self._get_iw_stats = U.function([self.ob, self.ac_in, self.rew, self.gamma, self.mask], [avg_iw, var_iw, max_iw, ess]) self._get_ret_stats = U.function([self.ob, self.ac_in, self.rew, self.gamma, self.mask], [avg_ret, var_ret, max_ret])
def __init__(self, sess, rnn_size, layer_size, decoder_vocab_size, embedding_dim, k, lr): self.sess = sess self._k = k self.lr = lr self.postive_imediate_reward = 1.0 self.negative_imediate_reward = 0.2 self.account_ratio = 0.9 self.rnn_size = rnn_size self.interesting = tf.placeholder(tf.float32, shape=[None, decoder_vocab_size], name='interest') self.history_masking = tf.placeholder(tf.float32, shape=[None, decoder_vocab_size], name='history') decoder_cell = self._get_simple_lstm(rnn_size, layer_size) self.rnn_init_state = tf.placeholder(tf.float32, [1, rnn_size], name='rnn_state') decoder_embedding = tf.Variable(tf.truncated_normal(shape=[decoder_vocab_size, embedding_dim], stddev=0.1), name='decoder_embedding') self.start_tokens = tf.placeholder(tf.int32, shape=[None], name='start_tokens') self.start_hit = tf.placeholder(tf.float32, shape=[None], name='start_hit') self.mem = tf.placeholder(tf.float32, shape=[None, decoder_vocab_size], name='mem') self.sequence_length = tf.placeholder(tf.int32, shape=[None], name='seq_length') helper = InteractiveGreedyEmbeddingHelper(decoder_embedding, self._k, self.start_tokens, self.start_hit, decoder_vocab_size, self.sequence_length) with tf.variable_scope('decoder'): fc_layer = Dense(decoder_vocab_size, activation=tf.nn.softmax) decoder = ExternalMemInteractiveDecoder(decoder_cell, helper, self.rnn_init_state, self.history_masking, self.interesting, self.mem, self.rnn_size, fc_layer) self.logits, self.final_state, self.final_history_masking, self.hit, self.final_sequence_lengths = \ dynamic_interactive_decode(decoder) self.hit = self.hit.hit reverse_hit = tf.reverse_sequence(self.hit, self.sequence_length, seq_dim=1) self.reverse_imediate_reward = tf.where(reverse_hit > 0, reverse_hit * self.postive_imediate_reward, (reverse_hit - 1) * self.negative_imediate_reward) self.imediate_reward = tf.reverse_sequence(self.reverse_imediate_reward, self.sequence_length, seq_dim=1) initial_time = tf.constant(0, dtype=tf.int32) initial_pre_reward = self.reverse_imediate_reward[0, 0] * 0.0 output_ta = tf.TensorArray(dtype=tf.float32, size=1, dynamic_size=True) def cond(time, pre_reward, output_ta_l): return tf.reduce_all(time < self.sequence_length) def body(time, pre_reward, output_ta_l): pre_reward = self.reverse_imediate_reward[0, time] + self.account_ratio * pre_reward output_ta_l = output_ta_l.write(time, pre_reward) return time + 1, pre_reward, output_ta_l res = tf.while_loop(cond, body, loop_vars=[initial_time, initial_pre_reward, output_ta]) self.cumsum_reward = tf.reverse_sequence([res[-1].stack()], self.sequence_length, seq_dim=1) self.cumsum_reward = tf.stop_gradient(self.cumsum_reward) self.rnn_output = self.logits.rnn_output self.sample_ids = self.logits.sample_id self.onehot_sample = tf.one_hot(self.sample_ids, depth=decoder_vocab_size, axis=-1) self.target = tf.placeholder(tf.int32, shape=[None, None], name='target') self.onehot_target = tf.one_hot(self.target, depth=decoder_vocab_size, axis=-1) self.gt_ratio = tf.cumprod((self.cumsum_reward * 0 + 1) * self.account_ratio, axis=1) self.gt_ratio = tf.stop_gradient(self.gt_ratio) self.is_reinforce = tf.placeholder(tf.int32, shape=[], name='isReinfoce') self.reinforce_cross_entropy = tf.reduce_mean(-tf.reduce_sum(tf.log(1e-8 + tf.reshape(self.rnn_output, [-1, decoder_vocab_size])) * tf.reshape(self.onehot_sample, [-1, decoder_vocab_size]) * self.cumsum_reward * self.gt_ratio, axis=-1), name='reinfolearn') self.supervised_cross_entropy = tf.reduce_mean(-tf.reduce_sum( tf.log(1e-8 + tf.reshape(self.rnn_output, [-1, decoder_vocab_size])) * tf.reshape(self.onehot_target, [-1, decoder_vocab_size]), name='mem_suplearn')) self.cost = tf.cond(self.is_reinforce > 0, lambda: self.reinforce_cross_entropy, lambda: self.supervised_cross_entropy) self.train_opt = tf.train.AdamOptimizer(self.lr, epsilon=1e-4) gradients = self.train_opt.compute_gradients(self.cost) capped_gradients = [(tf.clip_by_value(grad, -10., 10.), var) for grad, var in gradients if grad is not None] self.train_op = self.train_opt.apply_gradients(capped_gradients)
def ngrams(tokens, ngram_range, separator, name=None): """Create a `SparseTensor` of n-grams. Given a `SparseTensor` of tokens, returns a `SparseTensor` containing the ngrams that can be constructed from each row. `separator` is inserted between each pair of tokens, so " " would be an appropriate choice if the tokens are words, while "" would be an appropriate choice if they are characters. Example: `tokens` is a `SparseTensor` with indices = [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [1, 3]] values = ['One', 'was', 'Johnny', 'Two', 'was', 'a', 'rat'] dense_shape = [2, 4] If we set ngrams_range = (1,3) separator = ' ' output is a `SparseTensor` with indices = [[0, 0], [0, 1], [0, 2], ..., [1, 6], [1, 7], [1, 8]] values = ['One', 'One was', 'One was Johnny', 'was', 'was Johnny', 'Johnny', 'Two', 'Two was', 'Two was a', 'was', 'was a', 'was a rat', 'a', 'a rat', 'rat'] dense_shape = [2, 9] Args: tokens: a two-dimensional`SparseTensor` of dtype `tf.string` containing tokens that will be used to construct ngrams. ngram_range: A pair with the range (inclusive) of ngram sizes to return. separator: a string that will be inserted between tokens when ngrams are constructed. name: (Optional) A name for this operation. Returns: A `SparseTensor` containing all ngrams from each row of the input. Raises: ValueError: if ngram_range[0] < 1 or ngram_range[1] < ngram_range[0] """ # This function is implemented as follows. Assume we start with the following # `SparseTensor`: # # indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [2, 0], [2, 1], [2, 2]] # values=['a', 'b', 'c', 'd', 'q', 'x', 'y', 'z'] # dense_shape=[3, 4] # # First we then create shifts of the values and first column of indices, # buffering to avoid overrunning the end of the array, so the shifted values # (if we are ngrams up to size 3) are # # shifted_batch_indices[0]=[0, 0, 0, 0, 1, 2, 2, 2] # shifted_tokens[0]=['a', 'b', 'c', 'd', 'q', 'x', 'y', 'z'] # # shifted_batch_indices[1]=[0, 0, 0, 1, 2, 2, 2, -1] # shifted_tokens[1]=['b', 'c', 'd', 'q', 'x', 'y', 'z', ''] # # shifted_batch_indices[2]=[0, 0, 1, 2, 2, 2, -1, -1] # shifted_tokens[2]=['c', 'd', 'q', 'x', 'y', 'z', '', ''] # # These shifted ngrams are used to create the ngrams as follows. We use # tf.string_join to join shifted_tokens[:k] to create k-grams. The `separator` # string is inserted between each pair of tokens in the k-gram. # The batch that the first of these belonged to is given by # shifted_batch_indices[0]. However some of these will cross the boundaries # between 'batches' and so we we create a boolean mask which is True when # shifted_indices[:k] are all equal. # # This results in tensors of ngrams, their batch indices and a boolean mask, # which we then use to construct the output SparseTensor. with tf.name_scope(name, 'ngrams'): if ngram_range[0] < 1 or ngram_range[1] < ngram_range[0]: raise ValueError('Invalid ngram_range: %r' % (ngram_range,)) def _sliding_windows(values, num_shifts, fill_value): buffered_values = tf.concat( [values, tf.fill([num_shifts - 1], fill_value)], 0) return [tf.slice(buffered_values, [i], tf.shape(values)) for i in range(num_shifts)] shifted_batch_indices = _sliding_windows( tokens.indices[:, 0], ngram_range[1] + 1, tf.constant(-1, dtype=tf.int64)) shifted_tokens = _sliding_windows(tokens.values, ngram_range[1] + 1, '') # Construct a tensor of the form # [['a', 'ab, 'abc'], ['b', 'bcd', cde'], ...] def _string_join(tensors): if tensors: return tf.string_join(tensors, separator=separator) else: return ngrams_array = [_string_join(shifted_tokens[:k]) for k in range(ngram_range[0], ngram_range[1] + 1)] ngrams_tensor = tf.stack(ngrams_array, 1) # Construct a boolean mask for whether each ngram in ngram_tensor is valid, # in that each character cam from the same batch. valid_ngram = tf.equal(tf.cumprod( tf.to_int32(tf.equal(tf.stack(shifted_batch_indices, 1), tf.expand_dims(shifted_batch_indices[0], 1))), axis=1), 1) valid_ngram = valid_ngram[:, (ngram_range[0] - 1):ngram_range[1]] # Construct a tensor with the batch that each ngram in ngram_tensor belongs # to. batch_indices = tf.tile(tf.expand_dims(tokens.indices[:, 0], 1), [1, ngram_range[1] + 1 - ngram_range[0]]) # Apply the boolean mask and construct a SparseTensor with the given indices # and values, where another index is added to give the position within a # batch. batch_indices = tf.boolean_mask(batch_indices, valid_ngram) ngrams_tensor = tf.boolean_mask(ngrams_tensor, valid_ngram) instance_indices = segment_indices(batch_indices) dense_shape_second_dim = tf.maximum(tf.reduce_max(instance_indices), -1) + 1 return tf.SparseTensor( indices=tf.stack([batch_indices, instance_indices], 1), values=ngrams_tensor, dense_shape=tf.stack( [tokens.dense_shape[0], dense_shape_second_dim]))
def build_Q_expansion_graph(self, obs, first_rewards, first_done, worldmodel, rollout_len=1, model_ensembling=False): ### this sets up the machinery for having multiple parallel rollouts, each of which has a single consistent transition ensemble_idxs, transition_sample_n, reward_sample_n = worldmodel.get_ensemble_idx_info( ) q_sample_n = self.bayesian_config[ "eval_sample_count"] if self.bayesian_config is not False else 1 first_rewards = tf.tile( tf.expand_dims(tf.expand_dims(first_rewards, 1), 1), [1, transition_sample_n, reward_sample_n]) first_rewards.set_shape([None, transition_sample_n, reward_sample_n]) if model_ensembling: obs = tf.tile(tf.expand_dims(obs, 1), [1, transition_sample_n, 1]) obs.set_shape([None, transition_sample_n, self.obs_dim]) first_done = tf.tile(tf.expand_dims(first_done, 1), [1, transition_sample_n]) first_done.set_shape([None, transition_sample_n]) ### below, we use a while loop to actually do the iterative model rollout extra_info = worldmodel.init_extra_info(obs) action_ta = tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32) obs_ta = tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32) done_ta = tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32) extra_info_ta = tf.TensorArray(size=rollout_len, dynamic_size=False, dtype=tf.float32) def rollout_loop_body(r_i, xxx_todo_changeme): (obs, done, extra_info, action_ta, obs_ta, dones_ta, extra_info_ta) = xxx_todo_changeme action_pretanh, action = self.build_evalution_graph( tf.stop_gradient(obs), get_full_info=True) if model_ensembling: next_obs, next_dones, next_extra_info = worldmodel.transition( obs, action, extra_info, ensemble_idxs=ensemble_idxs) else: next_obs, next_dones, next_extra_info = worldmodel.transition( obs, action, extra_info) next_obs = tf.reduce_mean(next_obs, -2) next_dones = tf.reduce_mean(next_dones, -1) action_ta = action_ta.write(r_i, action) obs_ta = obs_ta.write(r_i, obs) dones_ta = dones_ta.write(r_i, done) extra_info_ta = extra_info_ta.write(r_i, extra_info) return r_i + 1, (next_obs, next_dones, next_extra_info, action_ta, obs_ta, dones_ta, extra_info_ta) _, (final_obs, final_done, final_extra_info, action_ta, obs_ta, done_ta, extra_info_ta) = tf.while_loop( lambda r_i, _: r_i < rollout_len, rollout_loop_body, [ 0, (obs, first_done, extra_info, action_ta, obs_ta, done_ta, extra_info_ta) ]) final_action_pretanh, final_action = self.build_evalution_graph( tf.stop_gradient(final_obs), get_full_info=True) ### compile the TensorArrays into useful tensors obss = obs_ta.stack() obss = tf.reshape( obss, tf.stack([rollout_len, -1, transition_sample_n, self.obs_dim])) obss = tf.transpose(obss, [1, 0, 2, 3]) final_obs = tf.reshape( final_obs, tf.stack([-1, 1, transition_sample_n, self.obs_dim])) all_obss = tf.concat([obss, final_obs], 1) next_obss = all_obss[:, 1:] dones = done_ta.stack() dones = tf.reshape(dones, tf.stack([rollout_len, -1, transition_sample_n])) dones = tf.transpose(dones, [1, 0, 2]) final_done = tf.reshape(final_done, tf.stack([-1, 1, transition_sample_n])) all_dones = tf.concat([dones, final_done], 1) actions = action_ta.stack() actions = tf.reshape( actions, tf.stack([rollout_len, -1, transition_sample_n, self.action_dim])) actions = tf.transpose(actions, [1, 0, 2, 3]) final_action = tf.reshape( final_action, tf.stack([-1, 1, transition_sample_n, self.action_dim])) all_actions = tf.concat([actions, final_action], 1) continue_probs = tf.cumprod(1. - all_dones, axis=1) rewards = worldmodel.get_rewards(obss, actions, next_obss) rawrew = rewards = tf.concat( [tf.expand_dims(first_rewards, 1), rewards], 1) ### TDK trick means we have to guess at every timestep if self.value_expansion["tdk_trick"]: guess_info = tf.concat([obss, actions], -1) Q_guesses = self.Q(guess_info, reduce_mode="random") Q_guesses = tf.reduce_mean( Q_guesses, -1 ) # make it so there's only one guess per rollout length, which is the mean of the guesses under all the various model rollouts reached_this_point_to_guess_prob = tf.reduce_mean( continue_probs, -1) else: Q_guesses = None reached_this_point_to_guess_prob = None ### use the Q function at every timestep to get value estimates target_info = tf.concat([all_obss, all_actions], -1) Q_targets = self.old_Q(target_info, reduce_mode="none") rollout_frames = rollout_len + 1 # if we take N steps, we have N+1 frames ### create "decay-exponent matrix" of size [1,ROLLOUT_FRAMES,ROLLOUT_FRAMES,1]. the first ROLLOUT_FRAMES corresponds to the index of the source, the second to the target. ts_count_mat = (tf.cast( tf.reshape(tf.range(rollout_frames), [1, rollout_frames]) - tf.reshape(tf.range(rollout_frames), [rollout_frames, 1]), tf.float32)) reward_coeff_matrix = tf.matrix_band_part( tf.ones([rollout_frames, rollout_frames]), 0, -1) * self.discount**ts_count_mat value_coeff_matrix = tf.matrix_band_part( tf.ones([rollout_frames, rollout_frames]), 0, -1) * self.discount**(1. + ts_count_mat) reward_coeff_matrix = tf.reshape( reward_coeff_matrix, [1, rollout_frames, rollout_frames, 1, 1]) value_coeff_matrix = tf.reshape( value_coeff_matrix, [1, rollout_frames, rollout_frames, 1, 1]) ### similarly, create a "done" matrix shifted_continue_probs = tf.concat([ tf.expand_dims(tf.ones_like(continue_probs[:, 0]), 1), continue_probs[:, :-1] ], 1) reward_continue_matrix = tf.expand_dims( shifted_continue_probs, 1) / tf.expand_dims( shifted_continue_probs + 1e-8, 2) value_continue_matrix = tf.expand_dims( continue_probs, 1) / tf.expand_dims(shifted_continue_probs + 1e-8, 2) reward_continue_matrix = tf.expand_dims(reward_continue_matrix, -1) value_continue_matrix = tf.expand_dims(value_continue_matrix, -1) ### apply the discounting factors to the rewards and values rewards = tf.expand_dims( rewards, 1) * reward_coeff_matrix * reward_continue_matrix rewards = tf.cumsum(rewards, axis=2) values = tf.expand_dims(Q_targets, 1) * value_coeff_matrix * value_continue_matrix ### compute the targets using the Bellman equation sampled_targets = tf.expand_dims( rewards, -2) * self.reward_scale + tf.expand_dims(values, -1) ### flatten out the various sources of variance (transition, reward, and Q-function ensembles) to get a set of estimates for each candidate target sampled_targets = tf.reshape( sampled_targets, tf.stack([ -1, rollout_frames, rollout_frames, transition_sample_n * reward_sample_n * q_sample_n ])) ### compute the mean and variance for each candidate target target_means, target_variances = tf.nn.moments(sampled_targets, 3) ### compute the confidence, either using the full covariance matrix, or approximating all the estimators as independent if self.value_expansion["covariances"]: targetdiffs = sampled_targets - tf.expand_dims(target_means, 3) target_covariances = tf.einsum( "abij,abjk->abik", targetdiffs, tf.transpose(targetdiffs, [0, 1, 3, 2])) target_confidence = tf.squeeze( tf.matrix_solve( target_covariances + tf.expand_dims( tf.expand_dims( tf.matrix_band_part( tf.ones(tf.shape(target_covariances)[-2:]), 0, 0) * 1e-3, 0), 0), tf.ones( tf.concat([ tf.shape(target_covariances)[:-1], tf.constant([1]) ], 0))), -1) else: target_confidence = 1. / (target_variances + 1e-8) ### normalize so weights sum to 1 target_confidence *= tf.matrix_band_part( tf.ones([1, rollout_frames, rollout_frames]), 0, -1) target_confidence = target_confidence / tf.reduce_sum( target_confidence, axis=2, keepdims=True) ### below here is a bunch of debugging Print statements that I use as a sanity check: # target_confidence = tf.Print(target_confidence, [], message="raw rewards") # target_confidence = tf.Print(target_confidence, [rawrew[0,:,0,0]], summarize=rollout_len+1) # target_means = tf.Print(target_means, [], message="\n", summarize=rollout_len+1) # target_means = tf.Print(target_means, [(1. - all_dones)[0,:,0]], message="contin", summarize=rollout_len+1) # target_means = tf.Print(target_means, [continue_probs[0,:,0]], message="cum_contin", summarize=rollout_len+1) # target_means = tf.Print(target_means, [shifted_continue_probs[0,:,0]], message="shifted contin", summarize=rollout_len+1) # target_means = tf.Print(target_means, [], message="reward_coeff") # for i in range(rollout_len+1): target_means = tf.Print(target_means, [reward_coeff_matrix[0,i,:,0,0]], summarize=rollout_len+1) # target_means = tf.Print(target_means, [], message="reward_continue") # for i in range(rollout_len+1): target_means = tf.Print(target_means, [reward_continue_matrix[0,i,:,0,0]], summarize=rollout_len+1) # target_means = tf.Print(target_means, [], message="value_coeff") # for i in range(rollout_len+1): target_means = tf.Print(target_means, [value_coeff_matrix[0,i,:,0,0]], summarize=rollout_len+1) # target_means = tf.Print(target_means, [], message="value_continue") # for i in range(rollout_len+1): target_means = tf.Print(target_means, [value_continue_matrix[0,i,:,0,0]], summarize=rollout_len+1) # target_confidence = tf.Print(target_confidence, [], message="rewards") # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [rewards[0,i,:,0,0]], summarize=rollout_len+1) # target_confidence = tf.Print(target_confidence, [], message="target Qs") # target_confidence = tf.Print(target_confidence, [Q_targets[0,:,0,0]], summarize=rollout_len+1) # target_confidence = tf.Print(target_confidence, [], message="values") # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [values[0,i,:,0,0]], summarize=rollout_len+1) # target_confidence = tf.Print(target_confidence, [], message="target_means") # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_means[0,i,:]], summarize=rollout_len+1) # target_confidence = tf.Print(target_confidence, [], message="target_variance") # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_variances[0,i,:]], summarize=rollout_len+1) # target_confidence = tf.Print(target_confidence, [], message="target_confidence") # for i in range(rollout_len+1): target_confidence = tf.Print(target_confidence, [target_confidence[0,i,:]], summarize=rollout_len+1) # target_means = tf.Print(target_means, [target_confidence, action_lls, tf.shape(Q_targets)], message="\n\n", summarize=10) return target_means, target_confidence, Q_guesses, reached_this_point_to_guess_prob
def discounted_reduce_sum(X, discount, axis=-1): if discount != 1.0: disc = tf.cumprod(discount * tf.ones_like(X), axis=axis) else: disc = 1.0 return tf.reduce_sum(X * disc, axis=axis)