def __call__(self, inputs, states, scope=None): with tf.variable_scope( scope or type(self).__name__, initializer=tf.random_normal_initializer(stddev=0.01)): # get the tensor if self._separate_pad: t_shape = [self._num_outputs, self._num_outputs, self._num_inputs] vec_a = inputs vec_b = states else: t_shape = [self._num_outputs+1, self._num_outputs, self._num_inputs+1] vec_a = tf.concat( axis=1, values=[inputs, tf.ones([inputs.get_shape()[0].value, 1])]) vec_b = tf.concat( axis=1, values=[inputs, tf.ones([inputs.get_shape()[0].value, 1])]) tensor = get_tt_3_tensor(t_shape, self._ranks, name='W') result = bilinear_product_tt_3(vec_a, tensor, vec_b) if self._separate_pad: # TODO possible weightnorm D = tf.get_variable('D', [self._num_inputs, self._num_outputs], initializer=tf.uniform_unit_scaling_initializer(1.2)) E = tf.get_variable('E', [self._num_outputs, self._num_outputs], initializer=tf.uniform_unit_scaling_initializer(1.2)) b = tf.get_variable('b', [self._num_outputs], initializer=tf.constant_initializer(0.0)) z = tf.nn.bias_add(tf.matmul(inputs, D) + tf.matmul(states, E), b) result = result + z result = self._nonlin(result) return result, result
def __init__( self, num_units, activation = simple_act, input_weights_init = tf.uniform_unit_scaling_initializer(factor=1.0), recc_weights_init = tf.uniform_unit_scaling_initializer(factor=0.1), sigma = 1.0, update_gate = True, dt = 1.0 ): self._num_units = num_units self._activation = activation self._dt = dt self._sigma = sigma if sigma else 1.0 self._update_gate = update_gate self.W = None self.U = None self.bias = None self.W_u = None self.U_u = None self.bias_u = None self.W_s = None self.U_s = None self.bias_s = None self.sigma = None self.input_weights_init = input_weights_init self.recc_weights_init = recc_weights_init self._sensitivity = False self.states_info = [] self.update_info = []
def _init_parameters(self): if self.W is None: self.W = vs.get_variable("W", [self._filters_num + self._num_units, self._num_units], initializer=tf.uniform_unit_scaling_initializer(factor=weight_init_factor)) if self.F is None: self.F = vs.get_variable("F", [L, filters_num], initializer=tf.uniform_unit_scaling_initializer(factor=weight_init_factor)) if self.R is None: self.R = vs.get_variable("R", [L, 1], initializer=tf.uniform_unit_scaling_initializer(factor=weight_init_factor*0.5))
def testInitializerIdentical(self): for use_gpu in [False, True]: init1 = tf.uniform_unit_scaling_initializer(seed=1) init2 = tf.uniform_unit_scaling_initializer(seed=1) self.assertTrue(identicaltest(self, init1, init2, use_gpu)) init3 = tf.uniform_unit_scaling_initializer(1.5, seed=1) init4 = tf.uniform_unit_scaling_initializer(1.5, seed=1) self.assertTrue(identicaltest(self, init3, init4, use_gpu))
def testInitializerDifferent(self): for use_gpu in [False, True]: init1 = tf.uniform_unit_scaling_initializer(seed=1) init2 = tf.uniform_unit_scaling_initializer(seed=2) init3 = tf.uniform_unit_scaling_initializer(1.5, seed=1) self.assertFalse(identicaltest(self, init1, init2, use_gpu)) self.assertFalse(identicaltest(self, init1, init3, use_gpu)) self.assertFalse(identicaltest(self, init2, init3, use_gpu))
def testInitializerIdentical(self): for dtype in [tf.float32, tf.float64]: init1 = tf.uniform_unit_scaling_initializer(seed=1, dtype=dtype) init2 = tf.uniform_unit_scaling_initializer(seed=1, dtype=dtype) self.assertTrue(identicaltest(self, init1, init2)) init3 = tf.uniform_unit_scaling_initializer(1.5, seed=1, dtype=dtype) init4 = tf.uniform_unit_scaling_initializer(1.5, seed=1, dtype=dtype) self.assertTrue(identicaltest(self, init3, init4))
def testInitializerDifferent(self): for dtype in [tf.float32, tf.float64]: init1 = tf.uniform_unit_scaling_initializer(seed=1, dtype=dtype) init2 = tf.uniform_unit_scaling_initializer(seed=2, dtype=dtype) init3 = tf.uniform_unit_scaling_initializer(1.5, seed=1, dtype=dtype) self.assertFalse(identicaltest(self, init1, init2)) self.assertFalse(identicaltest(self, init1, init3)) self.assertFalse(identicaltest(self, init2, init3))
def sharded_variable(name, shape, num_shards, dtype=tf.float32, transposed=False): '''分片操作''' shard_size = int((shape[0] + num_shards - 1) / num_shards) if transposed: initializer = tf.uniform_unit_scaling_initializer( dtype=dtype, ) else: initializer = tf.uniform_unit_scaling_initializer(dtype=dtype, ) return [tf.get_variable(name + '_%d' % i, [shard_size, shape[1]], initializer=initializer, dtype=dtype) for i in range(num_shards)]
def make_variable(name, shape, initializer, weight_decay=None, lr_mult=1, decay_mult=1): if lr_mult == 0: var = tf.get_variable(name, shape, initializer=initializer, trainable=False) elif weight_decay is None: var = tf.get_variable( name, shape, initializer=tf.uniform_unit_scaling_initializer()) else: var = tf.get_variable( name, shape, initializer=tf.uniform_unit_scaling_initializer(), regularizer=tf.contrib.layers.l2_regularizer(weight_decay*decay_mult)) if lr_mult > 0: tf.add_to_collection(str(lr_mult), var); return var
def _init_parameters(self): return tf.get_variable("F", [self._filter_size, self._input_size, self._layer_size], initializer=tf.uniform_unit_scaling_initializer(factor=c.weight_init_factor) ) def __call__(self, input, state, scope=None): #### if self._params is None: self._params = self._init_parameters() x = input u, a = state F = self._params #### b = tf.nn.conv1d(x, F, 1) Fc = tf.matmul(tf.transpose(F, (0, 2, 1), F)) fb = tf.conv1d(a, Fc, 1) print "b", b.get_shape() print "Fc", Fc.get_shape() print "fb", fb.get_shape() du = - u + b - fb new_u = u + c.epsilon * du / c.tau new_a = tf.nn.relu(new_u - c.lam) #### return (new_u, new_a), (new_u, new_a)
def testTransformerAutoencoder(self): hparams = imagetransformer_latent_tiny() hparams.mode = tf.estimator.ModeKeys.TRAIN block_dim = int(hparams.hidden_size // hparams.num_blocks) block_v_size = 2**(hparams.bottleneck_bits / (hparams.num_residuals * hparams.num_blocks)) block_v_size = int(block_v_size) means = tf.get_variable( name="means", shape=[hparams.num_residuals, hparams.num_blocks, block_v_size, block_dim], initializer=tf.uniform_unit_scaling_initializer()) hparams.bottleneck = functools.partial( discretization.discrete_bottleneck, hidden_size=hparams.hidden_size, z_size=hparams.bottleneck_bits, filter_size=hparams.filter_size, startup_steps=hparams.startup_steps, bottleneck_kind=hparams.bottleneck_kind, num_blocks=hparams.num_blocks, num_residuals=hparams.num_residuals, reshape_method=hparams.reshape_method, beta=hparams.vq_beta, decay=hparams.vq_decay, soft_em=hparams.soft_em, num_samples=hparams.num_samples, epsilon=hparams.vq_epsilon, ema=hparams.ema, means=means) inputs = None batch_size = hparams.batch_size targets = tf.random_uniform([batch_size, hparams.img_len, hparams.img_len, hparams.hidden_size], minval=-1., maxval=1.) target_space_id = None tf.train.create_global_step() decoder_output, losses, cache = latent_layers.transformer_autoencoder( inputs, targets, target_space_id, hparams) self.assertEqual(set(six.iterkeys(losses)), {"extra", "extra_loss", "latent_pred"}) self.evaluate(tf.global_variables_initializer()) decoder_output_, extra_loss_, latent_pred_ = self.evaluate( [decoder_output, losses["extra_loss"], losses["latent_pred"]]) self.assertEqual(decoder_output_.shape, (batch_size, hparams.img_len, hparams.img_len, hparams.hidden_size)) self.assertEqual(extra_loss_.shape, (batch_size,)) self.assertEqual(latent_pred_.shape, (batch_size,)) self.assertAllGreaterEqual(extra_loss_, 0.) self.assertAllGreaterEqual(latent_pred_, 0.) self.assertEqual(cache, None)
def __call__(self, inputs, states, scope=None): with tf.variable_scope(scope or type(self).__name__) as outer_scope: # do it # sub scope for the tensor init # should inherit reuse from outer scope with tf.variable_scope('tensor', initializer=init.orthonormal_init(0.5)): tensor = get_cp_tensor([self.input_size, self.output_size, self.state_size], self.rank, 'W', weightnorm=False, trainable=True) combination = bilinear_product_cp(inputs, tensor, states) # and project the input input_weights = tf.get_variable('U', shape=[self.input_size, self._input_projection], initializer=tf.uniform_unit_scaling_initializer(1.4)) input_proj = tf.matmul(inputs, input_weights) # apply a bias pre-nonlinearity bias = tf.get_variable('b', shape=[self.output_size], initializer=tf.constant_initializer(0.0)) if self.layernorm == 'pre': activations = layer_normalise(combination + input_proj + bias) else: activations = combination + input_proj + bias result = self._nonlinearity(activations) if self.layernorm == 'post': result = layer_normalise(result) result = result + states return result, result
def FullyConnected(x, out_dim, W_init=None, b_init=None, nl=tf.nn.relu, use_bias=True): """ Fully-Connected layer. :param input: a tensor to be flattened except the first dimension. :param out_dim: output dimension :param W_init: initializer for W. default to `xavier_initializer_conv2d`. :param b_init: initializer for b. default to zero initializer. :param nl: nonlinearity. default to `relu`. :param use_bias: whether to use bias. a boolean default to True :returns: a 2D tensor """ x = batch_flatten(x) in_dim = x.get_shape().as_list()[1] if W_init is None: #W_init = tf.truncated_normal_initializer(stddev=1 / math.sqrt(float(in_dim))) W_init = tf.uniform_unit_scaling_initializer(factor=1.43) if b_init is None: b_init = tf.constant_initializer() W = tf.get_variable('W', [in_dim, out_dim], initializer=W_init) if use_bias: b = tf.get_variable('b', [out_dim], initializer=b_init) prod = tf.nn.xw_plus_b(x, W, b) if use_bias else tf.matmul(x, W) return nl(prod, name='output')
def setup_loss_critic(critic): # we are starting with critic.outputs symbol (after logistic layer) with tf.variable_scope("rl", initializer=tf.uniform_unit_scaling_initializer(1.0)): # loss setup # None to timestep critic.target_qt = tf.placeholder(tf.float32, shape=[None, None, critic.vocab_size], name="q_action_score") # p_actions is the target_token, and it's already [T, batch_size] # q_t needs to be expanded... # critic.outputs [T, batch_size, vocab_size] # let's populate (expand) target tokens to fill up qt (just like what we did with one-hot labels) critic.q_loss = tf.reduce_mean(tf.square(critic.outputs - critic.target_qt)) # Note: not adding lambda*C yet (variance) opt = nlc_model.get_optimizer(FLAGS.optimizer)(critic.learning_rate) # update params = tf.trainable_variables() gradients = tf.gradients(critic.q_loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, FLAGS.max_gradient_norm) # self.gradient_norm = tf.global_norm(clipped_gradients) critic.gradient_norm = tf.global_norm(gradients) critic.param_norm = tf.global_norm(params) critic.updates = opt.apply_gradients( zip(clipped_gradients, params), global_step=critic.global_step)
def _fully_connected(self, x, out_dim): x = tf.reshape(x, [self._params.batch_size, -1]) w = tf.get_variable( 'DW', [x.get_shape()[1], out_dim], initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) b = tf.get_variable( 'biases', [out_dim], initializer=tf.constant_initializer()) return tf.nn.xw_plus_b(x, w, b)
def __init__(self,FLAGS): # Q: we can use an LSTM in the decoder too, but it may be a better idea not to increase the number of parameters too much self.state_size = FLAGS.state_size self.maxSentenceLength = FLAGS.maxSentenceLength with vs.variable_scope("decoder", initializer = tf.contrib.layers.xavier_initializer()): self.W = tf.get_variable("W", dtype = tf.float64, shape = (self.state_size,1)) self.b = tf.get_variable("b", dtype = tf.float64, shape = (1,), initializer=tf.uniform_unit_scaling_initializer(1.0))
def _fully_connected(self, x, out_dim, name=''): with tf.variable_scope(name): x = tf.reshape(x, [self._batch_size, -1]); w = tf.get_variable( name+'DW', [x.get_shape()[1], out_dim], initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) b = tf.get_variable(name+'biases', [out_dim], initializer=tf.constant_initializer()) return tf.nn.xw_plus_b(x, w, b)
def _fully_connected(self, x, out_dim): # 输入转换成2D tensor,尺寸为[N,-1] x = tf.reshape(x, [self.hps.batch_size, -1]) # 参数w,平均随机初始化,[-sqrt(3/dim), sqrt(3/dim)]*factor w = tf.get_variable('DW', [x.get_shape()[1], out_dim], initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) # 参数b,0值初始化 b = tf.get_variable('biases', [out_dim], initializer=tf.constant_initializer()) # 计算x*w+b return tf.nn.xw_plus_b(x, w, b)
def __call__(self, inputs, states, scope=None): """does the stuff""" with tf.variable_scope(scope or type(self).__name__, initializer=init.spectral_normalised_init(0.5)): # first we need to get the tensor if not self._separate_pad: shape = [self._num_units+1, self._num_units, self._num_inputs+1] vec_b = tf.concat( axis=1, values=[inputs, tf.ones([inputs.get_shape()[0].value, 1])]) vec_a = tf.concat( axis=1, values=[states, tf.ones([inputs.get_shape()[0].value, 1])]) else: shape = [self._num_units, self._num_units, self._num_inputs] vec_a, vec_b = states, inputs tensor = get_cp_tensor(shape, self._rank, 'W', weightnorm=self._weightnorm) result = bilinear_product_cp(vec_a, tensor, vec_b) if self._separate_pad: # TODO: use the new handy things if self._weightnorm: in_weights = get_weightnormed_matrix( [self._num_inputs, self._num_units], name='input_weights') rec_weights = get_weightnormed_matrix( [self._num_units, self._num_units], name='recurrent_weights', V_init=init.identity_initializer()) else: in_weights = tf.get_variable( 'input_weights', [self._num_inputs, self._num_units], tf.float32, initializer=tf.uniform_unit_scaling_initializer()) rec_weights = tf.get_variable( 'recurrent_weights', [self._num_units, self._num_units], tf.float32, initializer=init.identity_initializer()) bias = tf.get_variable('bias', [self._num_units], initializer=tf.constant_initializer(0.0)) result += tf.nn.bias_add( tf.matmul(vec_a, rec_weights) + tf.matmul(vec_b, in_weights), bias) result = self._nonlinearity(result) return result, result
def getModel(input): # 2-layer NN with tf.variable_scope("NN", initializer=tf.uniform_unit_scaling_initializer(factor=1.15)): W_1 = tf.get_variable("W_1", [self.config.num_hidden_1, input.get_shape()[0]]) self._test=W_1 b_1 = tf.get_variable("b_1", [self.config.num_hidden_1,1]) W_2 = tf.get_variable("W_2", [self.config.num_hidden_2, self.config.num_hidden_1]) b_2 = tf.get_variable("b_2", [self.config.num_hidden_2,1]) y_1 = tf.sigmoid(tf.matmul(W_1, input)+b_1) y_2 = tf.sigmoid(tf.matmul(W_2, y_1)+b_2) return y_2
def __init__(self, embedding_dim, num_embeddings, commitment_cost, name='vq_layer'): super(VectorQuantizer, self).__init__(name=name) self._embedding_dim = embedding_dim self._num_embeddings = num_embeddings self._commitment_cost = commitment_cost with self._enter_variable_scope(): initializer = tf.uniform_unit_scaling_initializer() self._w = tf.get_variable('embedding', [embedding_dim, num_embeddings], initializer=initializer, trainable=True)
def fc(inputs, w_shape, b_shape): w = tf.get_variable( "weights", w_shape, initializer=tf.truncated_normal_initializer(dtype=tf.float32, stddev=0.36), regularizer=tf.nn.l2_loss) b = tf.get_variable( "bias", b_shape, initializer=tf.uniform_unit_scaling_initializer(factor=0.1, seed=10, dtype=tf.float32)) return tf.matmul(inputs, w)
def init_models(eval_config, rnn_config, sess): global_step_tensor = tf.Variable(0, trainable=False, name='global_step') print('Creating rnn model') if flags.FLAGS.restore: initializer = None else: initializer = tf.uniform_unit_scaling_initializer() with tf.variable_scope("model", reuse=None, initializer=initializer): train_image_tensor = tf.placeholder(np.float32, (rnn_config.batch_size, rnn_config.image_size, rnn_config.image_size, 3), 'input_image') m = MultiModal(is_training=True, config=rnn_config, image_tensor=train_image_tensor, global_step_tensor=global_step_tensor) m.load_alexnet('models/alexnet_weights.npy', sess) variables_to_save = tf.trainable_variables() + [global_step_tensor] #print(variables_to_save) with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = MultiModal(is_training=False, config=rnn_config, image_tensor=train_image_tensor, global_step_tensor=global_step_tensor) test_image_tensor = tf.placeholder(np.float32, (eval_config.batch_size, eval_config.image_size, eval_config.image_size, 3), 'test_input_image') mtest = MultiModal(is_training=False, config=eval_config, image_tensor=test_image_tensor, global_step_tensor=global_step_tensor) initial_value = np.zeros((eval_config.batch_size, eval_config.image_size, eval_config.image_size, 3)).astype(np.float32) initial_value[0,:,:,:] = skimage.img_as_float(skimage.io.imread('data/test_image.jpg')) image_gen = tf.Variable(initial_value, trainable=True) mgen = MultiModal(is_training=False, config=eval_config, image_tensor=image_gen, global_step_tensor=global_step_tensor) gradients = tf.gradients(mgen.cost, [image_gen]) print(gradients) optimizer = tf.train.AdamOptimizer(0.1) image_train = optimizer.apply_gradients(zip(gradients, [image_gen])) merged = tf.merge_all_summaries() writer = tf.train.SummaryWriter("logs/") saver = tf.train.Saver() tf.initialize_all_variables().run() if flags.FLAGS.restore: checkpoint = tf.train.latest_checkpoint(os.path.abspath('ckpts/')) if checkpoint: print('Restoring from checkpoint: {}'.format(checkpoint)) saver.restore(sess, checkpoint) return global_step_tensor, m, merged, mtest, mvalid, mgen, image_gen, image_train, saver, writer
def conv1d_log(x, num_filters, filter_length, name, dilation=1, causal=True, kernel_initializer=tf.uniform_unit_scaling_initializer(1.0), biases_initializer=tf.constant_initializer(0.0)): """Fast 1D convolution that supports causal padding and dilation. Args: x: The [mb, time, channels] float tensor that we convolve. num_filters: The number of filter maps in the convolution. filter_length: The integer length of the filter. name: The name of the scope for the variables. dilation: The amount of dilation. causal: Whether or not this is a causal convolution. kernel_initializer: The kernel initialization function. biases_initializer: The biases initialization function. Returns: y: The output of the 1D convolution. """ batch_size, length, num_input_channels = x.get_shape().as_list() assert length % dilation == 0 kernel_shape = [1, filter_length, num_input_channels, num_filters] strides = [1, 1, 1, 1] biases_shape = [num_filters] padding = 'VALID' if causal else 'SAME' with tf.variable_scope(name): weights = tf.get_variable( 'W', shape=kernel_shape, initializer=kernel_initializer) biases = tf.get_variable( 'biases', shape=biases_shape, initializer=biases_initializer) x_ttb = time_to_batch(x, dilation) if filter_length > 1 and causal: x_ttb = tf.pad(x_ttb, [[0, 0], [filter_length - 1, 0], [0, 0]]) W_mean = tf.reduce_mean(weights) biases_mean = tf.reduce_mean(biases) x_ttb_shape = x_ttb.get_shape().as_list() x_4d = tf.reshape(x_ttb, [x_ttb_shape[0], 1, x_ttb_shape[1], num_input_channels]) y = tf.nn.conv2d(x_4d, weights, strides, padding=padding) y = tf.nn.bias_add(y, biases) y_shape = y.get_shape().as_list() y = tf.reshape(y, [y_shape[0], y_shape[2], num_filters]) y = batch_to_time(y, dilation) y.set_shape([batch_size, length, num_filters]) return y, W_mean, biases_mean
def __init__(self, FLAGS): self.numClasses = FLAGS.numClasses self.maxSentenceLength = FLAGS.maxSentenceLength with vs.variable_scope("classifier", initializer = tf.contrib.layers.xavier_initializer()): # self.U = tf.get_variable("U", dtype = tf.float64, # shape = (self.maxSentenceLength,self.numClasses)) self.U = tf.get_variable("U", dtype = tf.float64, shape = (self.maxSentenceLength,1)) # self.b = tf.get_variable("b", dtype = tf.float64, shape = (self.numClasses,), # initializer=tf.uniform_unit_scaling_initializer(1.0)) self.b = tf.get_variable("b", dtype = tf.float64, shape = (1,), initializer=tf.uniform_unit_scaling_initializer(1.0))
def _fully_connected(self, x, out_dim): """FullyConnected layer for final output.""" num_non_batch_dimensions = len(x.shape) prod_non_batch_dimensions = 1 for ii in range(num_non_batch_dimensions - 1): prod_non_batch_dimensions *= int(x.shape[ii + 1]) x = tf.reshape(x, [tf.shape(x)[0], -1]) w = tf.get_variable( 'DW', [prod_non_batch_dimensions, out_dim], initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) b = tf.get_variable('biases', [out_dim], initializer=tf.constant_initializer()) return tf.nn.xw_plus_b(x, w, b)
def _fully_connected(self, x, out_dim): """FullyConnected layer for final output.""" #x = tf.reshape(x, [self.hps.batch_size, -1]) w = tf.get_variable( 'DW', [x.get_shape()[1], out_dim], initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) b = tf.get_variable('biases', [out_dim], initializer=tf.constant_initializer()) self.fc_x = x self.fc_w = w self.fc_b = b return tf.nn.xw_plus_b(x, w, b)
def _fc_layer(self, input_tensor, n_out, n_in=None, activation=tf.identity): """ The fully connected layer :param input_tensor: 2-D tensor :param n_in: int, the number of input units :param n_out: int, the number of output units :param activation: activation function, default you use identity activation """ if n_in is None: n_in = input_tensor.get_shape().as_list()[-1] weights = self._get_variable("fc_weight", [n_in, n_out], initializer=tf.uniform_unit_scaling_initializer(factor=1.0), is_fc_layer=True) biases = self._get_variable("fc_bias", [n_out,], initializer=tf.zeros_initializer, is_fc_layer=True) wx_b = tf.matmul(input_tensor, weights) + biases return activation(wx_b)
def __init__(self, vocab_size, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, dropout, FLAGS, forward_only=False, optimizer="adam"): self.size = size self.vocab_size = vocab_size self.batch_size = batch_size self.num_layers = num_layers self.keep_prob_config = 1.0 - dropout self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.keep_prob = tf.placeholder(tf.float32) self.source_tokens = tf.placeholder(tf.int32, shape=[None, None]) self.target_tokens = tf.placeholder(tf.int32, shape=[None, None]) self.source_mask = tf.placeholder(tf.int32, shape=[None, None]) self.target_mask = tf.placeholder(tf.int32, shape=[None, None]) self.beam_size = tf.placeholder(tf.int32) self.target_length = tf.reduce_sum(self.target_mask, reduction_indices=0) self.FLAGS = FLAGS self.decoder_state_input, self.decoder_state_output = [], [] for i in xrange(num_layers): self.decoder_state_input.append(tf.placeholder(tf.float32, shape=[None, size])) with tf.variable_scope("NLC", initializer=tf.uniform_unit_scaling_initializer(1.0)): self.setup_embeddings() self.setup_encoder() self.setup_decoder() self.setup_loss() self.setup_beam() params = tf.trainable_variables() if not forward_only: opt = get_optimizer(optimizer)(self.learning_rate) gradients = tf.gradients(self.losses, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm) # self.gradient_norm = tf.global_norm(clipped_gradients) self.gradient_norm = tf.global_norm(gradients) self.param_norm = tf.global_norm(params) self.updates = opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.keep) # write_version=tf.train.SaverDef.V1
def linear(args, output_size, bias, bias_start=0.0, scope=None): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_start: starting value to initialize the bias; 0 by default. scope: VariableScope for the created subgraph; defaults to "Linear". Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ #assert args if not isinstance(args, (list, tuple)): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape().as_list() for a in args] for shape in shapes: if len(shape) != 2: raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes)) if not shape[1]: raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes)) else: total_arg_size += shape[1] # Now the computation. with tf.variable_scope(scope or "Linear"): matrix = tf.get_variable("Matrix", [total_arg_size, output_size], initializer = tf.uniform_unit_scaling_initializer()) if len(args) == 1: res = tf.matmul(args[0], matrix) else: res = tf.matmul(tf.concat(1, args), matrix) if bias is None: return res bias_term = tf.get_variable("Bias", [output_size], initializer=tf.constant_initializer(bias_start)) return res + bias_term
def __init__(self, encoder, *args): """ Initializes your System :param encoder: an encoder that you constructed in train.py :param decoder: a decoder that you constructed in train.py :param args: pass in more arguments as needed """ self.encoder = encoder # ==== set up placeholder tokens ======== # TMP TO REMOVE START self.config = args[0] # FLAG self.pretrained_embeddings = args[1] # embeddings self.num_per_epoch = args[2] # self.saver = args[2] # max_question_length = 66 # max_context_length = 35 # embedding_size = 50 # label_size = 2 # TMP TO REMOVE END self.question_placeholder = tf.placeholder( tf.int64, (None, self.config.max_question_length, self.config.n_features)) print(self.question_placeholder) self.question_length_placeholder = tf.placeholder(tf.int64, (None, )) self.context_placeholder = tf.placeholder( tf.int64, (None, self.config.max_context_length, self.config.n_features)) self.context_length_placeholder = tf.placeholder(tf.int64, (None, )) self.start_labels_placeholder = tf.placeholder(tf.int64, (None, )) self.end_labels_placeholder = tf.placeholder(tf.int64, (None, )) self.mask_placeholder = tf.placeholder( tf.float32, (None, self.config.max_context_length)) # ==== assemble pieces ==== with tf.variable_scope( "qa", initializer=tf.uniform_unit_scaling_initializer(1.0)): self.setup_embeddings() # self.preds = self.setup_system() u_pred_s, u_pred_e = self.setup_system() self.preds = (self.exp_mask(u_pred_s), self.exp_mask(u_pred_e) ) # mask the start end end predictions self.loss = self.setup_loss(self.preds) # ==== set up training/updating procedure ==== optfn = get_optimizer(self.config.optimizer) self.global_step = tf.contrib.framework.get_or_create_global_step() num_batches_per_epoch = (self.num_per_epoch / self.config.batch_size) self.decay_steps = int(num_batches_per_epoch * self.config.num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. self.lr = tf.train.exponential_decay( self.config.learning_rate, self.global_step, self.decay_steps, self.config.learning_rate_decay_factor, staircase=True) tf.summary.scalar('learning_rate', self.lr) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) self.summary_op = tf.summary.merge(summaries) self.train_op = optfn(self.lr).minimize(self.loss, global_step=self.global_step) self.saver = tf.train.Saver()
def add_embeddings(self): with tf.variable_scope('embedding'): embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer()) q_embed = tf.nn.embedding_lookup(embeddings, self.q) aplus_embed = tf.nn.embedding_lookup(embeddings, self.aplus) aminus_embed = tf.nn.embedding_lookup(embeddings, self.aminus) return q_embed, aplus_embed, aminus_embed
def add_embeddings(self): with tf.variable_scope('embedding'): if self.config.embeddings is not None: embeddings = tf.Variable(self.config.embeddings, name="embeddings", trainable=False) else: embeddings = tf.get_variable('embeddings', shape=[self.config.vocab_size, self.config.embedding_size], initializer=tf.uniform_unit_scaling_initializer()) q_embed = tf.nn.embedding_lookup(embeddings, self.q) a_embed = tf.nn.embedding_lookup(embeddings, self.a) return q_embed, a_embed
def __init__(self, encoder, decoder, embed_path): """ Initializes your System :param encoder: an encoder that you constructed in train.py :param decoder: a decoder that you constructed in train.py :param args: pass in more arguments as needed """ self.encoder = encoder self.decoder = decoder self.embed_path = embed_path # ==== set up placeholder tokens ======== self.context = tf.placeholder(tf.int32, shape=(None, context_max_len)) self.context_m = tf.placeholder(tf.bool, shape=(None, context_max_len)) self.question = tf.placeholder(tf.int32, shape=(None, question_max_len)) self.question_m = tf.placeholder(tf.bool, shape=(None, question_max_len)) self.answer_s = tf.placeholder(tf.int32, shape=(None, )) self.answer_e = tf.placeholder(tf.int32, shape=(None, )) # ==== assemble pieces ==== with tf.variable_scope( "qa", initializer=tf.uniform_unit_scaling_initializer(1.0)): self.setup_embeddings() self.setup_system() self.setup_loss() # ==== set up training/updating procedure ==== self.global_step = tf.Variable(cfg.start_steps, trainable=False) self.starter_learning_rate = tf.placeholder(tf.float32, name='start_lr') learning_rate = tf.train.exponential_decay( self.starter_learning_rate, self.global_step, 1000, 0.9, staircase=True) tf.summary.scalar('learning_rate', learning_rate) self.optimizer = tf.train.AdamOptimizer(learning_rate) # grad_var = self.optimizer.compute_gradients(self.final_loss) # grad = [i[0] for i in grad_var] # var = [i[1] for i in grad_var] # self.grad_norm = tf.global_norm(grad) # tf.summary.scalar('grad_norm', self.grad_norm) # grad, use_norm = tf.clip_by_global_norm(grad, max_grad_norm) # # self.train_op = self.optimizer.apply_gradients(zip(grad, var), global_step=self.global_step) gradients = self.optimizer.compute_gradients(self.final_loss) capped_gvs = [(tf.clip_by_value(grad, -clip_by_val, clip_by_val), var) for grad, var in gradients] grad = [x[0] for x in gradients] self.grad_norm = tf.global_norm(grad) tf.summary.scalar('grad_norm', self.grad_norm) self.train_op = self.optimizer.apply_gradients( capped_gvs, global_step=self.global_step) self.saver = tf.train.Saver() self.merged = tf.summary.merge_all()
def encoder(self, is_training=False, hidden_layers=5, kernel_size=3, channels=[200] * 5, dropout_emb=0.2, dropout_hidden=0.2, use_wn=True, use_bn=False): # Define the encoder # embeddings = tf.get_variable('embeddings', [self.vocab_size, self.emb_size]) with tf.variable_scope( self.scope, reuse=self.reuse, initializer=tf.uniform_unit_scaling_initializer()): masks = tf.cast(tf.sequence_mask(self.seq_lengths, maxlen=64), FLOAT_TYPE) # Dropout on embedding output. if dropout_emb: self.inputs = tf.cond( self.is_train, lambda: tf.nn.dropout(self.inputs, 1 - dropout_emb), lambda: self.inputs) hidden_output = self.inputs pre_channels = self.inputs.get_shape()[-1].value for i in xrange(hidden_layers): k = kernel_size cur_channels = channels[i] filter_w = tf.get_variable( 'filter_w_%d' % i, shape=[k, pre_channels, cur_channels], dtype=FLOAT_TYPE) filter_v = tf.get_variable( 'filter_v_%d' % i, shape=[k, pre_channels, cur_channels], dtype=FLOAT_TYPE) bias_b = tf.get_variable( 'bias_b_%d' % i, shape=[cur_channels], initializer=tf.zeros_initializer(dtype=FLOAT_TYPE)) bias_c = tf.get_variable( 'bias_c_%d' % i, shape=[cur_channels], initializer=tf.zeros_initializer(dtype=FLOAT_TYPE)) # Weight normalization. if use_wn: epsilon = 1e-12 g_w = tf.get_variable('g_w_%d' % i, shape=[k, 1, cur_channels], dtype=FLOAT_TYPE) g_v = tf.get_variable('g_v_%d' % i, shape=[k, 1, cur_channels], dtype=FLOAT_TYPE) # Perform wn filter_w = g_w * filter_w / (tf.sqrt( tf.reduce_sum(filter_w**2, 1, keep_dims=True)) + epsilon) filter_v = g_v * filter_v / (tf.sqrt( tf.reduce_sum(filter_v**2, 1, keep_dims=True)) + epsilon) w = tf.nn.conv1d(hidden_output, filter_w, 1, 'SAME') + bias_b v = tf.nn.conv1d(hidden_output, filter_v, 1, 'SAME') + bias_c if use_bn: w = layers.batch_norm(inputs=v, decay=0.9, is_training=self.is_train, center=True, scale=True, scope='BatchNorm_w_%d' % i) v = layers.batch_norm(inputs=w, decay=0.9, is_training=self.is_train, center=True, scale=True, scope='BatchNorm_v_%d' % i) hidden_output = w * tf.nn.sigmoid(v) # Mask paddings. hidden_output = hidden_output * tf.expand_dims(masks, -1) # Dropout on hidden output. if dropout_hidden: hidden_output = tf.cond( self.is_train, lambda: tf.nn.dropout( hidden_output, 1 - dropout_hidden), lambda: hidden_output) pre_channels = cur_channels hidden_output = hidden_output self.fc1 = hidden_output
def init_embedding(self): self.embedding = {} self.embedding['user_embedding'] = tf.get_variable("user_embedding",shape=(self.num_user,self.dim),dtype=tf.float32,initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) self.embedding['item_embedding'] = tf.get_variable("item_embedding",shape=(self.num_item,self.dim),dtype=tf.float32,initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) self.embedding['aspect_embedding'] = tf.get_variable("aspect_embedding",shape=(self.num_user,self.dim),dtype=tf.float32,initializer=tf.uniform_unit_scaling_initializer(factor=1.0))
def create_model(hps, vocab_size, classes_size): # 输入定义 encoded_length = hps.encoded_length batch_size = hps.batch_size inputs = tf.placeholder(tf.int32, (batch_size, encoded_length)) outputs = tf.placeholder(tf.int32, (batch_size, )) # for drop_out keep_prob = tf.placeholder(tf.float32, name='keep_prob') # record training step, un-trainable,保存当前训练到了那一步 global_step = tf.Variable( tf.zeros([], tf.int64), name='global_step', trainable=False ) # embedding layer # initialize embedding layer with uniform-distribution from -1 to +1 embedding_init = tf.random_uniform_initializer(-1.0, 1.) with tf.variable_scope('embedding', initializer=embedding_init): embedding = tf.get_variable( 'embedding', [vocab_size, hps.embedding_size], # size of embedding matrix tf.float32 ) # embedded_inputs = tf.nn.embedding_lookup(embedding, inputs) # LSTM layers scale = 1.0/math.sqrt(hps.embedding_size + hps.num_lstm_nodes[-1])/3.0 lstm_init = tf.random_uniform_initializer(-scale, scale) def _generate_params_for_lstm_cell(x_size, h_size, bias_size): """ :param x_size: :param h_size: :param bias_size: :return: """ x_w = tf.get_variable('x_weights', x_size) h_w = tf.get_variable('h_weights', h_size) b = tf.get_variable('bias', bias_size, initializer=tf.constant_initializer(0.0)) return x_w, h_w, b # one LSTM layer with tf.variable_scope('lstm', initializer=lstm_init): # all params in the lstm cell: with tf.variable_scope('inputs'): ix_w, ih_w, ib = _generate_params_for_lstm_cell( x_size=[hps.embedding_size, hps.num_lstm_nodes[0]], h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]], bias_size=[1, hps.num_lstm_nodes[0]] ) with tf.variable_scope('outputs'): ox_w, oh_w, ob = _generate_params_for_lstm_cell( x_size=[hps.embedding_size, hps.num_lstm_nodes[0]], h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]], bias_size=[1, hps.num_lstm_nodes[0]] ) with tf.variable_scope('forget'): fx_w, fh_w, fb = _generate_params_for_lstm_cell( x_size=[hps.embedding_size, hps.num_lstm_nodes[0]], h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]], bias_size=[1, hps.num_lstm_nodes[0]] ) # tanh with tf.variable_scope('memory'): cx_w, ch_w, cb = _generate_params_for_lstm_cell( x_size=[hps.embedding_size, hps.num_lstm_nodes[0]], h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]], bias_size=[1, hps.num_lstm_nodes[0]] ) state = tf.Variable( tf.zeros([batch_size, hps.num_lstm_nodes[0]]), trainable=False ) h = tf.Variable( tf.zeros([batch_size, hps.num_lstm_nodes[0]]), trainable=False ) # implement lstm. each word has its own lstm cell for i in range(encoded_length): embedd_input = embedded_inputs[:, i, :] # ???? embedd_input = tf.reshape(embedd_input, [batch_size, hps.num_embedding_size]) forget_gate = tf.sigmoid( tf.matmul(embedd_input, fx_w) + tf.matmul(h, fh_w) + fb) input_gate = tf.sigmoid( tf.matmul(embedd_input, ix_w) + tf.matmul(h, ih_w) + ib) mid_state = tf.tanh( tf.matmul(embedd_input, cx_w) + tf.matmul(h, ch_w) + cb) output_gate = tf.sigmoid( tf.matmul(embedd_input, ox_w) + tf.matmul(h, oh_w) + ob) state_C = mid_state * input_gate + state_C * forget_gate h = output_gate * tf.tanh(state) last = h # size: [100, 32] # fc layer fc_init = tf.uniform_unit_scaling_initializer(factor=1.0) with tf.variable_scope('fc', initializer=fc_init): fc1 = tf.layers.dense(last, hps.num_fc_nodes, activation=tf.nn.relu, name='fc1') fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob) logits = tf.layers.dense(fc1_dropout, classes_size, name='fc2') # calculate loss function, y_pred, accuracy with tf.name_scope('metrics'): softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=outputs ) loss = tf.reduce_mean(softmax_loss) y_pred = tf.argmax(tf.nn.softmax(logits), 1, output_type=tf.int32) correct_pred = tf.equal(outputs, y_pred) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) with tf.name_scope('train_op'): # get all trainable variables trainable_vars = tf.trainable_variables() # show all these trainable variables for var in trainable_vars: print('variable name: %s' % var) # tf.logging.info('variable name: %s' % var) # get all grads from loss with respect to all trainable variables grads, _ = tf.clip_by_global_norm( tf.gradients(loss, trainable_vars), hps.clip_lstm_grads ) # use AdamOptimizer optimizer = tf.train.AdamOptimizer(hps.learning_rate) # apply grads to all trainable_variables & train train_op = optimizer.apply_gradients( zip(grads, trainable_vars), global_step=global_step ) return ((inputs, outputs, keep_prob), (loss, accuracy), (train_op, global_step))
def __init__(self, encoder, decoder, embed_path): """ Initializes your System :param encoder: an encoder that you constructed in train.py :param decoder: a decoder that you constructed in train.py :param args: pass in more arguments as needed """ # self.input_size = cfg.batch_size self.embed_path = embed_path self.max_grad_norm = cfg.max_grad_norm self.encoder = encoder self.decoder = decoder # ==== set up placeholder tokens ======== # shape [batch_size, context_max_length] self.context = tf.placeholder(tf.int32, (None, context_max_len)) self.context_m = tf.placeholder(tf.bool, (None, context_max_len)) self.question = tf.placeholder(tf.int32, (None, question_max_len)) self.question_m = tf.placeholder(tf.bool, (None, question_max_len)) self.answer_s = tf.placeholder(tf.int32, (None, )) self.answer_e = tf.placeholder(tf.int32, (None, )) # self.batch_size = tf.placeholder(tf.int32,[], name='batch_size') # ==== assemble pieces ==== with tf.variable_scope( "qa", initializer=tf.uniform_unit_scaling_initializer(1.0, ), # regularizer=self.regularizer # initializer=identity_initializer ): self.setup_embeddings() self.setup_system() self.setup_loss() # ==== set up training/updating procedure ==== self.global_step = tf.Variable(0, trainable=False) # starter_learning_rate = start_lr self.starter_learning_rate = tf.placeholder(tf.float32, name='start_lr') # TODO: choose how to adapt learning rate at will learning_rate = tf.train.exponential_decay( self.starter_learning_rate, self.global_step, 1000, 0.96, staircase=True) tf.summary.scalar('learning_rate', learning_rate) # self.optimizer = get_optimizer(cfg.opt) self.optimizer = tf.train.AdamOptimizer(learning_rate) # TODO: consider graidents clipping. gradients = self.optimizer.compute_gradients(self.final_loss) capped_gvs = [(tf.clip_by_value(grad, -clip_by_val, clip_by_val), var) for grad, var in gradients] grad = [x[0] for x in gradients] self.grad_norm = tf.global_norm(grad) tf.summary.scalar('grad_norm', self.grad_norm) self.train_op = self.optimizer.apply_gradients( capped_gvs, global_step=self.global_step) # one could try clip_by_global_norm # var = [x[1] for x in gradients] # grad, self.grad_norm = tf.clip_by_global_norm(grad, self.max_grad_norm) # self.train_op = self.optimizer.apply_gradients(zip(grad, var), global_step=self.global_step) self.saver = tf.train.Saver() self.merged = tf.summary.merge_all()
def __init__(self, *args, **kwargs): super(TransformerAE, self).__init__(*args, **kwargs) self.predict_mask = 1.0 # Define bottleneck function self._hparams.bottleneck = functools.partial( discretization.discrete_bottleneck, hidden_size=self._hparams.hidden_size, z_size=self._hparams.z_size, filter_size=self._hparams.filter_size, bottleneck_kind=self._hparams.bottleneck_kind, num_blocks=self._hparams.num_blocks, num_residuals=self.hparams.num_residuals, reshape_method=self._hparams.reshape_method, beta=self._hparams.beta, ema=self._hparams.ema, epsilon=self._hparams.epsilon, decay=self._hparams.decay, random_top_k=self._hparams.random_top_k, soft_em=self.hparams.soft_em, num_samples=self.hparams.num_samples, softmax_k=self._hparams.softmax_k, temperature_warmup_steps=self._hparams.temperature_warmup_steps, do_hard_gumbel_softmax=self._hparams.do_hard_gumbel_softmax, num_flows=self._hparams.num_flows, approximate_gs_entropy=self._hparams.approximate_gs_entropy, discrete_mix=self._hparams.d_mix, noise_dev=self._hparams.noise_dev, startup_steps=self.hparams.startup_steps, summary=_DO_SUMMARIES) # Set the discretization bottleneck specific things here if self._hparams.bottleneck_kind in ["dvq", "gumbel-softmax-dvq"]: z_size_per_residual = self._hparams.z_size / self._hparams.num_residuals block_dim = int(self._hparams.hidden_size // self._hparams.num_blocks) block_v_size = 2**(z_size_per_residual / self._hparams.num_blocks) block_v_size = int(block_v_size) if self._hparams.reshape_method == "project": tf.logging.info("Using projections for DVQ") tf.logging.info("Trainable projections = {}".format( self._hparams.trainable_projections)) projection_tensors = tf.get_variable( name="projection", shape=[ self._hparams.num_residuals, self._hparams.num_blocks, self._hparams.hidden_size, block_dim ], initializer=tf.initializers.glorot_uniform(), trainable=self._hparams.trainable_projections) self._hparams.bottleneck = functools.partial( self._hparams.bottleneck, projection_tensors=projection_tensors) elif self._hparams.reshape_method == "slice": tf.logging.info("Using slices for DVQ") else: raise ValueError("Unknown reshape method") means = tf.get_variable( name="means", shape=[ self._hparams.num_residuals, self._hparams.num_blocks, block_v_size, block_dim ], initializer=tf.uniform_unit_scaling_initializer()) # Create the shadow variables if we are using EMA ema_count = None ema_means = None if self._hparams.ema: ema_count = [] for i in range(self._hparams.num_residuals): ema_count_i = tf.get_variable( "ema_count_{}".format(i), [self._hparams.num_blocks, block_v_size], initializer=tf.constant_initializer(0), trainable=False) ema_count.append(ema_count_i) with tf.colocate_with(means): ema_means = [] for i in range(self._hparams.num_residuals): ema_means_i = tf.get_variable( "ema_means_{}".format(i), [ self._hparams.num_blocks, block_v_size, block_dim ], initializer=( lambda shape, dtype=None, partition_info=None, # pylint: disable=g-long-lambda verify_shape=None: means.initialized_value()[i] ), trainable=False) ema_means.append(ema_means_i) # Update bottleneck self._hparams.bottleneck = functools.partial( self._hparams.bottleneck, means=means, ema_count=ema_count, ema_means=ema_means)
def new_weight(self, shape, name, uniform=False, stddev=0.1): if not uniform: initial = tf.random_normal_initializer(stddev=stddev) else: initial = tf.uniform_unit_scaling_initializer(factor=stddev) return tf.get_variable(name=name, shape=shape, initializer=initial)
# Matrix with dimensions (batch_size by maximum question length) self.questions_placeholder = tf.placeholder(tf.int32, shape=(None, None)) # Matrix with dimensions (batch_size by 2) where the second dimension is a binary indicator for each word. 0 represents the score for when word is not part of the answer and 1 represents the score when it is # TODO: confirm if this is score in fact or something else self.answers_placeholder = tf.placeholder(tf.int32, shape=(None, None)) # Placeholders for bidirectional lstm #self.passage_sequence_lengths = tf.placeholder(tf.int32, [None]) #self.question_sequence_lengths = tf.placeholder(tf.int32, [None]) # Create global step counter so we can track and save how many batches # we've completed. #self.global_step = tf.Variable(0, name='global_step', trainable=False) # The ordering of indices in the currently running shuffled batch #self.idxs = tf.Variable(tf.zeros(self.size_train_dataset, dtype=tf.int32), \ name='idxs', trainable=False) # ==== assemble pieces ==== with tf.variable_scope("qa", initializer=tf.uniform_unit_scaling_initializer(1.0)): self.preds = self.setup_predictions() # Creates embeddings and prediction self.loss = self.setup_loss(self.preds) # Creates loss computation self.train_op, self.grad_norm = self.setup_learning(self.loss) # Creates optimizer i.e. updates parameters in model # Create model saver self.saver = tf.train.Saver()
def _extra_init(self): super()._extra_init() # self.w_init = tf.random_uniform_initializer(-np.sqrt(3) * .04, np.sqrt(3) * .04) self.w_init = tf.uniform_unit_scaling_initializer(1.43)
def decode_spectrum(encoded_spectrum, intensity_inputs, decoder_inputs_emb, keep_conv, keep_dense, scope): """TODO(nh2tran): docstring. RNN decoder for the sequence-to-sequence model. Args: decoder_inputs: A list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor with shape [batch_size x cell.state_size]. cell: rnn_cell.RNNCell defining the cell function and size. loop_function: If not None, this function will be applied to the i-th output in order to generate the i+1-st input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. scope: VariableScope for the created subgraph; defaults to "rnn_decoder". Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x cell.output_size] containing generated outputs. state: The state of each cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. (Note that in some cases, like basic RNN cell or GRU cell, outputs and states can be the same. They are different for LSTM cells though.) """ single_cell = rnn_cell.BasicLSTMCell(num_units=deepnovo_config.num_units, state_is_tuple=True) if deepnovo_config.num_layers > 1: # cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * deepnovo_config.num_layers) stacked_rnn = [] for nn in range(deepnovo_config.num_layers): stacked_rnn.append( rnn_cell.BasicLSTMCell(num_units=deepnovo_config.num_units, state_is_tuple=True)) cell = rnn_cell.MultiRNNCell(cells=stacked_rnn, state_is_tuple=True) else: cell = single_cell cell = rnn_cell.DropoutWrapper(cell, input_keep_prob=keep_dense, output_keep_prob=keep_dense) with variable_scope.variable_scope(scope): # INTENSITY-Model Parameters # intensity input [128, 27, 2, 10] if deepnovo_config.FLAGS.shared: # shared-weight dense1_input_size = deepnovo_config.num_ion * deepnovo_config.WINDOW_SIZE dense1_output_size = deepnovo_config.num_units * 2 #+deepnovo_config.embedding_size #JOON dense1_W = variable_scope.get_variable( name="dense1_W_0", shape=[dense1_input_size, dense1_output_size], initializer=tf.uniform_unit_scaling_initializer(1.43)) dense1_B = variable_scope.get_variable( name="dense1_B_0", shape=[dense1_output_size], initializer=tf.constant_initializer(0.1)) dense_linear_W = variable_scope.get_variable( name="dense_linear_W", shape=[dense1_output_size, 1]) dense_linear_B = variable_scope.get_variable( name="dense_linear_B", shape=[1], initializer=tf.constant_initializer(0.1)) else: # joint-weight # conv1: [128, 8, 20, 26] >> [128, 8, 20, 64] with kernel [1, 3, 26, 64] conv1_weights = tf.get_variable( name="conv1_weights", shape=[1, 3, deepnovo_config.vocab_size, 64], initializer=tf.uniform_unit_scaling_initializer(1.43)) conv1_biases = tf.get_variable( name="conv1_biases", shape=[64], initializer=tf.constant_initializer(0.1)) # conv2: [128, 8, 20, 64] >> [128, 8, 20, 64] with kernel [1, 2, 64, 64] conv2_weights = tf.get_variable( name="conv2_weights", shape=[1, 2, 64, 64], initializer=tf.uniform_unit_scaling_initializer(1.43)) conv2_biases = tf.get_variable( name="conv2_biases", shape=[64], initializer=tf.constant_initializer(0.1)) # max_pool: [128, 8, 20, 64] >> [128, 8, 10, 64] # dense1: # 4D >> [128, 512] dense1_input_size = deepnovo_config.num_ion * ( deepnovo_config.WINDOW_SIZE // 2) * 64 # deepnovo_config.vocab_size dense1_output_size = deepnovo_config.num_units #JOON dense1_weights = tf.get_variable( "dense1_weights", shape=[dense1_input_size, dense1_output_size], initializer=tf.uniform_unit_scaling_initializer(1.43)) dense1_biases = tf.get_variable( "dense1_biases", shape=[dense1_output_size], initializer=tf.constant_initializer(0.1)) # for testing dense1_W_penalty = tf.multiply(tf.nn.l2_loss(dense1_weights), deepnovo_config.l2_loss_weight, name='dense1_W_penalty') # cat dense_concat_W = variable_scope.get_variable( name="dense_concat_W", # shape=[deepnovo_config.num_units+deepnovo_config.embedding_size, deepnovo_config.num_units],#JOON? shape=[deepnovo_config.num_units * 2, deepnovo_config.num_units], #JOON? initializer=tf.uniform_unit_scaling_initializer(1.43)) dense_concat_B = variable_scope.get_variable( name="dense_concat_B", shape=[deepnovo_config.num_units], #JOON initializer=tf.constant_initializer(0.1)) # DECODING - SPECTRUM as Input 0 with variable_scope.variable_scope("LSTM_cell"): input0 = encoded_spectrum print('input0 = encoded_spectrum:', encoded_spectrum) batch_size = array_ops.shape(input0)[0] zero_state = cell.zero_state(batch_size=batch_size, dtype=tf.float32) _, lstm_state_0 = cell(inputs=input0, state=zero_state) # nobi # DECODING - lstm_input_projected with variable_scope.variable_scope("LSTM_input_projected"): lstm_input_projected_W = variable_scope.get_variable( name="lstm_input_projected_W", shape=[ deepnovo_config.embedding_size, deepnovo_config.num_units ]) lstm_input_projected_B = variable_scope.get_variable( name="lstm_input_projected_B", shape=[deepnovo_config.num_units], initializer=tf.constant_initializer(0.1)) # DECODING LOOP # nobi outputs = [] AA_1 = decoder_inputs_emb[0] # padding [AA_1, AA_2, ?] with GO/EOS # ltsm.len_full lstm_state = lstm_state_0 for i, AA_2 in enumerate(decoder_inputs_emb): # nobi if i > 0: # to-do-later: bring variable definitions out of the loop variable_scope.get_variable_scope().reuse_variables() # INTENSITY-Model candidate_intensity = intensity_inputs[i] # [128, 27, 2, 10] if deepnovo_config.FLAGS.shared: # shared-weight candidate_intensity_reshape = tf.reshape( candidate_intensity, shape=[-1, dense1_input_size]) # [128*27, 2*10] layer_dense1_input = candidate_intensity_reshape layer_dense1 = tf.nn.relu( tf.matmul(layer_dense1_input, dense1_W) + dense1_B) # [128*27, 1024] layer_dense1_drop = tf.nn.dropout(layer_dense1, keep_dense) layer_dense1_output = ( tf.matmul(layer_dense1_drop, dense_linear_W) + dense_linear_B) # [128*27,1] # Intensity output intensity_output = tf.reshape( layer_dense1_output, shape=[-1, deepnovo_config.vocab_size]) # [128,27] else: # joint-weight # image_batch: [128, 26, 8, 20] >> [128, 8, 20, 26] # This is a bug, should be fixed at the input processing later. image_batch = tf.transpose(candidate_intensity, perm=[0, 2, 3, 1]) # [128,8,20,26] # conv1: [128, 8, 20, 26] >> [128, 8, 20, 64] with kernel [1, 3, 26, 64] conv1 = tf.nn.relu( tf.nn.conv2d(image_batch, conv1_weights, strides=[1, 1, 1, 1], padding='SAME') + conv1_biases) # conv2: [128, 8, 20, 64] >> [128, 8, 20, 64] with kernel [1, 2, 64, 64] conv2 = tf.nn.relu( tf.nn.conv2d(conv1, conv2_weights, strides=[1, 1, 1, 1], padding='SAME') + conv2_biases) conv2 = tf.nn.max_pool(conv2, ksize=[1, 1, 3, 1], strides=[1, 1, 2, 1], padding='SAME') # [128, 8, 10, 64] conv2 = tf.nn.dropout(conv2, keep_conv) # dense1: 4D >> [128, 512] dense1_input = tf.reshape( conv2, [-1, dense1_input_size]) # 2D flatten dense1 = tf.nn.relu( tf.matmul(dense1_input, dense1_weights) + dense1_biases) # [128, 512] # dense2: # [128, 512] >> [128, 512] #~ dense2 = tf.nn.relu(tf.matmul(dense1, dense2_weights) + dense2_biases) # [128, 512] #~ dropout1 = tf.nn.dropout(dense2, keep_dense, name="dropout1") dropout1 = tf.nn.dropout(dense1, keep_dense, name="dropout1") # logit_linear: [128, 512] >> [128, 27] #~ intensity_output = tf.add(tf.matmul(dropout1, linear_weights), #~ linear_biases) # [128, 27] intensity_output = dropout1 with variable_scope.variable_scope( "intensity_output_projected"): intensity_output_projected = rnn_cell_impl._linear( # TODO(nh2tran): _linear args=intensity_output, output_size=deepnovo_config.vocab_size, # [128,27] bias=True, bias_initializer=None, #0.1, kernel_initializer=None) # nobi # LSTM-Model AA_1_projected = (tf.matmul(AA_1, lstm_input_projected_W) + lstm_input_projected_B) AA_2_projected = (tf.matmul(AA_2, lstm_input_projected_W) + lstm_input_projected_B) with variable_scope.variable_scope("LSTM_cell"): variable_scope.get_variable_scope().reuse_variables() # print('cell:', cell) # print('AA_2_projected:', AA_2_projected) # print('lstm_state:', lstm_state) lstm_output, lstm_state = cell(inputs=AA_2_projected, state=lstm_state) AA_1 = AA_2 with variable_scope.variable_scope("lstm_output_projected"): lstm_output_projected = rnn_cell_impl._linear( # TODO(nh2tran): _linear args=lstm_output, output_size=deepnovo_config.vocab_size, # [128,27] bias=True, bias_initializer=None, #0.1, kernel_initializer=None) # LSTM-Intensity Connection-Model >> OUTPUT if deepnovo_config.FLAGS.use_intensity and deepnovo_config.FLAGS.use_lstm: #~ output_logit = tf.nn.relu(tf.matmul(lstm_output_projected, denseL_W) #~ + tf.matmul(intensity_output_projected, denseI_W) #~ + denseC_B) # cat concat = tf.concat(axis=1, values=[intensity_output, lstm_output]) concat_dense = tf.nn.relu( tf.matmul(concat, dense_concat_W) + dense_concat_B) concat_drop = tf.nn.dropout(concat_dense, keep_dense) with variable_scope.variable_scope("output_logit"): output_logit = rnn_cell_impl._linear( args=concat_drop, # TODO(nh2tran): _linear output_size=deepnovo_config.vocab_size, # [128,27] bias=True, bias_initializer=None, #0.1, kernel_initializer=None) elif deepnovo_config.FLAGS.use_intensity: # intensity only (without LSTM >> up to 10% loss, especially at AA-accuracy?) output_logit = intensity_output_projected elif deepnovo_config.FLAGS.use_lstm: output_logit = lstm_output_projected else: print("ERROR: wrong LSTM-Intensity model specified!") sys.exit() outputs.append(output_logit) return (outputs, dense1_W_penalty)
def resnet_v1_siamese(input_shape, depth, num_classes=10, weight_decay=0.0, embedding_activation='leaky-relu', embedding_aux_loss='cosine', reduce_variance=False, reduce_jacobian_loss=False, load_weights='', reduce_juccobian_coeff=0.01): ''' A resent model with the different MAD loss components :param input_shape: :param depth: numst be 6n+2 (32,56...) :param num_classes: number of classes :param weight_decay: decay to use for an l2 regularization :param embedding_activation: replace embedding layer activation functions :param embedding_aux_loss: loss to use for Siamese currently supporting reduce of margin and cosine distance :param reduce_variance: bool, if True adding the reduce variance loss :param reduce_jacobian_loss: bool, if True adding the reduce Jacobian loss :param load_weights: path to pretrained model to load, must be with identical configuration :param reduce_juccobian_coeff :return: ''' if (depth - 2) % 6 != 0: raise ValueError('depth should be 6n+2 (eg 20, 32, 44 in [a])') # Start model definition. num_filters = 16 num_res_blocks = int((depth - 2) / 6) in1 = Input(shape=input_shape) in2 = Input(shape=input_shape) conv_layer_list = [] x1, x2, conv_layer_list = resnet_layer_siamese( in1, in2, activation=embedding_activation, conv_first=True, weight_decay=weight_decay, conv_layer_list=conv_layer_list) # Instantiate the stack of residual units first_iter = True for stack in range(3): for res_block in range(num_res_blocks): strides = 1 if stack > 0 and res_block == 0: # first layer but not first stack strides = 2 # downsample if not first_iter: bn = BatchNormalization(momentum=0.9) if embedding_activation == 'leaky-relu': lr = LeakyReLU(alpha=0.1) else: lr = Activation(embedding_activation) y1 = bn(x1) y1 = lr(y1) y2 = bn(x2) y2 = lr(y2) else: y1 = x1 y2 = x2 y1, y2, conv_layer_list = resnet_layer_siamese( y1, y2, num_filters=num_filters, strides=strides, activation=embedding_activation, conv_first=True, weight_decay=weight_decay, conv_layer_list=conv_layer_list) y1, y2, conv_layer_list = resnet_layer_siamese( y1, y2, num_filters=num_filters, activation=None, conv_first=True, weight_decay=weight_decay, batch_normalization=False, conv_layer_list=conv_layer_list) if stack > 0 and res_block == 0: # first layer but not first stack ap = AveragePooling2D(2, 2, 'valid') x1 = ap(x1) x2 = ap(x2) x1 = Lambda(pad_depth, arguments={'desired_channels': y1.shape[-1]})(x1) x2 = Lambda(pad_depth, arguments={'desired_channels': y2.shape[-1]})(x2) x1 = keras.layers.add([x1, y1]) x2 = keras.layers.add([x2, y2]) first_iter = False num_filters *= 2 bn = BatchNormalization(momentum=0.9) x1 = bn(x1) x2 = bn(x2) if embedding_activation == 'leaky-relu': lr = LeakyReLU(alpha=0.1) elif embedding_activation == 'tanh': print("Using tanh activation") lr = Activation('tanh') x1 = lr(x1) x2 = lr(x2) ap = AveragePooling2D(pool_size=int(input_shape[0] / 4), name='bottleneck') x1 = ap(x1) x2 = ap(x2) emb1 = Flatten()(x1) emb2 = Flatten()(x2) dense = Dense( num_classes, name='logits', kernel_initializer=tf.uniform_unit_scaling_initializer(factor=1.0), kernel_regularizer=l2(weight_decay), bias_initializer=tf.constant_initializer()) logits1 = dense(emb1) logits2 = dense(emb2) output1 = Activation('softmax', name='main_output1')(logits1) output2 = Activation('softmax', name='main_output2')(logits2) if embedding_aux_loss == 'cosine': aux_out = Dot(1, normalize=True)([emb1, emb2]) elif embedding_aux_loss == 'margin': print("using margin loss") aux_out = Lambda(lambda l: K.concatenate( (K.expand_dims(l[0], axis=-1), K.expand_dims(l[1], axis=-1)), axis=-1))([emb1, emb2]) output_list = [output1, output2, aux_out] if reduce_variance: output_list += [emb1, emb2] if reduce_jacobian_loss: jacobian_output1 = Lambda(lambda l: reduce_juccobian_coeff * K.sqrt( K.sum(K.pow(K.gradients(output1, l)[0], 2), axis=(1, 2, 3))), output_shape=[1])(in1) jacobian_output2 = Lambda(lambda l: reduce_juccobian_coeff * K.sqrt( K.sum(K.pow(K.gradients(output2, l)[0], 2), axis=(1, 2, 3))), output_shape=[1])(in2) output_list += [jacobian_output1, jacobian_output2] model = Model(inputs=[in1, in2], outputs=output_list) if load_weights != '': #loading weights temp_model = keras.Model(model.input[0], model.get_layer('main_output1').output) temp_model.load_weights(load_weights) temp_model = keras.Model(model.input[1], model.get_layer('main_output2').output) temp_model.load_weights(load_weights) return model
def __init__(self, pretrained_embeddings, flags): """ Initializes your System :param args: pass in more arguments as needed """ self.pretrained_embeddings = pretrained_embeddings self.flags = flags self.h_size = self.flags.state_size self.p_size = self.flags.output_size self.q_size = self.flags.question_size self.embed_size = self.flags.embedding_size self.dropout = self.flags.dropout self.encoder = Encoder(hidden_size=self.h_size, dropout=(1.0 - self.flags.dropout)) self.decoder = Decoder(hidden_size=self.h_size, output_size=self.p_size, dropout=(1.0 - self.flags.dropout)) # ==== set up placeholder tokens ======== self.context_placeholder = tf.placeholder(tf.int32, shape=(None, self.p_size), name='context_placeholder') self.question_placeholder = tf.placeholder(tf.int32, shape=(None, self.q_size), name='question_placeholder') self.answer_span_placeholder = tf.placeholder( tf.int32, shape=(None, 2), name='answer_span_placeholder') self.mask_q_placeholder = tf.placeholder(tf.int32, shape=(None, ), name='mask_q_placeholder') self.mask_ctx_placeholder = tf.placeholder(tf.int32, shape=(None, ), name='mask_ctx_placeholder') self.dropout_placeholder = tf.placeholder(tf.float32, shape=(), name='dropout_placeholder') # ==== assemble pieces ==== with tf.variable_scope( "qa", initializer=tf.uniform_unit_scaling_initializer(1.0)): self.setup_embeddings() self.setup_system() self.setup_loss() # ==== set up training/updating procedure ==== self.global_step = tf.Variable(0, trainable=False) self.starter_learning_rate = self.flags.learning_rate self.learning_rate = self.starter_learning_rate # learning rate decay # self.learning_rate = tf.train.exponential_decay(self.starter_learning_rate, self.global_step, # 1000, 0.96, staircase=True) self.optimizer = get_optimizer("adam") if self.flags.grad_clip: # gradient clipping self.optimizer = self.optimizer(self.learning_rate) grads = self.optimizer.compute_gradients(self.loss) for i, (grad, var) in enumerate(grads): if grad is not None: grads[i] = (tf.clip_by_norm(grad, self.flags.max_gradient_norm), var) self.train_op = self.optimizer.apply_gradients( grads, global_step=self.global_step) else: # no gradient clipping self.train_op = self.optimizer(self.learning_rate).minimize( self.loss, global_step=self.global_step) self.saver = tf.train.Saver()
# cell_type = ThetaRNNCell if c.num_of_layers > 1: cells = rc.MultiRNNCell( [cell_type(c.net_size) for _ in xrange(c.num_of_layers)]) net_out_size = cells._cells[-1].state_size else: cells = cell_type(c.net_size) net_out_size = cells.state_size forecast_steps = c.forecast_ms #init = lambda shape, dtype: np.reshape(-np.sqrt(3) / np.sqrt(shape[0]) + np.random.random((shape[0], shape[3])) * 2.0*np.sqrt(3) / np.sqrt(shape[0]), (shape[0], 1, 1, shape[3])) init = lambda shape, dtype: generate_dct_dictionary(shape[0], shape[ 3]).reshape(shape[0], 1, 1, shape[3]) recov_init = tf.uniform_unit_scaling_initializer(factor=1.0) input = tf.placeholder(tf.float32, shape=(1, c.seq_size, 1, 1), name="Input") target = tf.placeholder(tf.float32, shape=(1, c.seq_size, 1, 1), name="Target") filter = vs.get_variable("W", [c.filter_len, 1, 1, c.filters_num], initializer=init) bias = vs.get_variable( "b", [c.filters_num], initializer=lambda shape, dtype: np.zeros(c.filters_num)) recov_filter = vs.get_variable("Wr", [c.filter_len, 1, 1, net_out_size], initializer=recov_init) state = tf.placeholder(tf.float32, shape=(c.batch_size, cells.state_size), name="State")
def testDuplicatedInitializer(self): for use_gpu in [False, True]: init = tf.uniform_unit_scaling_initializer() self.assertFalse(duplicated_initializer(self, init, use_gpu, 1))
def __init__(self, encoder, decoder, rev_vocab, args): """ Initializes your System :param encoder: an encoder that you constructed in train.py :param decoder: a decoder that you constructed in train.py :param args: pass in more arguments as needed """ # ==== Setup hyper parameters ======= self.max_length_passage = args.max_passage_length self.max_length_question = args.max_question_length self.embedding_size = args.embedding_size self.embed_path = args.embed_path self.learning_rate = args.learning_rate self.epochs = args.epochs self.start_epoch = args.start_epoch self.batch_size = args.batch_size self.max_gradient_norm = args.max_gradient_norm self.train_dir = args.train_dir self.saved_name = args.saved_name self.eval_num_samples = args.eval_num_samples self.val_and_save_num_batches = args.val_and_save_num_batches self.val_cost_frac = args.val_cost_frac self.size_train_dataset = args.size_train_dataset self.sigma_threshold = args.sigma_threshold # ==== Set encoder and decoder self.encoder = encoder self.decoder = decoder self.rev_vocab = rev_vocab # ==== Load any data we need ======== # First load word embeddings self.pretrained_embeddings = np.load( self.embed_path)['glove'] # We assume it's glove # ==== set up placeholder tokens ======== # The first dimension is the batch_size and second dimension represents maximum passage length self.passages_placeholder = tf.placeholder(tf.int32, shape=(None, None)) # The first dimension is the batch_size and second dimension represents maximum question length self.questions_placeholder = tf.placeholder(tf.int32, shape=(None, None)) # The first dimension is the batch_size and second dimension represents binary indicator for each word # 0 represents the word is not part of the answer. 1 represents it is. self.answers_placeholder = tf.placeholder(tf.int32, shape=(None, None)) # Need masks for both passages and questions self.mask_passage_placeholder = tf.placeholder(tf.bool, shape=(None, None)) self.mask_question_placeholder = tf.placeholder(tf.bool, shape=(None, None)) # Placeholders for bidirectional lstm # TODO: Question: is there a better way of doing this?? # This is constant list of batch_size where each index represents the number of words in the passage self.passage_sequence_lengths = tf.placeholder(tf.int32, [None]) self.question_sequence_lengths = tf.placeholder(tf.int32, [None]) # Create global step counter so we can track and save how many batches # we've completed. self.global_step = tf.Variable(0, name='global_step', trainable=False) # The ordering of indices in the currently running shuffled batch self.idxs = tf.Variable(tf.zeros(self.size_train_dataset, dtype=tf.int32), \ name='idxs', trainable=False) # ==== assemble pieces ==== with tf.variable_scope( "qa", initializer=tf.uniform_unit_scaling_initializer(1.0)): self.preds = self.setup_predictions( ) # Creates embeddings and prediction self.loss = self.setup_loss(self.preds) # Creates loss computation self.train_op, self.grad_norm, self.new_grad_norm = self.setup_learning( self.loss ) # Creates optimizer i.e. updates parameters in model # ==== set up training/updating procedure ==== # Create model saver self.saver = tf.train.Saver()
def initialize(sess): """Initialize data and model.""" if FLAGS.jobid >= 0: data.log_filename = os.path.join(FLAGS.train_dir, "log%d" % FLAGS.jobid) data.print_out("NN ", newline=False) # Set random seed. seed = FLAGS.random_seed + max(0, FLAGS.jobid) tf.set_random_seed(seed) random.seed(seed) np.random.seed(seed) # Check data sizes. assert data.bins min_length = 3 max_length = min(FLAGS.max_length, data.bins[-1]) assert max_length + 1 > min_length while len(data.bins) > 1 and data.bins[-2] > max_length + EXTRA_EVAL: data.bins = data.bins[:-1] assert data.bins[0] > FLAGS.rx_step data.forward_max = max(FLAGS.forward_max, data.bins[-1]) nclass = min(FLAGS.niclass, FLAGS.noclass) data_size = FLAGS.train_data_size if FLAGS.mode == 0 else 1000 # Initialize data for each task. tasks = FLAGS.task.split("-") for t in tasks: for l in xrange(max_length + EXTRA_EVAL - 1): data.init_data(t, l, data_size, nclass) data.init_data(t, data.bins[-2], data_size, nclass) data.init_data(t, data.bins[-1], data_size, nclass) end_size = 4 * 1024 if FLAGS.mode > 0 else 1024 data.init_data(t, data.forward_max, end_size, nclass) # Print out parameters. curriculum = FLAGS.curriculum_bound msg1 = ("layers %d kw %d h %d kh %d relax %d batch %d noise %.2f task %s" % (FLAGS.nconvs, FLAGS.kw, FLAGS.height, FLAGS.kh, FLAGS.rx_step, FLAGS.batch_size, FLAGS.grad_noise_scale, FLAGS.task)) msg2 = "data %d %s" % (FLAGS.train_data_size, msg1) msg3 = ("cut %.2f pull %.3f lr %.2f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s" % (FLAGS.cutoff, FLAGS.pull_incr, FLAGS.lr, FLAGS.init_weight, curriculum, FLAGS.nmaps, FLAGS.dropout, FLAGS.max_grad_norm, msg2)) data.print_out(msg3) # Create checkpoint directory if it does not exist. checkpoint_dir = os.path.join(FLAGS.train_dir, "neural_gpu%s" % ("" if FLAGS.jobid < 0 else str(FLAGS.jobid))) if not gfile.IsDirectory(checkpoint_dir): data.print_out("Creating checkpoint directory %s." % checkpoint_dir) gfile.MkDir(checkpoint_dir) # Create model and initialize it. tf.get_variable_scope().set_initializer( tf.uniform_unit_scaling_initializer(factor=1.8 * FLAGS.init_weight)) model = neural_gpu.NeuralGPU( FLAGS.nmaps, FLAGS.nmaps, FLAGS.niclass, FLAGS.noclass, FLAGS.dropout, FLAGS.rx_step, FLAGS.max_grad_norm, FLAGS.cutoff, FLAGS.nconvs, FLAGS.kw, FLAGS.kh, FLAGS.height, FLAGS.mode, FLAGS.lr, FLAGS.iw_batches, FLAGS.pull, FLAGS.pull_incr, min_length + 3) data.print_out("Created model.") sess.run(tf.initialize_all_variables()) data.print_out("Initialized variables.") # Load model from parameters if a checkpoint exists. ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and gfile.Exists(ckpt.model_checkpoint_path): data.print_out("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) # Check if there are ensemble models and get their checkpoints. ensemble = [] ensemble_dir_list = [d for d in FLAGS.ensemble.split(",") if d] for ensemble_dir in ensemble_dir_list: ckpt = tf.train.get_checkpoint_state(ensemble_dir) if ckpt and gfile.Exists(ckpt.model_checkpoint_path): data.print_out("Found ensemble model %s" % ckpt.model_checkpoint_path) ensemble.append(ckpt.model_checkpoint_path) # Return the model and needed variables. return (model, min_length, max_length, checkpoint_dir, curriculum, ensemble)
def lstm_layer(inp, length=None, state=None, memory=None, num_nodes=None, backward=False, clip=50.0, reg_func=tf.nn.l2_loss, weight_reg=False, weight_collection="LSTMWeights", bias_reg=False, stddev=None, seed=None, decode=False, use_native_weights=False, name=None): """Adds ops for an LSTM layer. This adds ops for the following operations: input => (forward-LSTM|backward-LSTM) => output The direction of the LSTM is determined by `backward`. If it is false, the forward LSTM is used, the backward one otherwise. Args: inp: A 3-D tensor of shape [`batch_size`, `max_length`, `feature_dim`]. length: A 1-D tensor of shape [`batch_size`] and type int64. Each element represents the length of the corresponding sequence in `inp`. state: If specified, uses it as the initial state. memory: If specified, uses it as the initial memory. num_nodes: The number of LSTM cells. backward: If true, reverses the `inp` before adding the ops. The output is also reversed so that the direction is the same as `inp`. clip: Value used to clip the cell values. reg_func: Function used for the weight regularization such as `tf.nn.l2_loss`. weight_reg: If true, regularize the filter weights with `reg_func`. weight_collection: Collection to add the weights to for regularization. bias_reg: If true, regularize the bias vector with `reg_func`. stddev: Standard deviation used to initialize the variables. seed: Seed used to initialize the variables. decode: If true, does not add ops which are not used for inference. use_native_weights: If true, uses weights in the same format as the native implementations. name: Name of the op. Returns: A 3-D tensor of shape [`batch_size`, `max_length`, `num_nodes`]. """ with tf.variable_scope(name): if backward: if length is None: inp = tf.reverse(inp, [False, True, False]) else: inp = tf.reverse_sequence(inp, length, 1, 0) num_prev = inp.get_shape()[2] if stddev: initializer = tf.truncated_normal_initializer(stddev=stddev, seed=seed) else: initializer = tf.uniform_unit_scaling_initializer(seed=seed) if use_native_weights: with tf.variable_scope("LSTMCell"): w = tf.get_variable( "W_0", shape=[num_prev + num_nodes, 4 * num_nodes], initializer=initializer, dtype=tf.float32) w_i_m = tf.slice(w, [0, 0], [num_prev, 4 * num_nodes], name="w_i_m") w_m_m = tf.reshape( tf.slice(w, [num_prev, 0], [num_nodes, 4 * num_nodes]), [num_nodes, 4, num_nodes], name="w_m_m") else: w_i_m = tf.get_variable("w_i_m", [num_prev, 4 * num_nodes], initializer=initializer) w_m_m = tf.get_variable("w_m_m", [num_nodes, 4, num_nodes], initializer=initializer) if not decode and weight_reg: tf.add_to_collection(weight_collection, reg_func(w_i_m, name="w_i_m_reg")) tf.add_to_collection(weight_collection, reg_func(w_m_m, name="w_m_m_reg")) batch_size = shapes.tensor_dim(inp, dim=0) num_frames = shapes.tensor_dim(inp, dim=1) prev = tf.reshape(inp, tf.pack([batch_size * num_frames, num_prev])) if use_native_weights: with tf.variable_scope("LSTMCell"): b = tf.get_variable( "B", shape=[4 * num_nodes], initializer=tf.zeros_initializer, dtype=tf.float32) biases = tf.identity(b, name="biases") else: biases = tf.get_variable( "biases", [4 * num_nodes], initializer=tf.constant_initializer(0.0)) if not decode and bias_reg: tf.add_to_collection( weight_collection, reg_func( biases, name="biases_reg")) prev = tf.nn.xw_plus_b(prev, w_i_m, biases) prev = tf.reshape(prev, tf.pack([batch_size, num_frames, 4, num_nodes])) if state is None: state = tf.fill(tf.pack([batch_size, num_nodes]), 0.0) if memory is None: memory = tf.fill(tf.pack([batch_size, num_nodes]), 0.0) out, _, mem = rnn.variable_lstm(prev, state, memory, w_m_m, clip=clip) if backward: if length is None: out = tf.reverse(out, [False, True, False]) else: out = tf.reverse_sequence(out, length, 1, 0) return out, mem
def _create_inference(self, item_input, is_reuse): with tf.name_scope('global_module'): u_emb = tf.nn.embedding_lookup(self.all_weights['user_embed'], self.user_input) v_emb = tf.nn.embedding_lookup(self.all_weights['item_embed'], item_input) mf_interact = tf.nn.dropout(tf.multiply(u_emb, v_emb), keep_prob=self.dropout_keep) with tf.name_scope('aspect_module'): u_hist = tf.nn.embedding_lookup( self.all_weights['user_history_aspect'], self.user_input, name='u_hist') v_hist = tf.nn.embedding_lookup( self.all_weights['item_history_aspect'], item_input, name='v_hist') u_hist_a_embs = tf.nn.embedding_lookup( self.all_weights['aspect_embed'], u_hist, name='u_hist_a_embs') v_hist_a_embs = tf.nn.embedding_lookup( self.all_weights['aspect_embed'], v_hist, name='v_hist_a_embs') u_hist_a_embs = tf.layers.dense( u_hist_a_embs, units=self.num_aspect_factor, name='aspect_embed_trans', kernel_initializer=tf.uniform_unit_scaling_initializer( factor=1.0), use_bias=False, reuse=is_reuse) v_hist_a_embs = tf.layers.dense( v_hist_a_embs, units=self.num_aspect_factor, name='aspect_embed_trans', kernel_initializer=tf.uniform_unit_scaling_initializer( factor=1.0), use_bias=False, reuse=True) user_mask_padding = tf.nn.embedding_lookup( self.all_weights['mask_lookup_table'], u_hist, name='user_mask_padding') item_mask_padding = tf.nn.embedding_lookup( self.all_weights['mask_lookup_table'], v_hist, name='item_mask_padding') u_hist_a_embs = tf.multiply(user_mask_padding, u_hist_a_embs, 'u_hist_a_embs_masked') v_hist_a_embs = tf.multiply(item_mask_padding, v_hist_a_embs, 'v_hist_a_embs_masked') with tf.name_scope('aspect_interact'): u_hist_a_embs_interact = tf.nn.l2_normalize(u_hist_a_embs, dim=-1) v_hist_a_embs_interact = tf.nn.l2_normalize(v_hist_a_embs, dim=-1) u_aspect_array_ = tf.expand_dims(u_hist_a_embs_interact, 2) v_aspect_array_ = tf.expand_dims(v_hist_a_embs_interact, 1) interact = tf.multiply(u_aspect_array_, v_aspect_array_) with tf.name_scope('aspect_level_attention'): att_l2_1 = tf.layers.dense(interact, units=1, name='att_l2_1', reuse=is_reuse) att_l2 = tf.nn.softmax(att_l2_1, dim=2) with tf.name_scope("user_level_attention"): v_a_emb = tf.tile( tf.reduce_sum(v_hist_a_embs_interact, axis=1, keep_dims=True), [1, self.MaxPerUser, 1]) input_att_l1 = v_a_emb * u_hist_a_embs_interact att_l1_1 = tf.layers.dense(input_att_l1, units=1, name='att_l1_1', reuse=is_reuse) att_l1 = tf.nn.softmax(att_l1_1, dim=1) with tf.name_scope('attach_attention'): weighted_interact_l2 = tf.reduce_sum(tf.multiply(att_l2, interact), axis=2) aspect_interact = tf.reduce_sum(tf.multiply( att_l1, weighted_interact_l2), axis=1) aspect_interact = tf.nn.dropout(aspect_interact, self.dropout_keep) with tf.name_scope('concatenate'): interact_vector = tf.concat([mf_interact, aspect_interact], axis=-1) with tf.name_scope('prediction'): rating_preds = tf.matmul(interact_vector, self.all_weights['W_out'], name='prediction') return rating_preds
def get_instance(args): """ create an instance of the initializer """ factor = float(args.get('factor', 1.0)) return tf.uniform_unit_scaling_initializer(factor, seed=SEED)
def add_hl(self, q_embed, aplus_embed, aminus_embed): with tf.variable_scope('HL'): W = tf.get_variable('weights', shape=[self.config.embedding_size, self.config.hidden_size], initializer=tf.uniform_unit_scaling_initializer()) b = tf.get_variable('biases', initializer=tf.constant(0.1, shape=[self.config.hidden_size])) h_q = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(q_embed, [-1, self.config.embedding_size]), W)+b), [-1, self.config.sequence_length, self.config.hidden_size]) h_ap = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(aplus_embed, [-1, self.config.embedding_size]), W)+b), [-1, self.config.sequence_length, self.config.hidden_size]) h_am = tf.reshape(tf.nn.tanh(tf.matmul(tf.reshape(aminus_embed, [-1, self.config.embedding_size]), W)+b), [-1, self.config.sequence_length, self.config.hidden_size]) tf.add_to_collection('total_loss', 0.5*self.config.l2_reg_lambda*tf.nn.l2_loss(W)) return h_q, h_ap, h_am
def autoencoder(data, corrupt_prob, dimensions, beta=0.01, rho=0.4, activation=tf.nn.sigmoid, lamb=0.01, gamma=0.01): # init_random = tf.random_normal_initializer(mean=0.0, stddev=1.0, seed=24, dtype=tf.float32) # # init_truncated = tf.truncated_normal_initializer(mean=0.0, stddev=1.0, seed=24, dtype=tf.float32) # # init_uniform = tf.random_uniform_initializer(minval=0, maxval=1, seed=24, dtype=tf.float32) init_uniform_unit = tf.uniform_unit_scaling_initializer(factor=1.0, seed=24, dtype=tf.float32) # init_variance_scaling_normal = tf.variance_scaling_initializer(scale=1.0, mode="fan_in", # distribution="normal", seed=24, dtype=tf.float32) # init_variance_scaling_uniform = tf.variance_scaling_initializer(scale=1.0, mode="fan_in", # distribution="uniform", seed=24, dtype=tf.float32) # init_orthogonal = tf.orthogonal_initializer(gain=1.0, seed=None, dtype=tf.float32) # init_glorot_uniform = tf.glorot_uniform_initializer() # init_glorot_normal = tf.glorot_normal_initializer() # x = tf.placeholder(tf.float32, [None, dimensions[0]], name='x') x = tf.cast(data, tf.float32) current_input = corrupt(x) * corrupt_prob + x * (1 - corrupt_prob) noise_input = current_input weight_decay_J = 0 # Build the encoder print("========= encoder begin ==========") encoder = [] encoder_b = [] for layer_i, n_output in enumerate(dimensions[1:]): n_input = int(current_input.get_shape()[0]) print("encoder : layer_i - n_output - n_input", layer_i, n_output, n_input) #W = tf.Variable(tf.random_uniform([n_output, n_input], -1.0 / math.sqrt(n_input), 1.0 / math.sqrt(n_input))) W_name = "W1_" + str(layer_i) W = tf.get_variable(W_name, shape=[n_output, n_input], initializer=init_uniform_unit) b = tf.Variable(tf.zeros([1, n_output])) encoder.append(W) encoder_b.append(b) output = activation( tf.transpose(tf.transpose(tf.matmul(W, current_input)) + b)) current_input = output weight_decay_J += (lamb / 2.0) * (tf.reduce_mean(W**2)) print("========= encoder finish =========") # latent representation encoder_out = current_input print(encoder_out.shape) #encoder.reverse() # Build the decoder using the same weights print("========= decoder begin ==========") for layer_i, n_output in enumerate(dimensions[:-1][::-1]): print("decoder : layer_i - n_output", layer_i, n_output) n_input = int(current_input.get_shape()[0]) #W = tf.transpose(encoder[layer_i]) # transpose of the weights #W = tf.Variable(tf.random_uniform([n_output, n_input], -1.0 / math.sqrt(n_input), 1.0 / math.sqrt(n_input))) W_name = "W2_" + str(layer_i) W = tf.get_variable(W_name, shape=[n_output, n_input], initializer=init_uniform_unit) b = tf.Variable(tf.zeros([1, n_output])) output = activation( tf.transpose(tf.transpose(tf.matmul(W, current_input)) + b)) current_input = output weight_decay_J += (lamb / 2.0) * (tf.reduce_mean(W**2)) print("========= decoder finish =========") # now have the reconstruction through the network reconstruction = current_input # kl = tf.reduce_mean(-tf.nn.softmax_cross_entropy_with_logits(logits=z, labels=z/0.01)) #encoder.reverse() rhohats = tf.reduce_mean(tf.transpose(encoder_out), 0) #p = np.repeat([rho], encoder_out.get_shape().as_list()[0]).astype(np.float32) kl = tf.reduce_mean(rho * tf.log(rho / rhohats) + (1 - rho) * tf.log((1 - rho) / (1 - rhohats))) #m = data.get_shape().as_list()[1] * 1.0 ae_loss = (gamma / 2.0) * tf.reduce_mean(tf.square(reconstruction - x)) kl_loss = beta * kl cost = ae_loss + kl_loss + weight_decay_J # cost = 0.5 * tf.reduce_sum(tf.square(y - x)) return { 'x': x, 'encoder_out': encoder_out, 'reconstruction': reconstruction, 'corrupt_prob': corrupt_prob, 'cost': cost, 'noise_input': noise_input, 'kl': kl, 'weight_decay_J': weight_decay_J, 'ae_loss': ae_loss, 'kl_loss': kl_loss, 'W_list': encoder, 'b_list': encoder_b }
return FHNOutputTuple(V, W), FHNStateTuple(V, W) state = FHNStateTuple( tf.placeholder(tf.float32, [batch_size, net_size], name="V"), tf.placeholder(tf.float32, [batch_size, net_size], name="W"), ) cell = FHNCell(net_size, basic_v_relation) input = tf.placeholder(tf.float32, shape=(batch_size, seq_size, 1, 1), name="Input") filter = vs.get_variable( "E", [L, 1, 1, filters_num], initializer=tf.uniform_unit_scaling_initializer(factor=weight_init_factor)) conv_out = tf.nn.conv2d(input, filter, strides=[1, strides, 1, 1], padding='SAME') conv_out = tf.transpose(conv_out, [1, 0, 2, 3]) conv_out = tf.squeeze(conv_out, squeeze_dims=[2]) net_out, finstate = rnn.dynamic_rnn(cell, conv_out, initial_state=state, time_major=True) V = tf.expand_dims(net_out.V, 3) V = tf.transpose(V, [1, 0, 3, 2])
def __init__(self, src_vocab_size, tgt_vocab_size, env_vocab_size, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, dropout, FLAGS, forward_only=False, optimizer="adam"): self.size = size self.src_vocab_size = src_vocab_size self.tgt_vocab_size = tgt_vocab_size self.env_vocab_size = env_vocab_size self.batch_size = batch_size self.num_layers = num_layers self.keep_prob_config = 1.0 - dropout self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.keep_prob = tf.placeholder(tf.float32) self.source_tokens = tf.placeholder(tf.int32, shape=[None, None], name="source_tokens") self.target_tokens = tf.placeholder(tf.int32, shape=[None, None], name="target_tokens") self.source_mask = tf.placeholder(tf.int32, shape=[None, None], name="source_mask") self.target_mask = tf.placeholder(tf.int32, shape=[None, None], name="target_mask") self.ctx_tokens = tf.placeholder(tf.int32, shape=[None, None], name="ctx_tokens") # self.pred_tokens = tf.placeholder(tf.int32, shape=[None, None], name="pred_tokens") self.ctx_mask = tf.placeholder(tf.int32, shape=[None, None], name="ctx_mask") # self.pred_mask = tf.placeholder(tf.int32, shape=[None, None], name="pred_mask") self.beam_size = tf.placeholder(tf.int32) self.target_length = tf.reduce_sum(self.target_mask, reduction_indices=0) self.FLAGS = FLAGS self.decoder_state_input, self.decoder_state_output = [], [] for i in xrange(num_layers): self.decoder_state_input.append( tf.placeholder(tf.float32, shape=[None, size])) # adding seed, now we fixed the randomness with tf.variable_scope("Logic", initializer=tf.uniform_unit_scaling_initializer( 1.0, seed=self.FLAGS.seed)): self.setup_embeddings() self.setup_encoder() # this should be fine... if FLAGS.co_attn: self.encoder_output = self.rev_coattn_encode() elif FLAGS.seq: self.encoder_output = self.sequence_encode() elif FLAGS.cat_attn: self.encoder_output = self.concate_encode() else: self.encoder_output = self.rev_attention_encode( ) # ha, attention is the "normal" case self.setup_decoder(self.encoder_output) self.setup_loss() self.setup_beam() params = tf.trainable_variables() if not forward_only: opt = get_optimizer(optimizer)(self.learning_rate) gradients = tf.gradients(self.losses, params) clipped_gradients, _ = tf.clip_by_global_norm( gradients, max_gradient_norm) # self.gradient_norm = tf.global_norm(clipped_gradients) self.gradient_norm = tf.global_norm(gradients) self.param_norm = tf.global_norm(params) self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver( tf.global_variables(), max_to_keep=FLAGS.keep) # write_version=tf.train.SaverDef.V1
def encode_spectrum(encoder_inputs, intensity_inputs_forward, intensity_inputs_backward, decoder_inputs_forward, decoder_inputs_backward, keep_conv, keep_dense): """TODO(nh2tran): docstring.""" with variable_scope.variable_scope("embedding_rnn_seq2seq"): # spectra_holder layer0 = tf.reshape(encoder_inputs[0], [-1, 1, deepnovo_config.MZ_SIZE, 1]) # conv1 conv1_W = variable_scope.get_variable( name="conv1_W", shape=[1, 4, 1, 4], initializer=tf.uniform_unit_scaling_initializer(1.43)) conv1_B = variable_scope.get_variable( name="conv1_B", shape=[4], initializer=tf.constant_initializer(0.1)) # conv2 conv2_W = variable_scope.get_variable( name="conv2_W", shape=[1, 4, 4, 4], initializer=tf.uniform_unit_scaling_initializer(1.43)) conv2_B = variable_scope.get_variable( name="conv2_B", shape=[4], initializer=tf.constant_initializer(0.1)) # dense1 dense1_input_size = 1 * (deepnovo_config.MZ_SIZE // (4)) * 4 dense1_output_size = deepnovo_config.embedding_size # JOON dense1_W = variable_scope.get_variable( name="dense1_W", shape=[dense1_input_size, dense1_output_size], initializer=tf.uniform_unit_scaling_initializer(1.43)) dense1_B = variable_scope.get_variable( name="dense1_B", shape=[dense1_output_size], initializer=tf.constant_initializer(0.1)) # layers conv1 = tf.nn.relu( tf.nn.conv2d(layer0, conv1_W, strides=[1, 1, 1, 1], padding='SAME') + conv1_B) conv2 = tf.nn.relu( tf.nn.conv2d(conv1, conv2_W, strides=[1, 1, 1, 1], padding='SAME') + conv2_B) conv2 = tf.nn.max_pool(conv2, ksize=[1, 1, 6, 1], strides=[1, 1, 4, 1], padding='SAME') conv2 = tf.nn.dropout(conv2, keep_conv) dense1 = tf.reshape(conv2, [-1, dense1_input_size]) dense1 = tf.nn.relu(tf.matmul(dense1, dense1_W) + dense1_B) dense1 = tf.nn.dropout(dense1, keep_dense) print('dense1 in encode_spectrum:', dense1) # SPECTRUM as Input 0 encoded_spectrum = dense1 return embed_labels(encoded_spectrum, intensity_inputs_forward, intensity_inputs_backward, decoder_inputs_forward, decoder_inputs_backward, keep_conv, keep_dense)
def spectrum_cnn2(spectrum): # define variables with variable_scope.variable_scope("spectrum_cnn2"): input_layer = tf.reshape(spectrum, [-1, 1, data_utils.MZ_SIZE, 1]) W1 = tf.get_variable( "W1", [1, 4, 1, 4], initializer=tf.contrib.layers.xavier_initializer(seed=0)) B1 = tf.get_variable("B1", [4], initializer=tf.constant_initializer(0.1)) W2 = tf.get_variable( "W2", [1, 4, 4, 4], initializer=tf.contrib.layers.xavier_initializer(seed=0)) B2 = tf.get_variable("B2", [4], initializer=tf.constant_initializer(0.1)) Z1 = tf.nn.conv2d(input_layer, W1, strides=[1, 1, 1, 1], padding='SAME') A1 = tf.nn.relu(Z1 + B1) # P1 = tf.layers.max_pooling1d( inputs=A1,pool_size=3,strides=3,padding="same") Z2 = tf.nn.conv2d(A1, W2, strides=[1, 1, 1, 1], padding='SAME') A2 = tf.nn.relu(Z2 + B2) P2 = tf.nn.max_pool(A2, ksize=[1, 1, 6, 1], strides=[1, 1, 4, 1], padding="SAME") D2 = tf.nn.dropout(P2, .75) dense1_input_size = 1 * (data_utils.MZ_SIZE // (4)) * 4 dense1_output_size = 512 dense1_W = variable_scope.get_variable( name="dense1_W", shape=[dense1_input_size, dense1_output_size], initializer=tf.uniform_unit_scaling_initializer(1.43)) dense1_B = variable_scope.get_variable( name="dense1_B", shape=[dense1_output_size], initializer=tf.constant_initializer(0.1)) # print(A1.shape) # print(D2.shape) Z3 = tf.reshape(D2, [-1, dense1_input_size]) Z3 = tf.nn.relu(tf.matmul(Z3, dense1_W) + dense1_B) # Z5 = tf.contrib.layers.fully_connected(P4, num_outputs=500,activation_fn=None) # Z5 = tf.nn.relu(Z5) Z3 = tf.nn.dropout(Z3, .5) #new Z5 = tf.contrib.layers.fully_connected(Z3, num_outputs=100, activation_fn=None) # # Z5 = tf.contrib.layers.fully_connected(Z5, num_outputs=50,activation_fn=None) # # Z5 = tf.nn.sigmoid(Z5) # Z5 = tf.nn.relu(Z5) print(Z5.shape) Z6 = tf.contrib.layers.fully_connected( Z5, num_outputs=1, #data_utils.vocab_size, activation_fn=None) print(Z6.shape) return Z6
def uniform_unit_scaling(params): return tf.uniform_unit_scaling_initializer()
def initialize_attention_func(self, input_size, attention_states): # Get shape of attention states (the outputs from the encoder cell) attention_states_shape = attention_states.get_shape().as_list() attention_size = attention_states_shape[-1] attention_length = attention_states_shape[1] # Define W_2 with tf.variable_scope('attention'): # Since we unroll the cell state tuples we will have two vectors # for each rnn cell (the hidden state vector c_t and the output # vector h_t) unrolled_state_length = 2 * self.state_size * self.num_cells W_2 = tf.get_variable( name='W_2', shape=[unrolled_state_length, attention_size], initializer=tf.uniform_unit_scaling_initializer(), dtype=tf.float32) b_2 = tf.get_variable(name='b_2', shape=[attention_size], initializer=tf.constant_initializer(), dtype=tf.float32) W_3 = tf.get_variable( name='W_3', shape=[input_size + attention_size, input_size], initializer=tf.uniform_unit_scaling_initializer(), dtype=tf.float32) b_3 = tf.get_variable(name='b_3', shape=[input_size], initializer=tf.constant_initializer(), dtype=tf.float32) # Reshape hidden encoder state `h_t`. h_t = tf.reshape(attention_states, shape=[-1, attention_length, 1, attention_size]) k = tf.get_variable(shape=[1, 1, attention_size, attention_size], name='attention_W') v = tf.get_variable(shape=[attention_size], name='attention_v') # Compute W_1 * h_t using a 1-by-1 convolution W1_ht = tf.nn.conv2d(input=h_t, filter=k, strides=[1, 1, 1, 1], padding='SAME', name='W1_ht') # Define attention function def attention_func(state): ''' Computes attention-weighted context vector c_t from a given RNN StateTuple. ''' # If the query is a tuple, flatten it # (e.g. when using bidirectional encoder). if is_sequence(state): query_list = flatten(state) state = tf.concat(query_list, axis=1) with tf.variable_scope('attention'): # Compute W_2 * d_t W2_dt = projection(state, W=W_2, b=b_2) W2_dt = tf.reshape(W2_dt, [-1, 1, 1, attention_size]) # Compute attention mask: # v.T * tanh(W_1 * h_t + W_2 * d_t) u = tf.reduce_sum(v * tf.tanh(W1_ht + W2_dt), [2, 3]) # Compute attention mask - alphas alpha = tf.nn.softmax(u, name='alpha-weights') # Compute the attention-weighted context vector c_t. c_t = tf.reduce_sum( tf.reshape(alpha, [-1, attention_length, 1, 1]) * h_t, [1, 2]) return c_t self._attention_func = attention_func self._W_3 = W_3 self._b_3 = b_3