def _create_embedders(self): #placeholder for input data self._src_input_data = tf.placeholder(tf.int32, [None, self.MAX_SEQ_LENGTH], name='source_sequence') self._tgt_input_data = tf.placeholder(tf.int32, [None, self.MAX_SEQ_LENGTH], name='target_sequence') self._labels = tf.placeholder(tf.int64, [None], name='targetSpace_labels') self._src_lens = tf.placeholder(tf.int32, [None], name='source_seq_lenths') self._tgt_lens = tf.placeholder(tf.int32, [None], name='target_seq_lenths') #create word embedding vectors self.src_word_embedding = tf.get_variable('src_word_embedding', [self.src_vocab_size, self.word_embed_size], initializer=tf.random_uniform_initializer(-0.25,0.25)) self.tgt_word_embedding = tf.get_variable('tgt_word_embedding', [self.tgt_vocab_size, self.word_embed_size], initializer=tf.random_uniform_initializer(-0.25, 0.25)) #transform input tensors from tokenID to word embedding self.src_input_distributed = tf.nn.embedding_lookup( self.src_word_embedding, self._src_input_data, name='dist_source') self.tgt_input_distributed = tf.nn.embedding_lookup( self.tgt_word_embedding, self._tgt_input_data, name='dist_target') if self.network_mode == 'source-encoder-only': self._source_encoder_only_network() elif self.network_mode == 'dual-encoder': self._dual_encoder_network() elif self.network_mode == 'shared-encoder': self._shared_encoder_network() else: print('Error!! Unsupported network mode: %s. Please specify on: source-encoder-only, dual-encoder or shared-encoder.' % self.network_mode ) exit(-1)
def xavier_init(input_size, output_size, uniform=True): if uniform: init_range= tf.sqrt(6.0/(input_size+output_size)) return tf.random_uniform_initializer(stdevv=init_range) else: init_range= tf.sqrt(3.0/(input_size+output_size)) return tf.random_uniform_initializer(stdevv=init_range)
def weight(name, shape, init='he', range=None): """ Initializes weight. :param name: Variable name :param shape: Tensor shape :param init: Init mode. xavier / normal / uniform / he (default is 'he') :param range: :return: Variable """ initializer = tf.constant_initializer() if init == 'xavier': fan_in, fan_out = _get_dims(shape) range = math.sqrt(6.0 / (fan_in + fan_out)) initializer = tf.random_uniform_initializer(-range, range) elif init == 'he': fan_in, _ = _get_dims(shape) std = math.sqrt(2.0 / fan_in) initializer = tf.random_normal_initializer(stddev=std) elif init == 'normal': initializer = tf.random_normal_initializer(stddev=0.1) elif init == 'uniform': if range is None: raise ValueError("range must not be None if uniform init is used.") initializer = tf.random_uniform_initializer(-range, range) var = tf.get_variable(name, shape, initializer=initializer) tf.add_to_collection('l2', tf.nn.l2_loss(var)) # Add L2 Loss return var
def __init__(self, state_size, num_obs, steps_per_obs, sigma_min=1e-5, dtype=tf.float32, random_seed=None): self.state_size = state_size self.sigma_min = sigma_min self.dtype = dtype self.steps_per_obs = steps_per_obs self.num_obs = num_obs self.num_timesteps = num_obs*steps_per_obs +1 initializers = { "w": tf.random_uniform_initializer(seed=random_seed), "b": tf.zeros_initializer } self.mus = [ snt.Linear(output_size=state_size, initializers=initializers) for t in xrange(self.num_timesteps) ] self.sigmas = [ tf.get_variable( shape=[state_size], dtype=self.dtype, name="q_sigma_%d" % (t + 1), initializer=tf.random_uniform_initializer(seed=random_seed)) for t in xrange(self.num_timesteps) ]
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or type(self).__name__): # Conveniently the concatenation of all hidden states at t-1 h_star_t_prev = state u_g = tf.get_variable("u_g", [self.state_size], initializer=tf.random_uniform_initializer(-0.1, 0.1)) cur_state_pos = 0 cur_inp = inputs new_states = [] for i, cell in enumerate(self._cells): with tf.variable_scope("Cell%d" % i): cur_state = array_ops.slice( state, [0, cur_state_pos], [-1, cell.state_size]) with tf.variable_scope("Global Reset"): w_g = tf.get_variable("w_g", cell.state_size, initializer=tf.random_uniform_initializer(-0.1, 0.1)) g = tf.sigmoid(tf.mul(w_g, cur_state) + tf.mul(u_g, h_star_t_prev)) U = tf.get_variable("U", [cell.state_size], initializer=tf.random_uniform_initializer(-0.1, 0.1)) cur_state = tf.reduce_sum(g * tf.matmul(cur_state, U)) cur_state_pos += cell.state_size cur_inp, new_state = cell(cur_inp, cur_state) new_states.append(new_state) return cur_inp, array_ops.concat(1, new_states)
def build_lstm_forward(H, x, googlenet, phase, reuse): grid_size = H['arch']['grid_width'] * H['arch']['grid_height'] outer_size = grid_size * H['arch']['batch_size'] input_mean = 117. x -= input_mean Z = googlenet_load.model(x, googlenet, H) with tf.variable_scope('decoder', reuse=reuse): scale_down = 0.01 if H['arch']['early_dropout'] and phase == 'train': Z = tf.nn.dropout(Z, 0.5) lstm_input = tf.reshape(Z * scale_down, (H['arch']['batch_size'] * grid_size, 1024)) lstm_outputs = build_lstm_inner(lstm_input, H) pred_boxes = [] pred_logits = [] for i in range(H['arch']['rnn_len']): output = lstm_outputs[i] if H['arch']['late_dropout'] and phase == 'train': output = tf.nn.dropout(output, 0.5) box_weights = tf.get_variable('box_ip%d' % i, shape=(H['arch']['lstm_size'], 4), initializer=tf.random_uniform_initializer(-0.1, 0.1)) conf_weights = tf.get_variable('conf_ip%d' % i, shape=(H['arch']['lstm_size'], 2), initializer=tf.random_uniform_initializer(-0.1, 0.1)) pred_boxes.append(tf.reshape(tf.matmul(output, box_weights) * 50, [outer_size, 1, 4])) pred_logits.append(tf.reshape(tf.matmul(output, conf_weights), [outer_size, 1, 2])) pred_boxes = tf.concat(1, pred_boxes) pred_logits = tf.concat(1, pred_logits) pred_logits_squash = tf.reshape(pred_logits, [outer_size * H['arch']['rnn_len'], 2]) pred_confidences_squash = tf.nn.softmax(pred_logits_squash) pred_confidences = tf.reshape(pred_confidences_squash, [outer_size, H['arch']['rnn_len'], 2]) return pred_boxes, pred_logits, pred_confidences
def count_sketch(probs, project_size): """ Calculates count-min sketch of a tensor. Args: probs: A `Tensor` project_size: output size (`int`) Returns:c A projected count-min sketch `Tensor` with shape [batch_size, project_size]. """ with tf.variable_scope('CountSketch_'+probs.name.replace(':', '_')) as scope: input_size = int(probs.get_shape()[1]) # h, s must be sampled once history = tf.get_collection('__countsketch') if scope.name in history: scope.reuse_variables() tf.add_to_collection('__countsketch', scope.name) h = tf.get_variable('h', [input_size], initializer=tf.random_uniform_initializer(0, project_size), trainable=False) s = tf.get_variable('s', [input_size], initializer=tf.random_uniform_initializer(0, 2), trainable=False) h = tf.cast(h, 'int32') s = tf.cast(tf.floor(s) * 2 - 1, 'int32') # 1 or -1 sk = _sketch_op.count_sketch(probs, h, s, project_size) sk.set_shape([probs.get_shape()[0], project_size]) return sk
def build(self): with tf.name_scope('weigths'): self.W = tf.get_variable( shape=[self.hidden_dim, self.nb_classes], initializer=tf.random_uniform_initializer(-0.2, 0.2), # initializer=tf.truncated_normal_initializer(stddev=0.01), name='lstm_weights' ) self.T = tf.get_variable( shape=[self.feat_size, self.nb_classes], initializer=tf.random_uniform_initializer(-0.2, 0.2), # initializer=tf.truncated_normal_initializer(stddev=0.01), name='feat_weights' ) self.lstm_fw = tf.contrib.rnn.LSTMCell(self.hidden_dim) with tf.name_scope('biases'): self.b = tf.Variable(tf.zeros([self.nb_classes], name="bias")) # self.b = tf.get_variable( # shape=[self.nb_classes], # initializer=tf.truncated_normal_initializer(stddev=0.01), # # initializer=tf.random_uniform_initializer(-0.2, 0.2), # name="bias" # ) return
def compute_feedback(self, inputs, full_state, layer_sizes, scope=None): with tf.variable_scope("Global Reset"): cur_state_pos = 0 full_state_size = sum(layer_sizes) summation_term = tf.get_variable("summation", self.state_size, initializer=tf.constant_initializer()) for i, layer_size in enumerate(layer_sizes): with tf.variable_scope("Cell%d" % i): # Compute global reset gate w_g = tf.get_variable("w_g", self.input_size, initializer=tf.random_uniform_initializer(-0.1, 0.1)) u_g = tf.get_variable("u_g", full_state_size, initializer=tf.random_uniform_initializer(-0.1, 0.1)) g__i_j = tf.sigmoid(tf.matmul(inputs, w_g) + tf.matmul(full_state, u_g)) # Accumulate sum h_t_1 = \ tf.slice( full_state, [0, cur_state_pos], [-1, layer_size] ) cur_state_pos += layer_size U = tf.get_variable("U", [self.input_size, self._num_units], initializer=tf.random_uniform_initializer(-0.1, 0.1)) b = tf.get_variable("b", self.state_size, initializer=tf.constant_initializer(1.)) summation_term = tf.add(summation_term, g__i_j * tf.matmul(U, h_t_1) + b) return summation_term
def sin_bank(x, bank_size, length, scope=None): with tf.variable_op_scope([x], scope, "SinBank") as scope: bank = tf.get_variable("bank", dtype=tf.float32, shape=[bank_size, ], initializer=tf.random_uniform_initializer(0.0, length)) shift = tf.get_variable("shift", dtype=tf.float32, shape=[bank_size, ], initializer=tf.random_uniform_initializer(0.0, length)) if not tf.get_variable_scope().reuse: tf.histogram_summary(bank.name, bank) return tf.sin(x*bank+shift)
def _build_net(self): with tf.name_scope('inputs'): self.tf_obs=tf.placeholder(tf.float32,[None,self.n_features],name="observations") self.tf_acts=tf.placeholder(tf.int32,[None, ],name="actions") self.tf_vt=tf.placeholder(tf.float32,[None, ],name="action_values") layer_1=tf.layers.dense( inputs=self.tf_obs, units=H, activation=tf.nn.tanh, kernel_initializer=tf.random_normal_initializer(mean=0,stddev=0.3), #kernel_initializer=tf.random_uniform_initializer(-0.23,0.23), bias_initializer=tf.constant_initializer(0), name='h_layer1', ) layer_2=tf.layers.dense( inputs=layer_1, units=H, activation=tf.nn.tanh, #kernel_initializer=tf.random_normal_initializer(mean=0,stddev=0.3), kernel_initializer=tf.random_uniform_initializer(-0.23,0.23), bias_initializer=tf.constant_initializer(0), name='h_layer2', ) all_act=tf.layers.dense( inputs=layer_2, units=self.n_actions, activation=tf.nn.tanh, kernel_initializer=tf.random_uniform_initializer(-0.23,0.23), #kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), #kernel_initializer=tf.truncated_normal_initializer(mean=0, stddev=0.3), bias_initializer=tf.constant_initializer(0), name='output' ) self.all_act_prob =tf.nn.softmax(all_act, name='act_prob') loss=tf.log(self.all_act_prob) with tf.name_scope('loss'): neg_log_prob=tf. reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts,self.n_actions),axis=1) loss=tf.reduce_mean(neg_log_prob*self.tf_vt) with tf.name_scope('optimizer'): self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)
def dense_layer(self, input, out_dim, name, func=tf.nn.relu): in_dim = input.get_shape().as_list()[-1] d = 1.0 / np.sqrt(in_dim) with tf.variable_scope(name): w_init = tf.random_uniform_initializer(-d, d) b_init = tf.random_uniform_initializer(-d, d) w = tf.get_variable('w', dtype=tf.float32, shape=[in_dim, out_dim], initializer=w_init) b = tf.get_variable('b', shape=[out_dim], initializer=b_init) output = tf.matmul(input, w) + b if func is not None: output = func(output) return output
def __init__(self, embedding_dim=100, batch_size=64, n_hidden=100, learning_rate=0.01, n_class=3, max_sentence_len=50, l2_reg=0., display_step=4, n_iter=100, type_=''): self.embedding_dim = embedding_dim self.batch_size = batch_size self.n_hidden = n_hidden self.learning_rate = learning_rate self.n_class = n_class self.max_sentence_len = max_sentence_len self.l2_reg = l2_reg self.display_step = display_step self.n_iter = n_iter self.type_ = type_ self.word_id_mapping, self.w2v = load_w2v(FLAGS.embedding_file_path, self.embedding_dim) self.word_embedding = tf.constant(self.w2v, name='word_embedding') # self.word_embedding = tf.Variable(self.w2v, name='word_embedding') # self.word_id_mapping = load_word_id_mapping(FLAGS.word_id_file_path) # self.word_embedding = tf.Variable( # tf.random_uniform([len(self.word_id_mapping), self.embedding_dim], -0.1, 0.1), name='word_embedding') self.dropout_keep_prob = tf.placeholder(tf.float32) with tf.name_scope('inputs'): self.x = tf.placeholder(tf.int32, [None, self.max_sentence_len]) self.y = tf.placeholder(tf.int32, [None, self.n_class]) self.sen_len = tf.placeholder(tf.int32, None) self.x_bw = tf.placeholder(tf.int32, [None, self.max_sentence_len]) self.y_bw = tf.placeholder(tf.int32, [None, self.n_class]) self.sen_len_bw = tf.placeholder(tf.int32, [None]) self.target_words = tf.placeholder(tf.int32, [None, 1]) with tf.name_scope('weights'): self.weights = { 'softmax_bi_lstm': tf.get_variable( name='bi_lstm_w', shape=[2 * self.n_hidden, self.n_class], initializer=tf.random_uniform_initializer(-0.003, 0.003), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ) } with tf.name_scope('biases'): self.biases = { 'softmax_bi_lstm': tf.get_variable( name='bi_lstm_b', shape=[self.n_class], initializer=tf.random_uniform_initializer(-0.003, 0.003), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ) }
def testRandomInitializer(self): # Sanity check that the slices uses a different seed when using a random # initializer function. with self.test_session(): var0, var1 = tf.create_partitioned_variables([20, 12], [1, 2], tf.random_uniform_initializer()) tf.global_variables_initializer().run() val0, val1 = var0.eval().flatten(), var1.eval().flatten() self.assertTrue(np.linalg.norm(val0 - val1) > 1e-6) # Negative test that proves that slices have the same values if # the random initializer uses a seed. with self.test_session(): var0, var1 = tf.create_partitioned_variables([20, 12], [1, 2], tf.random_uniform_initializer(seed=201)) tf.global_variables_initializer().run() val0, val1 = var0.eval().flatten(), var1.eval().flatten() self.assertAllClose(val0, val1)
def get_params(self): n_first_layer = self.n_inputs + self.n_heads * self.mem_ncols init_min = -0.1 init_max = 0.1 weights = { "hidden": tf.get_variable( name="hidden_weight", shape=[n_first_layer, self.n_hidden], initializer=tf.random_uniform_initializer(init_min, init_max), ), "output": tf.get_variable( name="output_weight", shape=[self.n_hidden, self.n_outputs], initializer=tf.random_uniform_initializer(init_min, init_max), ), } biases = { "hidden": tf.get_variable( name="hidden_bias", shape=[self.n_hidden], initializer=tf.constant_initializer(0), ), "output": tf.get_variable( name="output_bias", shape=[self.n_outputs], initializer=tf.constant_initializer(0), ), } for i in xrange(self.n_heads): self.add_head_params( weights=weights, biases=biases, i=i, init_min=init_min, init_max=init_max, is_write=True, ) self.add_head_params( weights=weights, biases=biases, i=i, init_min=init_min, init_max=init_max, is_write=False, ) return weights, biases
def __call__(self, inputs, state, full_state, layer_sizes, scope=None): """ Recurrence functionality here In contrast to tensorflow implementation, variables will be more explicit :param inputs: 2D Tensor with shape [batch_size x self.input_size] :param state: 2D Tensor with shape [batch_size x self.state_size] :param full_state: 2D Tensor with shape [batch_size x self.full_state_size] :param scope: VariableScope for the created subgraph; defaults to class name :return: h_t - Output: A 2D Tensor with shape [batch_size x self.output_size] h_t - New state: A 2D Tensor with shape [batch_size x self.state_size]. (the new state is also the output in a GRU cell) """ with tf.variable_scope(scope or type(self).__name__): h_t_prev, _ = tf.split(1, 2, state) x_t = inputs with tf.variable_scope("Update Gate"): W_z = tf.get_variable("W_z", [self.input_size, self._num_units], initializer=tf.random_uniform_initializer(-0.1, 0.1)) U_z = tf.get_variable("U_z", [self.input_size, self._num_units], initializer=tf.random_uniform_initializer(-0.1, 0.1)) b_z = tf.get_variable("b_z", [self._num_units], tf.constant_initializer(0.0)) z_t = tf.sigmoid(tf.matmul(x_t, W_z) + tf.matmul(h_t_prev, U_z) + b_z, name="z_t") with tf.variable_scope("Reset Gate"): W_r = tf.get_variable("W_r", [self.input_size, self._num_units], initializer=tf.random_uniform_initializer(-0.1, 0.1)) U_r = tf.get_variable("U_r", [self.input_size, self._num_units], initializer=tf.random_uniform_initializer(-0.1, 0.1)) b_r = tf.get_variable("b_r", [self._num_units], tf.constant_initializer(1.0)) r_t = tf.sigmoid(tf.matmul(x_t, W_r) + tf.matmul(h_t_prev, U_r) + b_r, name="r_t") with tf.variable_scope("Candidate"): # New memory content W = tf.get_variable("W", [self.input_size, self._num_units], initializer=tf.random_uniform_initializer(-0.1, 0.1)) b = tf.get_variable("b", [self._num_units], tf.constant_initializer(0.0)) summation_term = self.compute_feedback(x_t, full_state, layer_sizes) hc_t = tf.tanh(tf.matmul(x_t, W) + tf.mul(r_t, summation_term)) with tf.Variable("Output"): h_t = tf.mul(z_t, hc_t) + tf.mul((1 - z_t), h_t_prev) return h_t, h_t
def testBlockGRUToGRUCellSingleStep(self): with self.test_session(use_gpu=self._use_gpu, graph=tf.Graph()) as sess: batch_size = 4 cell_size = 5 input_size = 6 seed = 1994 initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=seed) # Inputs x = tf.zeros([batch_size, input_size]) h = tf.zeros([batch_size, cell_size]) # Values for the inputs. x_value = np.random.rand(batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Output from the basic GRU cell implementation. with tf.variable_scope("basic", initializer=initializer): output = tf.nn.rnn_cell.GRUCell(cell_size)(x, h) sess.run([tf.initialize_all_variables()]) basic_res = sess.run([output], {x: x_value, h: h_value}) # Output from the block GRU cell implementation. with tf.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([tf.initialize_all_variables()]) block_res = sess.run([output], {x: x_value, h: h_value}) self.assertEqual(len(block_res), len(basic_res)) for block, basic in zip(block_res, basic_res): self.assertAllClose(block, basic)
def testLSTMBasicToBlockPeeping(self): with self.test_session(use_gpu=self._use_gpu) as sess: batch_size = 2 input_size = 3 cell_size = 4 sequence_length = 5 inputs = [] for _ in range(sequence_length): inp = tf.convert_to_tensor( np.random.randn(batch_size, input_size), dtype=tf.float32) inputs.append(inp) initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890212) with tf.variable_scope("basic", initializer=initializer): cell = tf.nn.rnn_cell.LSTMCell(cell_size, use_peepholes=True, state_is_tuple=True) outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32) sess.run([tf.initialize_all_variables()]) basic_outputs = sess.run(outputs) basic_grads = sess.run(tf.gradients(outputs, inputs)) basic_wgrads = sess.run(tf.gradients(outputs, tf.trainable_variables())) with tf.variable_scope("block", initializer=initializer): w = tf.get_variable("w", shape=[input_size + cell_size, cell_size * 4], dtype=tf.float32) b = tf.get_variable("b", shape=[cell_size * 4], dtype=tf.float32, initializer=tf.zeros_initializer) wci = tf.get_variable("wci", shape=[cell_size], dtype=tf.float32) wcf = tf.get_variable("wcf", shape=[cell_size], dtype=tf.float32) wco = tf.get_variable("wco", shape=[cell_size], dtype=tf.float32) _, _, _, _, _, _, outputs = fused_lstm( tf.convert_to_tensor(sequence_length, dtype=tf.int64), inputs, w, b, wci=wci, wcf=wcf, wco=wco, cell_clip=0, use_peephole=True) sess.run([tf.initialize_all_variables()]) block_outputs = sess.run(outputs) block_grads = sess.run(tf.gradients(outputs, inputs)) block_wgrads = sess.run(tf.gradients(outputs, [w, b, wci, wcf, wco])) self.assertAllClose(basic_outputs, block_outputs) self.assertAllClose(basic_grads, block_grads) for basic, block in zip(basic_wgrads, block_wgrads): self.assertAllClose(basic, block, rtol=1e-2, atol=1e-2)
def train(data_dir, checkpoint_path, config): """Trains the model with the given data Args: data_dir: path to the data for the model (see data_utils for data format) checkpoint_path: the path to save the trained model checkpoints config: one of the above configs that specify the model and how it should be run and trained Returns: None """ # Prepare Name data. print("Reading Name data in %s" % data_dir) names, counts = data_utils.read_names(data_dir) with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = NamignizerModel(is_training=True, config=config) tf.global_variables_initializer().run() for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, names, counts, config.epoch_size, m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) m.saver.save(session, checkpoint_path, global_step=i)
def __init__(self, session, np_matrix, rank, learning_rate=0.1): matrix = tf.constant(np_matrix, dtype=tf.float32) scale = 2 * np.sqrt(np_matrix.mean() / rank) initializer = tf.random_uniform_initializer(maxval=scale) with tf.device('/job:ps/task:0'): self.matrix_W = tf.get_variable( "W", (np_matrix.shape[0], rank), initializer=initializer ) with tf.device("/job:ps/task:1"): self.matrix_H = tf.get_variable( "H", (rank, np_matrix.shape[1]), initializer=initializer ) matrix_WH = tf.matmul(self.matrix_W, self.matrix_H) f_norm = tf.reduce_sum(tf.pow(matrix - matrix_WH, 2)) nn_w = tf.reduce_sum(tf.abs(self.matrix_W) - self.matrix_W) nn_h = tf.reduce_sum(tf.abs(self.matrix_H) - self.matrix_H) constraint = INFINITY * (nn_w + nn_h) self.loss = f_norm + constraint self.constraint = constraint self.session = session self.optimizer = tf.train.GradientDescentOptimizer( learning_rate ).minimize(self.loss)
def modular_layer(inputs, modules: ModulePool, parallel_count: int, context: ModularContext): with tf.variable_scope(None, 'modular_layer'): inputs = context.begin_modular(inputs) flat_inputs = tf.layers.flatten(inputs) logits = tf.layers.dense(flat_inputs, modules.module_count * parallel_count) logits = tf.reshape(logits, [-1, parallel_count, modules.module_count]) ctrl = tfd.Categorical(logits) initializer = tf.random_uniform_initializer(maxval=modules.module_count, dtype=tf.int32) shape = [context.dataset_size, parallel_count] best_selection_persistent = tf.get_variable('best_selection', shape, tf.int32, initializer) if context.mode == ModularMode.E_STEP: # 1 x batch_size x 1 best_selection = tf.gather(best_selection_persistent, context.data_indices)[tf.newaxis] # sample_size x batch_size x 1 sampled_selection = tf.reshape(ctrl.sample(), [context.sample_size, -1, parallel_count]) selection = tf.concat([best_selection, sampled_selection[1:]], axis=0) selection = tf.reshape(selection, [-1, parallel_count]) elif context.mode == ModularMode.M_STEP: selection = tf.gather(best_selection_persistent, context.data_indices) elif context.mode == ModularMode.EVALUATION: selection = ctrl.mode() else: raise ValueError('Invalid modular mode') attrs = ModularLayerAttributes(selection, best_selection_persistent, ctrl) context.layers.append(attrs) return run_modules(inputs, selection, modules.module_fnc, modules.output_shape)
def xavier_init( n_inputs, n_outputs, uniform=True ): if uniform: init_range = tf.sqrt( 6.0 / (n_inputs + n_outputs) ) return tf.random_uniform_initializer( -init_range, init_range ) else: stddev = tf.sqrt( 3.0 / (n_inputs + n_outputs) ) return tf.truncated_normal_initializer( stddev=stddev )
def testSharingWeightsWithDifferentNamescope(self): num_units = 3 input_size = 5 batch_size = 2 num_proj = 4 with self.test_session(graph=tf.Graph()) as sess: initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed) inputs = 10 * [ tf.placeholder(tf.float32, shape=(None, input_size))] cell = rnn_cell.LSTMCell( num_units, input_size, use_peepholes=True, num_proj=num_proj, initializer=initializer) with tf.name_scope("scope0"): with tf.variable_scope("share_scope"): outputs0, _ = rnn.rnn(cell, inputs, dtype=tf.float32) with tf.name_scope("scope1"): with tf.variable_scope("share_scope", reuse=True): outputs1, _ = rnn.rnn(cell, inputs, dtype=tf.float32) tf.initialize_all_variables().run() input_value = np.random.randn(batch_size, input_size) output_values = sess.run( outputs0 + outputs1, feed_dict={inputs[0]: input_value}) outputs0_values = output_values[:10] outputs1_values = output_values[10:] self.assertEqual(len(outputs0_values), len(outputs1_values)) for out0, out1 in zip(outputs0_values, outputs1_values): self.assertAllEqual(out0, out1)
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or type(self).__name__): initializer = tf.random_uniform_initializer(-0.1, 0.1) def get_variable(name, shape): return tf.get_variable(name, shape, initializer=initializer, dtype=inputs.dtype) c_prev, y_prev = tf.split(1, 2, state) W_z = get_variable("W_z", [self.input_size, self._num_blocks]) W_f = get_variable("W_f", [self.input_size, self._num_blocks]) W_o = get_variable("W_o", [self.input_size, self._num_blocks]) R_z = get_variable("R_z", [self._num_blocks, self._num_blocks]) R_f = get_variable("R_f", [self._num_blocks, self._num_blocks]) R_o = get_variable("R_o", [self._num_blocks, self._num_blocks]) b_z = get_variable("b_z", [1, self._num_blocks]) b_f = get_variable("b_f", [1, self._num_blocks]) b_o = get_variable("b_o", [1, self._num_blocks]) p_f = get_variable("p_f", [self._num_blocks]) p_o = get_variable("p_o", [self._num_blocks]) g = h = tf.tanh z = g(tf.matmul(inputs, W_z) + tf.matmul(y_prev, R_z) + b_z) i = 1 f = tf.sigmoid(tf.matmul(inputs, W_f) + tf.matmul(y_prev, R_f) + tf.mul(c_prev, p_f) + b_f) c = tf.mul(i, z) + tf.mul(f, c_prev) o = tf.sigmoid(tf.matmul(inputs, W_o) + tf.matmul(y_prev, R_o) + tf.mul(c, p_o) + b_o) y = tf.mul(h(c), o) return y, tf.concat(1, [c, y])
def _testDoubleInput(self, use_gpu): num_units = 3 input_size = 5 batch_size = 2 num_proj = 4 num_proj_shards = 4 num_unit_shards = 2 with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess: initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed) inputs = 10 * [tf.placeholder(tf.float64)] cell = rnn_cell.LSTMCell( num_units, input_size=input_size, use_peepholes=True, num_proj=num_proj, num_unit_shards=num_unit_shards, num_proj_shards=num_proj_shards, initializer=initializer) outputs, _ = rnn.rnn( cell, inputs, initial_state=cell.zero_state(batch_size, tf.float64)) self.assertEqual(len(outputs), len(inputs)) tf.initialize_all_variables().run() input_value = np.asarray(np.random.randn(batch_size, input_size), dtype=np.float64) values = sess.run(outputs, feed_dict={inputs[0]: input_value}) self.assertEqual(values[0].dtype, input_value.dtype)
def testSharingWeightsWithReuse(self): num_units = 3 input_size = 5 batch_size = 2 num_proj = 4 with self.test_session(graph=tf.Graph()) as sess: initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed) inputs = 10 * [ tf.placeholder(tf.float32, shape=(None, input_size))] cell = rnn_cell.LSTMCell( num_units, input_size, use_peepholes=True, num_proj=num_proj, initializer=initializer) with tf.variable_scope("share_scope"): outputs0, _ = rnn.rnn(cell, inputs, dtype=tf.float32) with tf.variable_scope("share_scope", reuse=True): outputs1, _ = rnn.rnn(cell, inputs, dtype=tf.float32) with tf.variable_scope("diff_scope"): outputs2, _ = rnn.rnn(cell, inputs, dtype=tf.float32) tf.initialize_all_variables().run() input_value = np.random.randn(batch_size, input_size) output_values = sess.run( outputs0 + outputs1 + outputs2, feed_dict={inputs[0]: input_value}) outputs0_values = output_values[:10] outputs1_values = output_values[10:20] outputs2_values = output_values[20:] self.assertEqual(len(outputs0_values), len(outputs1_values)) self.assertEqual(len(outputs0_values), len(outputs2_values)) for o1, o2, o3 in zip(outputs0_values, outputs1_values, outputs2_values): # Same weights used by both RNNs so outputs should be the same. self.assertAllEqual(o1, o2) # Different weights used so outputs should be different. self.assertTrue(np.linalg.norm(o1-o3) > 1e-6)
def build(self, _): self.embedding = self.add_variable( "embedding_kernel", shape=[self.vocab_size, self.embedding_dim], dtype=tf.float32, initializer=tf.random_uniform_initializer(-0.1, 0.1), trainable=True)
def _testProjSharding(self, use_gpu): num_units = 3 input_size = 5 batch_size = 2 num_proj = 4 num_proj_shards = 4 num_unit_shards = 2 with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess: initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed) inputs = 10 * [ tf.placeholder(tf.float32, shape=(None, input_size))] cell = rnn_cell.LSTMCell( num_units, input_size=input_size, use_peepholes=True, num_proj=num_proj, num_unit_shards=num_unit_shards, num_proj_shards=num_proj_shards, initializer=initializer) outputs, _ = rnn.rnn(cell, inputs, dtype=tf.float32) self.assertEqual(len(outputs), len(inputs)) tf.initialize_all_variables().run() input_value = np.random.randn(batch_size, input_size) sess.run(outputs, feed_dict={inputs[0]: input_value})
def uniform(shape=None, minval=0, maxval=None, dtype=tf.float32, seed=None, name='Uniform'): """Uniform. Initialization with random values from a uniform distribution. The generated values follow a uniform distribution in the range `[minval, maxval)`. The lower bound `minval` is included in the range, while the upper bound `maxval` is excluded. For floats, the default range is `[0, 1)`. For ints, at least `maxval` must be specified explicitly. In the integer case, the random integers are slightly biased unless `maxval - minval` is an exact power of two. The bias is small for values of `maxval - minval` significantly smaller than the range of the output (either `2**32` or `2**64`). Args: shape: List of `int`. A shape to initialize a Tensor (optional). dtype: The tensor data type. Only float are supported. seed: `int`. Used to create a random seed for the distribution. name: name of the op. Returns: The Initializer, or an initialized `Tensor` if shape is specified. """ if shape: return tf.random_uniform( shape=shape, minval=minval, maxval=maxval, seed=seed, dtype=dtype, name=name) else: with get_name_scope(name): return tf.random_uniform_initializer( minval=minval, maxval=maxval, seed=seed, dtype=dtype)
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_ratio, use_cudnn_rnn=True): super(PTBModel, self).__init__() self.keep_ratio = 1 - dropout_ratio self.use_cudnn_rnn = use_cudnn_rnn self.embedding = self.track_layer(Embedding(vocab_size, embedding_dim)) if self.use_cudnn_rnn: self.rnn = cudnn_rnn.CudnnLSTM( num_layers, hidden_dim, dropout=dropout_ratio) else: self.rnn = RNN(hidden_dim, num_layers, self.keep_ratio) self.track_layer(self.rnn) self.linear = self.track_layer( tf.layers.Dense( vocab_size, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))) self._output_shape = [-1, embedding_dim]
def build_input(self, feats, reuse): with tf.variable_scope('Input', reuse=reuse): # Unigram if self.config.embd_init_type == 'uniform': uni_embd_var = tf.get_variable('uni_embd', [self.vocab_size, self.embd_size], initializer=tf.random_uniform_initializer(-1., 1.)) elif self.config.embd_init_type == 'random_normal': seed = random.randint(1, 10000) uni_embd_var = tf.get_variable('uni_embd', [self.vocab_size, self.embd_size], initializer=tf.random_normal_initializer(self.config.norm_mean, self.config.norm_std, seed=seed)) elif self.config.embd_init_type == 'truncated_normal': seed = random.randint(1, 10000) uni_embd_var = tf.get_variable('uni_embd', [self.vocab_size, self.embd_size], initializer=tf.truncated_normal_initializer(self.config.norm_mean, self.config.norm_std, seed=seed)) else: uni_embd_var = tf.get_variable('uni_embd', [self.vocab_size, self.embd_size]) uni_embd = tf.nn.embedding_lookup(uni_embd_var, tf.abs(feats['unigram'])) # -1 -> 1 # Img Feature img_feat = tf.clip_by_value(feats['img_feat'], -100, 100) return uni_embd, img_feat
def main_word2vec_basic(): tf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG) # sess = tf.InteractiveSession() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Step 1: Download the data, read the context into a list of strings. # Set hyperparameters. words = tl.files.load_matt_mahoney_text8_dataset() data_size = len(words) print(data_size) # 17005207 print( words[0:10] ) # ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against'] # exit() resume = False # load existing model, data and dictionaries _UNK = "_UNK" if FLAGS.model == "one": # toy setting (tensorflow/examples/tutorials/word2vec/word2vec_basic.py) vocabulary_size = 50000 # maximum number of word in vocabulary batch_size = 128 embedding_size = 128 # Dimension of the embedding vector (hidden layer). skip_window = 1 # How many words to consider left and right. num_skips = 2 # How many times to reuse an input to generate a label. # (should be double of 'skip_window' so as to # use both left and right words) num_sampled = 64 # Number of negative examples to sample. # more negative samples, higher loss learning_rate = 1.0 n_epoch = 20 model_file_name = "model_word2vec_50k_128" # Eval 2084/15851 accuracy = 15.7% if FLAGS.model == "two": # (tensorflow/models/embedding/word2vec.py) vocabulary_size = 80000 batch_size = 20 # Note: small batch_size need more steps for a Epoch embedding_size = 200 skip_window = 5 num_skips = 10 num_sampled = 100 learning_rate = 0.2 n_epoch = 15 model_file_name = "model_word2vec_80k_200" # 7.9% if FLAGS.model == "three": # (tensorflow/models/embedding/word2vec_optimized.py) vocabulary_size = 80000 batch_size = 500 embedding_size = 200 skip_window = 5 num_skips = 10 num_sampled = 25 learning_rate = 0.025 n_epoch = 20 model_file_name = "model_word2vec_80k_200_opt" # bad 0% if FLAGS.model == "four": # see: Learning word embeddings efficiently with noise-contrastive estimation vocabulary_size = 80000 batch_size = 100 embedding_size = 600 skip_window = 5 num_skips = 10 num_sampled = 25 learning_rate = 0.03 n_epoch = 200 * 10 model_file_name = "model_word2vec_80k_600" # bad num_steps = int( (data_size / batch_size) * n_epoch) # total number of iteration print('%d Steps in a Epoch, total Epochs %d' % (int(data_size / batch_size), n_epoch)) print(' learning_rate: %f' % learning_rate) print(' batch_size: %d' % batch_size) # Step 2: Build the dictionary and replace rare words with 'UNK' token. print() if resume: print("Load existing data and dictionaries" + "!" * 10) all_var = tl.files.load_npy_to_any(name=model_file_name + '.npy') data = all_var['data'] count = all_var['count'] dictionary = all_var['dictionary'] reverse_dictionary = all_var['reverse_dictionary'] else: data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset( words, vocabulary_size, True, _UNK) print( 'Most 5 common words (+UNK)', count[:5] ) # [['UNK', 418391], (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764)] print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) # [5243, 3081, 12, 6, 195, 2, 3135, 46, 59, 156] [b'anarchism', b'originated', b'as', b'a', b'term', b'of', b'abuse', b'first', b'used', b'against'] del words # Hint to reduce memory. # Step 3: Function to generate a training batch for the Skip-Gram model. print() batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, \ batch_size=8, num_skips=4, skip_window=2, data_index=0) for i in range(8): print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, \ batch_size=8, num_skips=2, skip_window=1, data_index=0) for i in range(8): print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) # Step 4: Build a Skip-Gram model. print() # We pick a random validation set to sample nearest neighbors. Here we limit the # validation samples to the words that have a low numeric ID, which by # construction are also the most frequent. valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) # a list of 'valid_size' integers smaller than 'valid_window' # print(valid_examples) # [90 85 20 33 35 62 37 63 88 38 82 58 83 59 48 64] # n_epoch = int(num_steps / batch_size) # train_inputs is a row vector, a input is an integer id of single word. # train_labels is a column vector, a label is an integer id of single word. # valid_dataset is a column vector, a valid set is an integer id of single word. train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) valid_dataset = tf.constant(valid_examples, dtype=tf.int32) # Look up embeddings for inputs. emb_net = tl.layers.Word2vecEmbeddingInputlayer( inputs=train_inputs, train_labels=train_labels, vocabulary_size=vocabulary_size, embedding_size=embedding_size, num_sampled=num_sampled, nce_loss_args={}, E_init=tf.random_uniform_initializer(minval=-1.0, maxval=1.0), E_init_args={}, nce_W_init=tf.truncated_normal_initializer( stddev=float(1.0 / np.sqrt(embedding_size))), nce_W_init_args={}, nce_b_init=tf.constant_initializer(value=0.0), nce_b_init_args={}, name='word2vec_layer', ) # Construct the optimizer. Note: AdamOptimizer is very slow in this case cost = emb_net.nce_cost train_params = emb_net.all_params # train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost, var_list=train_params) train_op = tf.train.AdagradOptimizer(learning_rate, initial_accumulator_value=0.1, use_locking=False).minimize( cost, var_list=train_params) # Compute the cosine similarity between minibatch examples and all embeddings. # For simple visualization of validation set. normalized_embeddings = emb_net.normalized_embeddings valid_embed = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) similarity = tf.matmul(valid_embed, normalized_embeddings, transpose_b=True) # multiply all valid word vector with all word vector. # transpose_b=True, normalized_embeddings is transposed before multiplication. # Step 5: Start training. sess.run(tf.global_variables_initializer()) if resume: print("Load existing model" + "!" * 10) # Load from ckpt or npz file # saver = tf.train.Saver() # saver.restore(sess, model_file_name+'.ckpt') tl.files.load_and_assign_npz_dict(name=model_file_name + '.npz', sess=sess) emb_net.print_params(False) emb_net.print_layers() # save vocabulary to txt tl.nlp.save_vocab(count, name='vocab_text8.txt') average_loss = 0 step = 0 print_freq = 2000 while step < num_steps: start_time = time.time() batch_inputs, batch_labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, \ batch_size=batch_size, num_skips=num_skips, skip_window=skip_window, data_index=data_index) feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} # We perform one update step by evaluating the train_op (including it # in the list of returned values for sess.run() _, loss_val = sess.run([train_op, cost], feed_dict=feed_dict) average_loss += loss_val if step % print_freq == 0: if step > 0: average_loss /= print_freq print("Average loss at step %d/%d. loss: %f took: %fs" % \ (step, num_steps, average_loss, time.time() - start_time)) average_loss = 0 # Prints out nearby words given a list of words. # Note that this is expensive (~20% slowdown if computed every 500 steps) if step % (print_freq * 5) == 0: sim = similarity.eval(session=sess) for i in xrange(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 # number of nearest neighbors to print nearest = (-sim[i, :]).argsort()[1:top_k + 1] log_str = "Nearest to %s:" % valid_word for k in xrange(top_k): close_word = reverse_dictionary[nearest[k]] log_str = "%s %s," % (log_str, close_word) print(log_str) if (step % (print_freq * 20) == 0) and (step != 0): print("Save model, data and dictionaries" + "!" * 10) # Save to ckpt or npz file # saver = tf.train.Saver() # save_path = saver.save(sess, model_file_name+'.ckpt') tl.files.save_npz_dict(emb_net.all_params, name=model_file_name + '.npz', sess=sess) tl.files.save_any_to_npy(save_dict={ 'data': data, 'count': count, 'dictionary': dictionary, 'reverse_dictionary': reverse_dictionary }, name=model_file_name + '.npy') # if step == num_steps-1: # keeptrain = input("Training %d finished enter 1 to keep training: " % num_steps) # if keeptrain == '1': # step = 0 # learning_rate = float(input("Input new learning rate: ")) # train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) step += 1 # Step 6: Visualize the normalized embedding matrix by t-SNE. print() final_embeddings = sess.run(normalized_embeddings) #.eval() tl.visualize.tsne_embedding(final_embeddings, reverse_dictionary, plot_only=500, \ second=5, saveable=False, name='word2vec_basic') # Step 7: Evaluate by analogy questions. see tensorflow/models/embedding/word2vec_optimized.py print() # from tensorflow/models/embedding/word2vec.py analogy_questions = tl.nlp.read_analogies_file( eval_file='questions-words.txt', word2id=dictionary) # The eval feeds three vectors of word ids for a, b, c, each of # which is of size N, where N is the number of analogies we want to # evaluate in one batch. analogy_a = tf.placeholder(dtype=tf.int32) # [N] analogy_b = tf.placeholder(dtype=tf.int32) # [N] analogy_c = tf.placeholder(dtype=tf.int32) # [N] # Each row of a_emb, b_emb, c_emb is a word's embedding vector. # They all have the shape [N, emb_dim] a_emb = tf.gather(normalized_embeddings, analogy_a) # a's embs b_emb = tf.gather(normalized_embeddings, analogy_b) # b's embs c_emb = tf.gather(normalized_embeddings, analogy_c) # c's embs # We expect that d's embedding vectors on the unit hyper-sphere is # near: c_emb + (b_emb - a_emb), which has the shape [N, emb_dim]. # Bangkok Thailand Tokyo Japan -> Thailand - Bangkok = Japan - Tokyo # Japan = Tokyo + (Thailand - Bangkok) # d = c + (b - a) target = c_emb + (b_emb - a_emb) # Compute cosine distance between each pair of target and vocab. # dist has shape [N, vocab_size]. dist = tf.matmul(target, normalized_embeddings, transpose_b=True) # For each question (row in dist), find the top 'n_answer' words. n_answer = 4 _, pred_idx = tf.nn.top_k(dist, n_answer) def predict(analogy): """Predict the top 4 answers for analogy questions.""" idx, = sess.run( [pred_idx], { analogy_a: analogy[:, 0], analogy_b: analogy[:, 1], analogy_c: analogy[:, 2] }) return idx # Evaluate analogy questions and reports accuracy. # i.e. How many questions we get right at precision@1. correct = 0 total = analogy_questions.shape[0] start = 0 while start < total: limit = start + 2500 sub = analogy_questions[start:limit, :] # question idx = predict(sub) # 4 answers for each question # print('question:', tl.nlp.word_ids_to_words(sub[0], reverse_dictionary)) # print('answers:', tl.nlp.word_ids_to_words(idx[0], reverse_dictionary)) start = limit for question in xrange(sub.shape[0]): for j in xrange(n_answer): # if one of the top 4 answers in correct, win ! if idx[question, j] == sub[question, 3]: # Bingo! We predicted correctly. E.g., [italy, rome, france, paris]. print( j + 1, tl.nlp.word_ids_to_words([idx[question, j]], reverse_dictionary), ':', tl.nlp.word_ids_to_words(sub[question, :], reverse_dictionary)) correct += 1 break elif idx[question, j] in sub[question, :3]: # We need to skip words already in the question. continue else: # The correct label is not the precision@1 break print("Eval %4d/%d accuracy = %4.1f%%" % (correct, total, correct * 100.0 / total))
def _build_sequence(placeholders, config): '''core of the sequence model. ''' with tf.name_scope('sequence_variables'): # Initialize embeddings to have variance=1, encoder and decoder share the same embeddings sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=tf.float32) embeddings = tf.get_variable( name='word_embedding_matrix', shape=[config.vocab_size, config.embedding_size], initializer=initializer, dtype=tf.float32) projection_weights = tf.Variable(tf.random_uniform( [config.hidden_units, config.vocab_size], -1, 1), dtype=tf.float32, name='projection_weights') projection_bias = tf.Variable(tf.zeros([config.vocab_size]), dtype=tf.float32, name='projection_bias') encoder_inputs_embedded = tf.nn.embedding_lookup( embeddings, placeholders['encoder_inputs'], name='encoder_inputs_embedded') with tf.name_scope('encoder_sequence'): encoder_cell = tf.contrib.rnn.LSTMCell(config.hidden_units) encoder_cell = tf.contrib.rnn.DropoutWrapper( encoder_cell, input_keep_prob=placeholders['dropout_input_keep_prob']) encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn( encoder_cell, encoder_inputs_embedded, dtype=tf.float32, time_major=True, scope='encoder') with tf.name_scope('inference'): ## transpose the dimension of embedded input to [batch_size, max_time, embedded_size] encoder_inputs_embedded_ = tf.transpose(encoder_inputs_embedded, [1, 0, 2]) mean_encoder_inputs_embedded = tf.reduce_mean(encoder_inputs_embedded_, axis=1) ## change the dimension to [batch_size, max_time, cell.output_size] encoder_outputs_ = tf.transpose(encoder_outputs, [1, 0, 2]) mean_encoder_outputs = tf.reduce_mean(encoder_outputs_, axis=1) final_cell_state = encoder_final_state[0] final_hidden_state = encoder_final_state[1] with tf.name_scope('decoder_sequence'): decoder_cell = tf.contrib.rnn.LSTMCell(config.hidden_units) ## give three extra space for error decoder_lengths = placeholders[ 'decoder_inputs_length'] + 1 ## consider the first <_GO> ## create the embedded _GO assert TOKEN_DICT[_GO] == 1 go_time_slice = tf.ones([config.batch_size], dtype=tf.int32, name='EOS') go_step_embedded = tf.nn.embedding_lookup(embeddings, go_time_slice) def loop_fn_initial(): '''returns the expected sets of outputs for the initial LSTM unit. the external variable `encoder_final_state` is used as initial_cell_state ''' initial_elements_finished = (0 >= decoder_lengths ) # all False at the initial step initial_input = go_step_embedded initial_cell_state = encoder_final_state initial_cell_output = None initial_loop_state = None # we don't need to pass any additional information return (initial_elements_finished, initial_input, initial_cell_state, initial_cell_output, initial_loop_state) def loop_fn_transition(time, previous_output, previous_state, previous_loop_state): '''create the outputs for next LSTM unit A projection with word embedding matrix is used to find the next input, instead of using the target se in `dynamic_rnn`. ''' def get_next_input(): output_logits = tf.add( tf.matmul(previous_output, projection_weights), projection_bias) prediction = tf.argmax(output_logits, axis=1) next_input = tf.nn.embedding_lookup(embeddings, prediction) return next_input elements_finished = ( time >= decoder_lengths ) # this operation produces boolean tensor of [batch_size] # defining if corresponding sequence has ended cur_input = get_next_input() cur_state = previous_state cur_output = previous_output loop_state = None return (elements_finished, cur_input, cur_state, cur_output, loop_state) def loop_fn(time, previous_output, previous_state, previous_loop_state): if previous_state is None: # time == 0 assert previous_output is None and previous_state is None return loop_fn_initial() else: return loop_fn_transition(time, previous_output, previous_state, previous_loop_state) decoder_outputs_tensor_array, decoder_final_state, _ = tf.nn.raw_rnn( decoder_cell, loop_fn) decoder_outputs = decoder_outputs_tensor_array.stack() with tf.name_scope('outputs_projection'): ## project the last hidden output from LSTM unit outputs to the word matrix decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack( tf.shape(decoder_outputs)) decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim)) decoder_logits_flat = tf.add( tf.matmul(decoder_outputs_flat, projection_weights), projection_bias) decoder_logits = tf.reshape( decoder_logits_flat, (decoder_max_steps, decoder_batch_size, config.vocab_size)) decoder_prediction = tf.argmax(decoder_logits, 2) tf.summary.histogram('{}_histogram'.format('decoder_prediction'), decoder_prediction) inference_set = (mean_encoder_inputs_embedded, mean_encoder_outputs, final_cell_state, final_hidden_state) return decoder_prediction, decoder_logits, inference_set
def __init__(self, vocab_size, buckets, size, num_layers, batch_size, num_softmax_samples, do_decode, num_gpus=2, train_and_test=False): """ :param source_vocab_size: 原始词词数目 :param target_vocab_size: 目标词词数目 :param buckets: 桶 :param size: cell的神经元数量 :param num_layers: 神经网络层数 :param batch_size: :param do_decode: 训练还是测试 影响seq2seq的解码过程 :param num_gpus: gpu的数量 :param 训练和预测一起进行 """ self._cur_gpu = 0 # 此参数用于自动选择gpu和cpu self._num_gpus = num_gpus # gpu的数量 self.sess = None # tf的session 若为None则后面需要创建一个新的 self.buckets = buckets self.global_step = tf.Variable( 0, trainable=False) # 一个tensor 用于记录训练集训练的次数 encoder_inputs = [] # encoder inputs decoder_inputs = [] target_inputs = [] loss_weight_inputs = [] # 所有的编码输入标识符号 for i in range(buckets[-1][0]): encoder_inputs.append( tf.placeholder(tf.int32, shape=[batch_size], name="encoder{}".format(i))) squence_length = tf.placeholder(tf.int32, [batch_size], name='squence_length') self.squence_length = squence_length # 所有的解码输出标识符号 for i in range(buckets[-1][1]): decoder_inputs.append( tf.placeholder(tf.int32, shape=[batch_size], name="decoder{}".format(i))) target_inputs.append( tf.placeholder(tf.int64, shape=[batch_size], name="target{}".format(i))) loss_weight_inputs.append( tf.placeholder(tf.float32, shape=[batch_size], name="loss_weight{}".format(i))) encoder_inputs_buckets = {} decoder_inputs_buckets = {} target_inputs_buckets = {} loss_weight_inputs_buckets = {} # bucket部分的 encoder decoder target # 解码和编码部分的bucket for bucket_id, bucket in enumerate(buckets): encoder_inputs_buckets[bucket_id] = encoder_inputs[0:bucket[0]] decoder_inputs_buckets[bucket_id] = decoder_inputs[0:bucket[1]] target_inputs_buckets[bucket_id] = target_inputs[0:bucket[1]] loss_weight_inputs_buckets[bucket_id] = loss_weight_inputs[ 0:bucket[1]] self.encoder_inputs_buckets = encoder_inputs_buckets self.decoder_inputs_buckets = decoder_inputs_buckets self.target_inputs_buckets = target_inputs_buckets self.loss_weight_inputs_buckets = loss_weight_inputs_buckets # 所有的编码部分和解码部分的embedding with tf.variable_scope( 'embedding', reuse=True if train_and_test else None), tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vocab_size, size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) # every word look up a word vector. emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] encoder_embedding_buckets = {} decoder_embedding_buckets = {} # bucket embedding 部分的 encoder decoder for i, bucket in enumerate(buckets): encoder_embedding_buckets[i] = emb_encoder_inputs[0:bucket[0]] decoder_embedding_buckets[i] = emb_decoder_inputs[0:bucket[1]] # 这里需要使用bucket encoder_output_buckets = {} encoder_state_buckets = {} device = self._next_device() for bucket_id, bucket in enumerate(buckets): encoder_input_embedding = encoder_embedding_buckets[bucket_id] for layer_id in range(num_layers): with tf.variable_scope( "encoder%d" % layer_id, reuse=(True if bucket_id > 0 else None) or (True if train_and_test else None)), tf.device(device): cell = LSTMCell(num_units=size, initializer=tf.random_uniform_initializer( -0.1, 0.1, seed=123), state_is_tuple=True) encoder_input_embedding, state = static_rnn( cell=cell, inputs=encoder_input_embedding, sequence_length=squence_length, dtype=tf.float32) output = encoder_input_embedding encoder_output_buckets[bucket_id] = output encoder_state_buckets[bucket_id] = state with tf.variable_scope('output_projection', reuse=True if train_and_test else None): w = tf.get_variable( 'w', [size, vocab_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vocab_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) loop_function = _extract_argmax_and_embed(embedding, (w, v)) if do_decode else None cell = LSTMCell(size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=True) decoder_output_buckets = {} decoder_state_buckets = {} device = self._next_device() for bucket_id, bucket in enumerate(buckets): with tf.variable_scope( "decoder", reuse=(True if bucket_id > 0 else None) or (True if train_and_test else None)), tf.device(device): t = tf.concat(values=[ tf.reshape(x, [-1, 1, size]) for x in encoder_output_buckets[bucket_id] ], axis=1) decoder_output, decoder_state = attention_decoder( decoder_inputs=decoder_embedding_buckets[bucket_id], initial_state=encoder_state_buckets[bucket_id], attention_states=t, cell=cell, num_heads=1, loop_function=loop_function, initial_state_attention=do_decode) decoder_output_buckets[bucket_id] = decoder_output decoder_state_buckets[bucket_id] = decoder_state model_output_buckets = {} # 输出的 logits model_output_predict_buckets = {} model_output_predict_merger_buckets = {} model_output_accuracy = {} device = self._next_device() for bucket_id, bucket in enumerate(buckets): model_output = [] model_output_predict = [] model_accuracy = [] with tf.variable_scope( "output", reuse=(True if bucket_id > 0 else None) or (True if train_and_test else None)), tf.device(device): for j in range(len(decoder_output_buckets[bucket_id])): output = tf.nn.xw_plus_b( decoder_output_buckets[bucket_id][j], w, v) predict = tf.argmax(input=output, axis=1, name="predict_{}_{}".format( bucket_id, j)) accuracy_bool = tf.equal( x=target_inputs_buckets[bucket_id][j], y=predict) model_accuracy.append( tf.reduce_mean( tf.cast(x=accuracy_bool, dtype=tf.float32))) model_output.append(output) model_output_predict.append( tf.reshape(tensor=predict, shape=[-1, 1])) model_output_buckets[bucket_id] = model_output model_output_predict_buckets[bucket_id] = model_output_predict model_output_predict_merger_buckets[bucket_id] = tf.concat( values=model_output_predict, axis=1) model_output_accuracy[bucket_id] = tf.add_n(inputs=model_accuracy, name="bucket_id_{}".format(bucket_id)) / \ buckets[bucket_id][1] self.model_output_buckets = model_output_buckets self.model_output_predict_buckets = model_output_predict_buckets self.model_output_predict_merger_buckets = model_output_predict_merger_buckets self.model_output_accuracy = model_output_accuracy def sampled_loss_func(labels, logits): # tf1.0的规范更加严格 with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(v, tf.float32) local_inputs = tf.cast(logits, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss(weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_softmax_samples, num_classes=vocab_size), tf.float32) device = self._next_device() loss_buckets = {} for bucket_id, bucket in enumerate(buckets): with tf.variable_scope( 'loss', reuse=(True if bucket_id > 0 else None) or (True if train_and_test else None)), tf.device(device): if num_softmax_samples != 0 and not do_decode: # 这里的输入部分不相同的原因是前者替换了softmax函数 loss = sequence_loss_by_example( logits=decoder_output_buckets[bucket_id], targets=target_inputs_buckets[bucket_id], weights=loss_weight_inputs_buckets[bucket_id], average_across_timesteps=True, softmax_loss_function=sampled_loss_func) # loss = sequence_loss(logits=model_output_buckets[bucket_id], # targets=target_inputs_buckets[bucket_id], # weights=loss_weight_inputs_buckets[bucket_id] # ) else: loss = sequence_loss( logits=model_output_buckets[bucket_id], targets=target_inputs_buckets[bucket_id], weights=loss_weight_inputs_buckets[bucket_id]) loss_buckets[bucket_id] = tf.reduce_mean(loss) # 计算平均loss self.loss_buckets = loss_buckets
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = PTBInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config, input_=train_input) tf.scalar_summary("Training Loss", m.cost) tf.scalar_summary("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config, input_=valid_input) tf.scalar_summary("Validation Loss", mvalid.cost) with tf.name_scope("Test"): test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config, input_=test_input) sv = tf.train.Supervisor(logdir=FLAGS.save_path) with sv.managed_session() as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest) print("Test Perplexity: %.3f" % test_perplexity) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
def train(data_dir, save_dir, best_dir, config): """Prepare the data and begin training.""" # Create variables batch_size = config.batch_size timesteps = config.timesteps num_epochs = config.epochs # Load the text and vocabulary data_loader = DataLoader( data_dir, mode='train', tokenize_func=lmmrl_tokenizer, encode_func=lmmrl_encoder, word_markers=config.include_word_markers, max_word_length=config.max_word_length ) # Prepare batches for training and validation train_batch_loader = BatchLoader(data_loader, batch_size=batch_size, timesteps=timesteps, mode='train') val_batch_loader = BatchLoader(data_loader, batch_size=batch_size, timesteps=timesteps, mode='val') # update vocabulary sizes config.word_vocab_size = len(data_loader.vocabs['words']) config.char_vocab_size = len(data_loader.vocabs['chars']) # Run on GPU by default cfg_proto = tf.ConfigProto(intra_op_parallelism_threads=0, inter_op_parallelism_threads=0) cfg_proto.gpu_options.allow_growth = True ########################################################################## # Load word frequency information ########################################################################## with open(os.path.join(data_dir, 'word_freq.txt'), encoding='utf-8') as f: freq = f.read().split() config['freq'] = freq ########################################################################## # Create model config.save_dir = save_dir model = Model(config) with tf.Session(config=cfg_proto, graph=model.graph) as sess: # Restore model/Initialize weights initializer = tf.random_uniform_initializer(-0.05, 0.05) with tf.variable_scope("model", reuse=None, initializer=initializer): steps_done = restore_model(sess, model, save_dir) logger.info("Loaded %d completed steps", steps_done) # Find starting epoch start_epoch = model.epoch_cntr.eval() # Start epoch-based training lr = config.initial_learning_rate # Finalize graph to prevent memory leakage sess.graph.finalize() last_val_ppl = 10000 for epoch in range(start_epoch, num_epochs): logger.info("Epoch %d / %d", epoch+1, num_epochs) # train run_epoch(sess, model, train_batch_loader, 'train', save_dir=save_dir, lr=lr) # fine-tune after every epoch sess.run(model.update_unknown) model.fine_tune(sess) # validate val_ppl = run_epoch(sess, model, val_batch_loader, 'val', best_dir=best_dir) # update learning rate conditionally if val_ppl >= last_val_ppl: lr *= config.lr_decay logger.info("Decaying learning rate to %.4f", lr) last_val_ppl = val_ppl # increment epoch sess.run([model.incr_epoch])
def w_initializer(dim_in, dim_out): random_range = math.sqrt(6.0 / (dim_in + dim_out)) return tf.random_uniform_initializer(-random_range, random_range)
def word2vec(batch_gen): """ Build the graph for word2vec model and train it """ # Step 1: define the placeholders for input and output # center_words have to be int to work on embedding lookup with tf.name_scope('data'): center_word = tf.placeholder(tf.int32, [BATCH_SIZE], name='center_words') y = tf.placeholder(tf.int32, [BATCH_SIZE, SKIP_WINDOW], name='target_words') # Step 2: define weights. In word2vec, it's actually the weights that we care about # vocab size x embed size # initialized to random uniform -1 to 1 with tf.name_scope('embedding_matrix'): embed_matrix = tf.get_variable( 'WordEmbedding', [VOCAB_SIZE, EMBED_SIZE], tf.float32, initializer=tf.random_uniform_initializer(-1.0, 1.0)) # Step 3: define the inference # get the embed of input words using tf.nn.embedding_lookup # embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed') with tf.name_scope('loss'): embed = tf.nn.embedding_lookup(embed_matrix, center_word, name='embed') # Step 4: construct variables for NCE loss # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...) # nce_weight (vocab size x embed size), intialized to truncated_normal stddev=1.0 / (EMBED_SIZE ** 0.5) # bias: vocab size, initialized to 0 nce_weight = tf.get_variable( 'nce_weight', [VOCAB_SIZE, EMBED_SIZE], initializer=tf.truncated_normal_initializer(stddev=1.0 / (EMBED_SIZE**0.5))) nce_bias = tf.get_variable('nce_bias', [VOCAB_SIZE], initializer=tf.zeros_initializer()) # define loss function to be NCE loss function # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...) # need to get the mean accross the batch # note: you should use embedding of center words for inputs, not center words themselves nce_loss = tf.nn.nce_loss(nce_weight, nce_bias, y, embed, NUM_SAMPLED, VOCAB_SIZE) loss = tf.reduce_mean(nce_loss, 0) # Step 5: define optimizer optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) total_loss = 0.0 # we use this to calculate the average loss in the last SKIP_STEP steps0 writer = tf.summary.FileWriter('./graphs/no_frills/', sess.graph) for index in range(NUM_TRAIN_STEPS): centers, targets = next(batch_gen) train_dict = {center_word: centers, y: targets} _, loss_batch = sess.run([optimizer, loss], feed_dict=train_dict) total_loss += loss_batch if (index + 1) % SKIP_STEP == 0: print('Average loss at step {}: {:5.1f}'.format( index, total_loss / SKIP_STEP)) total_loss = 0.0 writer.close()
def __init__(self, layer_name, filter_size, num_hidden_in, num_hidden, seq_shape, x_shape_in, tln=False, initializer=None): super(SpatioTemporalLSTMCell, self).__init__() """Initialize the basic Conv LSTM cell. Args: layer_name: layer names for different convlstm layers. filter_size: int tuple thats the height and width of the filter. num_hidden: number of units in output tensor. forget_bias: float, The bias added to forget gates (see above). tln: whether to apply tensor layer normalization """ self.layer_name = layer_name # 当前网络层名 self.filter_size = filter_size # 卷积核大小 self.num_hidden_in = num_hidden_in # 隐藏层输入大小 self.num_hidden = num_hidden # 隐藏层数量 self.batch = seq_shape[0] # batch_size self.height = seq_shape[2] # 图片高度 self.width = seq_shape[3] # 图片宽度 self.x_shape_in = x_shape_in # 通道数 self.layer_norm = tln # 是否归一化 self._forget_bias = 1.0 # 遗忘参数 def w_initializer(dim_in, dim_out): random_range = math.sqrt(6.0 / (dim_in + dim_out)) return tf.random_uniform_initializer(-random_range, random_range) if initializer is None or initializer == -1: # 初始化参数 self.initializer = w_initializer else: self.initializer = tf.random_uniform_initializer( -initializer, initializer) # 建立网络层 # h self.t_cc = layers.Conv2D( self.num_hidden * 4, # 网络输入 输出通道数 self.filter_size, 1, padding='same', # 滤波器大小 步长 填充方式 kernel_initializer=self.initializer(self.num_hidden_in, self.num_hidden * 4), # 参数初始化 name='time_state_to_state') # m self.s_cc = layers.Conv2D( self.num_hidden * 4, # 网络输入 输出通道数 self.filter_size, 1, padding='same', # 滤波器大小 步长 填充方式 kernel_initializer=self.initializer(self.num_hidden_in, self.num_hidden * 4), name='spatio_state_to_state') # x self.x_cc = layers.Conv2D( self.num_hidden * 4, # 网络输入 输出通道数 self.filter_size, 1, padding='same', # 滤波器大小 步长 填充方式 kernel_initializer=self.initializer(self.x_shape_in, self.num_hidden * 4), # 参数初始化 name='input_to_state') # c self.c_cc = layers.Conv2D( self.num_hidden, # 网络输入 输出通道数 1, 1, padding='same', # 滤波器大小 步长 填充方式 kernel_initializer=self.initializer(self.num_hidden * 2, self.num_hidden), # 参数初始化 name='cell_reduce') # bn self.bn_t_cc = tensor_layer_norm('st_time_state_to_state') self.bn_s_cc = tensor_layer_norm('st_spatio_state_to_state') self.bn_x_cc = tensor_layer_norm('st_input_to_state')
def get_decoder_cell(rnn_size): decoder_cell = tf.contrib.rnn.LSTMCell( rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2)) return decoder_cell
def make_cell(rnn_size, keep_prob): enc_cell = tf.contrib.rnn.LSTMCell( rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1)) drop_cell = tf.contrib.rnn.DropoutWrapper(enc_cell, output_keep_prob=keep_prob) return drop_cell
itertools.product(*(range(args.distinct_nums) for _ in range(args.num_terms))))) np.random.shuffle(product_array) def data(): # return np.random.randint(args.distinct_nums, # size=[args.distinct_nums, 1562]) return product_array.transpose() def target(data): return np.sum(data, 0) init = tf.random_uniform_initializer() with tf.Session() as sess, tf.variable_scope("", initializer=init): # embeddings inputs = tf.placeholder(tf.int32, shape=[args.num_terms, args.batch_size], name='inputs') with tf.device('/cpu:0'): embeddings = tf.Variable(tf.random_uniform( [args.vocabulary_size, embedding_size], -1.0, 1.0), name='embeddings') lookups = tf.nn.embedding_lookup(embeddings, inputs, name='lookups') inputs_list = tf.unpack(lookups) # GRU cell = tf.nn.rnn_cell.GRUCell(args.num_cells) cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=args.keep_prob)
def __init__( self, # input_vocab_size, #输入词表的大小 target_vocab_size, #输出词表的大小 batch_size=32, #数据batch的大小 embedding_size=300, #输入词表与输出词表embedding的维度 mode="train", #取值为train, 代表训练模式, 取值为decide,代表预训练模式 hidden_units=256, #Rnn模型的中间层大小,encoder和decoder层相同 depth=1, #encoder和decoder的rnn层数 beam_width=0, #是beamsearch的超参数,用于解码 cell_type="lstm", #rnn的神经元类型, lstm, gru dropout=0.2, #随机丢弃数据的比例,是要0到1之间 use_dropout=False, #是否使用dropout use_residual=False, #是否使用residual optimizer='adam', #使用哪一个优化器 learning_rate=1e-3, #学习率 min_learning_rate=1e-5, #最小学习率 decay_steps=50000, #衰减步数 max_gradient_norm=5.0, #梯度正则裁剪的系数 max_decode_step=None, #最大decode长度, 可以非常大 attention_type='Bahdanau', #使用attention类型 bidirectional=False, #是否使用双向encoder time_major=False, #是否在计算过程中使用时间作为主要的批量数据 seed=0, #一些层间的操作的随机数 parallel_iterations=None, #并行执行rnn循环的个数 share_embedding=False, #是否让encoder和decoder共用一个embedding pretrained_embedding=False): #是不是要使用预训练的embedding self.input_vocab_size = input_vocab_size self.target_vocab_size = target_vocab_size self.batch_size = batch_size self.embedding_size = embedding_size self.hidden_units = hidden_units self.depth = depth self.cell_type = cell_type.lower() self.use_dropout = use_dropout self.use_residual = use_residual self.attention_type = attention_type self.mode = mode self.optimizer = optimizer self.learning_rate = learning_rate self.min_learning_rate = min_learning_rate self.decay_steps = decay_steps self.max_gradient_norm = max_gradient_norm self.keep_prob = 1.0 - dropout self.seed = seed self.pretrained_embedding = pretrained_embedding self.bidirectional = bidirectional if isinstance(parallel_iterations, int): self.parallel_iterations = parallel_iterations else: self.parallel_iterations = batch_size self.time_major = time_major self.share_embedding = share_embedding #生成均匀分布的随机数 用于变量初始化 self.initializer = tf.random_uniform_initializer(-0.05, 0.05, dtype=tf.float32) assert self.cell_type in ('gru', 'lstm'), 'cell_type 应该是GRU 或者是 LSTM' if share_embedding: assert input_vocab_size == target_vocab_size, '如果share_embedding 为True 那么两个vocab_size 必须一样' assert mode in ( 'train', 'decode'), 'mode 必须是train 或者是decode , 而不是{}'.format(mode) assert dropout >= 0.0 and dropout < 1.0, 'dropout 必须大于等于0 且小于等于1' assert attention_type.lower() in ( 'bahdanau', 'loung'), 'attention_type 必须是bahdanau 或者是 loung' assert beam_width < target_vocab_size, 'beam_width {} 应该小于target_vocab_size{}'.format( beam_width, target_vocab_size) self.keep_prob_placeholder = tf.placeholder(tf.float32, shape=[], name='keep_prob') self.global_step = tf.Variable(0, trainable=False, name='global_step') self.use_beamsearch_decode = False self.beam_width = beam_width self.use_beamsearch_decode = True if self.beam_width > 0 else False self.max_decode_step = max_decode_step assert self.optimizer.lower() in ('adadelta', 'adam', 'rmsprop', 'momentum', 'sgd'), \ 'optimizer 必须是下列之一: adadelta, adam, rmsprop, momentum, sgd ' self.build_model()
def _build_model(self): """Add the whole generator model to the graph.""" hps = self._hps vsize = self._vocab.size() # size of the vocabulary with tf.variable_scope('sentiment'): # Some initializers self.rand_unif_init = tf.random_uniform_initializer( -hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123) self.trunc_norm_init = tf.truncated_normal_initializer( stddev=hps.trunc_norm_init_std) # Add embedding matrix (shared by the encoder and decoder inputs) with tf.variable_scope('embedding'): embedding = tf.get_variable('embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=self.trunc_norm_init) #embedding_score = tf.get_variable('embedding_score', [5, hps.hidden_dim], dtype=tf.float32, initializer=self.trunc_norm_init) #emb_dec_inputs = tf.nn.embedding_lookup(embedding, self._dec_batch) # list length max_dec_steps containing shape (batch_size, emb_size) #emb_dec_inputs = tf.unstack(emb_dec_inputs, axis=1) if FLAGS.run_method == 'auto-encoder': emb_enc_inputs = tf.nn.embedding_lookup( embedding, self._enc_batch ) # tensor with shape (batch_size, max_enc_steps, emb_size) emb_enc_inputs = emb_enc_inputs * tf.expand_dims( self._enc_padding_mask, axis=-1) hiddenstates = self._add_encoder(emb_enc_inputs, self._enc_lens) #self.return_hidden = fw_st.h #hiddenstates = tf.contrib.rnn.LSTMStateTuple(fw_st.h, fw_st.h)#self._reduce_states(fw_st, bw_st) w = tf.get_variable( 'w', [hps.hidden_dim * 2, 2], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) v = tf.get_variable( 'v', [2], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) hiddenstates = tf.reshape(hiddenstates, [hps.batch_size * hps.max_enc_steps, -1]) logits = tf.nn.xw_plus_b(hiddenstates, w, v) logits = tf.reshape(logits, [hps.batch_size, hps.max_enc_steps, 2]) #self.decoder_outputs_pretrain, self._max_best_output =self.add_decoder(embedding, emb_dec_inputs, vsize, hps) loss = tf.contrib.seq2seq.sequence_loss( logits, self._weight, self._enc_padding_mask, average_across_timesteps=True, average_across_batch=False) self.max_output = tf.argmax(logits, axis=-1) reward_loss = tf.contrib.seq2seq.sequence_loss( logits, self._weight, self._enc_padding_mask, average_across_timesteps=True, average_across_batch=False) * self.reward # Update the cost self._cost = tf.reduce_mean(loss) self._reward_cost = tf.reduce_mean(reward_loss) self.optimizer = tf.train.AdagradOptimizer( self._hps.lr, initial_accumulator_value=self._hps.adagrad_init_acc)
def __call__(self, x, prev_state): prev_read_vector_list = prev_state['read_vector_list'] prev_controller_vector_list = prev_state['controller_state'] controller_input = tf.concat([x] + prev_read_vector_list, axis =1) with tf.variable_scope('controller', reuse=self.reuse): controller_output, controller_state = self.controller(controller_input, prev_controller_vector_list) if self.k_strategy == 'summary': num_parameter_per_head = self.memory_vector_dim + 1 elif self.k_strategy == 'separate': num_parameter_per_head = self.memory_vector_dim * 2 + 1 total_parameter_num = num_parameter_per_head * self.head_num with tf.variable_scope('o2p', reuse=(self.step > 0) or self.reuse): o2p_w = tf.get_variable('o2p_w', [controller_output.get_shape()[1], total_parameter_num], initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1)) o2p_b = tf.get_variable('o2p_b', [total_parameter_num], initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1)) parameters = tf.nn.xw_plus_b(controller_output, o2p_w, o2p_b) head_parameter_list = tf.split(parameters, self.head_num, axis=1) prev_w_r_list = prev_state['w_r_list'] prev_M = prev_state['M'] prev_w_u = prev_state['w_u'] prev_indices, prev_w_lu = self.least_used(prev_w_u) w_r_list = [] w_w_list = [] k_list = [] a_list = [] for i, head_parameter in enumerate(head_parameter_list): with tf.variable_scope('addressing_head_%d' % i): k = tf.tanh(head_parameter[:, 0:self.memory_vector_dim], name='k') if self.k_strategy == 'separate': a = tf.tanh(head_parameter[:, self.memory_vector_dim:self.memory_vector_dim * 2], name='a') sig_alpha = tf.sigmoid(head_parameter[:, -1:], name='sig_alpha') w_r = self.read_head_addressing(k, prev_M) w_w = self.write_head_addressing(sig_alpha, prev_w_r_list[i], prev_w_lu) w_r_list.append(w_r) w_w_list.append(w_w) k_list.append(k) if self.k_strategy == 'separate': a_list.append(a) w_u = self.gamma * prev_w_u + tf.add_n(w_r_list) + tf.add_n(w_w_list) # eq (20) # Set least used memory location computed from w_(t-1)^u to zero M_ = prev_M * tf.expand_dims(1. - tf.one_hot(prev_indices[:, -1], self.memory_size), dim=2) # Writing M = M_ with tf.variable_scope('writing'): for i in range(self.head_num): w = tf.expand_dims(w_w_list[i], axis=2) if self.k_strategy == 'summary': k = tf.expand_dims(k_list[i], axis=1) elif self.k_strategy == 'separate': k = tf.expand_dims(a_list[i], axis=1) M = M + tf.matmul(w, k) # Reading read_vector_list = [] with tf.variable_scope('reading'): for i in range(self.head_num): read_vector = tf.reduce_sum(tf.expand_dims(w_r_list[i], dim=2) * M, axis=1) read_vector_list.append(read_vector) # controller_output -> NTM output NTM_output = tf.concat([controller_output] + read_vector_list, axis=1) state = { 'controller_state': controller_state, 'read_vector_list': read_vector_list, 'w_r_list': w_r_list, 'w_w_list': w_w_list, 'w_u': w_u, 'M': M, } self.step += 1 return NTM_output, state
def __init__(self, sess, config, api, log_dir, scope=None): self.sess = sess self.config = config self.n_state = config.n_state self.n_vocab = len(api.vocab) self.cell_type = config.cell_type self.encoding_cell_size = config.encoding_cell_size self.state_cell_size = config.state_cell_size self.keep_prob = config.keep_prob self.num_layer = config.num_layer self.max_utt_len = config.max_utt_len self.scope = scope with_label_loss = self.config.with_label_loss with tf.name_scope("io"): self.global_t = tf.placeholder(dtype=tf.int32, name="global_t") self.usr_input_sent = tf.placeholder(dtype=tf.int32, shape=(None, None, self.max_utt_len), name="user_input") self.sys_input_sent = tf.placeholder(dtype=tf.int32, shape=(None, None, self.max_utt_len), name="user_input") self.dialog_length_mask = tf.placeholder(dtype=tf.int32, shape=(None), name="dialog_length_mask") self.usr_full_mask = tf.placeholder(dtype=tf.int32, shape=(None, None, self.max_utt_len), name="usr_full_mask") self.sys_full_mask = tf.placeholder(dtype=tf.int32, shape=(None, None, self.max_utt_len), name="sys_full_mask") max_dialog_len = tf.shape(self.usr_input_sent)[1] self.learning_rate = tf.Variable(float(config.init_lr), trainable=False, name="learning_rate") self.learning_rate_decay_op = self.learning_rate.assign( tf.multiply(self.learning_rate, config.lr_decay)) self.global_t = tf.placeholder(dtype=tf.int32, name="global_t") self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") if self.config.with_label_loss: with tf.name_scope("labeled_id"): self.labeled_usr_input_sent = tf.placeholder( dtype=tf.int32, shape=(None, None, self.max_utt_len), name="labeled_user_input" ) #batch_size, dialog_len, max_utt_len self.labeled_sys_input_sent = tf.placeholder( dtype=tf.int32, shape=(None, None, self.max_utt_len), name="labeled_user_input") self.labeled_dialog_length_mask = tf.placeholder( dtype=tf.int32, shape=(None), name="labeled_dialog_length_mask") self.labeled_usr_full_mask = tf.placeholder( dtype=tf.int32, shape=(None, None, self.max_utt_len), name="labeled_usr_full_mask") self.labeled_sys_full_mask = tf.placeholder( dtype=tf.int32, shape=(None, None, self.max_utt_len), name="labeled_sys_full_mask") self.labeled_labels = tf.placeholder(tf.int32, shape=(None, None), name="labeled_labels") with variable_scope.variable_scope("sent_embedding"): self.W_embedding = tf.get_variable( "W_embedding", [self.n_vocab, config.embed_size], dtype=tf.float32) embedding_mask = tf.constant( [0 if i == 0 else 1 for i in range(self.n_vocab)], dtype=tf.float32, shape=[self.n_vocab, 1]) W_embedding = self.W_embedding * embedding_mask usr_input_embedding = tf.nn.embedding_lookup( W_embedding, tf.reshape(self.usr_input_sent, [-1])) # (8000, 300) usr_input_embedding = tf.reshape( usr_input_embedding, [-1, self.max_utt_len, self.config.embed_size ]) #(160, 50, 300) sys_input_embedding = tf.nn.embedding_lookup( W_embedding, tf.reshape(self.sys_input_sent, [-1])) # (8000, 300) sys_input_embedding = tf.reshape( sys_input_embedding, [-1, self.max_utt_len, self.config.embed_size ]) #(160, 50, 300) if self.config.with_label_loss: labeled_usr_input_embedding = tf.nn.embedding_lookup( W_embedding, tf.reshape(self.labeled_usr_input_sent, [-1])) # (8000, 300) labeled_usr_input_embedding = tf.reshape( labeled_usr_input_embedding, [-1, self.max_utt_len, self.config.embed_size ]) # (160, 50, 300) labeled_sys_input_embedding = tf.nn.embedding_lookup( W_embedding, tf.reshape(self.labeled_sys_input_sent, [-1])) # (8000, 300) labeled_sys_input_embedding = tf.reshape( labeled_sys_input_embedding, [-1, self.max_utt_len, self.config.embed_size ]) # (160, 50, 300) with variable_scope.variable_scope("sent_level"): self.encoding_cell = self.get_rnncell(self.cell_type, self.encoding_cell_size, self.keep_prob, num_layer=self.num_layer) usr_input_embedding, usr_sent_size = get_rnn_encode( usr_input_embedding, self.encoding_cell, scope="sent_embedding_rnn") sys_input_embedding, sys_sent_size = get_rnn_encode( sys_input_embedding, self.encoding_cell, scope="sent_embedding_rnn", reuse=True) usr_input_embedding = tf.reshape( usr_input_embedding[1], [-1, max_dialog_len, usr_sent_size[0]]) sys_input_embedding = tf.reshape( sys_input_embedding[1], [-1, max_dialog_len, sys_sent_size[0]]) if self.config.with_label_loss: labeled_usr_input_embedding, labeled_usr_sent_size = get_rnn_encode( labeled_usr_input_embedding, self.encoding_cell, scope="sent_embedding_rnn", reuse=True) labeled_sys_input_embedding, labeled_sys_sent_size = get_rnn_encode( labeled_sys_input_embedding, self.encoding_cell, scope="sent_embedding_rnn", reuse=True) labeled_usr_input_embedding = tf.reshape( labeled_usr_input_embedding[1], [-1, max_dialog_len, labeled_usr_sent_size[0]]) labeled_sys_input_embedding = tf.reshape( labeled_sys_input_embedding[1], [-1, max_dialog_len, labeled_sys_sent_size[0]]) if config.keep_prob < 1.0: usr_input_embedding = tf.nn.dropout(usr_input_embedding, config.keep_prob) sys_input_embedding = tf.nn.dropout(sys_input_embedding, config.keep_prob) if self.config.with_label_loss: labeled_usr_input_embedding = tf.nn.dropout( labeled_usr_input_embedding, config.keep_prob) labeled_sys_input_embedding = tf.nn.dropout( labeled_sys_input_embedding, config.keep_prob) joint_embedding = tf.concat( [usr_input_embedding, sys_input_embedding], 2, "joint_embedding" ) # (batch, dialog_len, embedding_size*2) (16, 10, 400) if self.config.with_label_loss: labeled_joint_embedding = tf.concat( [labeled_usr_input_embedding, labeled_sys_input_embedding], 2, "labeled_joint_embedding" ) # (batch, dialog_len, embedding_size*2) (16, 10, 400) with variable_scope.variable_scope("state_level"): usr_state_vocab_matrix = tf.get_variable( "usr_state_vocab_distribution", [self.n_state, self.n_vocab], dtype=tf.float32, initializer=tf.random_uniform_initializer()) sys_state_vocab_matrix = tf.get_variable( "sys_state_vocab_distribution", [self.n_state, self.n_vocab], dtype=tf.float32, initializer=tf.random_uniform_initializer()) self.usr_state_vocab_matrix = tf.nn.softmax( usr_state_vocab_matrix, -1) self.sys_state_vocab_matrix = tf.nn.softmax( sys_state_vocab_matrix, -1) self.state_cell = self.get_rnncell(self.cell_type, self.encoding_cell_size, self.keep_prob, num_layer=self.num_layer, activation=tf.nn.tanh) self.VAE_cell = VAECell(num_units=300, state_cell=self.state_cell, num_zt=self.config.n_state, vocab_size=self.n_vocab, max_utt_len=self.max_utt_len, config=config, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=None, reuse=None, name=None) # dec_input_embeding = placeholder(float32, (16, max_dialog_len, 50, 300)) # dec_seq_lens = placeholder(float32, (16, max_dialog_len)) # output_tokens = ((16, max_dialog_len, 50), int32) # sequence_length = (tf.int32, (16)) #print("before embedding") #print(W_embedding) #print(self.usr_input_sent) dec_input_embedding_usr = tf.nn.embedding_lookup( W_embedding, self.usr_input_sent) # (16, 10, 50, 300) dec_input_embedding_sys = tf.nn.embedding_lookup( W_embedding, self.sys_input_sent) # (16, 10, 50, 300) #print("embedding") dec_input_embedding = [ dec_input_embedding_usr, dec_input_embedding_sys ] #print(dec_input_embedding) dec_seq_lens_usr = tf.reduce_sum(tf.sign(self.usr_full_mask), 2) dec_seq_lens_sys = tf.reduce_sum(tf.sign(self.sys_full_mask), 2) dec_seq_lens = [dec_seq_lens_usr, dec_seq_lens_sys] output_tokens_usr = self.usr_input_sent output_tokens_sys = self.sys_input_sent output_tokens = [output_tokens_usr, output_tokens_sys] if self.config.with_label_loss: labeled_dec_input_embedding_usr = tf.nn.embedding_lookup( W_embedding, self.labeled_usr_input_sent) # (16, 10, 50, 300) labeled_dec_input_embedding_sys = tf.nn.embedding_lookup( W_embedding, self.labeled_sys_input_sent) # (16, 10, 50, 300) labeled_dec_input_embedding = [ labeled_dec_input_embedding_usr, labeled_dec_input_embedding_sys ] labeled_dec_seq_lens_usr = tf.reduce_sum( tf.sign(self.labeled_usr_full_mask), 2) labeled_dec_seq_lens_sys = tf.reduce_sum( tf.sign(self.labeled_sys_full_mask), 2) labeled_dec_seq_lens = [ labeled_dec_seq_lens_usr, labeled_dec_seq_lens_sys ] labeled_output_tokens_usr = self.labeled_usr_input_sent labeled_output_tokens_sys = self.labeled_sys_input_sent labeled_output_tokens = [ labeled_output_tokens_usr, labeled_output_tokens_sys ] with variable_scope.variable_scope( "dynamic_VAE_loss") as dynamic_vae_scope: self.initial_prev_z = tf.placeholder( tf.float32, (None, self.config.n_state), 'initial_prev_z') losses, z_ts, p_ts, bow_logits1, bow_logits2 = dynamic_vae( self.VAE_cell, joint_embedding, dec_input_embedding, dec_seq_lens, output_tokens, z_t_size=self.config.n_state, sequence_length=self.dialog_length_mask, initial_state=None, dtype=tf.float32, parallel_iterations=None, swap_memory=False, time_major=False, scope=None, initial_prev_z=self.initial_prev_z) if self.config.with_label_loss: dynamic_vae_scope.reuse_variables() labeled_losses, labeled_z_ts, labeled_pts, labeled_bow_logits1, labeled_bow_logits2 = dynamic_vae( self.VAE_cell, labeled_joint_embedding, labeled_dec_input_embedding, labeled_dec_seq_lens, labeled_output_tokens, z_t_size=self.config.n_state, sequence_length=self.labeled_dialog_length_mask, initial_state=None, dtype=tf.float32, parallel_iterations=None, swap_memory=False, time_major=False, scope=None) self.labeled_z_ts = labeled_z_ts self.labeled_z_ts_mask = tf.to_float( tf.sign(tf.reduce_sum(self.labeled_usr_full_mask, 2))) labeled_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.labeled_z_ts, labels=self.labeled_labels) labeled_loss = tf.reduce_sum(labeled_loss * self.labeled_z_ts_mask) labeled_loss = labeled_loss / tf.to_float( tf.reduce_sum(self.labeled_usr_full_mask) + tf.reduce_sum(self.labeled_sys_full_mask)) self.labeled_loss = tf.identity(labeled_loss, name="labeled_loss") z_ts = tf.nn.softmax(z_ts) # (16, 10, 12) z_ts_mask = tf.to_float( tf.sign(tf.reduce_sum(self.usr_full_mask, 2))) # (16, 10) z_ts_mask = tf.expand_dims(z_ts_mask, 2) # (16, 10, 1) self.z_ts = z_ts * z_ts_mask self.p_ts = p_ts self.bow_logits1 = bow_logits1 self.bow_logits2 = bow_logits2 loss_avg = tf.reduce_sum(losses) / tf.to_float( tf.reduce_sum(self.usr_full_mask) + tf.reduce_sum(self.sys_full_mask)) if self.config.with_label_loss: loss_avg = loss_avg + self.labeled_loss loss_avg = tf.identity(loss_avg, name="loss_average") self.basic_loss = loss_avg tf.summary.scalar("basic_loss", self.basic_loss) self.summary_op = tf.summary.merge_all() self.optimize(sess=sess, config=config, loss=self.basic_loss, log_dir=log_dir) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2)
def multi_encoder(encoder_inputs, encoders, encoder_input_length, other_inputs=None, **kwargs): """ Build multiple encoders according to the configuration in `encoders`, reading from `encoder_inputs`. The result is a list of the outputs produced by those encoders (for each time-step), and their final state. :param encoder_inputs: list of tensors of shape (batch_size, input_length), one tensor for each encoder. :param encoders: list of encoder configurations :param encoder_input_length: list of tensors of shape (batch_size,) (one tensor for each encoder) :return: encoder outputs: a list of tensors of shape (batch_size, input_length, encoder_cell_size), hidden states of the encoders. encoder state: concatenation of the final states of all encoders, tensor of shape (batch_size, sum_of_state_sizes) new_encoder_input_length: list of tensors of shape (batch_size,) with the true length of the encoder outputs. May be different than `encoder_input_length` because of maxout strides, and time pooling. """ encoder_states = [] encoder_outputs = [] # create embeddings in the global scope (allows sharing between encoder and decoder) embedding_variables = [] for encoder in encoders: if encoder.binary: embedding_variables.append(None) continue # inputs are token ids, which need to be mapped to vectors (embeddings) embedding_shape = [encoder.vocab_size, encoder.embedding_size] if encoder.embedding_initializer == 'sqrt3': initializer = tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3)) else: initializer = None device = '/cpu:0' if encoder.embeddings_on_cpu else None with tf.device(device): # embeddings can take a very large amount of memory, so # storing them in GPU memory can be impractical embedding = get_variable('embedding_{}'.format(encoder.name), shape=embedding_shape, initializer=initializer) embedding_variables.append(embedding) new_encoder_input_length = [] for i, encoder in enumerate(encoders): if encoder.use_lstm is False: encoder.cell_type = 'GRU' with tf.variable_scope('encoder_{}'.format(encoder.name)): encoder_inputs_ = encoder_inputs[i] encoder_input_length_ = encoder_input_length[i] def get_cell(input_size=None, reuse=False): if encoder.cell_type.lower() == 'lstm': cell = CellWrapper(BasicLSTMCell(encoder.cell_size, reuse=reuse)) elif encoder.cell_type.lower() == 'dropoutgru': cell = DropoutGRUCell(encoder.cell_size, reuse=reuse, layer_norm=encoder.layer_norm, input_size=input_size, input_keep_prob=encoder.rnn_input_keep_prob, state_keep_prob=encoder.rnn_state_keep_prob) else: cell = GRUCell(encoder.cell_size, reuse=reuse, layer_norm=encoder.layer_norm) if encoder.use_dropout and encoder.cell_type.lower() != 'dropoutgru': cell = DropoutWrapper(cell, input_keep_prob=encoder.rnn_input_keep_prob, output_keep_prob=encoder.rnn_output_keep_prob, state_keep_prob=encoder.rnn_state_keep_prob, variational_recurrent=encoder.pervasive_dropout, dtype=tf.float32, input_size=input_size) return cell embedding = embedding_variables[i] batch_size = tf.shape(encoder_inputs_)[0] time_steps = tf.shape(encoder_inputs_)[1] if embedding is not None: flat_inputs = tf.reshape(encoder_inputs_, [tf.multiply(batch_size, time_steps)]) flat_inputs = tf.nn.embedding_lookup(embedding, flat_inputs) encoder_inputs_ = tf.reshape(flat_inputs, tf.stack([batch_size, time_steps, flat_inputs.get_shape()[1].value])) if encoder.cell_type.lower() == 'raw': encoder_outputs.append(encoder_inputs_) encoder_states.append(tf.zeros([batch_size, encoder.cell_size])) new_encoder_input_length.append(encoder_input_length_) continue if other_inputs is not None: encoder_inputs_ = tf.concat([encoder_inputs_, other_inputs], axis=2) if encoder.use_dropout: noise_shape = [1, time_steps, 1] if encoder.pervasive_dropout else [batch_size, time_steps, 1] encoder_inputs_ = tf.nn.dropout(encoder_inputs_, keep_prob=encoder.word_keep_prob, noise_shape=noise_shape) size = tf.shape(encoder_inputs_)[2] noise_shape = [1, 1, size] if encoder.pervasive_dropout else [batch_size, time_steps, size] encoder_inputs_ = tf.nn.dropout(encoder_inputs_, keep_prob=encoder.embedding_keep_prob, noise_shape=noise_shape) if encoder.input_layers: for j, layer_size in enumerate(encoder.input_layers): if encoder.input_layer_activation is not None and encoder.input_layer_activation.lower() == 'relu': activation = tf.nn.relu else: activation = tf.tanh encoder_inputs_ = dense(encoder_inputs_, layer_size, activation=activation, use_bias=True, name='layer_{}'.format(j)) if encoder.use_dropout: encoder_inputs_ = tf.nn.dropout(encoder_inputs_, keep_prob=encoder.input_layer_keep_prob) # Contrary to Theano's RNN implementation, states after the sequence length are zero # (while Theano repeats last state) inter_layer_keep_prob = None if not encoder.use_dropout else encoder.inter_layer_keep_prob parameters = dict( inputs=encoder_inputs_, sequence_length=encoder_input_length_, dtype=tf.float32, parallel_iterations=encoder.parallel_iterations ) input_size = encoder_inputs_.get_shape()[2].value state_size = (encoder.cell_size * 2 if encoder.cell_type.lower() == 'lstm' else encoder.cell_size) def get_initial_state(name='initial_state'): if encoder.train_initial_states: initial_state = get_variable(name, initializer=tf.zeros(state_size)) return tf.tile(tf.expand_dims(initial_state, axis=0), [batch_size, 1]) else: return None if encoder.bidir: rnn = lambda reuse: stack_bidirectional_dynamic_rnn( cells_fw=[get_cell(input_size if j == 0 else 2 * encoder.cell_size, reuse=reuse) for j in range(encoder.layers)], cells_bw=[get_cell(input_size if j == 0 else 2 * encoder.cell_size, reuse=reuse) for j in range(encoder.layers)], initial_states_fw=[get_initial_state('initial_state_fw')] * encoder.layers, initial_states_bw=[get_initial_state('initial_state_bw')] * encoder.layers, time_pooling=encoder.time_pooling, pooling_avg=encoder.pooling_avg, **parameters) initializer = CellInitializer(encoder.cell_size) if encoder.orthogonal_init else None with tf.variable_scope(tf.get_variable_scope(), initializer=initializer): try: encoder_outputs_, _, encoder_states_ = rnn(reuse=False) except ValueError: # Multi-task scenario where we're reusing the same RNN parameters encoder_outputs_, _, encoder_states_ = rnn(reuse=True) else: if encoder.time_pooling or encoder.final_state == 'concat_last': raise NotImplementedError if encoder.layers > 1: cell = MultiRNNCell([get_cell(input_size if j == 0 else encoder.cell_size) for j in range(encoder.layers)]) initial_state = (get_initial_state(),) * encoder.layers else: cell = get_cell(input_size) initial_state = get_initial_state() encoder_outputs_, encoder_states_ = auto_reuse(tf.nn.dynamic_rnn)(cell=cell, initial_state=initial_state, **parameters) last_backward = encoder_outputs_[:, 0, encoder.cell_size:] indices = tf.stack([tf.range(batch_size), encoder_input_length_ - 1], axis=1) last_forward = tf.gather_nd(encoder_outputs_[:, :, :encoder.cell_size], indices) last_forward.set_shape([None, encoder.cell_size]) if encoder.final_state == 'concat_last': # concats last states of all backward layers (full LSTM states) encoder_state_ = tf.concat(encoder_states_, axis=1) elif encoder.final_state == 'average': mask = tf.sequence_mask(encoder_input_length_, maxlen=tf.shape(encoder_outputs_)[1], dtype=tf.float32) mask = tf.expand_dims(mask, axis=2) encoder_state_ = tf.reduce_sum(mask * encoder_outputs_, axis=1) / tf.reduce_sum(mask, axis=1) elif encoder.final_state == 'average_inputs': mask = tf.sequence_mask(encoder_input_length_, maxlen=tf.shape(encoder_inputs_)[1], dtype=tf.float32) mask = tf.expand_dims(mask, axis=2) encoder_state_ = tf.reduce_sum(mask * encoder_inputs_, axis=1) / tf.reduce_sum(mask, axis=1) elif encoder.bidir and encoder.final_state == 'last_both': encoder_state_ = tf.concat([last_forward, last_backward], axis=1) elif encoder.bidir and not encoder.final_state == 'last_forward': # last backward hidden state encoder_state_ = last_backward else: # last forward hidden state encoder_state_ = last_forward if encoder.bidir and encoder.bidir_projection: encoder_outputs_ = dense(encoder_outputs_, encoder.cell_size, use_bias=False, name='bidir_projection') encoder_outputs.append(encoder_outputs_) encoder_states.append(encoder_state_) new_encoder_input_length.append(encoder_input_length_) exemplar_code = None code = None exemplar = None ast = None for i, encoder in enumerate(encoders): res = encoder_states[i] if encoder.cell_type == 'raw': time_steps = tf.shape(encoder_outputs[i], out_type=tf.float32)[1] res = tf.einsum('ijk->ik', encoder_outputs[i]) res = tf.scalar_mul(tf.constant(1.0/time_steps), res) if encoder.name == 'exemplar_code': exemplar_code = res if encoder.name == 'code': code = res if encoder.name == 'exemplar': exemplar = res if encoder.name == 'ast': ast = res activation = tf.nn.sigmoid sim_score = dense(tf.concat([code, exemplar_code], axis=1), 1, use_bias=False, activation=activation, name='sim_score') # with tf.variable_scope('decoder_nl'): # fused = dense(tf.concat([code, ast], axis=1), encoder.cell_size, use_bias=False, activation=None, name='fuse') encoder_state = code * (1-sim_score) + exemplar * sim_score return encoder_outputs, encoder_state, new_encoder_input_length, sim_score
def _add_seq2seq(self): """Add the whole sequence-to-sequence model to the graph.""" hps = self._hps vsize = self._vocab.size() # size of the vocabulary with tf.variable_scope('seq2seq'): # Some initializers self.rand_unif_init = tf.random_uniform_initializer( -hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123) self.trunc_norm_init = tf.truncated_normal_initializer( stddev=hps.trunc_norm_init_std) # Add embedding matrix (shared by the encoder and decoder inputs) with tf.variable_scope('embedding'): # embedding = tf.get_variable('embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=self.trunc_norm_init) embedding = tf.Variable(tf.constant(0.0, shape=[vsize, hps.emb_dim]), trainable=False, name="word_embedding_w") self.embedding_init = embedding.assign(self.word_embedding) if hps.mode == "train": self._add_emb_vis(embedding) # add to tensorboard emb_enc_inputs = tf.nn.embedding_lookup( embedding, self._enc_batch ) # tensor with shape (batch_size, max_enc_steps, emb_size) emb_dec_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in tf.unstack(self._dec_batch, axis=1) ] # list length max_dec_steps containing shape (batch_size, emb_size) # Map article to topic distribution(batch_size, topic_num, 1) self.topic_distribution = self._article_topic_distribution( emb_enc_inputs, hps) # Extract topic words(batch_size, seq_num, 1) self.topic_words = self._extract_topic_words( emb_enc_inputs, self.topic_distribution, hps) # calculate final results based on topic_representation mapped_term_frequencies = self._map_term_frequency(hps) mu, log_sigma, kl_divergence = self._cal_topic_representation( mapped_term_frequencies, hps) # get article topic representation topic_additions = self._topic_representation(mu, log_sigma) # Add the encoder. enc_outputs, fw_st, bw_st = self._add_encoder( emb_enc_inputs, self._enc_lens) self._enc_states = enc_outputs # Our encoder is bidirectional and our decoder is unidirectional so we need to reduce the final encoder hidden state to the right size to be the initial decoder hidden state self._dec_in_state = self._reduce_states(fw_st, bw_st) # Add the decoder. with tf.variable_scope('decoder'): decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage = self._add_decoder( emb_dec_inputs) # Add the output projection to obtain the vocabulary distribution with tf.variable_scope('output_projection'): w = tf.get_variable('w', [hps.hidden_dim, vsize], dtype=tf.float32, initializer=self.trunc_norm_init) v = tf.get_variable('v', [vsize], dtype=tf.float32, initializer=self.trunc_norm_init) vocab_scores = [ ] # vocab_scores is the vocabulary distribution before applying softmax. Each entry on the list corresponds to one decoder step for i, output in enumerate(decoder_outputs): if i > 0: tf.get_variable_scope().reuse_variables() tmp_topic_addition = tf.slice(topic_additions, [i, 0], [1, vsize]) topic_filter = tf.slice(tf.squeeze(self.topic_words), [0, i], [hps.batch_size, 1]) logits = tf.nn.xw_plus_b(output, w, v) semantic = tf.multiply(topic_filter, tmp_topic_addition) vocab_scores.append(logits + semantic) # apply the linear layer vocab_dists = [ tf.nn.softmax(s) for s in vocab_scores ] # The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file. # For pointer-generator model, calc final distribution from copy distribution and vocabulary distribution if FLAGS.pointer_gen: final_dists = self._calc_final_dist(vocab_dists, self.attn_dists) else: # final distribution is just vocabulary distribution final_dists = vocab_dists if hps.mode in ['train', 'eval']: # Calculate the loss with tf.variable_scope('loss'): if FLAGS.pointer_gen: # Calculate the loss per step # This is fiddly; we use tf.gather_nd to pick out the probabilities of the gold target words loss_per_step = [ ] # will be list length max_dec_steps containing shape (batch_size) batch_nums = tf.range( 0, limit=hps.batch_size) # shape (batch_size) for dec_step, dist in enumerate(final_dists): targets = self._target_batch[:, dec_step] # The indices of the target words. shape (batch_size) indices = tf.stack((batch_nums, targets), axis=1) # shape (batch_size, 2) gold_probs = tf.gather_nd( dist, indices ) # shape (batch_size). prob of correct words on this step losses = -tf.log(gold_probs) loss_per_step.append(losses) # Apply dec_padding_mask and get loss self._loss = _mask_and_avg(loss_per_step, self._dec_padding_mask) else: # baseline model self._loss = tf.contrib.seq2seq.sequence_loss( tf.stack(vocab_scores, axis=1), self._target_batch, self._dec_padding_mask ) # this applies softmax internally self._loss = self._loss - kl_divergence tf.summary.scalar('loss', self._loss) # Calculate coverage loss from the attention distributions if hps.coverage: with tf.variable_scope('coverage_loss'): self._coverage_loss = _coverage_loss( self.attn_dists, self._dec_padding_mask) tf.summary.scalar('coverage_loss', self._coverage_loss) self._total_loss = self._loss + hps.cov_loss_wt * self._coverage_loss tf.summary.scalar('total_loss', self._total_loss) if hps.mode == "decode": # We run decode beam search mode one decoder step at a time assert len( final_dists ) == 1 # final_dists is a singleton list containing shape (batch_size, extended_vsize) final_dists = final_dists[0] topk_probs, self._topk_ids = tf.nn.top_k( final_dists, hps.batch_size * 2 ) # take the k largest probs. note batch_size=beam_size in decode mode self._topk_log_probs = tf.log(topk_probs)
with open("../filtertWN18.txt", 'r') as f: idx = -1 for x in f.readlines(): if len(x.strip().split(' ')) > 0: filtert.append([]) idx += 1 for i in x.strip().split(' '): filtert[idx].append(int(i)) else: print('length:0') #tf.placeholder() trainable = [] #可训练参数列表 bound = 6 / math.sqrt(embed_dim) ent_embedding =tf.get_variable("ent_embedding", [n_entity, embed_dim], initializer=tf.random_uniform_initializer(minval=-bound, \ maxval=bound,seed=345)) '''ent_projecting=tf.get_variable("ent_projecting", [n_entity, embed_dim], initializer=tf.random_uniform_initializer(minval=-bound, \ maxval=bound,seed=347))''' trainable.append(ent_embedding) #trainable.append(ent_projecting) rel_embedding =tf.get_variable("rel_embedding", [n_relation, embed_dim], initializer=tf.random_uniform_initializer(minval=-bound, \ maxval=bound,seed=346)) '''rel_projecting=tf.get_variable("rel_projecting", [n_relation, embed_dim], initializer=tf.random_uniform_initializer(minval=-bound, \ maxval=bound,seed=348))''' trainable.append(rel_embedding) #trainable.append(rel_projecting)
def attention_decoder(decoder_inputs, initial_state, attention_states, encoders, decoder, encoder_input_length, feed_previous=0.0, align_encoder_id=0, feed_argmax=True, sim_score=0.0, **kwargs): """ :param decoder_inputs: int32 tensor of shape (batch_size, output_length) :param initial_state: initial state of the decoder (usually the final state of the encoder), as a float32 tensor of shape (batch_size, initial_state_size). This state is mapped to the correct state size for the decoder. :param attention_states: list of tensors of shape (batch_size, input_length, encoder_cell_size), the hidden states of the encoder(s) (one tensor for each encoder). :param encoders: configuration of the encoders :param decoder: configuration of the decoder :param encoder_input_length: list of int32 tensors of shape (batch_size,), tells for each encoder, the true length of each sequence in the batch (sequences in the same batch are padded to all have the same length). :param feed_previous: scalar tensor corresponding to the probability to use previous decoder output instead of the ground truth as input for the decoder (1 when decoding, between 0 and 1 when training) :param feed_argmax: boolean tensor, when True the greedy decoder outputs the word with the highest probability (argmax). When False, it samples a word from the probability distribution (softmax). :param align_encoder_id: outputs attention weights for this encoder. Also used when predicting edit operations (pred_edits), to specifify which encoder reads the sequence to post-edit (MT). :return: outputs of the decoder as a tensor of shape (batch_size, output_length, decoder_cell_size) attention weights as a tensor of shape (output_length, encoders, batch_size, input_length) """ assert not decoder.pred_maxout_layer or decoder.cell_size % 2 == 0, 'cell size must be a multiple of 2' if decoder.use_lstm is False: decoder.cell_type = 'GRU' embedding_shape = [decoder.vocab_size, decoder.embedding_size] if decoder.embedding_initializer == 'sqrt3': initializer = tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3)) else: initializer = None device = '/cpu:0' if decoder.embeddings_on_cpu else None if decoder.share_emb is None: with tf.device(device): embedding = get_variable('embedding_{}'.format(decoder.name), shape=embedding_shape, initializer=initializer) else: with tf.device(device): embedding = get_variable('embedding_{}'.format(decoder.share_emb), shape=embedding_shape, initializer=initializer) input_shape = tf.shape(decoder_inputs) batch_size = input_shape[0] time_steps = input_shape[1] scope_name = 'decoder_{}'.format(decoder.name) scope_name += '/' + '_'.join(encoder.name for encoder in encoders) def embed(input_): embedded_input = tf.nn.embedding_lookup(embedding, input_) input_shape = tf.shape(embedded_input) batch_size = input_shape[0] if decoder.use_dropout and decoder.word_keep_prob is not None: noise_shape = [1, 1] if decoder.pervasive_dropout else [batch_size, 1] embedded_input = tf.nn.dropout(embedded_input, keep_prob=decoder.word_keep_prob, noise_shape=noise_shape) if decoder.use_dropout and decoder.embedding_keep_prob is not None: size = tf.shape(embedded_input)[1] noise_shape = [1, size] if decoder.pervasive_dropout else [batch_size, size] embedded_input = tf.nn.dropout(embedded_input, keep_prob=decoder.embedding_keep_prob, noise_shape=noise_shape) return embedded_input def get_cell(input_size=None, reuse=False): cells = [] for j in range(decoder.layers): input_size_ = input_size if j == 0 else decoder.cell_size if decoder.cell_type.lower() == 'lstm': cell = CellWrapper(BasicLSTMCell(decoder.cell_size, reuse=reuse)) elif decoder.cell_type.lower() == 'dropoutgru': cell = DropoutGRUCell(decoder.cell_size, reuse=reuse, layer_norm=decoder.layer_norm, input_size=input_size_, input_keep_prob=decoder.rnn_input_keep_prob, state_keep_prob=decoder.rnn_state_keep_prob) else: cell = GRUCell(decoder.cell_size, reuse=reuse, layer_norm=decoder.layer_norm) if decoder.use_dropout and decoder.cell_type.lower() != 'dropoutgru': cell = DropoutWrapper(cell, input_keep_prob=decoder.rnn_input_keep_prob, output_keep_prob=decoder.rnn_output_keep_prob, state_keep_prob=decoder.rnn_state_keep_prob, variational_recurrent=decoder.pervasive_dropout, dtype=tf.float32, input_size=input_size_) cells.append(cell) if len(cells) == 1: return cells[0] else: return CellWrapper(MultiRNNCell(cells)) def look(state, input_, prev_weights=None, pos=None): prev_weights_ = [prev_weights if i == align_encoder_id else None for i in range(len(encoders))] pos_ = None if decoder.pred_edits: pos_ = [pos if i == align_encoder_id else None for i in range(len(encoders))] if decoder.attn_prev_word: state = tf.concat([state, input_], axis=1) parameters = dict(hidden_states=attention_states, encoder_input_length=encoder_input_length, encoders=encoders, aggregation_method=decoder.aggregation_method, sim_score=sim_score) context, new_weights = multi_attention(state, pos=pos_, prev_weights=prev_weights_, **parameters) if decoder.context_mapping: with tf.variable_scope(scope_name): activation = tf.nn.tanh if decoder.context_mapping_activation == 'tanh' else None use_bias = not decoder.context_mapping_no_bias context = dense(context, decoder.context_mapping, use_bias=use_bias, activation=activation, name='context_mapping') return context, new_weights[align_encoder_id] def update(state, input_, context=None, symbol=None): if context is not None and decoder.rnn_feed_attn: input_ = tf.concat([input_, context], axis=1) input_size = input_.get_shape()[1].value initializer = CellInitializer(decoder.cell_size) if decoder.orthogonal_init else None with tf.variable_scope(tf.get_variable_scope(), initializer=initializer): try: output, new_state = get_cell(input_size)(input_, state) except ValueError: # auto_reuse doesn't work with LSTM cells output, new_state = get_cell(input_size, reuse=True)(input_, state) if decoder.skip_update and decoder.pred_edits and symbol is not None: is_del = tf.equal(symbol, utils.DEL_ID) new_state = tf.where(is_del, state, new_state) if decoder.cell_type.lower() == 'lstm' and decoder.use_lstm_full_state: output = new_state return output, new_state def update_pos(pos, symbol, max_pos=None): if not decoder.pred_edits: return pos is_keep = tf.equal(symbol, utils.KEEP_ID) is_del = tf.equal(symbol, utils.DEL_ID) is_not_ins = tf.logical_or(is_keep, is_del) pos = beam_search.resize_like(pos, symbol) max_pos = beam_search.resize_like(max_pos, symbol) pos += tf.to_float(is_not_ins) if max_pos is not None: pos = tf.minimum(pos, tf.to_float(max_pos)) return pos def generate(state, input_, context): if decoder.pred_use_lstm_state is False: # for back-compatibility state = state[:,-decoder.cell_size:] projection_input = [state, context] if decoder.use_previous_word: projection_input.insert(1, input_) # for back-compatibility output_ = tf.concat(projection_input, axis=1) if decoder.pred_deep_layer: deep_layer_size = decoder.pred_deep_layer_size or decoder.embedding_size if decoder.layer_norm: output_ = dense(output_, deep_layer_size, use_bias=False, name='deep_output') output_ = tf.contrib.layers.layer_norm(output_, activation_fn=tf.nn.tanh, scope='output_layer_norm') else: output_ = dense(output_, deep_layer_size, activation=tf.tanh, use_bias=True, name='deep_output') if decoder.use_dropout: size = tf.shape(output_)[1] noise_shape = [1, size] if decoder.pervasive_dropout else None output_ = tf.nn.dropout(output_, keep_prob=decoder.deep_layer_keep_prob, noise_shape=noise_shape) else: if decoder.pred_maxout_layer: maxout_size = decoder.maxout_size or decoder.cell_size output_ = dense(output_, maxout_size, use_bias=True, name='maxout') if decoder.old_maxout: # for back-compatibility with old models output_ = tf.nn.pool(tf.expand_dims(output_, axis=2), window_shape=[2], pooling_type='MAX', padding='SAME', strides=[2]) output_ = tf.squeeze(output_, axis=2) else: output_ = tf.maximum(*tf.split(output_, num_or_size_splits=2, axis=1)) if decoder.pred_embed_proj: # intermediate projection to embedding size (before projecting to vocabulary size) # this is useful to reduce the number of parameters, and # to use the output embeddings for output projection (tie_embeddings parameter) output_ = dense(output_, decoder.embedding_size, use_bias=False, name='softmax0') if decoder.tie_embeddings and (decoder.pred_embed_proj or decoder.pred_deep_layer): bias = get_variable('softmax1/bias', shape=[decoder.vocab_size]) output_ = tf.matmul(output_, tf.transpose(embedding)) + bias else: output_ = dense(output_, decoder.vocab_size, use_bias=True, name='softmax1') return output_ state_size = (decoder.cell_size * 2 if decoder.cell_type.lower() == 'lstm' else decoder.cell_size) * decoder.layers if decoder.use_dropout: initial_state = tf.nn.dropout(initial_state, keep_prob=decoder.initial_state_keep_prob) with tf.variable_scope(scope_name): if decoder.layer_norm: initial_state = dense(initial_state, state_size, use_bias=False, name='initial_state_projection') initial_state = tf.contrib.layers.layer_norm(initial_state, activation_fn=tf.nn.tanh, scope='initial_state_layer_norm') else: initial_state = dense(initial_state, state_size, use_bias=True, name='initial_state_projection', activation=tf.nn.tanh) if decoder.cell_type.lower() == 'lstm' and decoder.use_lstm_full_state: initial_output = initial_state else: initial_output = initial_state[:, -decoder.cell_size:] time = tf.constant(0, dtype=tf.int32, name='time') outputs = tf.TensorArray(dtype=tf.float32, size=time_steps) samples = tf.TensorArray(dtype=tf.int64, size=time_steps) inputs = tf.TensorArray(dtype=tf.int64, size=time_steps).unstack(tf.to_int64(tf.transpose(decoder_inputs))) states = tf.TensorArray(dtype=tf.float32, size=time_steps) weights = tf.TensorArray(dtype=tf.float32, size=time_steps) attns = tf.TensorArray(dtype=tf.float32, size=time_steps) initial_symbol = inputs.read(0) # first symbol is BOS initial_input = embed(initial_symbol) initial_pos = tf.zeros([batch_size], tf.float32) initial_weights = tf.zeros(tf.shape(attention_states[align_encoder_id])[:2]) with tf.variable_scope('decoder_{}'.format(decoder.name)): initial_context, _ = look(initial_output, initial_input, pos=initial_pos, prev_weights=initial_weights) initial_data = tf.concat([initial_state, initial_context, tf.expand_dims(initial_pos, axis=1), initial_weights], axis=1) context_size = initial_context.shape[1].value def get_logits(state, ids, time): # for beam-search decoding with tf.variable_scope('decoder_{}'.format(decoder.name)): state, context, pos, prev_weights = tf.split(state, [state_size, context_size, 1, -1], axis=1) input_ = embed(ids) pos = tf.squeeze(pos, axis=1) pos = tf.cond(tf.equal(time, 0), lambda: pos, lambda: update_pos(pos, ids, encoder_input_length[align_encoder_id])) if decoder.cell_type.lower() == 'lstm' and decoder.use_lstm_full_state: output = state else: # output is always the right-most part of state. However, this only works at test time, # because different dropout operations can be used on state and output. output = state[:, -decoder.cell_size:] if decoder.conditional_rnn: with tf.variable_scope('conditional_1'): output, state = update(state, input_) elif decoder.update_first: output, state = update(state, input_, None, ids) elif decoder.generate_first: output, state = tf.cond(tf.equal(time, 0), lambda: (output, state), lambda: update(state, input_, context, ids)) context, new_weights = look(output, input_, pos=pos, prev_weights=prev_weights) if decoder.conditional_rnn: with tf.variable_scope('conditional_2'): output, state = update(state, context) elif not decoder.generate_first: output, state = update(state, input_, context, ids) logits = generate(output, input_, context) pos = tf.expand_dims(pos, axis=1) state = tf.concat([state, context, pos, new_weights], axis=1) return state, logits def _time_step(time, input_, input_symbol, pos, state, output, outputs, states, weights, attns, prev_weights, samples): if decoder.conditional_rnn: with tf.variable_scope('conditional_1'): output, state = update(state, input_) elif decoder.update_first: output, state = update(state, input_, None, input_symbol) context, new_weights = look(output, input_, pos=pos, prev_weights=prev_weights) if decoder.conditional_rnn: with tf.variable_scope('conditional_2'): output, state = update(state, context) elif not decoder.generate_first: output, state = update(state, input_, context, input_symbol) output_ = generate(output, input_, context) argmax = lambda: tf.argmax(output_, 1) target = lambda: inputs.read(time + 1) softmax = lambda: tf.squeeze(tf.multinomial(tf.log(tf.nn.softmax(output_)), num_samples=1), axis=1) use_target = tf.logical_and(time < time_steps - 1, tf.random_uniform([]) >= feed_previous) predicted_symbol = tf.case([ (use_target, target), (tf.logical_not(feed_argmax), softmax)], default=argmax) # default case is useful for beam-search predicted_symbol.set_shape([None]) predicted_symbol = tf.stop_gradient(predicted_symbol) samples = samples.write(time, predicted_symbol) input_ = embed(predicted_symbol) pos = update_pos(pos, predicted_symbol, encoder_input_length[align_encoder_id]) attns = attns.write(time, context) weights = weights.write(time, new_weights) states = states.write(time, state) outputs = outputs.write(time, output_) if not decoder.conditional_rnn and not decoder.update_first and decoder.generate_first: output, state = update(state, input_, context, predicted_symbol) return (time + 1, input_, predicted_symbol, pos, state, output, outputs, states, weights, attns, new_weights, samples) with tf.variable_scope('decoder_{}'.format(decoder.name)): _, _, _, new_pos, new_state, _, outputs, states, weights, attns, new_weights, samples = tf.while_loop( cond=lambda time, *_: time < time_steps, body=_time_step, loop_vars=(time, initial_input, initial_symbol, initial_pos, initial_state, initial_output, outputs, weights, states, attns, initial_weights, samples), parallel_iterations=decoder.parallel_iterations, swap_memory=decoder.swap_memory) outputs = outputs.stack() weights = weights.stack() # batch_size, encoders, output time, input time states = states.stack() attns = attns.stack() samples = samples.stack() # put batch_size as first dimension outputs = tf.transpose(outputs, perm=(1, 0, 2)) weights = tf.transpose(weights, perm=(1, 0, 2)) states = tf.transpose(states, perm=(1, 0, 2)) attns = tf.transpose(attns, perm=(1, 0, 2)) samples = tf.transpose(samples) return outputs, weights, states, attns, samples, get_logits, initial_data
def create_model(session, run_options, run_metadata): device_strs = FLAGS.NN.split(",") devices_per_model = [get_device_address(x) for x in device_strs] num_models = FLAGS.num_models dtype = FLAGS.tf_dtype initializer = None if FLAGS.p != 0.0: initializer = tf.random_uniform_initializer(-FLAGS.p,FLAGS.p) if FLAGS.dynamic_rnn: from seqModelDistributed_dynamic import SeqModelDistributed else: from seqModelDistributed import SeqModelDistributed with tf.variable_scope("",initializer = initializer): model = SeqModelDistributed(FLAGS._buckets, FLAGS.size, FLAGS.real_vocab_size_from, FLAGS.real_vocab_size_to, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, optimizer = FLAGS.optimizer, dropoutRate = FLAGS.keep_prob, dtype = dtype, devices_per_model = devices_per_model, topk_n = FLAGS.beam_size, run_options = run_options, run_metadata = run_metadata, with_attention = FLAGS.attention, beam_search = FLAGS.beam_search, beam_buckets = _beam_buckets, with_sampled_softmax = FLAGS.with_sampled_softmax, n_samples = FLAGS.n_samples, attention_style = FLAGS.attention_style, attention_scale = FLAGS.attention_scale, num_models = num_models, tie_input_output_embedding = FLAGS.tie_input_output_embedding, variational_dropout = FLAGS.variational_dropout ) ckpt = tf.train.get_checkpoint_state(FLAGS.saved_model_dir) # if FLAGS.recommend or (not FLAGS.fromScratch) and ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): if FLAGS.mode == "DUMP_LSTM" or FLAGS.mode == "BEAM_DECODE" or FLAGS.mode == 'FORCE_DECODE' or (not FLAGS.fromScratch) and ckpt: if FLAGS.load_from_best: best_model_path = os.path.join(os.path.dirname(ckpt.model_checkpoint_path),"best-0") model.load_parameters(session, best_model_path) else: model.load_parameters(session, ckpt.model_checkpoint_path) if FLAGS.mode == 'BEAM_DECODE': session.run(tf.variables_initializer(model.beam_search_vars)) else: model.init_parameters_from_scratch(session) return model
def add_word_embedding(self): self.embedding = tf.get_variable( 'word_embedding', [self.vocab_size, self.embedding_dim], tf.float32, tf.random_uniform_initializer(-1.0, 1.0)) self.embedded = tf.nn.embedding_lookup( self.embedding, self.x) # forward activation of the input network
def __init__(self, is_training, length, leaking_rate=0.2, initLen=50): self.batch_size = batch_size = FLAGS.batch_size self.num_steps = num_steps = length self.inSize = inSize = FLAGS.input_dim self.resSize = resSize = FLAGS.hidden_dim self._input_data = tf.placeholder( tf.float32, [batch_size, length, FLAGS.input_dim]) if is_training: self._targets = tf.placeholder( tf.float32, [batch_size, length - initLen, FLAGS.output_dim]) else: self._targets = tf.placeholder( tf.float32, [batch_size, length, FLAGS.output_dim]) self._Win = Win = tf.placeholder(tf.float32, [inSize, resSize]) self._W = W = tf.placeholder(tf.float32, [resSize, resSize]) zeros = array_ops.zeros(array_ops.pack([batch_size, resSize]), dtype=tf.float32) zeros.set_shape([None, resSize]) self._initial_state = zeros # self._initial_state = np.zeros((batch_size, resSize), dtype=np.float32) S = [] s = self._initial_state with tf.variable_scope("ESN"): for i in range(num_steps): s = (1 - leaking_rate) * s + \ leaking_rate * tf.nn.tanh(tf.matmul(self._input_data[:,i,:], Win)+tf.matmul(s,W)) if is_training: if i >= initLen: S.append(tf.concat(1, [self._input_data[:, i, :], s])) else: S.append(tf.concat(1, [self._input_data[:, i, :], s])) self._final_state = s V_size = inSize + resSize hidden_output = tf.reshape(tf.concat(1, S), [-1, V_size]) V = tf.get_variable("v", shape=[V_size, FLAGS.output_dim], dtype=tf.float32, initializer=tf.random_uniform_initializer( -tf.sqrt(1. / V_size), tf.sqrt(1. / V_size))) b = tf.get_variable("b", shape=[FLAGS.output_dim], dtype=tf.float32, initializer=tf.constant_initializer(0.1)) logits = tf.add(tf.matmul(hidden_output, V), b) target = tf.reshape(self._targets, [-1, FLAGS.output_dim]) training_loss = tf.reduce_sum(tf.pow(logits - target, 2)) / 2 mse = tf.reduce_mean(tf.pow(logits - target, 2)) self._cost = mse self._logits = logits if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(training_loss, tvars), FLAGS.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
################################################################################ inp = tf.placeholder(dtype, [2, 4, 6, 5], 'input') conv = tf.layers.conv2d(inputs=inp, filters=4, kernel_size=[3, 5], padding='VALID', activation=tf.nn.elu, bias_initializer=tf.random_normal_initializer()) save(inp, conv, prefix + 'padding_valid') ################################################################################ inp = tf.placeholder(dtype, [3, 2, 3, 4], 'input') conv = tf.layers.conv2d(inputs=inp, filters=4, kernel_size=[1, 1], activation=tf.nn.tanh, bias_initializer=tf.random_uniform_initializer( 0, 1)) conv2 = tf.layers.conv2d(inputs=inp, filters=4, kernel_size=[1, 1], activation=tf.nn.sigmoid, bias_initializer=None) eltwise_add_mul = (inp * 0.31 + 2 * conv) * conv2 save(inp, eltwise_add_mul, prefix + 'eltwise_add_mul') ################################################################################ inp = tf.placeholder(dtype, [1, 4, 5, 1], 'input') conv = tf.layers.conv2d(inputs=inp, filters=4, kernel_size=[3, 1], padding='VALID') padded = tf.pad(conv, [[0, 0], [0, 2], [0, 0], [0, 0]]) merged = tf.concat([padded, inp], axis=3)
def _add_seq2seq(self): hps = self._hps vsize = self._vocab.NumIds() with tf.variable_scope('seq2seq'): encoder_inputs = tf.unstack(tf.transpose(self._articles)) decoder_inputs = tf.unstack(tf.transpose(self._abstracts)) targets = tf.unstack(tf.transpose(self._targets)) loss_weights = tf.unstack(tf.transpose(self._loss_weights)) article_lens = self._article_lens # Embedding shared by the input and outputs. with tf.variable_scope('embedding'), tf.device('/cpu:0'): embedding = tf.get_variable( 'embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) emb_encoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs ] emb_decoder_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs ] for layer_i in xrange(hps.enc_layers): with tf.variable_scope('encoder%d' % layer_i), tf.device( self._next_device()): cell_fw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123), state_is_tuple=False) cell_bw = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) (emb_encoder_inputs, fw_state, _) = tf.contrib.rnn.static_bidirectional_rnn( cell_fw, cell_bw, emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) encoder_outputs = emb_encoder_inputs with tf.variable_scope('output_projection'): w = tf.get_variable( 'w', [hps.num_hidden, vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) w_t = tf.transpose(w) v = tf.get_variable( 'v', [vsize], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope('decoder'), tf.device(self._next_device()): # When decoding, use model output from the previous step # for the next step. loop_function = None if hps.mode == 'decode': loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) cell = tf.contrib.rnn.LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) encoder_outputs = [ tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden]) for x in encoder_outputs ] self._enc_top_states = tf.concat(axis=1, values=encoder_outputs) self._dec_in_state = fw_state # During decoding, follow up _dec_in_state are fed from beam_search. # dec_out_state are stored by beam_search for next step feeding. initial_state_attention = (hps.mode == 'decode') decoder_outputs, self._dec_out_state = tf.contrib.legacy_seq2seq.attention_decoder( emb_decoder_inputs, self._dec_in_state, self._enc_top_states, cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) with tf.variable_scope('output'), tf.device(self._next_device()): model_outputs = [] for i in xrange(len(decoder_outputs)): if i > 0: tf.get_variable_scope().reuse_variables() model_outputs.append( tf.nn.xw_plus_b(decoder_outputs[i], w, v)) if hps.mode == 'decode': with tf.variable_scope('decode_output'), tf.device('/cpu:0'): best_outputs = [tf.argmax(x, 1) for x in model_outputs] tf.logging.info('best_outputs%s', best_outputs[0].get_shape()) self._outputs = tf.concat(axis=1, values=[ tf.reshape( x, [hps.batch_size, 1]) for x in best_outputs ]) self._topk_log_probs, self._topk_ids = tf.nn.top_k( tf.log(tf.nn.softmax(model_outputs[-1])), hps.batch_size * 2) with tf.variable_scope('loss'), tf.device(self._next_device()): def sampled_loss_func(inputs, labels): with tf.device('/cpu:0'): # Try gpu. labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss( weights=w_t, biases=v, labels=labels, inputs=inputs, num_sampled=hps.num_softmax_samples, num_classes=vsize) if hps.num_softmax_samples != 0 and hps.mode == 'train': self._loss = seq2seq_lib.sampled_sequence_loss( decoder_outputs, targets, loss_weights, sampled_loss_func) else: self._loss = tf.contrib.legacy_seq2seq.sequence_loss( model_outputs, targets, loss_weights) tf.summary.scalar('loss', tf.minimum(12.0, self._loss))
def __init__(self, stochastic=False, use_slope=True, variational_dropout=False, vocabulary_size=283, label_size=50, rnnSize=256, n_layers=3, dropout=0.5, zoneout=0.1, embedding_size=None, dtype=tf.float32, clip=0.35, k_width=3, name='hlstm', conv_filter=3, mid_filter=25, batch_size=128): self.rnnSize = rnnSize self.inputSize = vocabulary_size self.outputSize = label_size self.stochastic = stochastic self.dtype = dtype self.dropout = dropout self.n_layers = n_layers self.clip = clip self.name = name self.use_slope = use_slope self.zoneout = zoneout self.batch_size = batch_size self.k_width = k_width self.conv_filter = conv_filter self.mid_filter = mid_filter f_bias = 0.0 # placeholders self.x = tf.placeholder(tf.float32, [None, None, 40, 3], name='x') #[batch, seq_len] self.label = tf.sparse_placeholder(tf.int32, name='label') #[batch, seq_len] self.seq_len = tf.placeholder(tf.int32, [None], name='seq_len') # [batch_size] self.is_train = tf.placeholder(tf.bool, [], name='train') self.lr = tf.placeholder(tf.float32, [], name='lr') dropout_p = tf.where(self.is_train, self.dropout, 1.0) dropout_p = tf.cast(dropout_p, dtype=self.dtype) # LSTM layers self.lstm_cells = [] conv_filter_size = (self.conv_filter, self.conv_filter) h = tf.layers.conv2d(self.x, 32, conv_filter_size, (2, 2), 'same', use_bias=False, name='conv0') h = tf.contrib.layers.batch_norm(h, center=True, scale=True, is_training=self.is_train, decay=0.9, epsilon=1e-3, scope='bn0') h = tf.nn.tanh(h, name='tanh0') h = tf.layers.conv2d(h, 32, conv_filter_size, (1, 2), 'same', use_bias=False, name='conv1') h = tf.contrib.layers.batch_norm(h, center=True, scale=True, is_training=self.is_train, decay=0.9, epsilon=1e-3, scope='bn1') h = tf.nn.tanh(h, name='tanh1') #reshape # ([0] : batch_size, [1] : seq_len, [2]*[3] : feature dimension) h_shape = tf.shape(h) h = tf.reshape(h, [batch_size, h_shape[1], 1, 320]) conv_filter = tf.get_variable('QRNN_conv0_filter', shape=[mid_filter, 1, 320, 1], trainable=True) h = tf.nn.depthwise_conv2d(h, conv_filter, [1, 1, 1, 1], padding='SAME', name='QRNN_conv0') h = tf.squeeze(h, axis=[-2]) sru_ = SRU_layer(self.rnnSize, batch_size=self.batch_size, fwidth=self.k_width, pool_type='ifo', zoneout=self.zoneout, name='QRNN_layer0', infer=tf.logical_not(self.is_train), skip=True, skip_embedding=True) sru_h, last_state = sru_(h) sru_h = tf.nn.dropout( sru_h, dropout_p, noise_shape=[tf.shape(sru_h)[0], 1, tf.shape(sru_h)[2]]) for i in range(1, self.n_layers): sru_h = tf.expand_dims(sru_h, -2) conv_filter = tf.get_variable( 'QRNN_conv{}_filter'.format(i), shape=[mid_filter, 1, self.rnnSize, 1], trainable=True) sru_h = tf.nn.depthwise_conv2d(sru_h, conv_filter, [1, 1, 1, 1], padding='SAME', name='QRNN_conv{}'.format(i)) sru_h = tf.squeeze(sru_h, axis=[-2]) sru_ = SRU_layer(self.rnnSize, batch_size=self.batch_size, fwidth=self.k_width, pool_type='ifo', zoneout=self.zoneout, name='QRNN_layer{}'.format(i), infer=tf.logical_not(self.is_train), skip=True, skip_embedding=False) print(sru_h) sru_h, last_state = sru_(sru_h) sru_h = tf.nn.dropout( sru_h, dropout_p, noise_shape=[tf.shape(sru_h)[0], 1, tf.shape(sru_h)[2]]) h_shape = tf.shape(sru_h) output_h = tf.reshape(sru_h, [-1, self.rnnSize]) print(output_h) with tf.variable_scope('dense'): dense = tf.layers.dense( output_h, self.outputSize, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1)) #dense = tf.layers.dense(output_h, self.outputSize) self.logit = tf.reshape(dense, [h_shape[0], h_shape[1], self.outputSize]) self.logsoftmax = tf.nn.log_softmax(self.logit) self.loss = tf.nn.ctc_loss(inputs=self.logit, labels=self.label, sequence_length=self.seq_len, time_major=False) self.loss = tf.reduce_mean(self.loss) train_loss = self.loss opt = tf.train.AdamOptimizer(self.lr) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): grad, var = zip(*opt.compute_gradients(train_loss)) clipped_gradients, _ = tf.clip_by_global_norm(grad, clip) # var_check = [tf.check_numerics(v, 'nan in var' + repr(v)) for v in var] # grad_check = [tf.check_numerics(g, 'nan in grad' + repr(g)) for g in clipped_gradients] # with tf.control_dependencies(var_check): # with tf.control_dependencies(grad_check): self.optimizer = opt.apply_gradients(zip(clipped_gradients, var)) self.sentence, _ = tf.nn.ctc_greedy_decoder( tf.transpose(self.logit, (1, 0, 2)), self.seq_len) self.cer = tf.reduce_mean( tf.edit_distance(tf.cast(self.sentence[0], tf.int32), self.label)) # last states to placeholder self.saver = tf.train.Saver()
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(), tf.Session() as session: if FLAGS.importmodeldir is not None: # Import a model instance # Find last executed epoch from glob import glob history = list(map(lambda x: int(x.split('-')[1][:-5]), glob(FLAGS.importmodeldir+'/model/model.ckpt-*.meta'))) last_epoch = np.max(history) # Recreate model with tf.variable_scope("model", reuse=None): m = PTBModel(is_training=True, config=config) # merged_summaries_for_training = tf.merge_all_summaries() # use this operation to merge summaries attached so far with tf.variable_scope("model", reuse=True): mtest = PTBModel(is_training=False, config=eval_config) merged_summaries_for_test = tf.merge_all_summaries() # use this operation to merge summaries attached so far mvalid = PTBModel(is_training=False, config=config) # merged_summaries_for_valid = tf.merge_all_summaries() # use this operation to merge summaries attached so far # Fill model variables with trained values tf.train.Saver().restore(session, FLAGS.importmodeldir+'/model/model.ckpt-{}'.format(last_epoch)) initial_epoch = last_epoch + 1 else: # Create a model instance initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config) # merged_summaries_for_training = tf.merge_all_summaries() # use this operation to merge summaries attached so far with tf.variable_scope("model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=eval_config) merged_summaries_for_test = tf.merge_all_summaries() # use this operation to merge summaries attached so far mvalid = PTBModel(is_training=False, config=config) # merged_summaries_for_valid = tf.merge_all_summaries() # use this operation to merge summaries attached so far tf.initialize_all_variables().run() initial_epoch = 0 init_logs() init_model_persistance() train_writer = tf.train.SummaryWriter(FLAGS.logdir + "/train", session.graph) valid_writer = tf.train.SummaryWriter(FLAGS.logdir + "/valid", session.graph) test_writer = tf.train.SummaryWriter(FLAGS.logdir + "/test", session.graph) for i in range(initial_epoch, config.max_max_epoch): lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, train_data, m.train_op, verbose=True, summary_op=None, summary_writer=train_writer) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op(), summary_op=None,summary_writer=None) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) if FLAGS.exportmodeldir is not None: tf.train.Saver().save(session,FLAGS.exportmodeldir+"/model/model.ckpt",global_step=i) test_perplexity = run_epoch(session, mtest, test_data, tf.no_op(), summary_op=merged_summaries_for_test, summary_writer=test_writer) if FLAGS.exportmodeldir is not None: tf.train.Saver().save(session,FLAGS.exportmodeldir+"/model/model.ckpt",global_step=config.max_max_epoch) print("Test Perplexity: %.3f" % test_perplexity)
def add_word_embedding_layer(self): embedding = tf.get_variable('encoder', [self.vocab_size, self.embedding_dims], tf.float32, tf.random_uniform_initializer(-1.0, 1.0)) self._cursor = tf.nn.embedding_lookup(embedding, self._cursor)
def _build_word_char_embeddings(self): ''' options contains key 'char_cnn': { 'n_characters': 262, # includes the start / end characters 'max_characters_per_token': 50, 'filters': [ [1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 512] ], 'activation': 'tanh', # for the character embedding 'embedding': {'dim': 16} # for highway layers # if omitted, then no highway layers 'n_highway': 2, } ''' batch_size = self.options['batch_size'] unroll_steps = self.options['unroll_steps'] projection_dim = self.options['lstm']['projection_dim'] cnn_options = self.options['char_cnn'] filters = cnn_options['filters'] n_filters = sum(f[1] for f in filters) max_chars = cnn_options['max_characters_per_token'] char_embed_dim = cnn_options['embedding']['dim'] n_chars = cnn_options['n_characters'] if cnn_options['activation'] == 'tanh': activation = tf.nn.tanh elif cnn_options['activation'] == 'relu': activation = tf.nn.relu # the input character ids self.tokens_characters = tf.placeholder(DTYPE_INT, shape=(batch_size, unroll_steps, max_chars), name='tokens_characters') # the character embeddings with tf.device("/cpu:0"): self.embedding_weights = tf.get_variable( "char_embed", [n_chars, char_embed_dim], dtype=DTYPE, initializer=tf.random_uniform_initializer(-1.0, 1.0)) # shape (batch_size, unroll_steps, max_chars, embed_dim) self.char_embedding = tf.nn.embedding_lookup( self.embedding_weights, self.tokens_characters) if self.bidirectional: self.tokens_characters_reverse = tf.placeholder( DTYPE_INT, shape=(batch_size, unroll_steps, max_chars), name='tokens_characters_reverse') self.char_embedding_reverse = tf.nn.embedding_lookup( self.embedding_weights, self.tokens_characters_reverse) # the convolutions def make_convolutions(inp, reuse): with tf.variable_scope('CNN', reuse=reuse) as scope: convolutions = [] for i, (width, num) in enumerate(filters): if cnn_options['activation'] == 'relu': # He initialization for ReLU activation # with char embeddings init between -1 and 1 #w_init = tf.random_normal_initializer( # mean=0.0, # stddev=np.sqrt(2.0 / (width * char_embed_dim)) #) # Kim et al 2015, +/- 0.05 w_init = tf.random_uniform_initializer(minval=-0.05, maxval=0.05) elif cnn_options['activation'] == 'tanh': # glorot init w_init = tf.random_normal_initializer( mean=0.0, stddev=np.sqrt(1.0 / (width * char_embed_dim))) w = tf.get_variable("W_cnn_%s" % i, [1, width, char_embed_dim, num], initializer=w_init, dtype=DTYPE) b = tf.get_variable( "b_cnn_%s" % i, [num], dtype=DTYPE, initializer=tf.constant_initializer(0.0)) conv = tf.nn.conv2d( inp, w, strides=[1, 1, 1, 1], padding="VALID") + b # now max pool conv = tf.nn.max_pool(conv, [1, 1, max_chars - width + 1, 1], [1, 1, 1, 1], 'VALID') # activation conv = activation(conv) conv = tf.squeeze(conv, squeeze_dims=[2]) convolutions.append(conv) return tf.concat(convolutions, 2) # for first model, this is False, for others it's True reuse = tf.get_variable_scope().reuse embedding = make_convolutions(self.char_embedding, reuse) self.token_embedding_layers = [embedding] if self.bidirectional: # re-use the CNN weights from forward pass embedding_reverse = make_convolutions(self.char_embedding_reverse, True) # for highway and projection layers: # reshape from (batch_size, n_tokens, dim) to n_highway = cnn_options.get('n_highway') use_highway = n_highway is not None and n_highway > 0 use_proj = n_filters != projection_dim if use_highway or use_proj: embedding = tf.reshape(embedding, [-1, n_filters]) if self.bidirectional: embedding_reverse = tf.reshape(embedding_reverse, [-1, n_filters]) # set up weights for projection if use_proj: # assert n_filters > projection_dim with tf.variable_scope('CNN_proj') as scope: W_proj_cnn = tf.get_variable( "W_proj", [n_filters, projection_dim], initializer=tf.random_normal_initializer( mean=0.0, stddev=np.sqrt(1.0 / n_filters)), dtype=DTYPE) b_proj_cnn = tf.get_variable( "b_proj", [projection_dim], initializer=tf.constant_initializer(0.0), dtype=DTYPE) # apply highways layers def high(x, ww_carry, bb_carry, ww_tr, bb_tr): carry_gate = tf.nn.sigmoid(tf.matmul(x, ww_carry) + bb_carry) transform_gate = tf.nn.relu(tf.matmul(x, ww_tr) + bb_tr) return carry_gate * transform_gate + (1.0 - carry_gate) * x if use_highway: highway_dim = n_filters for i in range(n_highway): with tf.variable_scope('CNN_high_%s' % i) as scope: W_carry = tf.get_variable( 'W_carry', [highway_dim, highway_dim], # glorit init initializer=tf.random_normal_initializer( mean=0.0, stddev=np.sqrt(1.0 / highway_dim)), dtype=DTYPE) b_carry = tf.get_variable( 'b_carry', [highway_dim], initializer=tf.constant_initializer(-2.0), dtype=DTYPE) W_transform = tf.get_variable( 'W_transform', [highway_dim, highway_dim], initializer=tf.random_normal_initializer( mean=0.0, stddev=np.sqrt(1.0 / highway_dim)), dtype=DTYPE) b_transform = tf.get_variable( 'b_transform', [highway_dim], initializer=tf.constant_initializer(0.0), dtype=DTYPE) embedding = high(embedding, W_carry, b_carry, W_transform, b_transform) if self.bidirectional: embedding_reverse = high(embedding_reverse, W_carry, b_carry, W_transform, b_transform) self.token_embedding_layers.append( tf.reshape(embedding, [batch_size, unroll_steps, highway_dim])) # finally project down to projection dim if needed if use_proj: embedding = tf.matmul(embedding, W_proj_cnn) + b_proj_cnn if self.bidirectional: embedding_reverse = tf.matmul(embedding_reverse, W_proj_cnn) \ + b_proj_cnn self.token_embedding_layers.append( tf.reshape(embedding, [batch_size, unroll_steps, projection_dim])) # reshape back to (batch_size, tokens, dim) if use_highway or use_proj: shp = [batch_size, unroll_steps, projection_dim] embedding = tf.reshape(embedding, shp) if self.bidirectional: embedding_reverse = tf.reshape(embedding_reverse, shp) # at last assign attributes for remainder of the model self.embedding = embedding if self.bidirectional: self.embedding_reverse = embedding_reverse
#3x3 tic tac toe environment: #the each squre could take 3 different values: -1 for X, 0 nothing, 1 for O environment = np.zeros((9), dtype=np.int8) #if the game has not ended yet, the reward will be 0 #if the game has ended, reward will be +10 #a wrong move results in high lose, -50 state_input = tf.placeholder(tf.float32, [None, 9], "state_input") target_state_input = tf.placeholder(tf.float32, [None, 9], "target_state_input") #first layer weight_stddev = (2.0/9)**0.5 predict_w1 = tf.get_variable("predict_w1", (9, 80), initializer=tf.random_uniform_initializer()) predict_b1 = tf.Variable(tf.zeros(80), name="predict_b1") predict_layer_1_output = tf.nn.leaky_relu(tf.matmul(state_input, predict_w1)+predict_b1) weight_stddev = (2.0/9)**0.5 target_w1 = tf.get_variable("target_w1", (9, 80), initializer=tf.random_uniform_initializer()) target_b1 = tf.Variable(tf.zeros(80), name="target_b1") target_layer_1_output = tf.nn.leaky_relu(tf.matmul(target_state_input, target_w1)+target_b1) #second layer... weight_stddev = (2.0/80)**0.5 #9 available actions... predict_w2 = tf.get_variable("predict_w2", (80, 50), initializer=tf.random_uniform_initializer())