def _create_embedders(self):

    #placeholder for input data
    self._src_input_data = tf.placeholder(tf.int32, [None, self.MAX_SEQ_LENGTH], name='source_sequence')
    self._tgt_input_data = tf.placeholder(tf.int32, [None, self.MAX_SEQ_LENGTH], name='target_sequence')
    self._labels = tf.placeholder(tf.int64, [None], name='targetSpace_labels')
    self._src_lens = tf.placeholder(tf.int32, [None], name='source_seq_lenths')
    self._tgt_lens = tf.placeholder(tf.int32, [None], name='target_seq_lenths')

    #create word embedding vectors
    self.src_word_embedding = tf.get_variable('src_word_embedding', [self.src_vocab_size, self.word_embed_size],
                                         initializer=tf.random_uniform_initializer(-0.25,0.25))

    self.tgt_word_embedding = tf.get_variable('tgt_word_embedding', [self.tgt_vocab_size, self.word_embed_size],
                                         initializer=tf.random_uniform_initializer(-0.25, 0.25))

    #transform input tensors from tokenID to word embedding
    self.src_input_distributed = tf.nn.embedding_lookup( self.src_word_embedding, self._src_input_data, name='dist_source')
    self.tgt_input_distributed = tf.nn.embedding_lookup( self.tgt_word_embedding, self._tgt_input_data, name='dist_target')


    if self.network_mode == 'source-encoder-only':
      self._source_encoder_only_network()
    elif self.network_mode == 'dual-encoder':
      self._dual_encoder_network()
    elif self.network_mode == 'shared-encoder':
      self._shared_encoder_network()
    else:
      print('Error!! Unsupported network mode: %s. Please specify on: source-encoder-only, dual-encoder or shared-encoder.' % self.network_mode )
      exit(-1)
def xavier_init(input_size, output_size, uniform=True):
    if uniform:
        init_range= tf.sqrt(6.0/(input_size+output_size))
        return tf.random_uniform_initializer(stdevv=init_range)
    else:
        init_range= tf.sqrt(3.0/(input_size+output_size))
        return tf.random_uniform_initializer(stdevv=init_range)
Exemple #3
0
def weight(name, shape, init='he', range=None):
    """ Initializes weight.
    :param name: Variable name
    :param shape: Tensor shape
    :param init: Init mode. xavier / normal / uniform / he (default is 'he')
    :param range:
    :return: Variable
    """
    initializer = tf.constant_initializer()
    if init == 'xavier':
        fan_in, fan_out = _get_dims(shape)
        range = math.sqrt(6.0 / (fan_in + fan_out))
        initializer = tf.random_uniform_initializer(-range, range)

    elif init == 'he':
        fan_in, _ = _get_dims(shape)
        std = math.sqrt(2.0 / fan_in)
        initializer = tf.random_normal_initializer(stddev=std)

    elif init == 'normal':
        initializer = tf.random_normal_initializer(stddev=0.1)

    elif init == 'uniform':
        if range is None:
            raise ValueError("range must not be None if uniform init is used.")
        initializer = tf.random_uniform_initializer(-range, range)

    var = tf.get_variable(name, shape, initializer=initializer)
    tf.add_to_collection('l2', tf.nn.l2_loss(var))  # Add L2 Loss
    return var
Exemple #4
0
  def __init__(self,
               state_size,
               num_obs,
               steps_per_obs,
               sigma_min=1e-5,
               dtype=tf.float32,
               random_seed=None):
    self.state_size = state_size
    self.sigma_min = sigma_min
    self.dtype = dtype
    self.steps_per_obs = steps_per_obs
    self.num_obs = num_obs
    self.num_timesteps = num_obs*steps_per_obs +1

    initializers =  {
      "w": tf.random_uniform_initializer(seed=random_seed),
      "b": tf.zeros_initializer
    }
    self.mus = [
        snt.Linear(output_size=state_size, initializers=initializers)
        for t in xrange(self.num_timesteps)
    ]
    self.sigmas = [
        tf.get_variable(
            shape=[state_size],
            dtype=self.dtype,
            name="q_sigma_%d" % (t + 1),
            initializer=tf.random_uniform_initializer(seed=random_seed))
        for t in xrange(self.num_timesteps)
    ]
    def __call__(self, inputs, state, scope=None):
        with tf.variable_scope(scope or type(self).__name__):
            # Conveniently the concatenation of all hidden states at t-1
            h_star_t_prev = state
            u_g = tf.get_variable("u_g", [self.state_size],
                                  initializer=tf.random_uniform_initializer(-0.1, 0.1))
            cur_state_pos = 0
            cur_inp = inputs
            new_states = []
            for i, cell in enumerate(self._cells):
                with tf.variable_scope("Cell%d" % i):
                    cur_state = array_ops.slice(
                            state, [0, cur_state_pos], [-1, cell.state_size])
                    with tf.variable_scope("Global Reset"):
                        w_g = tf.get_variable("w_g", cell.state_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1))
                        g = tf.sigmoid(tf.mul(w_g, cur_state) + tf.mul(u_g, h_star_t_prev))
                        U = tf.get_variable("U", [cell.state_size],
                                            initializer=tf.random_uniform_initializer(-0.1, 0.1))
                        cur_state = tf.reduce_sum(g * tf.matmul(cur_state, U))

                    cur_state_pos += cell.state_size
                    cur_inp, new_state = cell(cur_inp, cur_state)
                    new_states.append(new_state)

        return cur_inp, array_ops.concat(1, new_states)
Exemple #6
0
def build_lstm_forward(H, x, googlenet, phase, reuse):
    grid_size = H['arch']['grid_width'] * H['arch']['grid_height']
    outer_size = grid_size * H['arch']['batch_size']
    input_mean = 117.
    x -= input_mean
    Z = googlenet_load.model(x, googlenet, H)
    with tf.variable_scope('decoder', reuse=reuse):
        scale_down = 0.01
        if H['arch']['early_dropout'] and phase == 'train':
            Z = tf.nn.dropout(Z, 0.5)
        lstm_input = tf.reshape(Z * scale_down, (H['arch']['batch_size'] * grid_size, 1024))
        lstm_outputs = build_lstm_inner(lstm_input, H)

        pred_boxes = []
        pred_logits = []
        for i in range(H['arch']['rnn_len']):
            output = lstm_outputs[i]
            if H['arch']['late_dropout'] and phase == 'train':
                output = tf.nn.dropout(output, 0.5)
            box_weights = tf.get_variable('box_ip%d' % i, shape=(H['arch']['lstm_size'], 4),
                initializer=tf.random_uniform_initializer(-0.1, 0.1))
            conf_weights = tf.get_variable('conf_ip%d' % i, shape=(H['arch']['lstm_size'], 2),
                initializer=tf.random_uniform_initializer(-0.1, 0.1))
            pred_boxes.append(tf.reshape(tf.matmul(output, box_weights) * 50,
                                         [outer_size, 1, 4]))
            pred_logits.append(tf.reshape(tf.matmul(output, conf_weights),
                                         [outer_size, 1, 2]))
        pred_boxes = tf.concat(1, pred_boxes)
        pred_logits = tf.concat(1, pred_logits)
        pred_logits_squash = tf.reshape(pred_logits,
                                        [outer_size * H['arch']['rnn_len'], 2])
        pred_confidences_squash = tf.nn.softmax(pred_logits_squash)
        pred_confidences = tf.reshape(pred_confidences_squash,
                                      [outer_size, H['arch']['rnn_len'], 2])
    return pred_boxes, pred_logits, pred_confidences
def count_sketch(probs, project_size):
    """ Calculates count-min sketch of a tensor.
    Args:
      probs: A `Tensor`
      project_size: output size (`int`)

    Returns:c
      A projected count-min sketch `Tensor` with shape [batch_size, project_size].
    """
    with tf.variable_scope('CountSketch_'+probs.name.replace(':', '_')) as scope:
        input_size = int(probs.get_shape()[1])

        # h, s must be sampled once
        history = tf.get_collection('__countsketch')
        if scope.name in history: scope.reuse_variables()
        tf.add_to_collection('__countsketch', scope.name)

        h = tf.get_variable('h', [input_size], initializer=tf.random_uniform_initializer(0, project_size), trainable=False)
        s = tf.get_variable('s', [input_size], initializer=tf.random_uniform_initializer(0, 2), trainable=False)

        h = tf.cast(h, 'int32')
        s = tf.cast(tf.floor(s) * 2 - 1, 'int32') # 1 or -1

        sk = _sketch_op.count_sketch(probs, h, s, project_size)
        sk.set_shape([probs.get_shape()[0], project_size])
        return sk
    def build(self):
        with tf.name_scope('weigths'):
            self.W = tf.get_variable(
                shape=[self.hidden_dim, self.nb_classes],
                initializer=tf.random_uniform_initializer(-0.2, 0.2),
                # initializer=tf.truncated_normal_initializer(stddev=0.01),
                name='lstm_weights'
            )
            self.T = tf.get_variable(
                shape=[self.feat_size, self.nb_classes],
                initializer=tf.random_uniform_initializer(-0.2, 0.2),
                # initializer=tf.truncated_normal_initializer(stddev=0.01),
                name='feat_weights'
            )
            self.lstm_fw = tf.contrib.rnn.LSTMCell(self.hidden_dim)

        with tf.name_scope('biases'):
            self.b = tf.Variable(tf.zeros([self.nb_classes], name="bias"))
            # self.b = tf.get_variable(
            #     shape=[self.nb_classes],
            #     initializer=tf.truncated_normal_initializer(stddev=0.01),
            #     # initializer=tf.random_uniform_initializer(-0.2, 0.2),
            #     name="bias"
            # )
        return
    def compute_feedback(self, inputs, full_state, layer_sizes, scope=None):
        with tf.variable_scope("Global Reset"):
            cur_state_pos = 0
            full_state_size = sum(layer_sizes)
            summation_term = tf.get_variable("summation", self.state_size, initializer=tf.constant_initializer())
            for i, layer_size in enumerate(layer_sizes):
                with tf.variable_scope("Cell%d" % i):
                    # Compute global reset gate
                    w_g = tf.get_variable("w_g", self.input_size, initializer=tf.random_uniform_initializer(-0.1, 0.1))
                    u_g = tf.get_variable("u_g", full_state_size, initializer=tf.random_uniform_initializer(-0.1, 0.1))
                    g__i_j = tf.sigmoid(tf.matmul(inputs, w_g) + tf.matmul(full_state, u_g))

                    # Accumulate sum
                    h_t_1 = \
                        tf.slice(
                                full_state,
                                [0, cur_state_pos],
                                [-1, layer_size]
                        )
                    cur_state_pos += layer_size
                    U = tf.get_variable("U", [self.input_size, self._num_units],
                                        initializer=tf.random_uniform_initializer(-0.1, 0.1))
                    b = tf.get_variable("b", self.state_size, initializer=tf.constant_initializer(1.))
                    summation_term = tf.add(summation_term, g__i_j * tf.matmul(U, h_t_1) + b)

        return summation_term
Exemple #10
0
def sin_bank(x, bank_size, length, scope=None):
    with tf.variable_op_scope([x], scope, "SinBank") as scope:
        bank = tf.get_variable("bank", dtype=tf.float32, shape=[bank_size, ],
                        initializer=tf.random_uniform_initializer(0.0, length))
        shift = tf.get_variable("shift", dtype=tf.float32, shape=[bank_size, ],
                        initializer=tf.random_uniform_initializer(0.0, length))
        if not tf.get_variable_scope().reuse:
            tf.histogram_summary(bank.name, bank)
        return tf.sin(x*bank+shift)
Exemple #11
0
    def _build_net(self):
        
        with tf.name_scope('inputs'):
            self.tf_obs=tf.placeholder(tf.float32,[None,self.n_features],name="observations")
            self.tf_acts=tf.placeholder(tf.int32,[None, ],name="actions")
            self.tf_vt=tf.placeholder(tf.float32,[None, ],name="action_values")
        
        layer_1=tf.layers.dense(
                
                inputs=self.tf_obs,
                units=H,
                activation=tf.nn.tanh,
                kernel_initializer=tf.random_normal_initializer(mean=0,stddev=0.3),
                #kernel_initializer=tf.random_uniform_initializer(-0.23,0.23),
                bias_initializer=tf.constant_initializer(0),
                name='h_layer1',             
                
                )
        layer_2=tf.layers.dense(
                
                inputs=layer_1,
                units=H,
                activation=tf.nn.tanh,
                #kernel_initializer=tf.random_normal_initializer(mean=0,stddev=0.3),
                kernel_initializer=tf.random_uniform_initializer(-0.23,0.23),
                bias_initializer=tf.constant_initializer(0),
                name='h_layer2',             
                
                )
        
        all_act=tf.layers.dense(
                inputs=layer_2,
                units=self.n_actions,
                activation=tf.nn.tanh,
                kernel_initializer=tf.random_uniform_initializer(-0.23,0.23),
                #kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),         
                #kernel_initializer=tf.truncated_normal_initializer(mean=0, stddev=0.3),
                bias_initializer=tf.constant_initializer(0),
                name='output'
        )

        self.all_act_prob =tf.nn.softmax(all_act, name='act_prob') 
        
        loss=tf.log(self.all_act_prob)
        
        with tf.name_scope('loss'):
            
            neg_log_prob=tf. reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts,self.n_actions),axis=1)
            loss=tf.reduce_mean(neg_log_prob*self.tf_vt)

        
        with tf.name_scope('optimizer'):
            
            self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)
Exemple #12
0
    def dense_layer(self, input, out_dim, name, func=tf.nn.relu):
        in_dim = input.get_shape().as_list()[-1]
        d = 1.0 / np.sqrt(in_dim)
        with tf.variable_scope(name):
            w_init = tf.random_uniform_initializer(-d, d)
            b_init = tf.random_uniform_initializer(-d, d)
            w = tf.get_variable('w', dtype=tf.float32, shape=[in_dim, out_dim], initializer=w_init)
            b = tf.get_variable('b', shape=[out_dim], initializer=b_init)

            output = tf.matmul(input, w) + b
            if func is not None:
                output = func(output)

        return output
Exemple #13
0
    def __init__(self, embedding_dim=100, batch_size=64, n_hidden=100, learning_rate=0.01,
                 n_class=3, max_sentence_len=50, l2_reg=0., display_step=4, n_iter=100, type_=''):
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.n_hidden = n_hidden
        self.learning_rate = learning_rate
        self.n_class = n_class
        self.max_sentence_len = max_sentence_len
        self.l2_reg = l2_reg
        self.display_step = display_step
        self.n_iter = n_iter
        self.type_ = type_
        self.word_id_mapping, self.w2v = load_w2v(FLAGS.embedding_file_path, self.embedding_dim)
        self.word_embedding = tf.constant(self.w2v, name='word_embedding')
        # self.word_embedding = tf.Variable(self.w2v, name='word_embedding')
        # self.word_id_mapping = load_word_id_mapping(FLAGS.word_id_file_path)
        # self.word_embedding = tf.Variable(
        #     tf.random_uniform([len(self.word_id_mapping), self.embedding_dim], -0.1, 0.1), name='word_embedding')

        self.dropout_keep_prob = tf.placeholder(tf.float32)
        with tf.name_scope('inputs'):
            self.x = tf.placeholder(tf.int32, [None, self.max_sentence_len])
            self.y = tf.placeholder(tf.int32, [None, self.n_class])
            self.sen_len = tf.placeholder(tf.int32, None)

            self.x_bw = tf.placeholder(tf.int32, [None, self.max_sentence_len])
            self.y_bw = tf.placeholder(tf.int32, [None, self.n_class])
            self.sen_len_bw = tf.placeholder(tf.int32, [None])

            self.target_words = tf.placeholder(tf.int32, [None, 1])

        with tf.name_scope('weights'):
            self.weights = {
                'softmax_bi_lstm': tf.get_variable(
                    name='bi_lstm_w',
                    shape=[2 * self.n_hidden, self.n_class],
                    initializer=tf.random_uniform_initializer(-0.003, 0.003),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                )
            }

        with tf.name_scope('biases'):
            self.biases = {
                'softmax_bi_lstm': tf.get_variable(
                    name='bi_lstm_b',
                    shape=[self.n_class],
                    initializer=tf.random_uniform_initializer(-0.003, 0.003),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                )
            }
 def testRandomInitializer(self):
     # Sanity check that the slices uses a different seed when using a random
     # initializer function.
     with self.test_session():
         var0, var1 = tf.create_partitioned_variables([20, 12], [1, 2], tf.random_uniform_initializer())
         tf.global_variables_initializer().run()
         val0, val1 = var0.eval().flatten(), var1.eval().flatten()
         self.assertTrue(np.linalg.norm(val0 - val1) > 1e-6)
     # Negative test that proves that slices have the same values if
     # the random initializer uses a seed.
     with self.test_session():
         var0, var1 = tf.create_partitioned_variables([20, 12], [1, 2], tf.random_uniform_initializer(seed=201))
         tf.global_variables_initializer().run()
         val0, val1 = var0.eval().flatten(), var1.eval().flatten()
         self.assertAllClose(val0, val1)
  def get_params(self):
    n_first_layer = self.n_inputs + self.n_heads * self.mem_ncols
    init_min = -0.1
    init_max = 0.1
    weights = {
      "hidden": tf.get_variable(
        name="hidden_weight",
        shape=[n_first_layer, self.n_hidden],
        initializer=tf.random_uniform_initializer(init_min, init_max),
      ),
      "output": tf.get_variable(
        name="output_weight",
        shape=[self.n_hidden, self.n_outputs],
        initializer=tf.random_uniform_initializer(init_min, init_max),
      ),
    }
    biases = {
      "hidden": tf.get_variable(
        name="hidden_bias",
        shape=[self.n_hidden],
        initializer=tf.constant_initializer(0),
      ),
      "output": tf.get_variable(
        name="output_bias",
        shape=[self.n_outputs],
        initializer=tf.constant_initializer(0),
      ),
    }

    for i in xrange(self.n_heads):
      self.add_head_params(
        weights=weights,
        biases=biases,
        i=i,
        init_min=init_min,
        init_max=init_max,
        is_write=True,
      )
      self.add_head_params(
        weights=weights,
        biases=biases,
        i=i,
        init_min=init_min,
        init_max=init_max,
        is_write=False,
      )

    return weights, biases
    def __call__(self, inputs, state, full_state, layer_sizes, scope=None):
        """
        Recurrence functionality here
        In contrast to tensorflow implementation, variables will be more explicit
        :param inputs: 2D Tensor with shape [batch_size x self.input_size]
        :param state: 2D Tensor with shape [batch_size x self.state_size]
        :param full_state: 2D Tensor with shape [batch_size x self.full_state_size]
        :param scope: VariableScope for the created subgraph; defaults to class name
        :return:
            h_t - Output: A 2D Tensor with shape [batch_size x self.output_size]
            h_t - New state: A 2D Tensor with shape [batch_size x self.state_size].
            (the new state is also the output in a GRU cell)
        """
        with tf.variable_scope(scope or type(self).__name__):
            h_t_prev, _ = tf.split(1, 2, state)
            x_t = inputs
            with tf.variable_scope("Update Gate"):
                W_z = tf.get_variable("W_z", [self.input_size, self._num_units],
                                      initializer=tf.random_uniform_initializer(-0.1, 0.1))
                U_z = tf.get_variable("U_z", [self.input_size, self._num_units],
                                      initializer=tf.random_uniform_initializer(-0.1, 0.1))
                b_z = tf.get_variable("b_z", [self._num_units], tf.constant_initializer(0.0))

                z_t = tf.sigmoid(tf.matmul(x_t, W_z) + tf.matmul(h_t_prev, U_z) + b_z, name="z_t")

            with tf.variable_scope("Reset Gate"):
                W_r = tf.get_variable("W_r", [self.input_size, self._num_units],
                                      initializer=tf.random_uniform_initializer(-0.1, 0.1))
                U_r = tf.get_variable("U_r", [self.input_size, self._num_units],
                                      initializer=tf.random_uniform_initializer(-0.1, 0.1))
                b_r = tf.get_variable("b_r", [self._num_units], tf.constant_initializer(1.0))

                r_t = tf.sigmoid(tf.matmul(x_t, W_r) + tf.matmul(h_t_prev, U_r) + b_r, name="r_t")

            with tf.variable_scope("Candidate"):
                # New memory content
                W = tf.get_variable("W", [self.input_size, self._num_units],
                                    initializer=tf.random_uniform_initializer(-0.1, 0.1))

                b = tf.get_variable("b", [self._num_units], tf.constant_initializer(0.0))

                summation_term = self.compute_feedback(x_t, full_state, layer_sizes)
                hc_t = tf.tanh(tf.matmul(x_t, W) + tf.mul(r_t, summation_term))

            with tf.Variable("Output"):
                h_t = tf.mul(z_t, hc_t) + tf.mul((1 - z_t), h_t_prev)

        return h_t, h_t
Exemple #17
0
    def testBlockGRUToGRUCellSingleStep(self):
        with self.test_session(use_gpu=self._use_gpu, graph=tf.Graph()) as sess:
            batch_size = 4
            cell_size = 5
            input_size = 6

            seed = 1994
            initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=seed)

            # Inputs
            x = tf.zeros([batch_size, input_size])
            h = tf.zeros([batch_size, cell_size])

            # Values for the inputs.
            x_value = np.random.rand(batch_size, input_size)
            h_value = np.random.rand(batch_size, cell_size)

            # Output from the basic GRU cell implementation.
            with tf.variable_scope("basic", initializer=initializer):
                output = tf.nn.rnn_cell.GRUCell(cell_size)(x, h)
                sess.run([tf.initialize_all_variables()])
                basic_res = sess.run([output], {x: x_value, h: h_value})

            # Output from the block GRU cell implementation.
            with tf.variable_scope("block", initializer=initializer):
                output = gru_ops.GRUBlockCell(cell_size)(x, h)
                sess.run([tf.initialize_all_variables()])
                block_res = sess.run([output], {x: x_value, h: h_value})

            self.assertEqual(len(block_res), len(basic_res))
            for block, basic in zip(block_res, basic_res):
                self.assertAllClose(block, basic)
Exemple #18
0
  def testLSTMBasicToBlockPeeping(self):
    with self.test_session(use_gpu=self._use_gpu) as sess:
      batch_size = 2
      input_size = 3
      cell_size = 4
      sequence_length = 5

      inputs = []
      for _ in range(sequence_length):
        inp = tf.convert_to_tensor(
            np.random.randn(batch_size, input_size),
            dtype=tf.float32)
        inputs.append(inp)

      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890212)
      with tf.variable_scope("basic", initializer=initializer):
        cell = tf.nn.rnn_cell.LSTMCell(cell_size,
                                       use_peepholes=True,
                                       state_is_tuple=True)
        outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)

        sess.run([tf.initialize_all_variables()])
        basic_outputs = sess.run(outputs)
        basic_grads = sess.run(tf.gradients(outputs, inputs))
        basic_wgrads = sess.run(tf.gradients(outputs, tf.trainable_variables()))

      with tf.variable_scope("block", initializer=initializer):
        w = tf.get_variable("w",
                            shape=[input_size + cell_size, cell_size * 4],
                            dtype=tf.float32)
        b = tf.get_variable("b",
                            shape=[cell_size * 4],
                            dtype=tf.float32,
                            initializer=tf.zeros_initializer)

        wci = tf.get_variable("wci", shape=[cell_size], dtype=tf.float32)
        wcf = tf.get_variable("wcf", shape=[cell_size], dtype=tf.float32)
        wco = tf.get_variable("wco", shape=[cell_size], dtype=tf.float32)

        _, _, _, _, _, _, outputs = fused_lstm(
            tf.convert_to_tensor(sequence_length,
                                 dtype=tf.int64),
            inputs,
            w,
            b,
            wci=wci,
            wcf=wcf,
            wco=wco,
            cell_clip=0,
            use_peephole=True)

        sess.run([tf.initialize_all_variables()])
        block_outputs = sess.run(outputs)
        block_grads = sess.run(tf.gradients(outputs, inputs))
        block_wgrads = sess.run(tf.gradients(outputs, [w, b, wci, wcf, wco]))

      self.assertAllClose(basic_outputs, block_outputs)
      self.assertAllClose(basic_grads, block_grads)
      for basic, block in zip(basic_wgrads, block_wgrads):
        self.assertAllClose(basic, block, rtol=1e-2, atol=1e-2)
Exemple #19
0
def train(data_dir, checkpoint_path, config):
    """Trains the model with the given data

    Args:
        data_dir: path to the data for the model (see data_utils for data
            format)
        checkpoint_path: the path to save the trained model checkpoints
        config: one of the above configs that specify the model and how it
            should be run and trained
    Returns:
        None
    """
    # Prepare Name data.
    print("Reading Name data in %s" % data_dir)
    names, counts = data_utils.read_names(data_dir)

    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            m = NamignizerModel(is_training=True, config=config)

        tf.global_variables_initializer().run()

        for i in range(config.max_max_epoch):
            lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
            m.assign_lr(session, config.learning_rate * lr_decay)

            print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
            train_perplexity = run_epoch(session, m, names, counts, config.epoch_size, m.train_op,
                                         verbose=True)
            print("Epoch: %d Train Perplexity: %.3f" %
                  (i + 1, train_perplexity))

            m.saver.save(session, checkpoint_path, global_step=i)
    def __init__(self, session, np_matrix, rank,
                 learning_rate=0.1):
        matrix = tf.constant(np_matrix, dtype=tf.float32)
        scale = 2 * np.sqrt(np_matrix.mean() / rank)
        initializer = tf.random_uniform_initializer(maxval=scale)

        with tf.device('/job:ps/task:0'):
            self.matrix_W = tf.get_variable(
                "W", (np_matrix.shape[0], rank), initializer=initializer
            )
        with tf.device("/job:ps/task:1"):
            self.matrix_H = tf.get_variable(
                "H", (rank, np_matrix.shape[1]), initializer=initializer
            )

        matrix_WH = tf.matmul(self.matrix_W, self.matrix_H)
        f_norm = tf.reduce_sum(tf.pow(matrix - matrix_WH, 2))

        nn_w = tf.reduce_sum(tf.abs(self.matrix_W) - self.matrix_W)
        nn_h = tf.reduce_sum(tf.abs(self.matrix_H) - self.matrix_H)
        constraint = INFINITY * (nn_w + nn_h)
        self.loss = f_norm + constraint
        self.constraint = constraint

        self.session = session
        self.optimizer = tf.train.GradientDescentOptimizer(
            learning_rate
        ).minimize(self.loss)
Exemple #21
0
def modular_layer(inputs, modules: ModulePool, parallel_count: int, context: ModularContext):
    with tf.variable_scope(None, 'modular_layer'):
        inputs = context.begin_modular(inputs)

        flat_inputs = tf.layers.flatten(inputs)
        logits = tf.layers.dense(flat_inputs, modules.module_count * parallel_count)
        logits = tf.reshape(logits, [-1, parallel_count, modules.module_count])
        ctrl = tfd.Categorical(logits)

        initializer = tf.random_uniform_initializer(maxval=modules.module_count, dtype=tf.int32)
        shape = [context.dataset_size, parallel_count]
        best_selection_persistent = tf.get_variable('best_selection', shape, tf.int32, initializer)

        if context.mode == ModularMode.E_STEP:
            # 1 x batch_size x 1
            best_selection = tf.gather(best_selection_persistent, context.data_indices)[tf.newaxis]
            # sample_size x batch_size x 1
            sampled_selection = tf.reshape(ctrl.sample(), [context.sample_size, -1, parallel_count])
            selection = tf.concat([best_selection, sampled_selection[1:]], axis=0)
            selection = tf.reshape(selection, [-1, parallel_count])
        elif context.mode == ModularMode.M_STEP:
            selection = tf.gather(best_selection_persistent, context.data_indices)
        elif context.mode == ModularMode.EVALUATION:
            selection = ctrl.mode()
        else:
            raise ValueError('Invalid modular mode')

        attrs = ModularLayerAttributes(selection, best_selection_persistent, ctrl)
        context.layers.append(attrs)

        return run_modules(inputs, selection, modules.module_fnc, modules.output_shape)
Exemple #22
0
def xavier_init( n_inputs, n_outputs, uniform=True ):
    if uniform:
        init_range = tf.sqrt( 6.0 / (n_inputs + n_outputs) )
        return tf.random_uniform_initializer( -init_range, init_range )
    else:
        stddev = tf.sqrt( 3.0 / (n_inputs + n_outputs) )
        return tf.truncated_normal_initializer( stddev=stddev )
Exemple #23
0
  def testSharingWeightsWithDifferentNamescope(self):
    num_units = 3
    input_size = 5
    batch_size = 2
    num_proj = 4
    with self.test_session(graph=tf.Graph()) as sess:
      initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
      inputs = 10 * [
          tf.placeholder(tf.float32, shape=(None, input_size))]
      cell = rnn_cell.LSTMCell(
          num_units, input_size, use_peepholes=True,
          num_proj=num_proj, initializer=initializer)

      with tf.name_scope("scope0"):
        with tf.variable_scope("share_scope"):
          outputs0, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
      with tf.name_scope("scope1"):
        with tf.variable_scope("share_scope", reuse=True):
          outputs1, _ = rnn.rnn(cell, inputs, dtype=tf.float32)

      tf.initialize_all_variables().run()
      input_value = np.random.randn(batch_size, input_size)
      output_values = sess.run(
          outputs0 + outputs1, feed_dict={inputs[0]: input_value})
      outputs0_values = output_values[:10]
      outputs1_values = output_values[10:]
      self.assertEqual(len(outputs0_values), len(outputs1_values))
      for out0, out1 in zip(outputs0_values, outputs1_values):
        self.assertAllEqual(out0, out1)
Exemple #24
0
    def __call__(self, inputs, state, scope=None):
        with tf.variable_scope(scope or type(self).__name__):
            initializer = tf.random_uniform_initializer(-0.1, 0.1)

            def get_variable(name, shape):
                return tf.get_variable(name, shape, initializer=initializer, dtype=inputs.dtype)

            c_prev, y_prev = tf.split(1, 2, state)

            W_z = get_variable("W_z", [self.input_size, self._num_blocks])
            W_f = get_variable("W_f", [self.input_size, self._num_blocks])
            W_o = get_variable("W_o", [self.input_size, self._num_blocks])

            R_z = get_variable("R_z", [self._num_blocks, self._num_blocks])
            R_f = get_variable("R_f", [self._num_blocks, self._num_blocks])
            R_o = get_variable("R_o", [self._num_blocks, self._num_blocks])

            b_z = get_variable("b_z", [1, self._num_blocks])
            b_f = get_variable("b_f", [1, self._num_blocks])
            b_o = get_variable("b_o", [1, self._num_blocks])

            p_f = get_variable("p_f", [self._num_blocks])
            p_o = get_variable("p_o", [self._num_blocks])

            g = h = tf.tanh

            z = g(tf.matmul(inputs, W_z) + tf.matmul(y_prev, R_z) + b_z)
            i = 1
            f = tf.sigmoid(tf.matmul(inputs, W_f) + tf.matmul(y_prev, R_f) + tf.mul(c_prev, p_f) + b_f)
            c = tf.mul(i, z) + tf.mul(f, c_prev)
            o = tf.sigmoid(tf.matmul(inputs, W_o) + tf.matmul(y_prev, R_o) + tf.mul(c, p_o) + b_o)
            y = tf.mul(h(c), o)

            return y, tf.concat(1, [c, y])
Exemple #25
0
  def _testDoubleInput(self, use_gpu):
    num_units = 3
    input_size = 5
    batch_size = 2
    num_proj = 4
    num_proj_shards = 4
    num_unit_shards = 2
    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
      initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
      inputs = 10 * [tf.placeholder(tf.float64)]

      cell = rnn_cell.LSTMCell(
          num_units,
          input_size=input_size,
          use_peepholes=True,
          num_proj=num_proj,
          num_unit_shards=num_unit_shards,
          num_proj_shards=num_proj_shards,
          initializer=initializer)

      outputs, _ = rnn.rnn(
          cell, inputs, initial_state=cell.zero_state(batch_size, tf.float64))

      self.assertEqual(len(outputs), len(inputs))

      tf.initialize_all_variables().run()
      input_value = np.asarray(np.random.randn(batch_size, input_size),
                               dtype=np.float64)
      values = sess.run(outputs, feed_dict={inputs[0]: input_value})
      self.assertEqual(values[0].dtype, input_value.dtype)
Exemple #26
0
  def testSharingWeightsWithReuse(self):
    num_units = 3
    input_size = 5
    batch_size = 2
    num_proj = 4
    with self.test_session(graph=tf.Graph()) as sess:
      initializer = tf.random_uniform_initializer(-1, 1, seed=self._seed)
      inputs = 10 * [
          tf.placeholder(tf.float32, shape=(None, input_size))]
      cell = rnn_cell.LSTMCell(
          num_units, input_size, use_peepholes=True,
          num_proj=num_proj, initializer=initializer)

      with tf.variable_scope("share_scope"):
        outputs0, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
      with tf.variable_scope("share_scope", reuse=True):
        outputs1, _ = rnn.rnn(cell, inputs, dtype=tf.float32)
      with tf.variable_scope("diff_scope"):
        outputs2, _ = rnn.rnn(cell, inputs, dtype=tf.float32)

      tf.initialize_all_variables().run()
      input_value = np.random.randn(batch_size, input_size)
      output_values = sess.run(
          outputs0 + outputs1 + outputs2, feed_dict={inputs[0]: input_value})
      outputs0_values = output_values[:10]
      outputs1_values = output_values[10:20]
      outputs2_values = output_values[20:]
      self.assertEqual(len(outputs0_values), len(outputs1_values))
      self.assertEqual(len(outputs0_values), len(outputs2_values))
      for o1, o2, o3 in zip(outputs0_values, outputs1_values, outputs2_values):
        # Same weights used by both RNNs so outputs should be the same.
        self.assertAllEqual(o1, o2)
        # Different weights used so outputs should be different.
        self.assertTrue(np.linalg.norm(o1-o3) > 1e-6)
Exemple #27
0
 def build(self, _):
   self.embedding = self.add_variable(
       "embedding_kernel",
       shape=[self.vocab_size, self.embedding_dim],
       dtype=tf.float32,
       initializer=tf.random_uniform_initializer(-0.1, 0.1),
       trainable=True)
Exemple #28
0
  def _testProjSharding(self, use_gpu):
    num_units = 3
    input_size = 5
    batch_size = 2
    num_proj = 4
    num_proj_shards = 4
    num_unit_shards = 2
    with self.test_session(use_gpu=use_gpu, graph=tf.Graph()) as sess:
      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)

      inputs = 10 * [
          tf.placeholder(tf.float32, shape=(None, input_size))]

      cell = rnn_cell.LSTMCell(
          num_units,
          input_size=input_size,
          use_peepholes=True,
          num_proj=num_proj,
          num_unit_shards=num_unit_shards,
          num_proj_shards=num_proj_shards,
          initializer=initializer)

      outputs, _ = rnn.rnn(cell, inputs, dtype=tf.float32)

      self.assertEqual(len(outputs), len(inputs))

      tf.initialize_all_variables().run()
      input_value = np.random.randn(batch_size, input_size)
      sess.run(outputs, feed_dict={inputs[0]: input_value})
def uniform(shape=None, minval=0, maxval=None, dtype=tf.float32, seed=None, name='Uniform'):
    """Uniform.

    Initialization with random values from a uniform distribution.

    The generated values follow a uniform distribution in the range
    `[minval, maxval)`. The lower bound `minval` is included in the range,
    while the upper bound `maxval` is excluded.

    For floats, the default range is `[0, 1)`.  For ints, at least `maxval`
    must be specified explicitly.

    In the integer case, the random integers are slightly biased unless
    `maxval - minval` is an exact power of two.  The bias is small for values of
    `maxval - minval` significantly smaller than the range of the output (either
    `2**32` or `2**64`).

    Args:
        shape: List of `int`. A shape to initialize a Tensor (optional).
        dtype: The tensor data type. Only float are supported.
        seed: `int`. Used to create a random seed for the distribution.
        name: name of the op.

    Returns:
        The Initializer, or an initialized `Tensor` if shape is specified.

    """
    if shape:
        return tf.random_uniform(
            shape=shape, minval=minval, maxval=maxval, seed=seed, dtype=dtype, name=name)
    else:
        with get_name_scope(name):
            return tf.random_uniform_initializer(
                minval=minval, maxval=maxval, seed=seed, dtype=dtype)
Exemple #30
0
  def __init__(self,
               vocab_size,
               embedding_dim,
               hidden_dim,
               num_layers,
               dropout_ratio,
               use_cudnn_rnn=True):
    super(PTBModel, self).__init__()

    self.keep_ratio = 1 - dropout_ratio
    self.use_cudnn_rnn = use_cudnn_rnn
    self.embedding = self.track_layer(Embedding(vocab_size, embedding_dim))

    if self.use_cudnn_rnn:
      self.rnn = cudnn_rnn.CudnnLSTM(
          num_layers, hidden_dim, dropout=dropout_ratio)
    else:
      self.rnn = RNN(hidden_dim, num_layers, self.keep_ratio)
    self.track_layer(self.rnn)

    self.linear = self.track_layer(
        tf.layers.Dense(
            vocab_size,
            kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1)))
    self._output_shape = [-1, embedding_dim]
    def build_input(self, feats, reuse):
        with tf.variable_scope('Input', reuse=reuse):
            # Unigram
            if self.config.embd_init_type == 'uniform':
                uni_embd_var = tf.get_variable('uni_embd', [self.vocab_size, self.embd_size], initializer=tf.random_uniform_initializer(-1., 1.))
            elif self.config.embd_init_type == 'random_normal':
                seed = random.randint(1, 10000)
                uni_embd_var = tf.get_variable('uni_embd', [self.vocab_size, self.embd_size], initializer=tf.random_normal_initializer(self.config.norm_mean, self.config.norm_std, seed=seed))
            elif self.config.embd_init_type == 'truncated_normal':
                seed = random.randint(1, 10000)
                uni_embd_var = tf.get_variable('uni_embd', [self.vocab_size, self.embd_size], initializer=tf.truncated_normal_initializer(self.config.norm_mean, self.config.norm_std, seed=seed))
            else:
                uni_embd_var = tf.get_variable('uni_embd', [self.vocab_size, self.embd_size])
            uni_embd = tf.nn.embedding_lookup(uni_embd_var, tf.abs(feats['unigram'])) # -1 -> 1

            # Img Feature
            img_feat = tf.clip_by_value(feats['img_feat'], -100, 100)

        return uni_embd, img_feat
def main_word2vec_basic():
    tf.logging.set_verbosity(tf.logging.DEBUG)
    tl.logging.set_verbosity(tl.logging.DEBUG)
    # sess = tf.InteractiveSession()
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

    # Step 1: Download the data, read the context into a list of strings.
    # Set hyperparameters.
    words = tl.files.load_matt_mahoney_text8_dataset()
    data_size = len(words)
    print(data_size)  # 17005207
    print(
        words[0:10]
    )  # ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
    # exit()

    resume = False  # load existing model, data and dictionaries
    _UNK = "_UNK"

    if FLAGS.model == "one":
        # toy setting (tensorflow/examples/tutorials/word2vec/word2vec_basic.py)
        vocabulary_size = 50000  # maximum number of word in vocabulary
        batch_size = 128
        embedding_size = 128  # Dimension of the embedding vector (hidden layer).
        skip_window = 1  # How many words to consider left and right.
        num_skips = 2  # How many times to reuse an input to generate a label.
        #     (should be double of 'skip_window' so as to
        #     use both left and right words)
        num_sampled = 64  # Number of negative examples to sample.
        #     more negative samples, higher loss
        learning_rate = 1.0
        n_epoch = 20
        model_file_name = "model_word2vec_50k_128"
        # Eval 2084/15851 accuracy = 15.7%
    if FLAGS.model == "two":
        # (tensorflow/models/embedding/word2vec.py)
        vocabulary_size = 80000
        batch_size = 20  # Note: small batch_size need more steps for a Epoch
        embedding_size = 200
        skip_window = 5
        num_skips = 10
        num_sampled = 100
        learning_rate = 0.2
        n_epoch = 15
        model_file_name = "model_word2vec_80k_200"
        # 7.9%
    if FLAGS.model == "three":
        # (tensorflow/models/embedding/word2vec_optimized.py)
        vocabulary_size = 80000
        batch_size = 500
        embedding_size = 200
        skip_window = 5
        num_skips = 10
        num_sampled = 25
        learning_rate = 0.025
        n_epoch = 20
        model_file_name = "model_word2vec_80k_200_opt"
        # bad 0%
    if FLAGS.model == "four":
        # see: Learning word embeddings efficiently with noise-contrastive estimation
        vocabulary_size = 80000
        batch_size = 100
        embedding_size = 600
        skip_window = 5
        num_skips = 10
        num_sampled = 25
        learning_rate = 0.03
        n_epoch = 200 * 10
        model_file_name = "model_word2vec_80k_600"
        # bad

    num_steps = int(
        (data_size / batch_size) * n_epoch)  # total number of iteration

    print('%d Steps in a Epoch, total Epochs %d' %
          (int(data_size / batch_size), n_epoch))
    print('   learning_rate: %f' % learning_rate)
    print('   batch_size: %d' % batch_size)

    # Step 2: Build the dictionary and replace rare words with 'UNK' token.
    print()
    if resume:
        print("Load existing data and dictionaries" + "!" * 10)
        all_var = tl.files.load_npy_to_any(name=model_file_name + '.npy')
        data = all_var['data']
        count = all_var['count']
        dictionary = all_var['dictionary']
        reverse_dictionary = all_var['reverse_dictionary']
    else:
        data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(
            words, vocabulary_size, True, _UNK)

    print(
        'Most 5 common words (+UNK)', count[:5]
    )  # [['UNK', 418391], (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764)]
    print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
    # [5243, 3081, 12, 6, 195, 2, 3135, 46, 59, 156] [b'anarchism', b'originated', b'as', b'a', b'term', b'of', b'abuse', b'first', b'used', b'against']

    del words  # Hint to reduce memory.

    # Step 3: Function to generate a training batch for the Skip-Gram model.
    print()

    batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, \
        batch_size=8, num_skips=4, skip_window=2, data_index=0)
    for i in range(8):
        print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
              reverse_dictionary[labels[i, 0]])

    batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, \
        batch_size=8, num_skips=2, skip_window=1, data_index=0)
    for i in range(8):
        print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
              reverse_dictionary[labels[i, 0]])

    # Step 4: Build a Skip-Gram model.
    print()

    # We pick a random validation set to sample nearest neighbors. Here we limit the
    # validation samples to the words that have a low numeric ID, which by
    # construction are also the most frequent.
    valid_size = 16  # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)
    # a list of 'valid_size' integers smaller than 'valid_window'
    # print(valid_examples)   # [90 85 20 33 35 62 37 63 88 38 82 58 83 59 48 64]
    # n_epoch = int(num_steps / batch_size)

    # train_inputs is a row vector, a input is an integer id of single word.
    # train_labels is a column vector, a label is an integer id of single word.
    # valid_dataset is a column vector, a valid set is an integer id of single word.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Look up embeddings for inputs.
    emb_net = tl.layers.Word2vecEmbeddingInputlayer(
        inputs=train_inputs,
        train_labels=train_labels,
        vocabulary_size=vocabulary_size,
        embedding_size=embedding_size,
        num_sampled=num_sampled,
        nce_loss_args={},
        E_init=tf.random_uniform_initializer(minval=-1.0, maxval=1.0),
        E_init_args={},
        nce_W_init=tf.truncated_normal_initializer(
            stddev=float(1.0 / np.sqrt(embedding_size))),
        nce_W_init_args={},
        nce_b_init=tf.constant_initializer(value=0.0),
        nce_b_init_args={},
        name='word2vec_layer',
    )

    # Construct the optimizer. Note: AdamOptimizer is very slow in this case
    cost = emb_net.nce_cost
    train_params = emb_net.all_params
    # train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost, var_list=train_params)
    train_op = tf.train.AdagradOptimizer(learning_rate,
                                         initial_accumulator_value=0.1,
                                         use_locking=False).minimize(
                                             cost, var_list=train_params)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    # For simple visualization of validation set.
    normalized_embeddings = emb_net.normalized_embeddings
    valid_embed = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embed,
                           normalized_embeddings,
                           transpose_b=True)
    # multiply all valid word vector with all word vector.
    # transpose_b=True, normalized_embeddings is transposed before multiplication.

    # Step 5: Start training.
    sess.run(tf.global_variables_initializer())
    if resume:
        print("Load existing model" + "!" * 10)
        # Load from ckpt or npz file
        # saver = tf.train.Saver()
        # saver.restore(sess, model_file_name+'.ckpt')
        tl.files.load_and_assign_npz_dict(name=model_file_name + '.npz',
                                          sess=sess)

    emb_net.print_params(False)
    emb_net.print_layers()

    # save vocabulary to txt
    tl.nlp.save_vocab(count, name='vocab_text8.txt')

    average_loss = 0
    step = 0
    print_freq = 2000
    while step < num_steps:
        start_time = time.time()
        batch_inputs, batch_labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, \
            batch_size=batch_size, num_skips=num_skips, skip_window=skip_window, data_index=data_index)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        # We perform one update step by evaluating the train_op (including it
        # in the list of returned values for sess.run()
        _, loss_val = sess.run([train_op, cost], feed_dict=feed_dict)
        average_loss += loss_val

        if step % print_freq == 0:
            if step > 0:
                average_loss /= print_freq
            print("Average loss at step %d/%d. loss: %f took: %fs" % \
                (step, num_steps, average_loss, time.time() - start_time))
            average_loss = 0
        # Prints out nearby words given a list of words.
        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % (print_freq * 5) == 0:
            sim = similarity.eval(session=sess)
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors to print
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = "Nearest to %s:" % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)

        if (step % (print_freq * 20) == 0) and (step != 0):
            print("Save model, data and dictionaries" + "!" * 10)
            # Save to ckpt or npz file
            # saver = tf.train.Saver()
            # save_path = saver.save(sess, model_file_name+'.ckpt')
            tl.files.save_npz_dict(emb_net.all_params,
                                   name=model_file_name + '.npz',
                                   sess=sess)
            tl.files.save_any_to_npy(save_dict={
                'data': data,
                'count': count,
                'dictionary': dictionary,
                'reverse_dictionary': reverse_dictionary
            },
                                     name=model_file_name + '.npy')

        # if step == num_steps-1:
        #     keeptrain = input("Training %d finished enter 1 to keep training: " % num_steps)
        #     if keeptrain == '1':
        #         step = 0
        #         learning_rate = float(input("Input new learning rate: "))
        #         train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
        step += 1

    # Step 6: Visualize the normalized embedding matrix by t-SNE.
    print()

    final_embeddings = sess.run(normalized_embeddings)  #.eval()
    tl.visualize.tsne_embedding(final_embeddings, reverse_dictionary, plot_only=500, \
        second=5, saveable=False, name='word2vec_basic')

    # Step 7: Evaluate by analogy questions. see tensorflow/models/embedding/word2vec_optimized.py
    print()

    #  from tensorflow/models/embedding/word2vec.py
    analogy_questions = tl.nlp.read_analogies_file(
        eval_file='questions-words.txt', word2id=dictionary)
    # The eval feeds three vectors of word ids for a, b, c, each of
    # which is of size N, where N is the number of analogies we want to
    # evaluate in one batch.
    analogy_a = tf.placeholder(dtype=tf.int32)  # [N]
    analogy_b = tf.placeholder(dtype=tf.int32)  # [N]
    analogy_c = tf.placeholder(dtype=tf.int32)  # [N]
    # Each row of a_emb, b_emb, c_emb is a word's embedding vector.
    # They all have the shape [N, emb_dim]
    a_emb = tf.gather(normalized_embeddings, analogy_a)  # a's embs
    b_emb = tf.gather(normalized_embeddings, analogy_b)  # b's embs
    c_emb = tf.gather(normalized_embeddings, analogy_c)  # c's embs
    # We expect that d's embedding vectors on the unit hyper-sphere is
    # near: c_emb + (b_emb - a_emb), which has the shape [N, emb_dim].
    #   Bangkok Thailand Tokyo Japan -> Thailand - Bangkok = Japan - Tokyo
    #   Japan = Tokyo + (Thailand - Bangkok)
    #   d = c + (b - a)
    target = c_emb + (b_emb - a_emb)
    # Compute cosine distance between each pair of target and vocab.
    # dist has shape [N, vocab_size].
    dist = tf.matmul(target, normalized_embeddings, transpose_b=True)
    # For each question (row in dist), find the top 'n_answer' words.
    n_answer = 4
    _, pred_idx = tf.nn.top_k(dist, n_answer)

    def predict(analogy):
        """Predict the top 4 answers for analogy questions."""
        idx, = sess.run(
            [pred_idx], {
                analogy_a: analogy[:, 0],
                analogy_b: analogy[:, 1],
                analogy_c: analogy[:, 2]
            })
        return idx

    # Evaluate analogy questions and reports accuracy.
    #  i.e. How many questions we get right at precision@1.
    correct = 0
    total = analogy_questions.shape[0]
    start = 0
    while start < total:
        limit = start + 2500
        sub = analogy_questions[start:limit, :]  # question
        idx = predict(sub)  # 4 answers for each question
        # print('question:', tl.nlp.word_ids_to_words(sub[0], reverse_dictionary))
        # print('answers:', tl.nlp.word_ids_to_words(idx[0], reverse_dictionary))
        start = limit
        for question in xrange(sub.shape[0]):
            for j in xrange(n_answer):
                # if one of the top 4 answers in correct, win !
                if idx[question, j] == sub[question, 3]:
                    # Bingo! We predicted correctly. E.g., [italy, rome, france, paris].
                    print(
                        j + 1,
                        tl.nlp.word_ids_to_words([idx[question, j]],
                                                 reverse_dictionary), ':',
                        tl.nlp.word_ids_to_words(sub[question, :],
                                                 reverse_dictionary))
                    correct += 1
                    break
                elif idx[question, j] in sub[question, :3]:
                    # We need to skip words already in the question.
                    continue
                else:
                    # The correct label is not the precision@1
                    break
    print("Eval %4d/%d accuracy = %4.1f%%" %
          (correct, total, correct * 100.0 / total))
def _build_sequence(placeholders, config):
    '''core of the sequence model.
    '''

    with tf.name_scope('sequence_variables'):
        # Initialize embeddings to have variance=1, encoder and decoder share the same embeddings
        sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
        initializer = tf.random_uniform_initializer(-sqrt3,
                                                    sqrt3,
                                                    dtype=tf.float32)
        embeddings = tf.get_variable(
            name='word_embedding_matrix',
            shape=[config.vocab_size, config.embedding_size],
            initializer=initializer,
            dtype=tf.float32)

        projection_weights = tf.Variable(tf.random_uniform(
            [config.hidden_units, config.vocab_size], -1, 1),
                                         dtype=tf.float32,
                                         name='projection_weights')

        projection_bias = tf.Variable(tf.zeros([config.vocab_size]),
                                      dtype=tf.float32,
                                      name='projection_bias')

        encoder_inputs_embedded = tf.nn.embedding_lookup(
            embeddings,
            placeholders['encoder_inputs'],
            name='encoder_inputs_embedded')

    with tf.name_scope('encoder_sequence'):
        encoder_cell = tf.contrib.rnn.LSTMCell(config.hidden_units)
        encoder_cell = tf.contrib.rnn.DropoutWrapper(
            encoder_cell,
            input_keep_prob=placeholders['dropout_input_keep_prob'])
        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            encoder_cell,
            encoder_inputs_embedded,
            dtype=tf.float32,
            time_major=True,
            scope='encoder')

    with tf.name_scope('inference'):
        ## transpose the dimension of embedded input to [batch_size, max_time, embedded_size]
        encoder_inputs_embedded_ = tf.transpose(encoder_inputs_embedded,
                                                [1, 0, 2])
        mean_encoder_inputs_embedded = tf.reduce_mean(encoder_inputs_embedded_,
                                                      axis=1)

        ## change the dimension to [batch_size, max_time, cell.output_size]
        encoder_outputs_ = tf.transpose(encoder_outputs, [1, 0, 2])
        mean_encoder_outputs = tf.reduce_mean(encoder_outputs_, axis=1)

        final_cell_state = encoder_final_state[0]
        final_hidden_state = encoder_final_state[1]

    with tf.name_scope('decoder_sequence'):
        decoder_cell = tf.contrib.rnn.LSTMCell(config.hidden_units)
        ## give three extra space for error
        decoder_lengths = placeholders[
            'decoder_inputs_length'] + 1  ## consider the first <_GO>
        ## create the embedded _GO
        assert TOKEN_DICT[_GO] == 1
        go_time_slice = tf.ones([config.batch_size],
                                dtype=tf.int32,
                                name='EOS')
        go_step_embedded = tf.nn.embedding_lookup(embeddings, go_time_slice)

        def loop_fn_initial():
            '''returns the expected sets of outputs for the initial LSTM unit.
            the external variable `encoder_final_state` is used as initial_cell_state
            '''
            initial_elements_finished = (0 >= decoder_lengths
                                         )  # all False at the initial step
            initial_input = go_step_embedded
            initial_cell_state = encoder_final_state
            initial_cell_output = None
            initial_loop_state = None  # we don't need to pass any additional information
            return (initial_elements_finished, initial_input,
                    initial_cell_state, initial_cell_output,
                    initial_loop_state)

        def loop_fn_transition(time, previous_output, previous_state,
                               previous_loop_state):
            '''create the outputs for next LSTM unit
            A projection with word embedding matrix is used to find the next input, instead of
            using the target se in `dynamic_rnn`.
            '''
            def get_next_input():
                output_logits = tf.add(
                    tf.matmul(previous_output, projection_weights),
                    projection_bias)
                prediction = tf.argmax(output_logits, axis=1)
                next_input = tf.nn.embedding_lookup(embeddings, prediction)
                return next_input

            elements_finished = (
                time >= decoder_lengths
            )  # this operation produces boolean tensor of [batch_size]
            # defining if corresponding sequence has ended
            cur_input = get_next_input()
            cur_state = previous_state
            cur_output = previous_output
            loop_state = None
            return (elements_finished, cur_input, cur_state, cur_output,
                    loop_state)

        def loop_fn(time, previous_output, previous_state,
                    previous_loop_state):
            if previous_state is None:  # time == 0
                assert previous_output is None and previous_state is None
                return loop_fn_initial()
            else:
                return loop_fn_transition(time, previous_output,
                                          previous_state, previous_loop_state)

        decoder_outputs_tensor_array, decoder_final_state, _ = tf.nn.raw_rnn(
            decoder_cell, loop_fn)
        decoder_outputs = decoder_outputs_tensor_array.stack()

    with tf.name_scope('outputs_projection'):
        ## project the last hidden output from LSTM unit outputs to the word matrix
        decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(
            tf.shape(decoder_outputs))
        decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim))
        decoder_logits_flat = tf.add(
            tf.matmul(decoder_outputs_flat, projection_weights),
            projection_bias)
        decoder_logits = tf.reshape(
            decoder_logits_flat,
            (decoder_max_steps, decoder_batch_size, config.vocab_size))
        decoder_prediction = tf.argmax(decoder_logits, 2)
    tf.summary.histogram('{}_histogram'.format('decoder_prediction'),
                         decoder_prediction)

    inference_set = (mean_encoder_inputs_embedded, mean_encoder_outputs,
                     final_cell_state, final_hidden_state)
    return decoder_prediction, decoder_logits, inference_set
    def __init__(self,
                 vocab_size,
                 buckets,
                 size,
                 num_layers,
                 batch_size,
                 num_softmax_samples,
                 do_decode,
                 num_gpus=2,
                 train_and_test=False):
        """
        :param source_vocab_size:  原始词词数目
        :param target_vocab_size:  目标词词数目
        :param buckets:  桶
        :param size:  cell的神经元数量
        :param num_layers:  神经网络层数
        :param batch_size:
        :param do_decode:  训练还是测试 影响seq2seq的解码过程
        :param num_gpus:  gpu的数量
        :param 训练和预测一起进行
        """
        self._cur_gpu = 0  # 此参数用于自动选择gpu和cpu
        self._num_gpus = num_gpus  # gpu的数量
        self.sess = None  # tf的session 若为None则后面需要创建一个新的
        self.buckets = buckets
        self.global_step = tf.Variable(
            0, trainable=False)  # 一个tensor 用于记录训练集训练的次数

        encoder_inputs = []  # encoder inputs
        decoder_inputs = []
        target_inputs = []
        loss_weight_inputs = []

        # 所有的编码输入标识符号
        for i in range(buckets[-1][0]):
            encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[batch_size],
                               name="encoder{}".format(i)))
        squence_length = tf.placeholder(tf.int32, [batch_size],
                                        name='squence_length')
        self.squence_length = squence_length
        # 所有的解码输出标识符号
        for i in range(buckets[-1][1]):
            decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[batch_size],
                               name="decoder{}".format(i)))
            target_inputs.append(
                tf.placeholder(tf.int64,
                               shape=[batch_size],
                               name="target{}".format(i)))
            loss_weight_inputs.append(
                tf.placeholder(tf.float32,
                               shape=[batch_size],
                               name="loss_weight{}".format(i)))
        encoder_inputs_buckets = {}
        decoder_inputs_buckets = {}
        target_inputs_buckets = {}
        loss_weight_inputs_buckets = {}
        # bucket部分的 encoder decoder target
        # 解码和编码部分的bucket
        for bucket_id, bucket in enumerate(buckets):
            encoder_inputs_buckets[bucket_id] = encoder_inputs[0:bucket[0]]
            decoder_inputs_buckets[bucket_id] = decoder_inputs[0:bucket[1]]
            target_inputs_buckets[bucket_id] = target_inputs[0:bucket[1]]
            loss_weight_inputs_buckets[bucket_id] = loss_weight_inputs[
                0:bucket[1]]

        self.encoder_inputs_buckets = encoder_inputs_buckets
        self.decoder_inputs_buckets = decoder_inputs_buckets
        self.target_inputs_buckets = target_inputs_buckets
        self.loss_weight_inputs_buckets = loss_weight_inputs_buckets

        # 所有的编码部分和解码部分的embedding
        with tf.variable_scope(
                'embedding',
                reuse=True if train_and_test else None), tf.device('/cpu:0'):
            embedding = tf.get_variable(
                'embedding', [vocab_size, size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=1e-4))
            # every word look up a word vector.
            emb_encoder_inputs = [
                tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs
            ]
            emb_decoder_inputs = [
                tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs
            ]
        encoder_embedding_buckets = {}
        decoder_embedding_buckets = {}
        # bucket embedding 部分的 encoder decoder
        for i, bucket in enumerate(buckets):
            encoder_embedding_buckets[i] = emb_encoder_inputs[0:bucket[0]]
            decoder_embedding_buckets[i] = emb_decoder_inputs[0:bucket[1]]
        # 这里需要使用bucket
        encoder_output_buckets = {}
        encoder_state_buckets = {}
        device = self._next_device()
        for bucket_id, bucket in enumerate(buckets):
            encoder_input_embedding = encoder_embedding_buckets[bucket_id]
            for layer_id in range(num_layers):
                with tf.variable_scope(
                        "encoder%d" % layer_id,
                        reuse=(True if bucket_id > 0 else None) or
                    (True if train_and_test else None)), tf.device(device):
                    cell = LSTMCell(num_units=size,
                                    initializer=tf.random_uniform_initializer(
                                        -0.1, 0.1, seed=123),
                                    state_is_tuple=True)
                    encoder_input_embedding, state = static_rnn(
                        cell=cell,
                        inputs=encoder_input_embedding,
                        sequence_length=squence_length,
                        dtype=tf.float32)
                output = encoder_input_embedding
                encoder_output_buckets[bucket_id] = output
                encoder_state_buckets[bucket_id] = state
        with tf.variable_scope('output_projection',
                               reuse=True if train_and_test else None):
            w = tf.get_variable(
                'w', [size, vocab_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=1e-4))
            w_t = tf.transpose(w)
            v = tf.get_variable(
                'v', [vocab_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=1e-4))

        loop_function = _extract_argmax_and_embed(embedding,
                                                  (w,
                                                   v)) if do_decode else None
        cell = LSTMCell(size,
                        initializer=tf.random_uniform_initializer(-0.1,
                                                                  0.1,
                                                                  seed=123),
                        state_is_tuple=True)
        decoder_output_buckets = {}
        decoder_state_buckets = {}
        device = self._next_device()
        for bucket_id, bucket in enumerate(buckets):
            with tf.variable_scope(
                    "decoder",
                    reuse=(True if bucket_id > 0 else None)
                    or (True if train_and_test else None)), tf.device(device):
                t = tf.concat(values=[
                    tf.reshape(x, [-1, 1, size])
                    for x in encoder_output_buckets[bucket_id]
                ],
                              axis=1)
                decoder_output, decoder_state = attention_decoder(
                    decoder_inputs=decoder_embedding_buckets[bucket_id],
                    initial_state=encoder_state_buckets[bucket_id],
                    attention_states=t,
                    cell=cell,
                    num_heads=1,
                    loop_function=loop_function,
                    initial_state_attention=do_decode)
                decoder_output_buckets[bucket_id] = decoder_output
                decoder_state_buckets[bucket_id] = decoder_state
        model_output_buckets = {}  # 输出的 logits
        model_output_predict_buckets = {}
        model_output_predict_merger_buckets = {}
        model_output_accuracy = {}
        device = self._next_device()
        for bucket_id, bucket in enumerate(buckets):
            model_output = []
            model_output_predict = []
            model_accuracy = []
            with tf.variable_scope(
                    "output",
                    reuse=(True if bucket_id > 0 else None)
                    or (True if train_and_test else None)), tf.device(device):
                for j in range(len(decoder_output_buckets[bucket_id])):
                    output = tf.nn.xw_plus_b(
                        decoder_output_buckets[bucket_id][j], w, v)
                    predict = tf.argmax(input=output,
                                        axis=1,
                                        name="predict_{}_{}".format(
                                            bucket_id, j))
                    accuracy_bool = tf.equal(
                        x=target_inputs_buckets[bucket_id][j], y=predict)
                    model_accuracy.append(
                        tf.reduce_mean(
                            tf.cast(x=accuracy_bool, dtype=tf.float32)))
                    model_output.append(output)
                    model_output_predict.append(
                        tf.reshape(tensor=predict, shape=[-1, 1]))
            model_output_buckets[bucket_id] = model_output
            model_output_predict_buckets[bucket_id] = model_output_predict
            model_output_predict_merger_buckets[bucket_id] = tf.concat(
                values=model_output_predict, axis=1)
            model_output_accuracy[bucket_id] = tf.add_n(inputs=model_accuracy, name="bucket_id_{}".format(bucket_id)) / \
                                               buckets[bucket_id][1]
        self.model_output_buckets = model_output_buckets
        self.model_output_predict_buckets = model_output_predict_buckets
        self.model_output_predict_merger_buckets = model_output_predict_merger_buckets
        self.model_output_accuracy = model_output_accuracy

        def sampled_loss_func(labels, logits):  # tf1.0的规范更加严格
            with tf.device('/cpu:0'):  # Try gpu.
                labels = tf.reshape(labels, [-1, 1])
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(v, tf.float32)
                local_inputs = tf.cast(logits, tf.float32)
                return tf.cast(
                    tf.nn.sampled_softmax_loss(weights=local_w_t,
                                               biases=local_b,
                                               labels=labels,
                                               inputs=local_inputs,
                                               num_sampled=num_softmax_samples,
                                               num_classes=vocab_size),
                    tf.float32)

        device = self._next_device()
        loss_buckets = {}
        for bucket_id, bucket in enumerate(buckets):
            with tf.variable_scope(
                    'loss',
                    reuse=(True if bucket_id > 0 else None)
                    or (True if train_and_test else None)), tf.device(device):
                if num_softmax_samples != 0 and not do_decode:
                    # 这里的输入部分不相同的原因是前者替换了softmax函数
                    loss = sequence_loss_by_example(
                        logits=decoder_output_buckets[bucket_id],
                        targets=target_inputs_buckets[bucket_id],
                        weights=loss_weight_inputs_buckets[bucket_id],
                        average_across_timesteps=True,
                        softmax_loss_function=sampled_loss_func)
                    # loss = sequence_loss(logits=model_output_buckets[bucket_id],
                    #                      targets=target_inputs_buckets[bucket_id],
                    #                      weights=loss_weight_inputs_buckets[bucket_id]
                    #                      )
                else:
                    loss = sequence_loss(
                        logits=model_output_buckets[bucket_id],
                        targets=target_inputs_buckets[bucket_id],
                        weights=loss_weight_inputs_buckets[bucket_id])
                loss_buckets[bucket_id] = tf.reduce_mean(loss)  # 计算平均loss
        self.loss_buckets = loss_buckets
Exemple #35
0
def main(_):
    if not FLAGS.data_path:
        raise ValueError("Must set --data_path to PTB data directory")

    raw_data = reader.ptb_raw_data(FLAGS.data_path)
    train_data, valid_data, test_data, _ = raw_data

    config = get_config()
    eval_config = get_config()
    eval_config.batch_size = 1
    eval_config.num_steps = 1

    with tf.Graph().as_default():
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)

        with tf.name_scope("Train"):
            train_input = PTBInput(config=config,
                                   data=train_data,
                                   name="TrainInput")
            with tf.variable_scope("Model",
                                   reuse=None,
                                   initializer=initializer):
                m = PTBModel(is_training=True,
                             config=config,
                             input_=train_input)
            tf.scalar_summary("Training Loss", m.cost)
            tf.scalar_summary("Learning Rate", m.lr)

        with tf.name_scope("Valid"):
            valid_input = PTBInput(config=config,
                                   data=valid_data,
                                   name="ValidInput")
            with tf.variable_scope("Model",
                                   reuse=True,
                                   initializer=initializer):
                mvalid = PTBModel(is_training=False,
                                  config=config,
                                  input_=valid_input)
            tf.scalar_summary("Validation Loss", mvalid.cost)

        with tf.name_scope("Test"):
            test_input = PTBInput(config=eval_config,
                                  data=test_data,
                                  name="TestInput")
            with tf.variable_scope("Model",
                                   reuse=True,
                                   initializer=initializer):
                mtest = PTBModel(is_training=False,
                                 config=eval_config,
                                 input_=test_input)

        sv = tf.train.Supervisor(logdir=FLAGS.save_path)
        with sv.managed_session() as session:
            for i in range(config.max_max_epoch):
                lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0)
                m.assign_lr(session, config.learning_rate * lr_decay)

                print("Epoch: %d Learning rate: %.3f" %
                      (i + 1, session.run(m.lr)))
                train_perplexity = run_epoch(session,
                                             m,
                                             eval_op=m.train_op,
                                             verbose=True)
                print("Epoch: %d Train Perplexity: %.3f" %
                      (i + 1, train_perplexity))
                valid_perplexity = run_epoch(session, mvalid)
                print("Epoch: %d Valid Perplexity: %.3f" %
                      (i + 1, valid_perplexity))

            test_perplexity = run_epoch(session, mtest)
            print("Test Perplexity: %.3f" % test_perplexity)

            if FLAGS.save_path:
                print("Saving model to %s." % FLAGS.save_path)
                sv.saver.save(session,
                              FLAGS.save_path,
                              global_step=sv.global_step)
Exemple #36
0
def train(data_dir, save_dir, best_dir, config):
	"""Prepare the data and begin training."""
	# Create variables
	batch_size = config.batch_size
	timesteps = config.timesteps
	num_epochs = config.epochs
	# Load the text and vocabulary
	data_loader = DataLoader(
		data_dir, 
		mode='train', 
		tokenize_func=lmmrl_tokenizer, 
		encode_func=lmmrl_encoder, 
		word_markers=config.include_word_markers,
		max_word_length=config.max_word_length
	)
	# Prepare batches for training and validation
	train_batch_loader = BatchLoader(data_loader, batch_size=batch_size, timesteps=timesteps, mode='train')
	val_batch_loader = BatchLoader(data_loader, batch_size=batch_size, timesteps=timesteps, mode='val')

	# update vocabulary sizes
	config.word_vocab_size = len(data_loader.vocabs['words'])
	config.char_vocab_size = len(data_loader.vocabs['chars'])

	# Run on GPU by default
	cfg_proto = tf.ConfigProto(intra_op_parallelism_threads=0, inter_op_parallelism_threads=0)
	cfg_proto.gpu_options.allow_growth = True

	##########################################################################
	# Load word frequency information
	##########################################################################
	with open(os.path.join(data_dir, 'word_freq.txt'), encoding='utf-8') as f:
		freq = f.read().split()
		config['freq'] = freq
	##########################################################################

	# Create model
	config.save_dir = save_dir
	model = Model(config)

	with tf.Session(config=cfg_proto, graph=model.graph) as sess:
		# Restore model/Initialize weights
		initializer = tf.random_uniform_initializer(-0.05, 0.05)
		with tf.variable_scope("model", reuse=None, initializer=initializer):
			steps_done = restore_model(sess, model, save_dir)
		
		logger.info("Loaded %d completed steps", steps_done)

		# Find starting epoch
		start_epoch = model.epoch_cntr.eval()

		# Start epoch-based training
		lr = config.initial_learning_rate
		
		# Finalize graph to prevent memory leakage
		sess.graph.finalize()
		last_val_ppl = 10000
		
		for epoch in range(start_epoch, num_epochs):
			logger.info("Epoch %d / %d", epoch+1, num_epochs)
			# train
			run_epoch(sess, model, train_batch_loader, 'train', save_dir=save_dir, lr=lr)
			# fine-tune after every epoch
            sess.run(model.update_unknown)
			model.fine_tune(sess)
			# validate
			val_ppl = run_epoch(sess, model, val_batch_loader, 'val', best_dir=best_dir)
			# update learning rate conditionally
			if val_ppl >= last_val_ppl:
				lr *= config.lr_decay
				logger.info("Decaying learning rate to %.4f", lr)
			last_val_ppl = val_ppl
			# increment epoch
			sess.run([model.incr_epoch])
 def w_initializer(dim_in, dim_out):
     random_range = math.sqrt(6.0 / (dim_in + dim_out))
     return tf.random_uniform_initializer(-random_range, random_range)
Exemple #38
0
def word2vec(batch_gen):
    """ Build the graph for word2vec model and train it """
    # Step 1: define the placeholders for input and output
    # center_words have to be int to work on embedding lookup
    with tf.name_scope('data'):
        center_word = tf.placeholder(tf.int32, [BATCH_SIZE],
                                     name='center_words')
        y = tf.placeholder(tf.int32, [BATCH_SIZE, SKIP_WINDOW],
                           name='target_words')

    # Step 2: define weights. In word2vec, it's actually the weights that we care about
    # vocab size x embed size
    # initialized to random uniform -1 to 1

    with tf.name_scope('embedding_matrix'):
        embed_matrix = tf.get_variable(
            'WordEmbedding', [VOCAB_SIZE, EMBED_SIZE],
            tf.float32,
            initializer=tf.random_uniform_initializer(-1.0, 1.0))

    # Step 3: define the inference
    # get the embed of input words using tf.nn.embedding_lookup
    # embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed')

    with tf.name_scope('loss'):
        embed = tf.nn.embedding_lookup(embed_matrix, center_word, name='embed')

        # Step 4: construct variables for NCE loss
        # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
        # nce_weight (vocab size x embed size), intialized to truncated_normal stddev=1.0 / (EMBED_SIZE ** 0.5)
        # bias: vocab size, initialized to 0

        nce_weight = tf.get_variable(
            'nce_weight', [VOCAB_SIZE, EMBED_SIZE],
            initializer=tf.truncated_normal_initializer(stddev=1.0 /
                                                        (EMBED_SIZE**0.5)))

        nce_bias = tf.get_variable('nce_bias', [VOCAB_SIZE],
                                   initializer=tf.zeros_initializer())
        # define loss function to be NCE loss function
        # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
        # need to get the mean accross the batch
        # note: you should use embedding of center words for inputs, not center words themselves

        nce_loss = tf.nn.nce_loss(nce_weight, nce_bias, y, embed, NUM_SAMPLED,
                                  VOCAB_SIZE)
        loss = tf.reduce_mean(nce_loss, 0)
    # Step 5: define optimizer

    optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)

    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())

        total_loss = 0.0  # we use this to calculate the average loss in the last SKIP_STEP steps0
        writer = tf.summary.FileWriter('./graphs/no_frills/', sess.graph)
        for index in range(NUM_TRAIN_STEPS):
            centers, targets = next(batch_gen)
            train_dict = {center_word: centers, y: targets}
            _, loss_batch = sess.run([optimizer, loss], feed_dict=train_dict)
            total_loss += loss_batch
            if (index + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(
                    index, total_loss / SKIP_STEP))
                total_loss = 0.0
        writer.close()
    def __init__(self,
                 layer_name,
                 filter_size,
                 num_hidden_in,
                 num_hidden,
                 seq_shape,
                 x_shape_in,
                 tln=False,
                 initializer=None):
        super(SpatioTemporalLSTMCell, self).__init__()
        """Initialize the basic Conv LSTM cell.
		Args:
			layer_name: layer names for different convlstm layers.
			filter_size: int tuple thats the height and width of the filter.
			num_hidden: number of units in output tensor.
			forget_bias: float, The bias added to forget gates (see above).
			tln: whether to apply tensor layer normalization
		"""
        self.layer_name = layer_name  # 当前网络层名
        self.filter_size = filter_size  # 卷积核大小
        self.num_hidden_in = num_hidden_in  # 隐藏层输入大小
        self.num_hidden = num_hidden  # 隐藏层数量
        self.batch = seq_shape[0]  # batch_size
        self.height = seq_shape[2]  # 图片高度
        self.width = seq_shape[3]  # 图片宽度
        self.x_shape_in = x_shape_in  # 通道数
        self.layer_norm = tln  # 是否归一化
        self._forget_bias = 1.0  # 遗忘参数

        def w_initializer(dim_in, dim_out):
            random_range = math.sqrt(6.0 / (dim_in + dim_out))
            return tf.random_uniform_initializer(-random_range, random_range)

        if initializer is None or initializer == -1:  # 初始化参数
            self.initializer = w_initializer
        else:
            self.initializer = tf.random_uniform_initializer(
                -initializer, initializer)

        # 建立网络层
        # h
        self.t_cc = layers.Conv2D(
            self.num_hidden * 4,  # 网络输入 输出通道数
            self.filter_size,
            1,
            padding='same',  # 滤波器大小 步长 填充方式
            kernel_initializer=self.initializer(self.num_hidden_in,
                                                self.num_hidden * 4),  # 参数初始化
            name='time_state_to_state')

        # m
        self.s_cc = layers.Conv2D(
            self.num_hidden * 4,  # 网络输入 输出通道数
            self.filter_size,
            1,
            padding='same',  # 滤波器大小 步长 填充方式
            kernel_initializer=self.initializer(self.num_hidden_in,
                                                self.num_hidden * 4),
            name='spatio_state_to_state')

        # x
        self.x_cc = layers.Conv2D(
            self.num_hidden * 4,  # 网络输入 输出通道数
            self.filter_size,
            1,
            padding='same',  # 滤波器大小 步长 填充方式
            kernel_initializer=self.initializer(self.x_shape_in,
                                                self.num_hidden * 4),  # 参数初始化
            name='input_to_state')

        # c
        self.c_cc = layers.Conv2D(
            self.num_hidden,  # 网络输入 输出通道数
            1,
            1,
            padding='same',  # 滤波器大小 步长 填充方式
            kernel_initializer=self.initializer(self.num_hidden * 2,
                                                self.num_hidden),  # 参数初始化
            name='cell_reduce')

        # bn
        self.bn_t_cc = tensor_layer_norm('st_time_state_to_state')
        self.bn_s_cc = tensor_layer_norm('st_spatio_state_to_state')
        self.bn_x_cc = tensor_layer_norm('st_input_to_state')
Exemple #40
0
 def get_decoder_cell(rnn_size):
     decoder_cell = tf.contrib.rnn.LSTMCell(
         rnn_size,
         initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
     return decoder_cell
Exemple #41
0
 def make_cell(rnn_size, keep_prob):
     enc_cell = tf.contrib.rnn.LSTMCell(
         rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1))
     drop_cell = tf.contrib.rnn.DropoutWrapper(enc_cell,
                                               output_keep_prob=keep_prob)
     return drop_cell
Exemple #42
0
        itertools.product(*(range(args.distinct_nums)
                            for _ in range(args.num_terms)))))
np.random.shuffle(product_array)


def data():
    # return np.random.randint(args.distinct_nums,
    #                          size=[args.distinct_nums, 1562])
    return product_array.transpose()


def target(data):
    return np.sum(data, 0)


init = tf.random_uniform_initializer()
with tf.Session() as sess, tf.variable_scope("", initializer=init):
    # embeddings
    inputs = tf.placeholder(tf.int32,
                            shape=[args.num_terms, args.batch_size],
                            name='inputs')
    with tf.device('/cpu:0'):
        embeddings = tf.Variable(tf.random_uniform(
            [args.vocabulary_size, embedding_size], -1.0, 1.0),
                                 name='embeddings')
        lookups = tf.nn.embedding_lookup(embeddings, inputs, name='lookups')
    inputs_list = tf.unpack(lookups)

    # GRU
    cell = tf.nn.rnn_cell.GRUCell(args.num_cells)
    cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=args.keep_prob)
Exemple #43
0
    def __init__(
        self,  #
        input_vocab_size,  #输入词表的大小
        target_vocab_size,  #输出词表的大小
        batch_size=32,  #数据batch的大小
        embedding_size=300,  #输入词表与输出词表embedding的维度
        mode="train",  #取值为train, 代表训练模式, 取值为decide,代表预训练模式
        hidden_units=256,  #Rnn模型的中间层大小,encoder和decoder层相同
        depth=1,  #encoder和decoder的rnn层数
        beam_width=0,  #是beamsearch的超参数,用于解码
        cell_type="lstm",  #rnn的神经元类型, lstm, gru
        dropout=0.2,  #随机丢弃数据的比例,是要0到1之间
        use_dropout=False,  #是否使用dropout
        use_residual=False,  #是否使用residual
        optimizer='adam',  #使用哪一个优化器
        learning_rate=1e-3,  #学习率
        min_learning_rate=1e-5,  #最小学习率
        decay_steps=50000,  #衰减步数
        max_gradient_norm=5.0,  #梯度正则裁剪的系数
        max_decode_step=None,  #最大decode长度, 可以非常大
        attention_type='Bahdanau',  #使用attention类型
        bidirectional=False,  #是否使用双向encoder
        time_major=False,  #是否在计算过程中使用时间作为主要的批量数据
        seed=0,  #一些层间的操作的随机数
        parallel_iterations=None,  #并行执行rnn循环的个数
        share_embedding=False,  #是否让encoder和decoder共用一个embedding
        pretrained_embedding=False):  #是不是要使用预训练的embedding
        self.input_vocab_size = input_vocab_size
        self.target_vocab_size = target_vocab_size
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.hidden_units = hidden_units
        self.depth = depth
        self.cell_type = cell_type.lower()
        self.use_dropout = use_dropout
        self.use_residual = use_residual
        self.attention_type = attention_type
        self.mode = mode
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        self.min_learning_rate = min_learning_rate
        self.decay_steps = decay_steps
        self.max_gradient_norm = max_gradient_norm
        self.keep_prob = 1.0 - dropout
        self.seed = seed
        self.pretrained_embedding = pretrained_embedding
        self.bidirectional = bidirectional

        if isinstance(parallel_iterations, int):
            self.parallel_iterations = parallel_iterations
        else:
            self.parallel_iterations = batch_size
        self.time_major = time_major
        self.share_embedding = share_embedding
        #生成均匀分布的随机数  用于变量初始化
        self.initializer = tf.random_uniform_initializer(-0.05,
                                                         0.05,
                                                         dtype=tf.float32)
        assert self.cell_type in ('gru', 'lstm'), 'cell_type 应该是GRU 或者是 LSTM'

        if share_embedding:
            assert input_vocab_size == target_vocab_size, '如果share_embedding 为True 那么两个vocab_size 必须一样'
        assert mode in (
            'train', 'decode'), 'mode 必须是train 或者是decode , 而不是{}'.format(mode)

        assert dropout >= 0.0 and dropout < 1.0, 'dropout 必须大于等于0 且小于等于1'

        assert attention_type.lower() in (
            'bahdanau', 'loung'), 'attention_type 必须是bahdanau 或者是 loung'

        assert beam_width < target_vocab_size, 'beam_width {} 应该小于target_vocab_size{}'.format(
            beam_width, target_vocab_size)

        self.keep_prob_placeholder = tf.placeholder(tf.float32,
                                                    shape=[],
                                                    name='keep_prob')
        self.global_step = tf.Variable(0, trainable=False, name='global_step')

        self.use_beamsearch_decode = False
        self.beam_width = beam_width
        self.use_beamsearch_decode = True if self.beam_width > 0 else False
        self.max_decode_step = max_decode_step

        assert self.optimizer.lower() in ('adadelta', 'adam', 'rmsprop', 'momentum', 'sgd'), \
            'optimizer 必须是下列之一: adadelta, adam, rmsprop, momentum, sgd '
        self.build_model()
    def _build_model(self):
        """Add the whole generator model to the graph."""
        hps = self._hps
        vsize = self._vocab.size()  # size of the vocabulary

        with tf.variable_scope('sentiment'):
            # Some initializers
            self.rand_unif_init = tf.random_uniform_initializer(
                -hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123)
            self.trunc_norm_init = tf.truncated_normal_initializer(
                stddev=hps.trunc_norm_init_std)

            # Add embedding matrix (shared by the encoder and decoder inputs)
            with tf.variable_scope('embedding'):
                embedding = tf.get_variable('embedding', [vsize, hps.emb_dim],
                                            dtype=tf.float32,
                                            initializer=self.trunc_norm_init)
                #embedding_score = tf.get_variable('embedding_score', [5, hps.hidden_dim], dtype=tf.float32, initializer=self.trunc_norm_init)

                #emb_dec_inputs = tf.nn.embedding_lookup(embedding, self._dec_batch) # list length max_dec_steps containing shape (batch_size, emb_size)
                #emb_dec_inputs = tf.unstack(emb_dec_inputs, axis=1)
                if FLAGS.run_method == 'auto-encoder':
                    emb_enc_inputs = tf.nn.embedding_lookup(
                        embedding, self._enc_batch
                    )  # tensor with shape (batch_size, max_enc_steps, emb_size)
                    emb_enc_inputs = emb_enc_inputs * tf.expand_dims(
                        self._enc_padding_mask, axis=-1)
                    hiddenstates = self._add_encoder(emb_enc_inputs,
                                                     self._enc_lens)
                    #self.return_hidden = fw_st.h
                    #hiddenstates = tf.contrib.rnn.LSTMStateTuple(fw_st.h, fw_st.h)#self._reduce_states(fw_st, bw_st)
            w = tf.get_variable(
                'w', [hps.hidden_dim * 2, 2],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=1e-4))
            v = tf.get_variable(
                'v', [2],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=1e-4))
            hiddenstates = tf.reshape(hiddenstates,
                                      [hps.batch_size * hps.max_enc_steps, -1])
            logits = tf.nn.xw_plus_b(hiddenstates, w, v)
            logits = tf.reshape(logits, [hps.batch_size, hps.max_enc_steps, 2])

            #self.decoder_outputs_pretrain, self._max_best_output =self.add_decoder(embedding, emb_dec_inputs, vsize, hps)
            loss = tf.contrib.seq2seq.sequence_loss(
                logits,
                self._weight,
                self._enc_padding_mask,
                average_across_timesteps=True,
                average_across_batch=False)
            self.max_output = tf.argmax(logits, axis=-1)

            reward_loss = tf.contrib.seq2seq.sequence_loss(
                logits,
                self._weight,
                self._enc_padding_mask,
                average_across_timesteps=True,
                average_across_batch=False) * self.reward

            # Update the cost
            self._cost = tf.reduce_mean(loss)
            self._reward_cost = tf.reduce_mean(reward_loss)
            self.optimizer = tf.train.AdagradOptimizer(
                self._hps.lr,
                initial_accumulator_value=self._hps.adagrad_init_acc)
Exemple #45
0
    def __call__(self, x, prev_state):
        prev_read_vector_list = prev_state['read_vector_list']
        prev_controller_vector_list = prev_state['controller_state']

        controller_input = tf.concat([x] + prev_read_vector_list, axis =1)
        with tf.variable_scope('controller', reuse=self.reuse):
            controller_output, controller_state = self.controller(controller_input, prev_controller_vector_list)

        if self.k_strategy == 'summary':
            num_parameter_per_head = self.memory_vector_dim + 1

        elif self.k_strategy == 'separate':
            num_parameter_per_head = self.memory_vector_dim * 2 + 1

        total_parameter_num = num_parameter_per_head * self.head_num
        with tf.variable_scope('o2p', reuse=(self.step > 0) or self.reuse):
            o2p_w = tf.get_variable('o2p_w', [controller_output.get_shape()[1], total_parameter_num],
                                    initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
            o2p_b = tf.get_variable('o2p_b', [total_parameter_num],
                                    initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
            parameters = tf.nn.xw_plus_b(controller_output, o2p_w, o2p_b)
        head_parameter_list = tf.split(parameters, self.head_num, axis=1)

        prev_w_r_list = prev_state['w_r_list']
        prev_M = prev_state['M']
        prev_w_u = prev_state['w_u']
        prev_indices, prev_w_lu = self.least_used(prev_w_u)

        w_r_list = []
        w_w_list = []
        k_list = []
        a_list = []

        for i, head_parameter in enumerate(head_parameter_list):
            with tf.variable_scope('addressing_head_%d' % i):
                k = tf.tanh(head_parameter[:, 0:self.memory_vector_dim], name='k')
                if self.k_strategy == 'separate':
                    a = tf.tanh(head_parameter[:, self.memory_vector_dim:self.memory_vector_dim * 2], name='a')
                sig_alpha = tf.sigmoid(head_parameter[:, -1:], name='sig_alpha')
                w_r = self.read_head_addressing(k, prev_M)
                w_w = self.write_head_addressing(sig_alpha, prev_w_r_list[i], prev_w_lu)
            w_r_list.append(w_r)
            w_w_list.append(w_w)
            k_list.append(k)
            if self.k_strategy == 'separate':
                a_list.append(a)

        w_u = self.gamma * prev_w_u + tf.add_n(w_r_list) + tf.add_n(w_w_list)  # eq (20)

        # Set least used memory location computed from w_(t-1)^u to zero
        M_ = prev_M * tf.expand_dims(1. - tf.one_hot(prev_indices[:, -1], self.memory_size), dim=2)

        # Writing
        M = M_
        with tf.variable_scope('writing'):
            for i in range(self.head_num):
                w = tf.expand_dims(w_w_list[i], axis=2)
                if self.k_strategy == 'summary':
                    k = tf.expand_dims(k_list[i], axis=1)
                elif self.k_strategy == 'separate':
                    k = tf.expand_dims(a_list[i], axis=1)
                M = M + tf.matmul(w, k)

        # Reading
        read_vector_list = []
        with tf.variable_scope('reading'):
            for i in range(self.head_num):
                read_vector = tf.reduce_sum(tf.expand_dims(w_r_list[i], dim=2) * M, axis=1)
                read_vector_list.append(read_vector)

        # controller_output -> NTM output
        NTM_output = tf.concat([controller_output] + read_vector_list, axis=1)
        state = {
            'controller_state': controller_state,
            'read_vector_list': read_vector_list,
            'w_r_list': w_r_list,
            'w_w_list': w_w_list,
            'w_u': w_u,
            'M': M,
        }

        self.step += 1
        return NTM_output, state
Exemple #46
0
    def __init__(self, sess, config, api, log_dir, scope=None):
        self.sess = sess
        self.config = config
        self.n_state = config.n_state
        self.n_vocab = len(api.vocab)
        self.cell_type = config.cell_type
        self.encoding_cell_size = config.encoding_cell_size
        self.state_cell_size = config.state_cell_size
        self.keep_prob = config.keep_prob
        self.num_layer = config.num_layer
        self.max_utt_len = config.max_utt_len
        self.scope = scope
        with_label_loss = self.config.with_label_loss

        with tf.name_scope("io"):
            self.global_t = tf.placeholder(dtype=tf.int32, name="global_t")
            self.usr_input_sent = tf.placeholder(dtype=tf.int32,
                                                 shape=(None, None,
                                                        self.max_utt_len),
                                                 name="user_input")
            self.sys_input_sent = tf.placeholder(dtype=tf.int32,
                                                 shape=(None, None,
                                                        self.max_utt_len),
                                                 name="user_input")
            self.dialog_length_mask = tf.placeholder(dtype=tf.int32,
                                                     shape=(None),
                                                     name="dialog_length_mask")
            self.usr_full_mask = tf.placeholder(dtype=tf.int32,
                                                shape=(None, None,
                                                       self.max_utt_len),
                                                name="usr_full_mask")
            self.sys_full_mask = tf.placeholder(dtype=tf.int32,
                                                shape=(None, None,
                                                       self.max_utt_len),
                                                name="sys_full_mask")
            max_dialog_len = tf.shape(self.usr_input_sent)[1]

            self.learning_rate = tf.Variable(float(config.init_lr),
                                             trainable=False,
                                             name="learning_rate")
            self.learning_rate_decay_op = self.learning_rate.assign(
                tf.multiply(self.learning_rate, config.lr_decay))
            self.global_t = tf.placeholder(dtype=tf.int32, name="global_t")
            self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior")

            if self.config.with_label_loss:
                with tf.name_scope("labeled_id"):
                    self.labeled_usr_input_sent = tf.placeholder(
                        dtype=tf.int32,
                        shape=(None, None, self.max_utt_len),
                        name="labeled_user_input"
                    )  #batch_size, dialog_len, max_utt_len
                    self.labeled_sys_input_sent = tf.placeholder(
                        dtype=tf.int32,
                        shape=(None, None, self.max_utt_len),
                        name="labeled_user_input")
                    self.labeled_dialog_length_mask = tf.placeholder(
                        dtype=tf.int32,
                        shape=(None),
                        name="labeled_dialog_length_mask")
                    self.labeled_usr_full_mask = tf.placeholder(
                        dtype=tf.int32,
                        shape=(None, None, self.max_utt_len),
                        name="labeled_usr_full_mask")
                    self.labeled_sys_full_mask = tf.placeholder(
                        dtype=tf.int32,
                        shape=(None, None, self.max_utt_len),
                        name="labeled_sys_full_mask")
                    self.labeled_labels = tf.placeholder(tf.int32,
                                                         shape=(None, None),
                                                         name="labeled_labels")

        with variable_scope.variable_scope("sent_embedding"):
            self.W_embedding = tf.get_variable(
                "W_embedding", [self.n_vocab, config.embed_size],
                dtype=tf.float32)
            embedding_mask = tf.constant(
                [0 if i == 0 else 1 for i in range(self.n_vocab)],
                dtype=tf.float32,
                shape=[self.n_vocab, 1])
            W_embedding = self.W_embedding * embedding_mask

            usr_input_embedding = tf.nn.embedding_lookup(
                W_embedding, tf.reshape(self.usr_input_sent,
                                        [-1]))  # (8000, 300)
            usr_input_embedding = tf.reshape(
                usr_input_embedding,
                [-1, self.max_utt_len, self.config.embed_size
                 ])  #(160, 50, 300)
            sys_input_embedding = tf.nn.embedding_lookup(
                W_embedding, tf.reshape(self.sys_input_sent,
                                        [-1]))  # (8000, 300)
            sys_input_embedding = tf.reshape(
                sys_input_embedding,
                [-1, self.max_utt_len, self.config.embed_size
                 ])  #(160, 50, 300)

            if self.config.with_label_loss:
                labeled_usr_input_embedding = tf.nn.embedding_lookup(
                    W_embedding, tf.reshape(self.labeled_usr_input_sent,
                                            [-1]))  # (8000, 300)
                labeled_usr_input_embedding = tf.reshape(
                    labeled_usr_input_embedding,
                    [-1, self.max_utt_len, self.config.embed_size
                     ])  # (160, 50, 300)
                labeled_sys_input_embedding = tf.nn.embedding_lookup(
                    W_embedding, tf.reshape(self.labeled_sys_input_sent,
                                            [-1]))  # (8000, 300)
                labeled_sys_input_embedding = tf.reshape(
                    labeled_sys_input_embedding,
                    [-1, self.max_utt_len, self.config.embed_size
                     ])  # (160, 50, 300)

        with variable_scope.variable_scope("sent_level"):
            self.encoding_cell = self.get_rnncell(self.cell_type,
                                                  self.encoding_cell_size,
                                                  self.keep_prob,
                                                  num_layer=self.num_layer)
            usr_input_embedding, usr_sent_size = get_rnn_encode(
                usr_input_embedding,
                self.encoding_cell,
                scope="sent_embedding_rnn")
            sys_input_embedding, sys_sent_size = get_rnn_encode(
                sys_input_embedding,
                self.encoding_cell,
                scope="sent_embedding_rnn",
                reuse=True)

            usr_input_embedding = tf.reshape(
                usr_input_embedding[1], [-1, max_dialog_len, usr_sent_size[0]])
            sys_input_embedding = tf.reshape(
                sys_input_embedding[1], [-1, max_dialog_len, sys_sent_size[0]])

            if self.config.with_label_loss:
                labeled_usr_input_embedding, labeled_usr_sent_size = get_rnn_encode(
                    labeled_usr_input_embedding,
                    self.encoding_cell,
                    scope="sent_embedding_rnn",
                    reuse=True)
                labeled_sys_input_embedding, labeled_sys_sent_size = get_rnn_encode(
                    labeled_sys_input_embedding,
                    self.encoding_cell,
                    scope="sent_embedding_rnn",
                    reuse=True)

                labeled_usr_input_embedding = tf.reshape(
                    labeled_usr_input_embedding[1],
                    [-1, max_dialog_len, labeled_usr_sent_size[0]])
                labeled_sys_input_embedding = tf.reshape(
                    labeled_sys_input_embedding[1],
                    [-1, max_dialog_len, labeled_sys_sent_size[0]])

            if config.keep_prob < 1.0:
                usr_input_embedding = tf.nn.dropout(usr_input_embedding,
                                                    config.keep_prob)
                sys_input_embedding = tf.nn.dropout(sys_input_embedding,
                                                    config.keep_prob)
                if self.config.with_label_loss:
                    labeled_usr_input_embedding = tf.nn.dropout(
                        labeled_usr_input_embedding, config.keep_prob)
                    labeled_sys_input_embedding = tf.nn.dropout(
                        labeled_sys_input_embedding, config.keep_prob)

            joint_embedding = tf.concat(
                [usr_input_embedding, sys_input_embedding], 2,
                "joint_embedding"
            )  # (batch, dialog_len, embedding_size*2) (16, 10, 400)
            if self.config.with_label_loss:
                labeled_joint_embedding = tf.concat(
                    [labeled_usr_input_embedding, labeled_sys_input_embedding],
                    2, "labeled_joint_embedding"
                )  # (batch, dialog_len, embedding_size*2) (16, 10, 400)

        with variable_scope.variable_scope("state_level"):
            usr_state_vocab_matrix = tf.get_variable(
                "usr_state_vocab_distribution", [self.n_state, self.n_vocab],
                dtype=tf.float32,
                initializer=tf.random_uniform_initializer())
            sys_state_vocab_matrix = tf.get_variable(
                "sys_state_vocab_distribution", [self.n_state, self.n_vocab],
                dtype=tf.float32,
                initializer=tf.random_uniform_initializer())
            self.usr_state_vocab_matrix = tf.nn.softmax(
                usr_state_vocab_matrix, -1)
            self.sys_state_vocab_matrix = tf.nn.softmax(
                sys_state_vocab_matrix, -1)

            self.state_cell = self.get_rnncell(self.cell_type,
                                               self.encoding_cell_size,
                                               self.keep_prob,
                                               num_layer=self.num_layer,
                                               activation=tf.nn.tanh)
            self.VAE_cell = VAECell(num_units=300,
                                    state_cell=self.state_cell,
                                    num_zt=self.config.n_state,
                                    vocab_size=self.n_vocab,
                                    max_utt_len=self.max_utt_len,
                                    config=config,
                                    use_peepholes=False,
                                    cell_clip=None,
                                    initializer=None,
                                    num_proj=None,
                                    proj_clip=None,
                                    num_unit_shards=None,
                                    num_proj_shards=None,
                                    forget_bias=1.0,
                                    state_is_tuple=True,
                                    activation=None,
                                    reuse=None,
                                    name=None)

            # dec_input_embeding = placeholder(float32, (16, max_dialog_len, 50, 300))
            # dec_seq_lens = placeholder(float32, (16, max_dialog_len))
            # output_tokens = ((16, max_dialog_len, 50), int32)
            # sequence_length = (tf.int32, (16))

            #print("before embedding")
            #print(W_embedding)
            #print(self.usr_input_sent)
            dec_input_embedding_usr = tf.nn.embedding_lookup(
                W_embedding, self.usr_input_sent)  # (16, 10, 50, 300)
            dec_input_embedding_sys = tf.nn.embedding_lookup(
                W_embedding, self.sys_input_sent)  # (16, 10, 50, 300)
            #print("embedding")
            dec_input_embedding = [
                dec_input_embedding_usr, dec_input_embedding_sys
            ]
            #print(dec_input_embedding)

            dec_seq_lens_usr = tf.reduce_sum(tf.sign(self.usr_full_mask), 2)
            dec_seq_lens_sys = tf.reduce_sum(tf.sign(self.sys_full_mask), 2)
            dec_seq_lens = [dec_seq_lens_usr, dec_seq_lens_sys]

            output_tokens_usr = self.usr_input_sent
            output_tokens_sys = self.sys_input_sent
            output_tokens = [output_tokens_usr, output_tokens_sys]

            if self.config.with_label_loss:
                labeled_dec_input_embedding_usr = tf.nn.embedding_lookup(
                    W_embedding,
                    self.labeled_usr_input_sent)  # (16, 10, 50, 300)
                labeled_dec_input_embedding_sys = tf.nn.embedding_lookup(
                    W_embedding,
                    self.labeled_sys_input_sent)  # (16, 10, 50, 300)
                labeled_dec_input_embedding = [
                    labeled_dec_input_embedding_usr,
                    labeled_dec_input_embedding_sys
                ]

                labeled_dec_seq_lens_usr = tf.reduce_sum(
                    tf.sign(self.labeled_usr_full_mask), 2)
                labeled_dec_seq_lens_sys = tf.reduce_sum(
                    tf.sign(self.labeled_sys_full_mask), 2)
                labeled_dec_seq_lens = [
                    labeled_dec_seq_lens_usr, labeled_dec_seq_lens_sys
                ]

                labeled_output_tokens_usr = self.labeled_usr_input_sent
                labeled_output_tokens_sys = self.labeled_sys_input_sent
                labeled_output_tokens = [
                    labeled_output_tokens_usr, labeled_output_tokens_sys
                ]

            with variable_scope.variable_scope(
                    "dynamic_VAE_loss") as dynamic_vae_scope:
                self.initial_prev_z = tf.placeholder(
                    tf.float32, (None, self.config.n_state), 'initial_prev_z')
                losses, z_ts, p_ts, bow_logits1, bow_logits2 = dynamic_vae(
                    self.VAE_cell,
                    joint_embedding,
                    dec_input_embedding,
                    dec_seq_lens,
                    output_tokens,
                    z_t_size=self.config.n_state,
                    sequence_length=self.dialog_length_mask,
                    initial_state=None,
                    dtype=tf.float32,
                    parallel_iterations=None,
                    swap_memory=False,
                    time_major=False,
                    scope=None,
                    initial_prev_z=self.initial_prev_z)

                if self.config.with_label_loss:
                    dynamic_vae_scope.reuse_variables()
                    labeled_losses, labeled_z_ts, labeled_pts, labeled_bow_logits1, labeled_bow_logits2 = dynamic_vae(
                        self.VAE_cell,
                        labeled_joint_embedding,
                        labeled_dec_input_embedding,
                        labeled_dec_seq_lens,
                        labeled_output_tokens,
                        z_t_size=self.config.n_state,
                        sequence_length=self.labeled_dialog_length_mask,
                        initial_state=None,
                        dtype=tf.float32,
                        parallel_iterations=None,
                        swap_memory=False,
                        time_major=False,
                        scope=None)
                    self.labeled_z_ts = labeled_z_ts
                    self.labeled_z_ts_mask = tf.to_float(
                        tf.sign(tf.reduce_sum(self.labeled_usr_full_mask, 2)))

                    labeled_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.labeled_z_ts, labels=self.labeled_labels)
                    labeled_loss = tf.reduce_sum(labeled_loss *
                                                 self.labeled_z_ts_mask)

                    labeled_loss = labeled_loss / tf.to_float(
                        tf.reduce_sum(self.labeled_usr_full_mask) +
                        tf.reduce_sum(self.labeled_sys_full_mask))
                    self.labeled_loss = tf.identity(labeled_loss,
                                                    name="labeled_loss")

            z_ts = tf.nn.softmax(z_ts)  # (16, 10, 12)
            z_ts_mask = tf.to_float(
                tf.sign(tf.reduce_sum(self.usr_full_mask, 2)))  # (16, 10)
            z_ts_mask = tf.expand_dims(z_ts_mask, 2)  # (16, 10, 1)
            self.z_ts = z_ts * z_ts_mask
            self.p_ts = p_ts
            self.bow_logits1 = bow_logits1
            self.bow_logits2 = bow_logits2
            loss_avg = tf.reduce_sum(losses) / tf.to_float(
                tf.reduce_sum(self.usr_full_mask) +
                tf.reduce_sum(self.sys_full_mask))

            if self.config.with_label_loss:
                loss_avg = loss_avg + self.labeled_loss

            loss_avg = tf.identity(loss_avg, name="loss_average")

            self.basic_loss = loss_avg
            tf.summary.scalar("basic_loss", self.basic_loss)

            self.summary_op = tf.summary.merge_all()

            self.optimize(sess=sess,
                          config=config,
                          loss=self.basic_loss,
                          log_dir=log_dir)

        self.saver = tf.train.Saver(tf.global_variables(),
                                    write_version=tf.train.SaverDef.V2)
Exemple #47
0
def multi_encoder(encoder_inputs, encoders, encoder_input_length, other_inputs=None, **kwargs):
    """
    Build multiple encoders according to the configuration in `encoders`, reading from `encoder_inputs`.
    The result is a list of the outputs produced by those encoders (for each time-step), and their final state.

    :param encoder_inputs: list of tensors of shape (batch_size, input_length), one tensor for each encoder.
    :param encoders: list of encoder configurations
    :param encoder_input_length: list of tensors of shape (batch_size,) (one tensor for each encoder)
    :return:
      encoder outputs: a list of tensors of shape (batch_size, input_length, encoder_cell_size), hidden states of the
        encoders.
      encoder state: concatenation of the final states of all encoders, tensor of shape (batch_size, sum_of_state_sizes)
      new_encoder_input_length: list of tensors of shape (batch_size,) with the true length of the encoder outputs.
        May be different than `encoder_input_length` because of maxout strides, and time pooling.
    """
    encoder_states = []
    encoder_outputs = []

    # create embeddings in the global scope (allows sharing between encoder and decoder)
    embedding_variables = []
    for encoder in encoders:
        if encoder.binary:
            embedding_variables.append(None)
            continue
        # inputs are token ids, which need to be mapped to vectors (embeddings)
        embedding_shape = [encoder.vocab_size, encoder.embedding_size]

        if encoder.embedding_initializer == 'sqrt3':
            initializer = tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3))
        else:
            initializer = None

        device = '/cpu:0' if encoder.embeddings_on_cpu else None
        with tf.device(device):  # embeddings can take a very large amount of memory, so
            # storing them in GPU memory can be impractical
            embedding = get_variable('embedding_{}'.format(encoder.name), shape=embedding_shape,
                                     initializer=initializer)
        embedding_variables.append(embedding)

    new_encoder_input_length = []

    for i, encoder in enumerate(encoders):
        if encoder.use_lstm is False:
            encoder.cell_type = 'GRU'

        with tf.variable_scope('encoder_{}'.format(encoder.name)):
            encoder_inputs_ = encoder_inputs[i]
            encoder_input_length_ = encoder_input_length[i]

            def get_cell(input_size=None, reuse=False):
                if encoder.cell_type.lower() == 'lstm':
                    cell = CellWrapper(BasicLSTMCell(encoder.cell_size, reuse=reuse))
                elif encoder.cell_type.lower() == 'dropoutgru':
                    cell = DropoutGRUCell(encoder.cell_size, reuse=reuse, layer_norm=encoder.layer_norm,
                                          input_size=input_size, input_keep_prob=encoder.rnn_input_keep_prob,
                                          state_keep_prob=encoder.rnn_state_keep_prob)
                else:
                    cell = GRUCell(encoder.cell_size, reuse=reuse, layer_norm=encoder.layer_norm)

                if encoder.use_dropout and encoder.cell_type.lower() != 'dropoutgru':
                    cell = DropoutWrapper(cell, input_keep_prob=encoder.rnn_input_keep_prob,
                                          output_keep_prob=encoder.rnn_output_keep_prob,
                                          state_keep_prob=encoder.rnn_state_keep_prob,
                                          variational_recurrent=encoder.pervasive_dropout,
                                          dtype=tf.float32, input_size=input_size)
                return cell

            embedding = embedding_variables[i]

            batch_size = tf.shape(encoder_inputs_)[0]
            time_steps = tf.shape(encoder_inputs_)[1]

            if embedding is not None:
                flat_inputs = tf.reshape(encoder_inputs_, [tf.multiply(batch_size, time_steps)])
                flat_inputs = tf.nn.embedding_lookup(embedding, flat_inputs)
                encoder_inputs_ = tf.reshape(flat_inputs,
                                             tf.stack([batch_size, time_steps, flat_inputs.get_shape()[1].value]))

            if encoder.cell_type.lower() == 'raw':
                encoder_outputs.append(encoder_inputs_)
                encoder_states.append(tf.zeros([batch_size, encoder.cell_size]))
                new_encoder_input_length.append(encoder_input_length_)
                continue

            if other_inputs is not None:
                encoder_inputs_ = tf.concat([encoder_inputs_, other_inputs], axis=2)

            if encoder.use_dropout:
                noise_shape = [1, time_steps, 1] if encoder.pervasive_dropout else [batch_size, time_steps, 1]
                encoder_inputs_ = tf.nn.dropout(encoder_inputs_, keep_prob=encoder.word_keep_prob,
                                                noise_shape=noise_shape)

                size = tf.shape(encoder_inputs_)[2]
                noise_shape = [1, 1, size] if encoder.pervasive_dropout else [batch_size, time_steps, size]
                encoder_inputs_ = tf.nn.dropout(encoder_inputs_, keep_prob=encoder.embedding_keep_prob,
                                                noise_shape=noise_shape)

            if encoder.input_layers:
                for j, layer_size in enumerate(encoder.input_layers):
                    if encoder.input_layer_activation is not None and encoder.input_layer_activation.lower() == 'relu':
                        activation = tf.nn.relu
                    else:
                        activation = tf.tanh

                    encoder_inputs_ = dense(encoder_inputs_, layer_size, activation=activation, use_bias=True,
                                            name='layer_{}'.format(j))
                    if encoder.use_dropout:
                        encoder_inputs_ = tf.nn.dropout(encoder_inputs_, keep_prob=encoder.input_layer_keep_prob)

            # Contrary to Theano's RNN implementation, states after the sequence length are zero
            # (while Theano repeats last state)
            inter_layer_keep_prob = None if not encoder.use_dropout else encoder.inter_layer_keep_prob

            parameters = dict(
                inputs=encoder_inputs_, sequence_length=encoder_input_length_,
                dtype=tf.float32, parallel_iterations=encoder.parallel_iterations
            )

            input_size = encoder_inputs_.get_shape()[2].value
            state_size = (encoder.cell_size * 2 if encoder.cell_type.lower() == 'lstm' else encoder.cell_size)

            def get_initial_state(name='initial_state'):
                if encoder.train_initial_states:
                    initial_state = get_variable(name, initializer=tf.zeros(state_size))
                    return tf.tile(tf.expand_dims(initial_state, axis=0), [batch_size, 1])
                else:
                    return None

            if encoder.bidir:
                rnn = lambda reuse: stack_bidirectional_dynamic_rnn(
                    cells_fw=[get_cell(input_size if j == 0 else 2 * encoder.cell_size, reuse=reuse)
                              for j in range(encoder.layers)],
                    cells_bw=[get_cell(input_size if j == 0 else 2 * encoder.cell_size, reuse=reuse)
                              for j in range(encoder.layers)],
                    initial_states_fw=[get_initial_state('initial_state_fw')] * encoder.layers,
                    initial_states_bw=[get_initial_state('initial_state_bw')] * encoder.layers,
                    time_pooling=encoder.time_pooling, pooling_avg=encoder.pooling_avg,
                    **parameters)

                initializer = CellInitializer(encoder.cell_size) if encoder.orthogonal_init else None
                with tf.variable_scope(tf.get_variable_scope(), initializer=initializer):
                    try:
                        encoder_outputs_, _, encoder_states_ = rnn(reuse=False)
                    except ValueError:   # Multi-task scenario where we're reusing the same RNN parameters
                        encoder_outputs_, _, encoder_states_ = rnn(reuse=True)
            else:
                if encoder.time_pooling or encoder.final_state == 'concat_last':
                    raise NotImplementedError

                if encoder.layers > 1:
                    cell = MultiRNNCell([get_cell(input_size if j == 0 else encoder.cell_size)
                                         for j in range(encoder.layers)])
                    initial_state = (get_initial_state(),) * encoder.layers
                else:
                    cell = get_cell(input_size)
                    initial_state = get_initial_state()

                encoder_outputs_, encoder_states_ = auto_reuse(tf.nn.dynamic_rnn)(cell=cell,
                                                                                  initial_state=initial_state,
                                                                                  **parameters)

            last_backward = encoder_outputs_[:, 0, encoder.cell_size:]
            indices = tf.stack([tf.range(batch_size), encoder_input_length_ - 1], axis=1)
            last_forward = tf.gather_nd(encoder_outputs_[:, :, :encoder.cell_size], indices)
            last_forward.set_shape([None, encoder.cell_size])

            if encoder.final_state == 'concat_last': # concats last states of all backward layers (full LSTM states)
                encoder_state_ = tf.concat(encoder_states_, axis=1)
            elif encoder.final_state == 'average':
                mask = tf.sequence_mask(encoder_input_length_, maxlen=tf.shape(encoder_outputs_)[1], dtype=tf.float32)
                mask = tf.expand_dims(mask, axis=2)
                encoder_state_ = tf.reduce_sum(mask * encoder_outputs_, axis=1) / tf.reduce_sum(mask, axis=1)
            elif encoder.final_state == 'average_inputs':
                mask = tf.sequence_mask(encoder_input_length_, maxlen=tf.shape(encoder_inputs_)[1], dtype=tf.float32)
                mask = tf.expand_dims(mask, axis=2)
                encoder_state_ = tf.reduce_sum(mask * encoder_inputs_, axis=1) / tf.reduce_sum(mask, axis=1)
            elif encoder.bidir and encoder.final_state == 'last_both':
                encoder_state_ = tf.concat([last_forward, last_backward], axis=1)
            elif encoder.bidir and not encoder.final_state == 'last_forward':   # last backward hidden state
                encoder_state_ = last_backward
            else:  # last forward hidden state
                encoder_state_ = last_forward

            if encoder.bidir and encoder.bidir_projection:
                encoder_outputs_ = dense(encoder_outputs_, encoder.cell_size, use_bias=False, name='bidir_projection')

            encoder_outputs.append(encoder_outputs_)
            encoder_states.append(encoder_state_)
            new_encoder_input_length.append(encoder_input_length_)

    exemplar_code = None
    code = None
    exemplar = None
    ast = None
    for i, encoder in enumerate(encoders):
        res = encoder_states[i]
        if encoder.cell_type == 'raw':
            time_steps = tf.shape(encoder_outputs[i], out_type=tf.float32)[1]
            res = tf.einsum('ijk->ik', encoder_outputs[i])
            res = tf.scalar_mul(tf.constant(1.0/time_steps), res)
        if encoder.name == 'exemplar_code':
            exemplar_code = res
        if encoder.name == 'code':
            code = res
        if encoder.name == 'exemplar':
            exemplar = res
        if encoder.name == 'ast':
            ast = res
    activation = tf.nn.sigmoid
    sim_score = dense(tf.concat([code, exemplar_code], axis=1), 1, use_bias=False, activation=activation, name='sim_score')
    # with tf.variable_scope('decoder_nl'):
    #     fused = dense(tf.concat([code, ast], axis=1), encoder.cell_size, use_bias=False, activation=None, name='fuse')
    encoder_state = code * (1-sim_score) + exemplar * sim_score
    return encoder_outputs, encoder_state, new_encoder_input_length, sim_score
Exemple #48
0
    def _add_seq2seq(self):
        """Add the whole sequence-to-sequence model to the graph."""
        hps = self._hps
        vsize = self._vocab.size()  # size of the vocabulary

        with tf.variable_scope('seq2seq'):
            # Some initializers
            self.rand_unif_init = tf.random_uniform_initializer(
                -hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123)
            self.trunc_norm_init = tf.truncated_normal_initializer(
                stddev=hps.trunc_norm_init_std)

            # Add embedding matrix (shared by the encoder and decoder inputs)
            with tf.variable_scope('embedding'):
                # embedding = tf.get_variable('embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=self.trunc_norm_init)
                embedding = tf.Variable(tf.constant(0.0,
                                                    shape=[vsize,
                                                           hps.emb_dim]),
                                        trainable=False,
                                        name="word_embedding_w")
                self.embedding_init = embedding.assign(self.word_embedding)
                if hps.mode == "train":
                    self._add_emb_vis(embedding)  # add to tensorboard
                emb_enc_inputs = tf.nn.embedding_lookup(
                    embedding, self._enc_batch
                )  # tensor with shape (batch_size, max_enc_steps, emb_size)
                emb_dec_inputs = [
                    tf.nn.embedding_lookup(embedding, x)
                    for x in tf.unstack(self._dec_batch, axis=1)
                ]  # list length max_dec_steps containing shape (batch_size, emb_size)

            # Map article to topic distribution(batch_size, topic_num, 1)
            self.topic_distribution = self._article_topic_distribution(
                emb_enc_inputs, hps)

            # Extract topic words(batch_size, seq_num, 1)
            self.topic_words = self._extract_topic_words(
                emb_enc_inputs, self.topic_distribution, hps)

            # calculate final results based on topic_representation
            mapped_term_frequencies = self._map_term_frequency(hps)
            mu, log_sigma, kl_divergence = self._cal_topic_representation(
                mapped_term_frequencies, hps)

            # get article topic representation
            topic_additions = self._topic_representation(mu, log_sigma)

            # Add the encoder.
            enc_outputs, fw_st, bw_st = self._add_encoder(
                emb_enc_inputs, self._enc_lens)
            self._enc_states = enc_outputs

            # Our encoder is bidirectional and our decoder is unidirectional so we need to reduce the final encoder hidden state to the right size to be the initial decoder hidden state
            self._dec_in_state = self._reduce_states(fw_st, bw_st)

            # Add the decoder.
            with tf.variable_scope('decoder'):
                decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage = self._add_decoder(
                    emb_dec_inputs)

            # Add the output projection to obtain the vocabulary distribution
            with tf.variable_scope('output_projection'):
                w = tf.get_variable('w', [hps.hidden_dim, vsize],
                                    dtype=tf.float32,
                                    initializer=self.trunc_norm_init)
                v = tf.get_variable('v', [vsize],
                                    dtype=tf.float32,
                                    initializer=self.trunc_norm_init)
                vocab_scores = [
                ]  # vocab_scores is the vocabulary distribution before applying softmax. Each entry on the list corresponds to one decoder step
                for i, output in enumerate(decoder_outputs):
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()
                    tmp_topic_addition = tf.slice(topic_additions, [i, 0],
                                                  [1, vsize])
                    topic_filter = tf.slice(tf.squeeze(self.topic_words),
                                            [0, i], [hps.batch_size, 1])
                    logits = tf.nn.xw_plus_b(output, w, v)
                    semantic = tf.multiply(topic_filter, tmp_topic_addition)
                    vocab_scores.append(logits +
                                        semantic)  # apply the linear layer

                vocab_dists = [
                    tf.nn.softmax(s) for s in vocab_scores
                ]  # The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file.

            # For pointer-generator model, calc final distribution from copy distribution and vocabulary distribution
            if FLAGS.pointer_gen:
                final_dists = self._calc_final_dist(vocab_dists,
                                                    self.attn_dists)
            else:  # final distribution is just vocabulary distribution
                final_dists = vocab_dists

            if hps.mode in ['train', 'eval']:
                # Calculate the loss
                with tf.variable_scope('loss'):
                    if FLAGS.pointer_gen:
                        # Calculate the loss per step
                        # This is fiddly; we use tf.gather_nd to pick out the probabilities of the gold target words
                        loss_per_step = [
                        ]  # will be list length max_dec_steps containing shape (batch_size)
                        batch_nums = tf.range(
                            0, limit=hps.batch_size)  # shape (batch_size)
                        for dec_step, dist in enumerate(final_dists):
                            targets = self._target_batch[:,
                                                         dec_step]  # The indices of the target words. shape (batch_size)
                            indices = tf.stack((batch_nums, targets),
                                               axis=1)  # shape (batch_size, 2)
                            gold_probs = tf.gather_nd(
                                dist, indices
                            )  # shape (batch_size). prob of correct words on this step
                            losses = -tf.log(gold_probs)
                            loss_per_step.append(losses)

                        # Apply dec_padding_mask and get loss
                        self._loss = _mask_and_avg(loss_per_step,
                                                   self._dec_padding_mask)

                    else:  # baseline model
                        self._loss = tf.contrib.seq2seq.sequence_loss(
                            tf.stack(vocab_scores, axis=1), self._target_batch,
                            self._dec_padding_mask
                        )  # this applies softmax internally
                        self._loss = self._loss - kl_divergence

                    tf.summary.scalar('loss', self._loss)

                    # Calculate coverage loss from the attention distributions
                    if hps.coverage:
                        with tf.variable_scope('coverage_loss'):
                            self._coverage_loss = _coverage_loss(
                                self.attn_dists, self._dec_padding_mask)
                            tf.summary.scalar('coverage_loss',
                                              self._coverage_loss)
                        self._total_loss = self._loss + hps.cov_loss_wt * self._coverage_loss
                        tf.summary.scalar('total_loss', self._total_loss)

        if hps.mode == "decode":
            # We run decode beam search mode one decoder step at a time
            assert len(
                final_dists
            ) == 1  # final_dists is a singleton list containing shape (batch_size, extended_vsize)
            final_dists = final_dists[0]
            topk_probs, self._topk_ids = tf.nn.top_k(
                final_dists, hps.batch_size * 2
            )  # take the k largest probs. note batch_size=beam_size in decode mode
            self._topk_log_probs = tf.log(topk_probs)
Exemple #49
0
with open("../filtertWN18.txt", 'r') as f:
    idx = -1
    for x in f.readlines():
        if len(x.strip().split(' ')) > 0:
            filtert.append([])
            idx += 1
            for i in x.strip().split(' '):
                filtert[idx].append(int(i))
        else:
            print('length:0')
#tf.placeholder()
trainable = []  #可训练参数列表

bound = 6 / math.sqrt(embed_dim)
ent_embedding =tf.get_variable("ent_embedding", [n_entity, embed_dim],
                                                   initializer=tf.random_uniform_initializer(minval=-bound, \
                                                    maxval=bound,seed=345))
'''ent_projecting=tf.get_variable("ent_projecting", [n_entity, embed_dim],
                                                   initializer=tf.random_uniform_initializer(minval=-bound, \
                                                   	maxval=bound,seed=347))'''
trainable.append(ent_embedding)
#trainable.append(ent_projecting)

rel_embedding =tf.get_variable("rel_embedding", [n_relation, embed_dim],
                                                   initializer=tf.random_uniform_initializer(minval=-bound, \
                                                    maxval=bound,seed=346))
'''rel_projecting=tf.get_variable("rel_projecting", [n_relation, embed_dim],
                                                   initializer=tf.random_uniform_initializer(minval=-bound, \
                                                   	maxval=bound,seed=348))'''
trainable.append(rel_embedding)
#trainable.append(rel_projecting)
Exemple #50
0
def attention_decoder(decoder_inputs, initial_state, attention_states, encoders, decoder, encoder_input_length,
                      feed_previous=0.0, align_encoder_id=0, feed_argmax=True, sim_score=0.0, **kwargs):
    """
    :param decoder_inputs: int32 tensor of shape (batch_size, output_length)
    :param initial_state: initial state of the decoder (usually the final state of the encoder),
      as a float32 tensor of shape (batch_size, initial_state_size). This state is mapped to the
      correct state size for the decoder.
    :param attention_states: list of tensors of shape (batch_size, input_length, encoder_cell_size),
      the hidden states of the encoder(s) (one tensor for each encoder).
    :param encoders: configuration of the encoders
    :param decoder: configuration of the decoder
    :param encoder_input_length: list of int32 tensors of shape (batch_size,), tells for each encoder,
     the true length of each sequence in the batch (sequences in the same batch are padded to all have the same
     length).
    :param feed_previous: scalar tensor corresponding to the probability to use previous decoder output
      instead of the ground truth as input for the decoder (1 when decoding, between 0 and 1 when training)
    :param feed_argmax: boolean tensor, when True the greedy decoder outputs the word with the highest
    probability (argmax). When False, it samples a word from the probability distribution (softmax).
    :param align_encoder_id: outputs attention weights for this encoder. Also used when predicting edit operations
    (pred_edits), to specifify which encoder reads the sequence to post-edit (MT).

    :return:
      outputs of the decoder as a tensor of shape (batch_size, output_length, decoder_cell_size)
      attention weights as a tensor of shape (output_length, encoders, batch_size, input_length)
    """
    assert not decoder.pred_maxout_layer or decoder.cell_size % 2 == 0, 'cell size must be a multiple of 2'

    if decoder.use_lstm is False:
        decoder.cell_type = 'GRU'

    embedding_shape = [decoder.vocab_size, decoder.embedding_size]
    if decoder.embedding_initializer == 'sqrt3':
        initializer = tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3))
    else:
        initializer = None

    device = '/cpu:0' if decoder.embeddings_on_cpu else None
    if decoder.share_emb is None:
        with tf.device(device):
            embedding = get_variable('embedding_{}'.format(decoder.name), shape=embedding_shape, initializer=initializer)
    else:
        with tf.device(device):
            embedding = get_variable('embedding_{}'.format(decoder.share_emb), shape=embedding_shape, initializer=initializer)
    
    input_shape = tf.shape(decoder_inputs)
    batch_size = input_shape[0]
    time_steps = input_shape[1]

    scope_name = 'decoder_{}'.format(decoder.name)
    scope_name += '/' + '_'.join(encoder.name for encoder in encoders)

    def embed(input_):
        embedded_input = tf.nn.embedding_lookup(embedding, input_)

        input_shape = tf.shape(embedded_input)
        batch_size = input_shape[0]

        if decoder.use_dropout and decoder.word_keep_prob is not None:
            noise_shape = [1, 1] if decoder.pervasive_dropout else [batch_size, 1]
            embedded_input = tf.nn.dropout(embedded_input, keep_prob=decoder.word_keep_prob, noise_shape=noise_shape)
        if decoder.use_dropout and decoder.embedding_keep_prob is not None:
            size = tf.shape(embedded_input)[1]
            noise_shape = [1, size] if decoder.pervasive_dropout else [batch_size, size]
            embedded_input = tf.nn.dropout(embedded_input, keep_prob=decoder.embedding_keep_prob,
                                           noise_shape=noise_shape)

        return embedded_input

    def get_cell(input_size=None, reuse=False):
        cells = []

        for j in range(decoder.layers):
            input_size_ = input_size if j == 0 else decoder.cell_size

            if decoder.cell_type.lower() == 'lstm':
                cell = CellWrapper(BasicLSTMCell(decoder.cell_size, reuse=reuse))
            elif decoder.cell_type.lower() == 'dropoutgru':
                cell = DropoutGRUCell(decoder.cell_size, reuse=reuse, layer_norm=decoder.layer_norm,
                                      input_size=input_size_, input_keep_prob=decoder.rnn_input_keep_prob,
                                      state_keep_prob=decoder.rnn_state_keep_prob)
            else:
                cell = GRUCell(decoder.cell_size, reuse=reuse, layer_norm=decoder.layer_norm)

            if decoder.use_dropout and decoder.cell_type.lower() != 'dropoutgru':
                cell = DropoutWrapper(cell, input_keep_prob=decoder.rnn_input_keep_prob,
                                      output_keep_prob=decoder.rnn_output_keep_prob,
                                      state_keep_prob=decoder.rnn_state_keep_prob,
                                      variational_recurrent=decoder.pervasive_dropout,
                                      dtype=tf.float32, input_size=input_size_)
            cells.append(cell)

        if len(cells) == 1:
            return cells[0]
        else:
            return CellWrapper(MultiRNNCell(cells))

    def look(state, input_, prev_weights=None, pos=None):
        prev_weights_ = [prev_weights if i == align_encoder_id else None for i in range(len(encoders))]
        pos_ = None
        if decoder.pred_edits:
            pos_ = [pos if i == align_encoder_id else None for i in range(len(encoders))]
        if decoder.attn_prev_word:
            state = tf.concat([state, input_], axis=1)

        parameters = dict(hidden_states=attention_states, encoder_input_length=encoder_input_length,
                          encoders=encoders, aggregation_method=decoder.aggregation_method, sim_score=sim_score)
        context, new_weights = multi_attention(state, pos=pos_, prev_weights=prev_weights_, **parameters)

        if decoder.context_mapping:
            with tf.variable_scope(scope_name):
                activation = tf.nn.tanh if decoder.context_mapping_activation == 'tanh' else None
                use_bias = not decoder.context_mapping_no_bias
                context = dense(context, decoder.context_mapping, use_bias=use_bias, activation=activation,
                                name='context_mapping')

        return context, new_weights[align_encoder_id]

    def update(state, input_, context=None, symbol=None):
        if context is not None and decoder.rnn_feed_attn:
            input_ = tf.concat([input_, context], axis=1)
        input_size = input_.get_shape()[1].value

        initializer = CellInitializer(decoder.cell_size) if decoder.orthogonal_init else None
        with tf.variable_scope(tf.get_variable_scope(), initializer=initializer):
            try:
                output, new_state = get_cell(input_size)(input_, state)
            except ValueError:  # auto_reuse doesn't work with LSTM cells
                output, new_state = get_cell(input_size, reuse=True)(input_, state)

        if decoder.skip_update and decoder.pred_edits and symbol is not None:
            is_del = tf.equal(symbol, utils.DEL_ID)
            new_state = tf.where(is_del, state, new_state)

        if decoder.cell_type.lower() == 'lstm' and decoder.use_lstm_full_state:
            output = new_state

        return output, new_state

    def update_pos(pos, symbol, max_pos=None):
        if not decoder.pred_edits:
            return pos

        is_keep = tf.equal(symbol, utils.KEEP_ID)
        is_del = tf.equal(symbol, utils.DEL_ID)
        is_not_ins = tf.logical_or(is_keep, is_del)

        pos = beam_search.resize_like(pos, symbol)
        max_pos = beam_search.resize_like(max_pos, symbol)

        pos += tf.to_float(is_not_ins)
        if max_pos is not None:
            pos = tf.minimum(pos, tf.to_float(max_pos))
        return pos

    def generate(state, input_, context):
        if decoder.pred_use_lstm_state is False:  # for back-compatibility
            state = state[:,-decoder.cell_size:]

        projection_input = [state, context]
        if decoder.use_previous_word:
            projection_input.insert(1, input_)  # for back-compatibility

        output_ = tf.concat(projection_input, axis=1)

        if decoder.pred_deep_layer:
            deep_layer_size = decoder.pred_deep_layer_size or decoder.embedding_size
            if decoder.layer_norm:
                output_ = dense(output_, deep_layer_size, use_bias=False, name='deep_output')
                output_ = tf.contrib.layers.layer_norm(output_, activation_fn=tf.nn.tanh, scope='output_layer_norm')
            else:
                output_ = dense(output_, deep_layer_size, activation=tf.tanh, use_bias=True, name='deep_output')

            if decoder.use_dropout:
                size = tf.shape(output_)[1]
                noise_shape = [1, size] if decoder.pervasive_dropout else None
                output_ = tf.nn.dropout(output_, keep_prob=decoder.deep_layer_keep_prob, noise_shape=noise_shape)
        else:
            if decoder.pred_maxout_layer:
                maxout_size = decoder.maxout_size or decoder.cell_size
                output_ = dense(output_, maxout_size, use_bias=True, name='maxout')
                if decoder.old_maxout:  # for back-compatibility with old models
                    output_ = tf.nn.pool(tf.expand_dims(output_, axis=2), window_shape=[2], pooling_type='MAX',
                                         padding='SAME', strides=[2])
                    output_ = tf.squeeze(output_, axis=2)
                else:
                    output_ = tf.maximum(*tf.split(output_, num_or_size_splits=2, axis=1))

            if decoder.pred_embed_proj:
                # intermediate projection to embedding size (before projecting to vocabulary size)
                # this is useful to reduce the number of parameters, and
                # to use the output embeddings for output projection (tie_embeddings parameter)
                output_ = dense(output_, decoder.embedding_size, use_bias=False, name='softmax0')

        if decoder.tie_embeddings and (decoder.pred_embed_proj or decoder.pred_deep_layer):
            bias = get_variable('softmax1/bias', shape=[decoder.vocab_size])
            output_ = tf.matmul(output_, tf.transpose(embedding)) + bias
        else:
            output_ = dense(output_, decoder.vocab_size, use_bias=True, name='softmax1')
        return output_

    state_size = (decoder.cell_size * 2 if decoder.cell_type.lower() == 'lstm' else decoder.cell_size) * decoder.layers

    if decoder.use_dropout:
        initial_state = tf.nn.dropout(initial_state, keep_prob=decoder.initial_state_keep_prob)

    with tf.variable_scope(scope_name):
        if decoder.layer_norm:
            initial_state = dense(initial_state, state_size, use_bias=False, name='initial_state_projection')
            initial_state = tf.contrib.layers.layer_norm(initial_state, activation_fn=tf.nn.tanh,
                                                         scope='initial_state_layer_norm')
        else:
            initial_state = dense(initial_state, state_size, use_bias=True, name='initial_state_projection',
                                  activation=tf.nn.tanh)

    if decoder.cell_type.lower() == 'lstm' and decoder.use_lstm_full_state:
        initial_output = initial_state
    else:
        initial_output = initial_state[:, -decoder.cell_size:]

    time = tf.constant(0, dtype=tf.int32, name='time')
    outputs = tf.TensorArray(dtype=tf.float32, size=time_steps)
    samples = tf.TensorArray(dtype=tf.int64, size=time_steps)
    inputs = tf.TensorArray(dtype=tf.int64, size=time_steps).unstack(tf.to_int64(tf.transpose(decoder_inputs)))

    states = tf.TensorArray(dtype=tf.float32, size=time_steps)
    weights = tf.TensorArray(dtype=tf.float32, size=time_steps)
    attns = tf.TensorArray(dtype=tf.float32, size=time_steps)

    initial_symbol = inputs.read(0)  # first symbol is BOS
    initial_input = embed(initial_symbol)
    initial_pos = tf.zeros([batch_size], tf.float32)
    initial_weights = tf.zeros(tf.shape(attention_states[align_encoder_id])[:2])
    with tf.variable_scope('decoder_{}'.format(decoder.name)):
        initial_context, _ = look(initial_output, initial_input, pos=initial_pos, prev_weights=initial_weights)
    initial_data = tf.concat([initial_state, initial_context, tf.expand_dims(initial_pos, axis=1), initial_weights],
                             axis=1)
    context_size = initial_context.shape[1].value

    def get_logits(state, ids, time):  # for beam-search decoding
        with tf.variable_scope('decoder_{}'.format(decoder.name)):
            state, context, pos, prev_weights = tf.split(state, [state_size, context_size, 1, -1], axis=1)
            input_ = embed(ids)

            pos = tf.squeeze(pos, axis=1)
            pos = tf.cond(tf.equal(time, 0),
                          lambda: pos,
                          lambda: update_pos(pos, ids, encoder_input_length[align_encoder_id]))

            if decoder.cell_type.lower() == 'lstm' and decoder.use_lstm_full_state:
                output = state
            else:
                # output is always the right-most part of state. However, this only works at test time,
                # because different dropout operations can be used on state and output.
                output = state[:, -decoder.cell_size:]

            if decoder.conditional_rnn:
                with tf.variable_scope('conditional_1'):
                    output, state = update(state, input_)
            elif decoder.update_first:
                output, state = update(state, input_, None, ids)
            elif decoder.generate_first:
                output, state = tf.cond(tf.equal(time, 0),
                                        lambda: (output, state),
                                        lambda: update(state, input_, context, ids))

            context, new_weights = look(output, input_, pos=pos, prev_weights=prev_weights)

            if decoder.conditional_rnn:
                with tf.variable_scope('conditional_2'):
                    output, state = update(state, context)
            elif not decoder.generate_first:
                output, state = update(state, input_, context, ids)

            logits = generate(output, input_, context)

            pos = tf.expand_dims(pos, axis=1)
            state = tf.concat([state, context, pos, new_weights], axis=1)
            return state, logits

    def _time_step(time, input_, input_symbol, pos, state, output, outputs, states, weights, attns, prev_weights,
                   samples):
        if decoder.conditional_rnn:
            with tf.variable_scope('conditional_1'):
                output, state = update(state, input_)
        elif decoder.update_first:
            output, state = update(state, input_, None, input_symbol)

        context, new_weights = look(output, input_, pos=pos, prev_weights=prev_weights)

        if decoder.conditional_rnn:
            with tf.variable_scope('conditional_2'):
                output, state = update(state, context)
        elif not decoder.generate_first:
            output, state = update(state, input_, context, input_symbol)

        output_ = generate(output, input_, context)

        argmax = lambda: tf.argmax(output_, 1)
        target = lambda: inputs.read(time + 1)
        softmax = lambda: tf.squeeze(tf.multinomial(tf.log(tf.nn.softmax(output_)), num_samples=1),
                                     axis=1)

        use_target = tf.logical_and(time < time_steps - 1, tf.random_uniform([]) >= feed_previous)
        predicted_symbol = tf.case([
            (use_target, target),
            (tf.logical_not(feed_argmax), softmax)],
            default=argmax)   # default case is useful for beam-search

        predicted_symbol.set_shape([None])
        predicted_symbol = tf.stop_gradient(predicted_symbol)
        samples = samples.write(time, predicted_symbol)

        input_ = embed(predicted_symbol)
        pos = update_pos(pos, predicted_symbol, encoder_input_length[align_encoder_id])

        attns = attns.write(time, context)
        weights = weights.write(time, new_weights)
        states = states.write(time, state)
        outputs = outputs.write(time, output_)

        if not decoder.conditional_rnn and not decoder.update_first and decoder.generate_first:
            output, state = update(state, input_, context, predicted_symbol)

        return (time + 1, input_, predicted_symbol, pos, state, output, outputs, states, weights, attns, new_weights,
                samples)

    with tf.variable_scope('decoder_{}'.format(decoder.name)):
        _, _, _, new_pos, new_state, _, outputs, states, weights, attns, new_weights, samples = tf.while_loop(
            cond=lambda time, *_: time < time_steps,
            body=_time_step,
            loop_vars=(time, initial_input, initial_symbol, initial_pos, initial_state, initial_output, outputs,
                       weights, states, attns, initial_weights, samples),
            parallel_iterations=decoder.parallel_iterations,
            swap_memory=decoder.swap_memory)

    outputs = outputs.stack()
    weights = weights.stack()  # batch_size, encoders, output time, input time
    states = states.stack()
    attns = attns.stack()
    samples = samples.stack()

    # put batch_size as first dimension
    outputs = tf.transpose(outputs, perm=(1, 0, 2))
    weights = tf.transpose(weights, perm=(1, 0, 2))
    states = tf.transpose(states, perm=(1, 0, 2))
    attns = tf.transpose(attns, perm=(1, 0, 2))
    samples = tf.transpose(samples)

    return outputs, weights, states, attns, samples, get_logits, initial_data
Exemple #51
0
def create_model(session, run_options, run_metadata):
    device_strs = FLAGS.NN.split(",")
    devices_per_model = [get_device_address(x) for x in device_strs]
    num_models = FLAGS.num_models
    dtype = FLAGS.tf_dtype

    initializer = None
    if FLAGS.p != 0.0:
        initializer = tf.random_uniform_initializer(-FLAGS.p,FLAGS.p)

    if FLAGS.dynamic_rnn:
        from seqModelDistributed_dynamic import SeqModelDistributed
    else:
        from seqModelDistributed import SeqModelDistributed
        
    with tf.variable_scope("",initializer = initializer):
        model = SeqModelDistributed(FLAGS._buckets,
                                    FLAGS.size,
                                    FLAGS.real_vocab_size_from,
                                    FLAGS.real_vocab_size_to,
                                    FLAGS.num_layers,
                                    FLAGS.max_gradient_norm,
                                    FLAGS.batch_size,
                                    FLAGS.learning_rate,
                                    FLAGS.learning_rate_decay_factor,
                                    optimizer = FLAGS.optimizer,
                                    dropoutRate = FLAGS.keep_prob,
                                    dtype = dtype,
                                    devices_per_model = devices_per_model,
                                    topk_n = FLAGS.beam_size,
                                    run_options = run_options,
                                    run_metadata = run_metadata,
                                    with_attention = FLAGS.attention,
                                    beam_search = FLAGS.beam_search,
                                    beam_buckets = _beam_buckets,
                                    with_sampled_softmax = FLAGS.with_sampled_softmax,
                                    n_samples = FLAGS.n_samples,
                                    attention_style = FLAGS.attention_style,
                                    attention_scale = FLAGS.attention_scale,
                                    num_models = num_models,
                                    tie_input_output_embedding = FLAGS.tie_input_output_embedding,
                                    variational_dropout = FLAGS.variational_dropout
                         )

    ckpt = tf.train.get_checkpoint_state(FLAGS.saved_model_dir)
    # if FLAGS.recommend or (not FLAGS.fromScratch) and ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
    
    if FLAGS.mode == "DUMP_LSTM" or FLAGS.mode == "BEAM_DECODE" or FLAGS.mode == 'FORCE_DECODE' or (not FLAGS.fromScratch) and ckpt:

        if FLAGS.load_from_best:
            best_model_path = os.path.join(os.path.dirname(ckpt.model_checkpoint_path),"best-0")
            model.load_parameters(session, best_model_path)
        else:
            model.load_parameters(session, ckpt.model_checkpoint_path)
            
        if FLAGS.mode == 'BEAM_DECODE':
            session.run(tf.variables_initializer(model.beam_search_vars))
            
    else:
        model.init_parameters_from_scratch(session)
    return model
Exemple #52
0
 def add_word_embedding(self):
     self.embedding = tf.get_variable(
         'word_embedding', [self.vocab_size, self.embedding_dim],
         tf.float32, tf.random_uniform_initializer(-1.0, 1.0))
     self.embedded = tf.nn.embedding_lookup(
         self.embedding, self.x)  # forward activation of the input network
Exemple #53
0
    def __init__(self, is_training, length, leaking_rate=0.2, initLen=50):
        self.batch_size = batch_size = FLAGS.batch_size
        self.num_steps = num_steps = length
        self.inSize = inSize = FLAGS.input_dim
        self.resSize = resSize = FLAGS.hidden_dim

        self._input_data = tf.placeholder(
            tf.float32, [batch_size, length, FLAGS.input_dim])
        if is_training:
            self._targets = tf.placeholder(
                tf.float32, [batch_size, length - initLen, FLAGS.output_dim])
        else:
            self._targets = tf.placeholder(
                tf.float32, [batch_size, length, FLAGS.output_dim])

        self._Win = Win = tf.placeholder(tf.float32, [inSize, resSize])
        self._W = W = tf.placeholder(tf.float32, [resSize, resSize])

        zeros = array_ops.zeros(array_ops.pack([batch_size, resSize]),
                                dtype=tf.float32)
        zeros.set_shape([None, resSize])
        self._initial_state = zeros
        #        self._initial_state = np.zeros((batch_size, resSize), dtype=np.float32)

        S = []
        s = self._initial_state

        with tf.variable_scope("ESN"):
            for i in range(num_steps):
                s = (1 - leaking_rate) * s + \
                leaking_rate * tf.nn.tanh(tf.matmul(self._input_data[:,i,:], Win)+tf.matmul(s,W))
                if is_training:
                    if i >= initLen:
                        S.append(tf.concat(1, [self._input_data[:, i, :], s]))
                else:
                    S.append(tf.concat(1, [self._input_data[:, i, :], s]))
        self._final_state = s

        V_size = inSize + resSize
        hidden_output = tf.reshape(tf.concat(1, S), [-1, V_size])

        V = tf.get_variable("v",
                            shape=[V_size, FLAGS.output_dim],
                            dtype=tf.float32,
                            initializer=tf.random_uniform_initializer(
                                -tf.sqrt(1. / V_size), tf.sqrt(1. / V_size)))
        b = tf.get_variable("b",
                            shape=[FLAGS.output_dim],
                            dtype=tf.float32,
                            initializer=tf.constant_initializer(0.1))
        logits = tf.add(tf.matmul(hidden_output, V), b)

        target = tf.reshape(self._targets, [-1, FLAGS.output_dim])
        training_loss = tf.reduce_sum(tf.pow(logits - target, 2)) / 2
        mse = tf.reduce_mean(tf.pow(logits - target, 2))
        self._cost = mse
        self._logits = logits

        if not is_training:
            return

        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(training_loss, tvars),
                                          FLAGS.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))
Exemple #54
0
 ################################################################################
 inp = tf.placeholder(dtype, [2, 4, 6, 5], 'input')
 conv = tf.layers.conv2d(inputs=inp,
                         filters=4,
                         kernel_size=[3, 5],
                         padding='VALID',
                         activation=tf.nn.elu,
                         bias_initializer=tf.random_normal_initializer())
 save(inp, conv, prefix + 'padding_valid')
 ################################################################################
 inp = tf.placeholder(dtype, [3, 2, 3, 4], 'input')
 conv = tf.layers.conv2d(inputs=inp,
                         filters=4,
                         kernel_size=[1, 1],
                         activation=tf.nn.tanh,
                         bias_initializer=tf.random_uniform_initializer(
                             0, 1))
 conv2 = tf.layers.conv2d(inputs=inp,
                          filters=4,
                          kernel_size=[1, 1],
                          activation=tf.nn.sigmoid,
                          bias_initializer=None)
 eltwise_add_mul = (inp * 0.31 + 2 * conv) * conv2
 save(inp, eltwise_add_mul, prefix + 'eltwise_add_mul')
 ################################################################################
 inp = tf.placeholder(dtype, [1, 4, 5, 1], 'input')
 conv = tf.layers.conv2d(inputs=inp,
                         filters=4,
                         kernel_size=[3, 1],
                         padding='VALID')
 padded = tf.pad(conv, [[0, 0], [0, 2], [0, 0], [0, 0]])
 merged = tf.concat([padded, inp], axis=3)
Exemple #55
0
    def _add_seq2seq(self):
        hps = self._hps
        vsize = self._vocab.NumIds()

        with tf.variable_scope('seq2seq'):
            encoder_inputs = tf.unstack(tf.transpose(self._articles))
            decoder_inputs = tf.unstack(tf.transpose(self._abstracts))
            targets = tf.unstack(tf.transpose(self._targets))
            loss_weights = tf.unstack(tf.transpose(self._loss_weights))
            article_lens = self._article_lens

            # Embedding shared by the input and outputs.
            with tf.variable_scope('embedding'), tf.device('/cpu:0'):
                embedding = tf.get_variable(
                    'embedding', [vsize, hps.emb_dim],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=1e-4))
                emb_encoder_inputs = [
                    tf.nn.embedding_lookup(embedding, x)
                    for x in encoder_inputs
                ]
                emb_decoder_inputs = [
                    tf.nn.embedding_lookup(embedding, x)
                    for x in decoder_inputs
                ]

            for layer_i in xrange(hps.enc_layers):
                with tf.variable_scope('encoder%d' % layer_i), tf.device(
                        self._next_device()):
                    cell_fw = tf.contrib.rnn.LSTMCell(
                        hps.num_hidden,
                        initializer=tf.random_uniform_initializer(-0.1,
                                                                  0.1,
                                                                  seed=123),
                        state_is_tuple=False)
                    cell_bw = tf.contrib.rnn.LSTMCell(
                        hps.num_hidden,
                        initializer=tf.random_uniform_initializer(-0.1,
                                                                  0.1,
                                                                  seed=113),
                        state_is_tuple=False)
                    (emb_encoder_inputs, fw_state,
                     _) = tf.contrib.rnn.static_bidirectional_rnn(
                         cell_fw,
                         cell_bw,
                         emb_encoder_inputs,
                         dtype=tf.float32,
                         sequence_length=article_lens)
            encoder_outputs = emb_encoder_inputs

            with tf.variable_scope('output_projection'):
                w = tf.get_variable(
                    'w', [hps.num_hidden, vsize],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=1e-4))
                w_t = tf.transpose(w)
                v = tf.get_variable(
                    'v', [vsize],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=1e-4))

            with tf.variable_scope('decoder'), tf.device(self._next_device()):
                # When decoding, use model output from the previous step
                # for the next step.
                loop_function = None
                if hps.mode == 'decode':
                    loop_function = _extract_argmax_and_embed(
                        embedding, (w, v), update_embedding=False)

                cell = tf.contrib.rnn.LSTMCell(
                    hps.num_hidden,
                    initializer=tf.random_uniform_initializer(-0.1,
                                                              0.1,
                                                              seed=113),
                    state_is_tuple=False)

                encoder_outputs = [
                    tf.reshape(x, [hps.batch_size, 1, 2 * hps.num_hidden])
                    for x in encoder_outputs
                ]
                self._enc_top_states = tf.concat(axis=1,
                                                 values=encoder_outputs)
                self._dec_in_state = fw_state
                # During decoding, follow up _dec_in_state are fed from beam_search.
                # dec_out_state are stored by beam_search for next step feeding.
                initial_state_attention = (hps.mode == 'decode')
                decoder_outputs, self._dec_out_state = tf.contrib.legacy_seq2seq.attention_decoder(
                    emb_decoder_inputs,
                    self._dec_in_state,
                    self._enc_top_states,
                    cell,
                    num_heads=1,
                    loop_function=loop_function,
                    initial_state_attention=initial_state_attention)

            with tf.variable_scope('output'), tf.device(self._next_device()):
                model_outputs = []
                for i in xrange(len(decoder_outputs)):
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()
                    model_outputs.append(
                        tf.nn.xw_plus_b(decoder_outputs[i], w, v))

            if hps.mode == 'decode':
                with tf.variable_scope('decode_output'), tf.device('/cpu:0'):
                    best_outputs = [tf.argmax(x, 1) for x in model_outputs]
                    tf.logging.info('best_outputs%s',
                                    best_outputs[0].get_shape())
                    self._outputs = tf.concat(axis=1,
                                              values=[
                                                  tf.reshape(
                                                      x, [hps.batch_size, 1])
                                                  for x in best_outputs
                                              ])

                    self._topk_log_probs, self._topk_ids = tf.nn.top_k(
                        tf.log(tf.nn.softmax(model_outputs[-1])),
                        hps.batch_size * 2)

            with tf.variable_scope('loss'), tf.device(self._next_device()):

                def sampled_loss_func(inputs, labels):
                    with tf.device('/cpu:0'):  # Try gpu.
                        labels = tf.reshape(labels, [-1, 1])
                        return tf.nn.sampled_softmax_loss(
                            weights=w_t,
                            biases=v,
                            labels=labels,
                            inputs=inputs,
                            num_sampled=hps.num_softmax_samples,
                            num_classes=vsize)

                if hps.num_softmax_samples != 0 and hps.mode == 'train':
                    self._loss = seq2seq_lib.sampled_sequence_loss(
                        decoder_outputs, targets, loss_weights,
                        sampled_loss_func)
                else:
                    self._loss = tf.contrib.legacy_seq2seq.sequence_loss(
                        model_outputs, targets, loss_weights)
                tf.summary.scalar('loss', tf.minimum(12.0, self._loss))
    def __init__(self,
                 stochastic=False,
                 use_slope=True,
                 variational_dropout=False,
                 vocabulary_size=283,
                 label_size=50,
                 rnnSize=256,
                 n_layers=3,
                 dropout=0.5,
                 zoneout=0.1,
                 embedding_size=None,
                 dtype=tf.float32,
                 clip=0.35,
                 k_width=3,
                 name='hlstm',
                 conv_filter=3,
                 mid_filter=25,
                 batch_size=128):
        self.rnnSize = rnnSize
        self.inputSize = vocabulary_size
        self.outputSize = label_size
        self.stochastic = stochastic

        self.dtype = dtype
        self.dropout = dropout
        self.n_layers = n_layers
        self.clip = clip
        self.name = name
        self.use_slope = use_slope
        self.zoneout = zoneout
        self.batch_size = batch_size
        self.k_width = k_width
        self.conv_filter = conv_filter

        self.mid_filter = mid_filter
        f_bias = 0.0

        # placeholders
        self.x = tf.placeholder(tf.float32, [None, None, 40, 3],
                                name='x')  #[batch, seq_len]
        self.label = tf.sparse_placeholder(tf.int32,
                                           name='label')  #[batch, seq_len]
        self.seq_len = tf.placeholder(tf.int32, [None],
                                      name='seq_len')  # [batch_size]

        self.is_train = tf.placeholder(tf.bool, [], name='train')
        self.lr = tf.placeholder(tf.float32, [], name='lr')
        dropout_p = tf.where(self.is_train, self.dropout, 1.0)
        dropout_p = tf.cast(dropout_p, dtype=self.dtype)

        # LSTM layers
        self.lstm_cells = []
        conv_filter_size = (self.conv_filter, self.conv_filter)

        h = tf.layers.conv2d(self.x,
                             32,
                             conv_filter_size, (2, 2),
                             'same',
                             use_bias=False,
                             name='conv0')
        h = tf.contrib.layers.batch_norm(h,
                                         center=True,
                                         scale=True,
                                         is_training=self.is_train,
                                         decay=0.9,
                                         epsilon=1e-3,
                                         scope='bn0')
        h = tf.nn.tanh(h, name='tanh0')

        h = tf.layers.conv2d(h,
                             32,
                             conv_filter_size, (1, 2),
                             'same',
                             use_bias=False,
                             name='conv1')
        h = tf.contrib.layers.batch_norm(h,
                                         center=True,
                                         scale=True,
                                         is_training=self.is_train,
                                         decay=0.9,
                                         epsilon=1e-3,
                                         scope='bn1')
        h = tf.nn.tanh(h, name='tanh1')

        #reshape
        # ([0] : batch_size, [1] : seq_len, [2]*[3] : feature dimension)
        h_shape = tf.shape(h)

        h = tf.reshape(h, [batch_size, h_shape[1], 1, 320])
        conv_filter = tf.get_variable('QRNN_conv0_filter',
                                      shape=[mid_filter, 1, 320, 1],
                                      trainable=True)
        h = tf.nn.depthwise_conv2d(h,
                                   conv_filter, [1, 1, 1, 1],
                                   padding='SAME',
                                   name='QRNN_conv0')
        h = tf.squeeze(h, axis=[-2])
        sru_ = SRU_layer(self.rnnSize,
                         batch_size=self.batch_size,
                         fwidth=self.k_width,
                         pool_type='ifo',
                         zoneout=self.zoneout,
                         name='QRNN_layer0',
                         infer=tf.logical_not(self.is_train),
                         skip=True,
                         skip_embedding=True)
        sru_h, last_state = sru_(h)
        sru_h = tf.nn.dropout(
            sru_h,
            dropout_p,
            noise_shape=[tf.shape(sru_h)[0], 1,
                         tf.shape(sru_h)[2]])

        for i in range(1, self.n_layers):
            sru_h = tf.expand_dims(sru_h, -2)
            conv_filter = tf.get_variable(
                'QRNN_conv{}_filter'.format(i),
                shape=[mid_filter, 1, self.rnnSize, 1],
                trainable=True)
            sru_h = tf.nn.depthwise_conv2d(sru_h,
                                           conv_filter, [1, 1, 1, 1],
                                           padding='SAME',
                                           name='QRNN_conv{}'.format(i))
            sru_h = tf.squeeze(sru_h, axis=[-2])
            sru_ = SRU_layer(self.rnnSize,
                             batch_size=self.batch_size,
                             fwidth=self.k_width,
                             pool_type='ifo',
                             zoneout=self.zoneout,
                             name='QRNN_layer{}'.format(i),
                             infer=tf.logical_not(self.is_train),
                             skip=True,
                             skip_embedding=False)
            print(sru_h)
            sru_h, last_state = sru_(sru_h)
            sru_h = tf.nn.dropout(
                sru_h,
                dropout_p,
                noise_shape=[tf.shape(sru_h)[0], 1,
                             tf.shape(sru_h)[2]])

        h_shape = tf.shape(sru_h)
        output_h = tf.reshape(sru_h, [-1, self.rnnSize])
        print(output_h)

        with tf.variable_scope('dense'):
            dense = tf.layers.dense(
                output_h,
                self.outputSize,
                kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
            #dense = tf.layers.dense(output_h, self.outputSize)
        self.logit = tf.reshape(dense,
                                [h_shape[0], h_shape[1], self.outputSize])
        self.logsoftmax = tf.nn.log_softmax(self.logit)
        self.loss = tf.nn.ctc_loss(inputs=self.logit,
                                   labels=self.label,
                                   sequence_length=self.seq_len,
                                   time_major=False)
        self.loss = tf.reduce_mean(self.loss)
        train_loss = self.loss
        opt = tf.train.AdamOptimizer(self.lr)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            grad, var = zip(*opt.compute_gradients(train_loss))
            clipped_gradients, _ = tf.clip_by_global_norm(grad, clip)
            #            var_check = [tf.check_numerics(v, 'nan in var' + repr(v)) for v in var]
            #            grad_check = [tf.check_numerics(g, 'nan in grad' + repr(g)) for g in clipped_gradients]
            #            with tf.control_dependencies(var_check):
            #                with tf.control_dependencies(grad_check):
            self.optimizer = opt.apply_gradients(zip(clipped_gradients, var))

        self.sentence, _ = tf.nn.ctc_greedy_decoder(
            tf.transpose(self.logit, (1, 0, 2)), self.seq_len)
        self.cer = tf.reduce_mean(
            tf.edit_distance(tf.cast(self.sentence[0], tf.int32), self.label))
        # last states to placeholder

        self.saver = tf.train.Saver()
Exemple #57
0
def main(_):
  if not FLAGS.data_path:
    raise ValueError("Must set --data_path to PTB data directory")

  raw_data = reader.ptb_raw_data(FLAGS.data_path)
  train_data, valid_data, test_data, _ = raw_data

  config = get_config()
  eval_config = get_config()
  eval_config.batch_size = 1
  eval_config.num_steps = 1

  with tf.Graph().as_default(), tf.Session() as session:

    if FLAGS.importmodeldir is not None:
      # Import a model instance

      # Find last executed epoch

      from glob import glob
      history = list(map(lambda x: int(x.split('-')[1][:-5]), glob(FLAGS.importmodeldir+'/model/model.ckpt-*.meta')))
      last_epoch = np.max(history)

      # Recreate model
      with tf.variable_scope("model", reuse=None):
        m = PTBModel(is_training=True, config=config)
        # merged_summaries_for_training = tf.merge_all_summaries() # use this operation to merge summaries attached so far
      with tf.variable_scope("model", reuse=True):
        mtest = PTBModel(is_training=False, config=eval_config)
        merged_summaries_for_test = tf.merge_all_summaries()  # use this operation to merge summaries attached so far
        mvalid = PTBModel(is_training=False, config=config)
        # merged_summaries_for_valid = tf.merge_all_summaries() # use this operation to merge summaries attached so far

      # Fill model variables with trained values
      tf.train.Saver().restore(session, FLAGS.importmodeldir+'/model/model.ckpt-{}'.format(last_epoch))
      initial_epoch = last_epoch + 1
    else:
      # Create a model instance

      initializer = tf.random_uniform_initializer(-config.init_scale,
                                                  config.init_scale)

      with tf.variable_scope("model", reuse=None, initializer=initializer):
        m = PTBModel(is_training=True, config=config)
        # merged_summaries_for_training = tf.merge_all_summaries() # use this operation to merge summaries attached so far
      with tf.variable_scope("model", reuse=True, initializer=initializer):
        mtest = PTBModel(is_training=False, config=eval_config)
        merged_summaries_for_test = tf.merge_all_summaries()  # use this operation to merge summaries attached so far
        mvalid = PTBModel(is_training=False, config=config)
        # merged_summaries_for_valid = tf.merge_all_summaries() # use this operation to merge summaries attached so far
      tf.initialize_all_variables().run()
      initial_epoch = 0

    init_logs()
    init_model_persistance()

    train_writer = tf.train.SummaryWriter(FLAGS.logdir + "/train", session.graph)
    valid_writer = tf.train.SummaryWriter(FLAGS.logdir + "/valid", session.graph)
    test_writer = tf.train.SummaryWriter(FLAGS.logdir + "/test", session.graph)

    for i in range(initial_epoch, config.max_max_epoch):
      lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
      m.assign_lr(session, config.learning_rate * lr_decay)

      print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
      train_perplexity = run_epoch(session, m, train_data, m.train_op,
                                   verbose=True, summary_op=None, summary_writer=train_writer)
      print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
      valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op(), summary_op=None,summary_writer=None)
      print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
      if FLAGS.exportmodeldir is not None:
        tf.train.Saver().save(session,FLAGS.exportmodeldir+"/model/model.ckpt",global_step=i)
    test_perplexity = run_epoch(session, mtest, test_data, tf.no_op(), summary_op=merged_summaries_for_test, summary_writer=test_writer)
    if FLAGS.exportmodeldir is not None:
      tf.train.Saver().save(session,FLAGS.exportmodeldir+"/model/model.ckpt",global_step=config.max_max_epoch)
    print("Test Perplexity: %.3f" % test_perplexity)
Exemple #58
0
 def add_word_embedding_layer(self):
     embedding = tf.get_variable('encoder',
                                 [self.vocab_size, self.embedding_dims],
                                 tf.float32,
                                 tf.random_uniform_initializer(-1.0, 1.0))
     self._cursor = tf.nn.embedding_lookup(embedding, self._cursor)
Exemple #59
0
    def _build_word_char_embeddings(self):
        '''
        options contains key 'char_cnn': {

        'n_characters': 262,

        # includes the start / end characters
        'max_characters_per_token': 50,

        'filters': [
            [1, 32],
            [2, 32],
            [3, 64],
            [4, 128],
            [5, 256],
            [6, 512],
            [7, 512]
        ],
        'activation': 'tanh',

        # for the character embedding
        'embedding': {'dim': 16}

        # for highway layers
        # if omitted, then no highway layers
        'n_highway': 2,
        }
        '''
        batch_size = self.options['batch_size']
        unroll_steps = self.options['unroll_steps']
        projection_dim = self.options['lstm']['projection_dim']

        cnn_options = self.options['char_cnn']
        filters = cnn_options['filters']
        n_filters = sum(f[1] for f in filters)
        max_chars = cnn_options['max_characters_per_token']
        char_embed_dim = cnn_options['embedding']['dim']
        n_chars = cnn_options['n_characters']
        if cnn_options['activation'] == 'tanh':
            activation = tf.nn.tanh
        elif cnn_options['activation'] == 'relu':
            activation = tf.nn.relu

        # the input character ids
        self.tokens_characters = tf.placeholder(DTYPE_INT,
                                                shape=(batch_size,
                                                       unroll_steps,
                                                       max_chars),
                                                name='tokens_characters')
        # the character embeddings
        with tf.device("/cpu:0"):
            self.embedding_weights = tf.get_variable(
                "char_embed", [n_chars, char_embed_dim],
                dtype=DTYPE,
                initializer=tf.random_uniform_initializer(-1.0, 1.0))
            # shape (batch_size, unroll_steps, max_chars, embed_dim)
            self.char_embedding = tf.nn.embedding_lookup(
                self.embedding_weights, self.tokens_characters)

            if self.bidirectional:
                self.tokens_characters_reverse = tf.placeholder(
                    DTYPE_INT,
                    shape=(batch_size, unroll_steps, max_chars),
                    name='tokens_characters_reverse')
                self.char_embedding_reverse = tf.nn.embedding_lookup(
                    self.embedding_weights, self.tokens_characters_reverse)

        # the convolutions
        def make_convolutions(inp, reuse):
            with tf.variable_scope('CNN', reuse=reuse) as scope:
                convolutions = []
                for i, (width, num) in enumerate(filters):
                    if cnn_options['activation'] == 'relu':
                        # He initialization for ReLU activation
                        # with char embeddings init between -1 and 1
                        #w_init = tf.random_normal_initializer(
                        #    mean=0.0,
                        #    stddev=np.sqrt(2.0 / (width * char_embed_dim))
                        #)

                        # Kim et al 2015, +/- 0.05
                        w_init = tf.random_uniform_initializer(minval=-0.05,
                                                               maxval=0.05)
                    elif cnn_options['activation'] == 'tanh':
                        # glorot init
                        w_init = tf.random_normal_initializer(
                            mean=0.0,
                            stddev=np.sqrt(1.0 / (width * char_embed_dim)))
                    w = tf.get_variable("W_cnn_%s" % i,
                                        [1, width, char_embed_dim, num],
                                        initializer=w_init,
                                        dtype=DTYPE)
                    b = tf.get_variable(
                        "b_cnn_%s" % i, [num],
                        dtype=DTYPE,
                        initializer=tf.constant_initializer(0.0))

                    conv = tf.nn.conv2d(
                        inp, w, strides=[1, 1, 1, 1], padding="VALID") + b
                    # now max pool
                    conv = tf.nn.max_pool(conv,
                                          [1, 1, max_chars - width + 1, 1],
                                          [1, 1, 1, 1], 'VALID')

                    # activation
                    conv = activation(conv)
                    conv = tf.squeeze(conv, squeeze_dims=[2])

                    convolutions.append(conv)

            return tf.concat(convolutions, 2)

        # for first model, this is False, for others it's True
        reuse = tf.get_variable_scope().reuse
        embedding = make_convolutions(self.char_embedding, reuse)

        self.token_embedding_layers = [embedding]

        if self.bidirectional:
            # re-use the CNN weights from forward pass
            embedding_reverse = make_convolutions(self.char_embedding_reverse,
                                                  True)

        # for highway and projection layers:
        #   reshape from (batch_size, n_tokens, dim) to
        n_highway = cnn_options.get('n_highway')
        use_highway = n_highway is not None and n_highway > 0
        use_proj = n_filters != projection_dim

        if use_highway or use_proj:
            embedding = tf.reshape(embedding, [-1, n_filters])
            if self.bidirectional:
                embedding_reverse = tf.reshape(embedding_reverse,
                                               [-1, n_filters])

        # set up weights for projection
        if use_proj:
            # assert n_filters > projection_dim
            with tf.variable_scope('CNN_proj') as scope:
                W_proj_cnn = tf.get_variable(
                    "W_proj", [n_filters, projection_dim],
                    initializer=tf.random_normal_initializer(
                        mean=0.0, stddev=np.sqrt(1.0 / n_filters)),
                    dtype=DTYPE)
                b_proj_cnn = tf.get_variable(
                    "b_proj", [projection_dim],
                    initializer=tf.constant_initializer(0.0),
                    dtype=DTYPE)

        # apply highways layers
        def high(x, ww_carry, bb_carry, ww_tr, bb_tr):
            carry_gate = tf.nn.sigmoid(tf.matmul(x, ww_carry) + bb_carry)
            transform_gate = tf.nn.relu(tf.matmul(x, ww_tr) + bb_tr)
            return carry_gate * transform_gate + (1.0 - carry_gate) * x

        if use_highway:
            highway_dim = n_filters

            for i in range(n_highway):
                with tf.variable_scope('CNN_high_%s' % i) as scope:
                    W_carry = tf.get_variable(
                        'W_carry',
                        [highway_dim, highway_dim],
                        # glorit init
                        initializer=tf.random_normal_initializer(
                            mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
                        dtype=DTYPE)
                    b_carry = tf.get_variable(
                        'b_carry', [highway_dim],
                        initializer=tf.constant_initializer(-2.0),
                        dtype=DTYPE)
                    W_transform = tf.get_variable(
                        'W_transform', [highway_dim, highway_dim],
                        initializer=tf.random_normal_initializer(
                            mean=0.0, stddev=np.sqrt(1.0 / highway_dim)),
                        dtype=DTYPE)
                    b_transform = tf.get_variable(
                        'b_transform', [highway_dim],
                        initializer=tf.constant_initializer(0.0),
                        dtype=DTYPE)

                embedding = high(embedding, W_carry, b_carry, W_transform,
                                 b_transform)
                if self.bidirectional:
                    embedding_reverse = high(embedding_reverse, W_carry,
                                             b_carry, W_transform, b_transform)
                self.token_embedding_layers.append(
                    tf.reshape(embedding,
                               [batch_size, unroll_steps, highway_dim]))

        # finally project down to projection dim if needed
        if use_proj:
            embedding = tf.matmul(embedding, W_proj_cnn) + b_proj_cnn
            if self.bidirectional:
                embedding_reverse = tf.matmul(embedding_reverse, W_proj_cnn) \
                    + b_proj_cnn
            self.token_embedding_layers.append(
                tf.reshape(embedding,
                           [batch_size, unroll_steps, projection_dim]))

        # reshape back to (batch_size, tokens, dim)
        if use_highway or use_proj:
            shp = [batch_size, unroll_steps, projection_dim]
            embedding = tf.reshape(embedding, shp)
            if self.bidirectional:
                embedding_reverse = tf.reshape(embedding_reverse, shp)

        # at last assign attributes for remainder of the model
        self.embedding = embedding
        if self.bidirectional:
            self.embedding_reverse = embedding_reverse

#3x3 tic tac toe environment:
#the each squre could take 3 different values: -1 for X, 0 nothing, 1 for O
environment = np.zeros((9), dtype=np.int8)

#if the game has not ended yet, the reward will be 0
#if the game has ended, reward will be +10
#a wrong move results in high lose, -50


state_input = tf.placeholder(tf.float32, [None, 9], "state_input")
target_state_input = tf.placeholder(tf.float32, [None, 9], "target_state_input")
#first layer
weight_stddev = (2.0/9)**0.5
predict_w1 = tf.get_variable("predict_w1", (9, 80), initializer=tf.random_uniform_initializer())
predict_b1 = tf.Variable(tf.zeros(80), name="predict_b1")
predict_layer_1_output = tf.nn.leaky_relu(tf.matmul(state_input, predict_w1)+predict_b1)



weight_stddev = (2.0/9)**0.5
target_w1 = tf.get_variable("target_w1", (9, 80), initializer=tf.random_uniform_initializer())
target_b1 = tf.Variable(tf.zeros(80), name="target_b1")
target_layer_1_output = tf.nn.leaky_relu(tf.matmul(target_state_input, target_w1)+target_b1)


#second layer...
weight_stddev = (2.0/80)**0.5
#9 available actions...
predict_w2 = tf.get_variable("predict_w2", (80, 50), initializer=tf.random_uniform_initializer())