Exemple #1
0
    def __init__(self, args, infer=False):
        '''these arguments appear in full in train.py'''
        self.args = args
        '''it seems this will never happen'''
        if infer:
            args.batch_size = 1
            args.seq_length = 1
        '''the types of models at our disposal'''
        if args.model == 'rnn':
            cell_fn = rnn.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn.BasicLSTMCell
        else:
            raise Exception("model type not supported: {}".format(args.model))
        '''this is a placeholder for dropout, defaults to 0 for computing f(x)'''
        self.dropout = tf.placeholder_with_default(0., shape=())
        '''the structure of the cell is formed here'''
        cells = []
        for _ in range(args.num_layers):
            cell = cell_fn(args.rnn_size)
            cell = rnn.DropoutWrapper(cell, output_keep_prob=1 - self.dropout)
            cells.append(cell)
        self.cell = cell = rnn.MultiRNNCell(cells)
        '''the model object includes train data, test data if specified, and some batch/epoch pointers'''
        self.input_data = tf.placeholder(tf.int32,
                                         [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32,
                                      [args.batch_size, args.seq_length])
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)
        self.batch_pointer = tf.Variable(0,
                                         name="batch_pointer",
                                         trainable=False,
                                         dtype=tf.int32)
        self.inc_batch_pointer_op = tf.assign(self.batch_pointer,
                                              self.batch_pointer + 1)
        self.epoch_pointer = tf.Variable(0,
                                         name="epoch_pointer",
                                         trainable=False)
        self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False)
        self.test_x = tf.placeholder(tf.int32,
                                     shape=[args.batch_size, args.seq_length])
        self.test_y = tf.placeholder(tf.int32,
                                     [args.batch_size, args.seq_length])
        '''i never figured out what this does'''
        tf.summary.scalar("time_batch", self.batch_time)

        def variable_summaries(var):
            """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
            with tf.name_scope('summaries'):
                mean = tf.reduce_mean(var)
                tf.summary.scalar('mean', mean)
                #with tf.name_scope('stddev'):
                #   stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
                #tf.summary.scalar('stddev', stddev)
                tf.summary.scalar('max', tf.reduce_max(var))
                tf.summary.scalar('min', tf.reduce_min(var))
                #tf.summary.histogram('histogram', var)

        '''begin defining model variables'''
        with tf.variable_scope('rnnlm'):
            '''the get_variable is an initializer: here we get weights, then biases'''
            softmax_w = tf.get_variable(
                "softmax_w", [args.rnn_size, args.vocab_size],
                initializer=tf.truncated_normal_initializer(mean=0.,
                                                            stddev=.1,
                                                            seed=2018,
                                                            dtype=tf.float32))
            variable_summaries(softmax_w)
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size],
                                        initializer=tf.constant_initializer(
                                            np.repeat(0., args.vocab_size),
                                            tf.float32, args.vocab_size))
            variable_summaries(softmax_b)
            with tf.device("/cpu:0"):
                '''W will be the word embeddings'''

                self.W = tf.Variable(tf.constant(
                    0.0, shape=[args.vocab_size, args.embedding_dim]),
                                     name="W")

                self.embedding_placeholder = tf.placeholder(
                    tf.float32, [args.vocab_size, args.embedding_dim])
                self.embedding_init = self.W.assign(self.embedding_placeholder)
                '''the data to input to the model for some computation'''
                inputs = tf.split(
                    tf.nn.embedding_lookup(self.W, self.input_data),
                    args.seq_length, 1)
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

                test_inputs = tf.split(
                    tf.nn.embedding_lookup(self.W, self.test_x),
                    args.seq_length, 1)
                test_inputs = [
                    tf.squeeze(test_input_, [1]) for test_input_ in test_inputs
                ]
        '''im not 100% on this one, but it never gets used'''

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            print(tf.argmax(prev, 1))
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            print(prev_symbol)
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        '''the model output, logits, probability distbution, and loss'''
        outputs, last_state = legacy_seq2seq.rnn_decoder(
            inputs,
            self.initial_state,
            cell,
            loop_function=loop if infer else None,
            scope='rnnlm')
        output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.temp = tf.placeholder_with_default(1., shape=())
        self.temped_logits = self.logits / self.temp
        self.probs = tf.nn.softmax(self.temped_logits)
        loss = legacy_seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([args.batch_size * args.seq_length])], args.vocab_size)
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        '''the test output, logits, and loss'''
        test_outputs, test_last_state = legacy_seq2seq.rnn_decoder(
            test_inputs,
            self.initial_state,
            cell,
            loop_function=loop if infer else None,
            scope='rnnlm')
        test_output = tf.reshape(tf.concat(test_outputs, 1),
                                 [-1, args.rnn_size])
        self.test_logits = tf.matmul(test_output, softmax_w) + softmax_b
        self.test_probs = tf.nn.softmax(self.test_logits)
        test_loss = legacy_seq2seq.sequence_loss_by_example(
            [self.test_logits], [tf.reshape(self.test_y, [-1])],
            [tf.ones([self.test_y.shape[0]])], args.vocab_size)
        self.test_cost = tf.reduce_sum(
            test_loss) / args.batch_size / args.seq_length

        tf.summary.scalar("cost", self.cost)
        '''for retrieval of the final states'''
        self.final_state = last_state
        self.test_final_state = test_last_state
        '''the optimizer'''
        self.lr = tf.Variable(0.0, trainable=False)
        optimizer = tf.train.AdamOptimizer(self.lr)
        '''so this was the really hacky way i got the embeddings to be trainable on demand: two channels for optimization. turn whichever you wish'''
        self.tvars = tf.trainable_variables()
        self.tvars_no_W = [
            var for var in tf.trainable_variables() if "W:0" not in var.name
        ]
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, self.tvars),
                                          args.grad_clip)
        grads_no_W, _ = tf.clip_by_global_norm(
            tf.gradients(self.cost, self.tvars_no_W), args.grad_clip)
        '''running this updates the parameters'''
        self.train_op = optimizer.apply_gradients(zip(grads, self.tvars))
        self.train_op_no_W = optimizer.apply_gradients(
            zip(grads_no_W, self.tvars_no_W))
Exemple #2
0
X = tf.placeholder(tf.int32, [None, sequence_length])
Y = tf.placeholder(tf.int32, [None, sequence_length])

# One-hot encoding
X_one_hot = tf.one_hot(X, num_classes)
print(X_one_hot)  # check out the shape


# Make a lstm cell with hidden_size (each unit output vector size)
def lstm_cell():
    cell = rnn.BasicLSTMCell(hidden_size, state_is_tuple=True)
    return cell


multi_cells = rnn.MultiRNNCell([lstm_cell() for _ in range(2)],
                               state_is_tuple=True)

# outputs: unfolding size x hidden size, state = hidden size
outputs, _states = tf.nn.dynamic_rnn(multi_cells, X_one_hot, dtype=tf.float32)

# FC layer
X_for_fc = tf.reshape(outputs, [-1, hidden_size])
outputs = tf.contrib.layers.fully_connected(X_for_fc,
                                            num_classes,
                                            activation_fn=None)

# reshape out for sequence_loss
outputs = tf.reshape(outputs, [batch_size, sequence_length, num_classes])

# All weights are 1 (equal weights)
weights = tf.ones([batch_size, sequence_length])
    with tf.device('/gpu:0'):

        input_data = tf.placeholder(tf.int32, shape=[batch_size, num_steps])
        target = tf.placeholder(tf.int32, shape=[batch_size, num_steps])
        keep_prob = tf.placeholder(tf.float32)
        embedding = tf.get_variable("embedding", [word_vocab_size, rnn_size])
        inputs = tf.nn.embedding_lookup(embedding, input_data)

        def rnn_cell():
            return tf.contrib.rnn.DropoutWrapper(rnn.BasicLSTMCell(
                num_hidden_units, reuse=False),
                                                 output_keep_prob=keep_prob,
                                                 variational_recurrent=True,
                                                 dtype=tf.float32)

        cells = rnn.MultiRNNCell(
            [rnn_cell() for _ in range(num_hidden_layers)])
        rnn_initial_state = cells.zero_state(batch_size, dtype=tf.float32)
        print(input_data.get_shape().as_list())
        outputs, final_state = tf.nn.dynamic_rnn(
            cells, inputs, initial_state=rnn_initial_state, dtype=tf.float32)

        outputs = tf.reshape(tf.concat(outputs, 1), [-1, rnn_size])
        softmax_w = tf.get_variable("softmax_w", [rnn_size, word_vocab_size])
        softmax_b = tf.get_variable("softmax_b", [word_vocab_size])

        logits = tf.matmul(outputs, softmax_w) + softmax_b
        logits = tf.reshape(logits, [batch_size, num_steps, word_vocab_size])

        loss = tf.contrib.seq2seq.sequence_loss(logits,
                                                target,
                                                tf.ones(
Exemple #4
0
    def network(self):
        """
        RNN 网络搭建

        :return:
        """
        # 1. embedding layer
        with tf.name_scope('embedding'):
            if self.embedding_mat is None:
                self.Embedding = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_dims],
                                                               -1., 1.), name='Embedding')
                self.embedded_chars = tf.nn.embedding_lookup(self.Embedding, self.input_x)

        # 2. RNN hidden layer
        with tf.name_scope('rnn'):
            if self.cell.startswith("bi"):
                cell_fw, cell_bw = self.bi_dir_rnn()
                if self.num_layer > 1:
                    cell_fw = rnn.MultiRNNCell([cell_fw] * self.num_layer, state_is_tuple=True)
                    cell_bw = rnn.MultiRNNCell([cell_bw] * self.num_layer, state_is_tuple=True)

                outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.embedded_chars,
                                                             dtype=tf.float32)

                # 将双向的LSTM 输出拼接,得到[None, time_step, hidden_dims * 2]
                outputs = tf.concat(outputs, axis=2)
            else:
                cells = self.witch_cell()
                if self.num_layer > 1:
                    cells = rnn.MultiRNNCell([cells] * self.num_layer, state_is_tuple=True)

                # outputs:[batch, timestep_size, hidden_size]
                # state:[layer_num, 2, batch_size, hidden_size]
                outputs, _ = tf.nn.dynamic_rnn(cells, self.embedded_chars, dtype=tf.float32)
            # 取出最后一个状态的输出 [none, 1, hidden_dims * 2]
            h_state = outputs[:, -1, :]

        # 3. FC and softmax layer
        with tf.name_scope('output'):
            if self.cell.startswith('bi'):
                self.W = tf.Variable(tf.truncated_normal([self.hidden_unit * 2, self.num_tags], stddev=0.1),
                                     dtype=tf.float32, name='W')
            else:
                self.W = tf.Variable(tf.truncated_normal([self.hidden_unit, self.num_tags], stddev=0.1),
                                     dtype=tf.float32, name='W')
            self.b = tf.Variable(tf.constant(0.1, shape=[self.num_tags]), dtype=tf.float32, name='b')

            # full coneection and softmax output
            self.logits = tf.nn.softmax(tf.matmul(h_state, self.W) + self.b)

        # 4. loss
        with tf.name_scope('loss'):
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy)
            # l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()
            #                     if 'bias' not in v.name]) * self.l2_reg_lambda

            self.l2_loss += tf.nn.l2_loss(self.W)
            self.l2_loss += tf.nn.l2_loss(self.b)
            self.loss += self.l2_loss
        # 5. accuracy
        with tf.name_scope('accuracy'):
            self.predicted = tf.equal(tf.argmax(self.logits, 1),
                                 tf.arg_max(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(self.predicted, dtype=tf.float32))

        with tf.name_scope('num_prediction'):
            self.num_correct = tf.reduce_sum(tf.cast(self.predicted, dtype=tf.float32), name='num_correct')
Exemple #5
0
    def __init__(self, args, training=True):
        self.args = args
        if not training:
            args.batch_size = 1
            args.seq_length = 1

        if args.model == 'rnn':
            cell_fn = rnn.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn.BasicLSTMCell
        elif args.model == 'nas':
            cell_fn = rnn.NASCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cells = []
        for _ in range(args.num_layers):
            cell = cell_fn(args.rnn_size)
            if training and (args.output_keep_prob < 1.0 or args.input_keep_prob < 1.0):
                cell = rnn.DropoutWrapper(cell,
                                          input_keep_prob=args.input_keep_prob,
                                          output_keep_prob=args.output_keep_prob)
            cells.append(cell)

        self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True)

        self.input_data = tf.placeholder(
            tf.int32, [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(
            tf.int32, [args.batch_size, args.seq_length])
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)

        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w",
                                        [args.rnn_size, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])

        embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size])
        inputs = tf.nn.embedding_lookup(embedding, self.input_data)

        # dropout beta testing: double check which one should affect next line
        if training and args.output_keep_prob:
            inputs = tf.nn.dropout(inputs, args.output_keep_prob)

        inputs = tf.split(inputs, args.seq_length, 1)
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        outputs, last_state = legacy_seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if not training else None, scope='rnnlm')
        output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size])


        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = legacy_seq2seq.sequence_loss_by_example(
                [self.logits],
                [tf.reshape(self.targets, [-1])],
                [tf.ones([args.batch_size * args.seq_length])])
        with tf.name_scope('cost'):
            self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                args.grad_clip)
        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        # instrument tensorboard
        tf.summary.histogram('logits', self.logits)
        tf.summary.histogram('loss', loss)
        tf.summary.scalar('train_loss', self.cost)
Exemple #6
0
    def rnn_estimator(x, y):
        """RNN estimator with target predictor function on top."""
        x = input_op_fn(x)
        if cell_type == 'rnn':
            cell_fn = contrib_rnn.BasicRNNCell
        elif cell_type == 'gru':
            cell_fn = contrib_rnn.GRUCell
        elif cell_type == 'lstm':
            cell_fn = functools.partial(contrib_rnn.BasicLSTMCell,
                                        state_is_tuple=False)
        else:
            raise ValueError(
                'cell_type {} is not supported. '.format(cell_type))
        # TODO(ipolosukhin): state_is_tuple=False is deprecated
        if bidirectional:
            # forward direction cell
            fw_cell = lambda: cell_fn(rnn_size)
            bw_cell = lambda: cell_fn(rnn_size)
            # attach attention cells if specified
            if attn_length is not None:

                def attn_fw_cell():
                    return contrib_rnn.AttentionCellWrapper(
                        fw_cell(),
                        attn_length=attn_length,
                        attn_size=attn_size,
                        attn_vec_size=attn_vec_size,
                        state_is_tuple=False)

                def attn_bw_cell():
                    return contrib_rnn.AttentionCellWrapper(
                        bw_cell(),
                        attn_length=attn_length,
                        attn_size=attn_size,
                        attn_vec_size=attn_vec_size,
                        state_is_tuple=False)
            else:
                attn_fw_cell = fw_cell
                attn_bw_cell = bw_cell

            rnn_fw_cell = contrib_rnn.MultiRNNCell(
                [attn_fw_cell() for _ in range(num_layers)],
                state_is_tuple=False)
            # backward direction cell
            rnn_bw_cell = contrib_rnn.MultiRNNCell(
                [attn_bw_cell() for _ in range(num_layers)],
                state_is_tuple=False)
            # pylint: disable=unexpected-keyword-arg, no-value-for-parameter
            _, encoding = bidirectional_rnn(rnn_fw_cell,
                                            rnn_bw_cell,
                                            x,
                                            dtype=dtypes.float32,
                                            sequence_length=sequence_length,
                                            initial_state_fw=initial_state,
                                            initial_state_bw=initial_state)
        else:
            rnn_cell = lambda: cell_fn(rnn_size)

            if attn_length is not None:

                def attn_rnn_cell():
                    return contrib_rnn.AttentionCellWrapper(
                        rnn_cell(),
                        attn_length=attn_length,
                        attn_size=attn_size,
                        attn_vec_size=attn_vec_size,
                        state_is_tuple=False)
            else:
                attn_rnn_cell = rnn_cell

            cell = contrib_rnn.MultiRNNCell(
                [attn_rnn_cell() for _ in range(num_layers)],
                state_is_tuple=False)
            _, encoding = contrib_rnn.static_rnn(
                cell,
                x,
                dtype=dtypes.float32,
                sequence_length=sequence_length,
                initial_state=initial_state)
        return target_predictor_fn(encoding, y)
Exemple #7
0
eth_transaction_fee = tf.constant(0.001)
etc_transaction_fee = tf.constant(0.01)
SEQLEN = 30
BATCHSIZE = 200
INTERNALSIZE = 512
NLAYERS = 3
learning_rate = 0.001
# inputs/outputs
X = tf.placeholder(tf.float32, [None], name="X")
Y_ = tf.placeholder(tf.float32, [None], name="Y_")

# making the multirnn_gru_cell
Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE * NLAYERS], name='Hin')
# [ BATCHSIZE, INTERNALSIZE * NLAYERS]
cells = [rnn.GRUCell(INTERNALSIZE) for _ in range(NLAYERS)]
multicell = rnn.MultiRNNCell(cells, state_is_tuple=False)
Yr, H = tf.nn.dynamic_rnn(multicell, X, dtype=tf.float32, initial_state=Hin)
H = tf.identity(H, name='H')

# checkpoints dir
if not os.path.exists("checkpoints"):
    os.mkdir("checkpoints")
saver = tf.train.Saver(max_to_keep=1000)

# init
# initial zero input state
istate = np.zeros([BATCHSIZE, INTERNALSIZE * NLAYERS])
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
step = 0
Exemple #8
0
    def add_graph(self, noyear=False, feedforward=False):
        """
        parameters:
            noyear: a boolean, indicates whether year information is included as input to the model
            feedforward: a boolean, indicates whether the model is a feedforward neural network or an LSTM
        Creates a graph for the model. Generates placeholders for X_word, X_year, Y_label, and the embedding matrix. Creates
        year embedding. Details model architecture. Calculates accuracy, log perplexity, and loss. Optimizes network based on loss.
        """
        # Creates placeholders for LSTM
        self.X_word = tf.placeholder(tf.int32, [None, MAX_SENT_LENGTH])
        self.X_year = tf.placeholder(tf.int32, [None])
        self.Y_label = tf.placeholder(tf.int32, [None, MAX_SENT_LENGTH])
        self.embedding_matrix = tf.placeholder(tf.float32,
                                               [MAX_THRESHOLD, EMBED_DIM])

        # Looks up embeddings for each word
        X_word = tf.nn.embedding_lookup(self.embedding_matrix, self.X_word)

        # Creates year embedding
        new_years = tf.subtract(self.X_year, START_YEAR)
        unembedded_year = tf.tile(tf.expand_dims(new_years, axis=1),
                                  [1, MAX_SENT_LENGTH])
        self.year_embed_mat = tf.get_variable(
            name="year_embed_mat",
            shape=(NUM_YEAR, EMBED_DIM),
            initializer=tf.contrib.layers.xavier_initializer())

        embedded_year = tf.nn.embedding_lookup(self.year_embed_mat,
                                               unembedded_year)
        if noyear:
            embedded_year = tf.zeros_like(embedded_year)

        # Concatenates X_word and year embedding to get single combined input
        X = tf.concat([X_word, embedded_year], axis=2)

        if feedforward:
            # Implements Feed-Forward
            H = tf.layers.dense(inputs=X,
                                units=LAYERS[0],
                                activation=tf.nn.sigmoid)

        else:
            # Implements LSTM
            rnn_layers = [rnn.LSTMCell(size) for size in LAYERS]
            multi_rnn_cell = rnn.MultiRNNCell(rnn_layers)

            H, _ = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
                                     inputs=X,
                                     dtype=tf.float32)

        # POS tags
        self.Y = tf.contrib.layers.fully_connected(
            inputs=H,
            num_outputs=N_POS,
        )

        # Calculates accuracy
        equal = tf.equal(tf.cast(tf.argmax(self.Y, axis=2), tf.int32),
                         tf.cast(self.Y_label, tf.int32))
        self.acc = tf.reduce_mean(tf.cast(equal, tf.float32))
        self.vec_acc = tf.reduce_mean(tf.cast(equal, tf.float32), axis=1)

        # Calculates perplexity
        mask = tf.cast(tf.one_hot(self.Y_label, N_POS), tf.float32)
        p = tf.reduce_sum(tf.nn.softmax(self.Y) * mask, axis=2)
        self.log_perp = -tf.reduce_sum(tf.log(p), axis=1) / MAX_SENT_LENGTH
        self.perp = tf.exp(self.log_perp)

        # Calculates loss
        self.loss = tf.losses.sparse_softmax_cross_entropy(
            labels=self.Y_label,
            logits=self.Y,
        )

        # Sets train_step that uses AdamOptimizer to minimize loss
        self.train_step = tf.train.AdamOptimizer(LR).minimize(self.loss)
Exemple #9
0
weights={'in':tf.Variable(tf.random_normal([n_input,n_hidden])),
        'out': tf.Variable(tf.random_normal([n_hidden,n_outputs]))}
biases={'in':tf.Variable(tf.constant(0.1,shape=[n_hidden,])),
        'out': tf.Variable(tf.constant(0.1,shape=[n_outputs,]))}

X=train_x
Y=train_y
test=test_x

w_in=weights['in']
b_in=biases['in']
inputs=tf.reshape(x,[-1,n_input])
input_rnn=tf.matmul(inputs,w_in)+b_in
input_rnn=tf.reshape(input_rnn,[-1,time_step,n_hidden])    
lstm_cells=[rnn.LSTMCell(n_hidden,forget_bias=1.0) for _ in range(n_layers)]
lstm=rnn.MultiRNNCell(lstm_cells)
outputs,states=tf.nn.dynamic_rnn(lstm,inputs=x,dtype=tf.float32,time_major=False)
outputs=tf.reshape(outputs,[-1,n_hidden])
w_out=weights['out']
b_out=biases['out']
pred=tf.matmul(outputs,w_out)+b_out
    #损失函数
loss=tf.reduce_mean(tf.square(tf.reshape(pred,[-1])-tf.reshape(y, [-1])))
train_op=tf.train.AdamOptimizer(learning_rate).minimize(loss)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
        #重复训练10000次
    for i in range(1000):
        step=0
        start=0
        end=start+batch_size
Exemple #10
0
labels = tf.placeholder(tf.float32, shape=(None, 3), name='labels')


def feed_dict(X, asp, lx, y):
    fd = {inputs[t]: X[t] for t in range(dm.max_seq_len)}
    fd.update({asp_inputs: asp})
    fd = {}
    fd.update({labels: y})
    return fd


cell = rnn.BasicLSTMCell(cell_num)
#cell = rnn.LSTMCell(cell_num, initializer=initializer)
#cell = rnn.DropoutWrapper(cell, output_keep_prob=dropout_keep_prob)
cells = [deepcopy(cell) for i in range(layer_num)]
cell = rnn.MultiRNNCell(cells)

with tf.name_scope('embedding'):
    if dm.use_pretrained_embedding:
        pre_trained_emb = embedding_frame.dropna().values.astype('float32')
        pre_trained_embedding = tf.get_variable(
            name="pre_trained_embedding",
            shape=pre_trained_emb.shape,
            initializer=tf.constant_initializer(pre_trained_emb),
            trainable=True)
        pad_embedding = tf.get_variable('pad_embedding',
                                        (dm.start_idx, embedding_size),
                                        dtype=tf.float32,
                                        initializer=initializer)
        embedding = tf.concat([pad_embedding, pre_trained_embedding],
                              axis=0,
Exemple #11
0
 def multi_cell():
     return rnn.MultiRNNCell(
         [single_cell() for _ in range(hyper.num_layer)])
l = tf.placeholder(tf.int32, [None])
weights = tf.Variable(tf.random_normal([n_hidden * 2, n_classes]))
biases = tf.Variable(tf.random_normal([n_classes]))
'''构建Graph'''


def GRU_cell():
    cell = rnn.GRUCell(n_hidden, reuse=tf.get_variable_scope().reuse)
    return rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)


inputs = tf.transpose(x, [1, 0, 2])
inputs = tf.reshape(inputs, [-1, n_input])
inputs = tf.split(inputs, n_steps)
# ** 1.构建前向后向多层 LSTM
cell_fw = rnn.MultiRNNCell([GRU_cell() for _ in range(layer_num)],
                           state_is_tuple=True)
cell_bw = rnn.MultiRNNCell([GRU_cell() for _ in range(layer_num)],
                           state_is_tuple=True)
# ** 2.初始状态+
initial_state_fw = cell_fw.zero_state(batch_size, tf.float32)
initial_state_bw = cell_bw.zero_state(batch_size, tf.float32)
# ** 3.bi-lstm 计算(tf封装)
outputs, _, _ = rnn.static_bidirectional_rnn(cell_fw,
                                             cell_bw,
                                             inputs,
                                             initial_state_fw=initial_state_fw,
                                             initial_state_bw=initial_state_bw,
                                             dtype=tf.float32,
                                             sequence_length=l)
output = tf.reshape(tf.concat(outputs, 1), [-1, 2 * n_hidden])
logits = tf.matmul(output, weights) + biases
Exemple #13
0
def main():
    # **步骤1:RNN 的输入shape = (n_batch_size, timestep_size, n_input)
    X = tf.placeholder(tf.float32, shape=(None, n_steps * n_input), name="X")
    y = tf.placeholder(tf.float32, shape=(None, n_classes), name="y")

    def lstm_cell(n_hidden, keep_prob):
        # **步骤2:定义一层 LSTM_cell,只需要说明 n_hidden, 它会自动匹配输入的 X 的维度
        cell = rnn.LSTMCell(num_units=n_hidden,
                            initializer=tf.random_uniform_initializer(-0.1,
                                                                      0.1,
                                                                      seed=2),
                            forget_bias=1.0,
                            state_is_tuple=True)
        # **步骤3:添加 dropout layer, 一般只设置 output_keep_prob
        cell = rnn.DropoutWrapper(cell=cell,
                                  input_keep_prob=1.0,
                                  output_keep_prob=keep_prob)
        return cell

    # **步骤4:调用 MultiRNNCell 来实现多层 LSTM
    cells = [lstm_cell(n_hidden, keep_prob) for _ in range(n_layers)]
    mlstm_cell = rnn.MultiRNNCell(cells, state_is_tuple=True)
    # **步骤5:用全零来初始化state  # 通过zero_state得到一个全0的初始状态
    init_state = mlstm_cell.zero_state(n_batch_size, dtype=tf.float32)

    # **步骤6:方法一,调用 dynamic_rnn() 来让我们构建好的网络运行起来
    # ** 当 time_major==False 时, outputs.shape = [n_batch_size, timestep_size, n_hidden]
    # ** 所以,可以取 h_state = outputs[:, -1, :] 作为最后输出
    # ** state.shape = [layer_num, 2, n_batch_size, n_hidden],
    # ** 或者,可以取 h_state = state[-1][1] 作为最后输出
    # ** 最后输出维度是 [n_batch_size, n_hidden]
    # outputs, state = tf.nn.dynamic_rnn(mlstm_cell, inputs=X, initial_state=init_state, time_major=False)
    # h_state = outputs[:, -1, :]  # 或者 h_state = state[-1][1]

    # *************** 为了更好的理解 LSTM 工作原理,我们把上面 步骤6 中的函数自己来实现 ***************
    # 通过查看文档你会发现, RNNCell 都提供了一个 __call__()函数(见最后附),我们可以用它来展开实现LSTM按时间步迭代。
    # **步骤6:方法二,按时间步展开计算

    outputs = list()
    with tf.variable_scope('RNN'):
        for timestep in range(n_steps):
            if timestep > 0:
                tf.get_variable_scope().reuse_variables()
            # 这里的state保存了每一层 LSTM 的状态
            cell_output, h1 = mlstm_cell.call(X, init_state)
            outputs.append(cell_output)
    h_state = outputs[-1]

    # 上面 LSTM 部分的输出会是一个 [n_hidden] 的tensor,我们要分类的话,还需要接一个 softmax 层
    # 首先定义 softmax 的连接权重矩阵和偏置
    # out_W = tf.placeholder(tf.float32, [n_hidden, n_classes], name='out_Weights')
    # out_bias = tf.placeholder(tf.float32, [n_classes], name='out_bias')
    # 开始训练和测试

    W = tf.Variable(tf.truncated_normal([n_hidden, n_classes]),
                    dtype=tf.float32)
    bias = tf.Variable(tf.constant(0.1, shape=[n_classes]), dtype=tf.float32)
    y_ = tf.nn.softmax(tf.matmul(h_state, W) + bias)

    # 损失和评估函数
    cost = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_))
    train_op = tf.train.AdamOptimizer(lr).minimize(cost)

    correct_prediction = tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1))
    acc = tf.reduce_mean(tf.cast(correct_prediction, "float"))

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        for i in range(2000):

            train_X, train_y = mnist.train.next_batch(n_batch_size)
            session.run(train_op,
                        feed_dict={
                            X: train_X,
                            y: train_y,
                            keep_prob: 0.5
                        })
            if (i + 1) % 200 == 0:
                train_acc, train_loss = session.run([acc, cost],
                                                    feed_dict={
                                                        X: train_X,
                                                        y: train_y,
                                                        keep_prob: 1.0
                                                    })
                # 已经迭代完成的 epoch 数: mnist.train.epochs_completed
                print("Iter {}, step {}, loss {:6f}, train acc {}".format(
                    mnist.train.epochs_completed, (i + 1), train_loss,
                    train_acc))
        print("\nevaluation model")
        test_X = mnist.test.images[:n_batch_size]
        test_y = mnist.test.labels[:n_batch_size]
        # 计算测试数据的准确率
        test_acc, test_loss = session.run([acc, cost],
                                          feed_dict={
                                              X: test_X,
                                              y: test_y,
                                              keep_prob: 1.0,
                                              batch_size: n_batch_size
                                          })
        print("test acc {},test loss {}".format(test_acc, test_loss))
Exemple #14
0
 def add_rnn(layer_count, hidden_size, cell=rnn.BasicLSTMCell, activation=tf.tanh):
     # hidden_size = 5,神经元序列
     cells = [cell(hidden_size, activation=activation) for _ in range(layer_count)]
     return rnn.MultiRNNCell(cells)
Exemple #15
0
    def __init__(self, params, training=True):
        if not training:
            params.batch_size = 1
            params.seq_length = 1

        cells = []
        for _ in range(params.num_layers):
            cell = rnn.BasicLSTMCell(params.rnn_size)
            cells.append(cell)

        self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True)

        self.input_data = tf.placeholder(
            tf.int32, [params.batch_size, params.seq_length])
        self.targets = tf.placeholder(tf.int32,
                                      [params.batch_size, params.seq_length])
        self.initial_state = cell.zero_state(params.batch_size, tf.float32)

        with tf.variable_scope('lstm_lm'):
            softmax_w = tf.get_variable("softmax_w",
                                        [params.rnn_size, params.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [params.vocab_size])

        embedding = tf.get_variable("embedding",
                                    [params.vocab_size, params.rnn_size])
        inputs = tf.nn.embedding_lookup(embedding, self.input_data)

        inputs = tf.split(inputs, params.seq_length, 1)
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        outputs, last_state = legacy_seq2seq.rnn_decoder(
            inputs,
            self.initial_state,
            cell,
            loop_function=loop if not training else None,
            scope='lstm_lm')
        output = tf.reshape(tf.concat(outputs, 1), [-1, params.rnn_size])

        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = legacy_seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([params.batch_size * params.seq_length])])
        self.cost = (tf.reduce_sum(loss) /
                     params.batch_size) / params.seq_length
        with tf.name_scope('cost'):
            self.cost = (tf.reduce_sum(loss) /
                         params.batch_size) / params.seq_length
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                          params.grad_clip)
        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        tf.summary.histogram('logits', self.logits)
        tf.summary.histogram('loss', loss)
        tf.summary.scalar('train_loss', self.cost)
Exemple #16
0
numInputs = 2
numOutputs = 1
timesteps = 1

resultSet = []

w = tf.Variable(tf.truncated_normal([numHidden2, numOutputs]))

b = tf.Variable(tf.random_normal([numOutputs]))

lstm = rnn.LSTMCell(numHidden, state_is_tuple=True)
lstm2 = rnn.LSTMCell(numHidden2, state_is_tuple=True)

lstm3 = rnn.LSTMCell(numHidden2, state_is_tuple=True)
cell = rnn.MultiRNNCell([lstm, lstm2])


def LSTM(X):

    output, state = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

    output = tf.transpose(output, (1, 0, 2))

    out = tf.tanh(tf.matmul(output[-1], w) + b)

    return out


X = tf.placeholder(tf.float32, [None, timesteps, numInputs])
Y = tf.placeholder(tf.float32, [None, numOutputs])
def build_stacked_gru_model(
    embedding_layer,
    partial_sequence_length,
    gru_hidden_sizes,
    num_output_features,
    bidirectional):
  """Predicts next production rule from partial sequence with stacked GRUs.

  Args:
    embedding_layer: Float32 tensor with shape
        [batch_size, max_length, num_features]. Input to the model.
    partial_sequence_length: Int32 tensor with shape [batch_size].
        This tensor is used for sequence_length in tf.nn.dynamic_rnn().
    gru_hidden_sizes: List of integers, number of units for each GRU layer.
    num_output_features: Integer, the number of output features.
    bidirectional: Boolean, whether to use bidirectional RNN.

  Returns:
    Float tensor with shape [batch_size, num_output_features]
  """
  with tf.variable_scope('stacked_gru_model'):
    gru_cells = [
        tf.nn.rnn_cell.GRUCell(gru_hidden_size)
        for gru_hidden_size in gru_hidden_sizes
    ]
    forward_stacked_gru = contrib_rnn.MultiRNNCell(gru_cells)
    if bidirectional:
      gru_cells = [
          tf.nn.rnn_cell.GRUCell(gru_hidden_size)
          for gru_hidden_size in gru_hidden_sizes
      ]
      backward_stacked_gru = contrib_rnn.MultiRNNCell(gru_cells)

      _, final_states = tf.nn.bidirectional_dynamic_rnn(
          cell_fw=forward_stacked_gru,
          cell_bw=backward_stacked_gru,
          inputs=embedding_layer,
          sequence_length=partial_sequence_length,
          dtype=embedding_layer.dtype,
          time_major=False)
      # final_states is a tuple of tuples:
      # (
      #     (forward_gru_0, forward_gru_1, ...),
      #     (backward_gru_0, backward_gru_1, ...)
      # )
      # Flatten the tuple as
      # (forward_gru_0, ..., backward_gru_0, ...)
      final_states = final_states[0] + final_states[1]
    else:
      _, final_states = tf.nn.dynamic_rnn(
          cell=forward_stacked_gru,
          inputs=embedding_layer,
          sequence_length=partial_sequence_length,
          dtype=embedding_layer.dtype,
          time_major=False)

    concat_final_states = tf.concat(
        final_states, axis=1, name='concatenate_gru_final_states')

    logits = tf.layers.dense(
        concat_final_states, num_output_features, name='logits')
    return logits
Exemple #18
0
def get_rnn_cell(hparams=None, mode=None):
    """Creates an RNN cell.

    See :func:`~texar.core.default_rnn_cell_hparams` for all
    hyperparameters and default values.

    Args:
        hparams (dict or HParams, optional): Cell hyperparameters. Missing
            hyperparameters are set to default values.
        mode (optional): A Tensor taking value in
            :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
            `TRAIN`, `EVAL`, and `PREDICT`. If `None`, dropout will be
            controlled by :func:`texar.global_mode`.

    Returns:
        A cell instance.

    Raises:
        ValueError: If hparams["num_layers"]>1 and hparams["type"] is a class
            instance.
        ValueError: The cell is not an
            :tf_main:`RNNCell <contrib/rnn/RNNCell>` instance.
    """
    if hparams is None or isinstance(hparams, dict):
        hparams = HParams(hparams, default_rnn_cell_hparams())

    d_hp = hparams["dropout"]
    if d_hp["variational_recurrent"] and \
            len(d_hp["input_size"]) != hparams["num_layers"]:
        raise ValueError(
            "If variational_recurrent=True, input_size must be a list of "
            "num_layers(%d) integers. Got len(input_size)=%d." %
            (hparams["num_layers"], len(d_hp["input_size"])))

    cells = []
    cell_kwargs = hparams["kwargs"].todict()
    num_layers = hparams["num_layers"]
    for layer_i in range(num_layers):
        # Create the basic cell
        cell_type = hparams["type"]
        if not is_str(cell_type) and not isinstance(cell_type, type):
            if num_layers > 1:
                raise ValueError(
                    "If 'num_layers'>1, then 'type' must be a cell class or "
                    "its name/module path, rather than a cell instance.")
        cell_modules = ['tensorflow.contrib.rnn', 'texar.custom']
        cell = utils.check_or_get_instance(
            cell_type, cell_kwargs, cell_modules, rnn.RNNCell)

        # Optionally add dropout
        if d_hp["input_keep_prob"] < 1.0 or \
                d_hp["output_keep_prob"] < 1.0 or \
                d_hp["state_keep_prob"] < 1.0:
            vr_kwargs = {}
            if d_hp["variational_recurrent"]:
                vr_kwargs = {
                    "variational_recurrent": True,
                    "input_size": d_hp["input_size"][layer_i],
                    "dtype": tf.float32
                }
            input_keep_prob = switch_dropout(d_hp["input_keep_prob"],
                                             mode)
            output_keep_prob = switch_dropout(d_hp["output_keep_prob"],
                                              mode)
            state_keep_prob = switch_dropout(d_hp["state_keep_prob"],
                                             mode)
            cell = rnn.DropoutWrapper(
                cell=cell,
                input_keep_prob=input_keep_prob,
                output_keep_prob=output_keep_prob,
                state_keep_prob=state_keep_prob,
                **vr_kwargs)

        # Optionally add residual and highway connections
        if layer_i > 0:
            if hparams["residual"]:
                cell = rnn.ResidualWrapper(cell)
            if hparams["highway"]:
                cell = rnn.HighwayWrapper(cell)

        cells.append(cell)

    if hparams["num_layers"] > 1:
        cell = rnn.MultiRNNCell(cells)
    else:
        cell = cells[0]

    return cell
Exemple #19
0
    def __init__(self, args, infer=False):
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        if args.model == 'rnn':
            cell_fn = rnn.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn.BasicLSTMCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cells = []
        for _ in range(args.num_layers):
            cell = cell_fn(args.rnn_size)
            cells.append(cell)

        self.cell = cell = rnn.MultiRNNCell(cells)

        self.input_data = tf.placeholder(tf.int32,
                                         [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32,
                                      [args.batch_size, args.seq_length])
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)
        self.batch_pointer = tf.Variable(0,
                                         name="batch_pointer",
                                         trainable=False,
                                         dtype=tf.int32)
        self.inc_batch_pointer_op = tf.assign(self.batch_pointer,
                                              self.batch_pointer + 1)
        self.epoch_pointer = tf.Variable(0,
                                         name="epoch_pointer",
                                         trainable=False)
        self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False)
        tf.summary.scalar("time_batch", self.batch_time)

        def variable_summaries(var):
            with tf.name_scope('summaries'):
                mean = tf.reduce_mean(var)
                tf.summary.scalar('mean', mean)
                #with tf.name_scope('stddev'):
                #   stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
                #tf.summary.scalar('stddev', stddev)
                tf.summary.scalar('max', tf.reduce_max(var))
                tf.summary.scalar('min', tf.reduce_min(var))
                #tf.summary.histogram('histogram', var)

        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w",
                                        [args.rnn_size, args.vocab_size])
            variable_summaries(softmax_w)
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
            variable_summaries(softmax_b)
            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding",
                                            [args.vocab_size, args.rnn_size])
                inputs = tf.split(
                    tf.nn.embedding_lookup(embedding, self.input_data),
                    args.seq_length, 1)
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        outputs, last_state = legacy_seq2seq.rnn_decoder(
            inputs,
            self.initial_state,
            cell,
            loop_function=loop if infer else None,
            scope='rnnlm')
        output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = legacy_seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([args.batch_size * args.seq_length])], args.vocab_size)
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        tf.summary.scalar("cost", self.cost)
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                          args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Exemple #20
0
def getLayeredCell(layer_size, num_units, input_keep_prob,
        output_keep_prob=1.0):
    return rnn.MultiRNNCell([rnn.DropoutWrapper(rnn.BasicLSTMCell(num_units),
        input_keep_prob, output_keep_prob) for i in range(layer_size)])
def main(_):

    # load data, either shakespeare, or the Python source of Tensorflow itself
    shakedir = FLAGS.text_dir
    # shakedir = "../tensorflow/**/*.py"
    codetext, valitext, bookranges = txt.read_data_files(shakedir,
                                                         validation=True)

    # display some stats on the data
    epoch_size = len(codetext) // (FLAGS.train_batch_size * FLAGS.seqlen)
    txt.print_data_stats(len(codetext), len(valitext), epoch_size)

    #
    # the model (see FAQ in README.md)
    #
    lr = tf.placeholder(tf.float32, name='lr')  # learning rate
    pkeep = tf.placeholder(tf.float32, name='pkeep')  # dropout parameter
    batchsize = tf.placeholder(tf.int32, name='batchsize')

    # inputs
    X = tf.placeholder(tf.uint8, [None, None],
                       name='X')  # [ BATCHSIZE, FLAGS.seqlen ]
    Xo = tf.one_hot(X, ALPHASIZE, 1.0,
                    0.0)  # [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ]
    # expected outputs = same sequence shifted by 1 since we are trying to predict the next character
    Y_ = tf.placeholder(tf.uint8, [None, None],
                        name='Y_')  # [ BATCHSIZE, FLAGS.seqlen ]
    Yo_ = tf.one_hot(Y_, ALPHASIZE, 1.0,
                     0.0)  # [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ]
    # input state
    Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE * NLAYERS],
                         name='Hin')  # [ BATCHSIZE, INTERNALSIZE * NLAYERS]

    # using a NLAYERS=3 layers of GRU cells, unrolled FLAGS.seqlen=30 times
    # dynamic_rnn infers FLAGS.seqlen from the size of the inputs Xo

    onecell = rnn.GRUCell(INTERNALSIZE)
    dropcell = rnn.DropoutWrapper(onecell, input_keep_prob=pkeep)
    multicell = rnn.MultiRNNCell([dropcell] * NLAYERS, state_is_tuple=False)
    multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep)
    Yr, H = tf.nn.dynamic_rnn(multicell,
                              Xo,
                              dtype=tf.float32,
                              initial_state=Hin)
    # Yr: [ BATCHSIZE, FLAGS.seqlen, INTERNALSIZE ]
    # H:  [ BATCHSIZE, INTERNALSIZE*NLAYERS ] # this is the last state in the sequence

    H = tf.identity(H, name='H')  # just to give it a name

    # Softmax layer implementation:
    # Flatten the first two dimension of the output [ BATCHSIZE, FLAGS.seqlen, ALPHASIZE ] => [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ]
    # then apply softmax readout layer. This way, the weights and biases are shared across unrolled time steps.
    # From the readout point of view, a value coming from a cell or a minibatch is the same thing

    Yflat = tf.reshape(
        Yr, [-1, INTERNALSIZE])  # [ BATCHSIZE x FLAGS.seqlen, INTERNALSIZE ]
    Ylogits = layers.linear(
        Yflat, ALPHASIZE)  # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ]
    Yflat_ = tf.reshape(
        Yo_, [-1, ALPHASIZE])  # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ]
    loss = tf.nn.softmax_cross_entropy_with_logits(
        logits=Ylogits, labels=Yflat_)  # [ BATCHSIZE x FLAGS.seqlen ]
    loss = tf.reshape(loss, [batchsize, -1])  # [ BATCHSIZE, FLAGS.seqlen ]
    Yo = tf.nn.softmax(Ylogits,
                       name='Yo')  # [ BATCHSIZE x FLAGS.seqlen, ALPHASIZE ]
    Y = tf.argmax(Yo, 1)  # [ BATCHSIZE x FLAGS.seqlen ]
    Y = tf.reshape(Y, [batchsize, -1], name="Y")  # [ BATCHSIZE, FLAGS.seqlen ]
    train_step = tf.train.AdamOptimizer(lr).minimize(loss)

    # stats for display
    seqloss = tf.reduce_mean(loss, 1)
    batchloss = tf.reduce_mean(seqloss)
    accuracy = tf.reduce_mean(
        tf.cast(tf.equal(Y_, tf.cast(Y, tf.uint8)), tf.float32))
    loss_summary = tf.summary.scalar("batch_loss", batchloss)
    acc_summary = tf.summary.scalar("batch_accuracy", accuracy)
    summaries = tf.summary.merge([loss_summary, acc_summary])

    # Init Tensorboard stuff. This will save Tensorboard information into a different
    # folder at each run named 'log/<timestamp>/'. Two sets of data are saved so that
    # you can compare training and validation curves visually in Tensorboard.
    timestamp = str(math.trunc(time.time()))
    summary_writer = tf.summary.FileWriter(
        os.path.join(FLAGS.summaries_dir, timestamp + "-training"))
    validation_writer = tf.summary.FileWriter(
        os.path.join(FLAGS.summaries_dir, timestamp + "-validation"))

    # Init for saving models. They will be saved into a directory named 'checkpoints'.
    # Only the last checkpoint is kept.
    if not os.path.exists(FLAGS.checkpoint_dir):
        os.mkdir(FLAGS.checkpoint_dir)
    saver = tf.train.Saver(max_to_keep=1)

    # for display: init the progress bar
    DISPLAY_FREQ = 50
    _50_BATCHES = DISPLAY_FREQ * FLAGS.train_batch_size * FLAGS.seqlen
    progress = txt.Progress(DISPLAY_FREQ,
                            size=111 + 2,
                            msg="Training on next " + str(DISPLAY_FREQ) +
                            " batches")

    # init
    istate = np.zeros([FLAGS.train_batch_size,
                       INTERNALSIZE * NLAYERS])  # initial zero input state
    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)
    step = 0

    # training loop
    for x, y_, epoch in txt.rnn_minibatch_sequencer(codetext,
                                                    FLAGS.train_batch_size,
                                                    FLAGS.seqlen,
                                                    nb_epochs=1000):

        # train on one minibatch
        feed_dict = {
            X: x,
            Y_: y_,
            Hin: istate,
            lr: FLAGS.learning_rate,
            pkeep: FLAGS.dropout_pkeep,
            batchsize: FLAGS.train_batch_size
        }
        _, y, ostate, smm = sess.run([train_step, Y, H, summaries],
                                     feed_dict=feed_dict)

        # save training data for Tensorboard
        summary_writer.add_summary(smm, step)

        # display a visual validation of progress (every 50 batches)
        if step % _50_BATCHES == 0:
            feed_dict = {
                X: x,
                Y_: y_,
                Hin: istate,
                pkeep: 1.0,
                batchsize: FLAGS.train_batch_size
            }  # no dropout for validation
            y, l, bl, acc = sess.run([Y, seqloss, batchloss, accuracy],
                                     feed_dict=feed_dict)
            txt.print_learning_learned_comparison(x, y, l, bookranges, bl, acc,
                                                  epoch_size, step, epoch)

        # run a validation step every 50 batches
        # The validation text should be a single sequence but that's too slow (1s per 1024 chars!),
        # so we cut it up and batch the pieces (slightly inaccurate)
        # tested: validating with 5K sequences instead of 1K is only slightly more accurate, but a lot slower.
        if step % _50_BATCHES == 0 and len(valitext) > 0:
            VALI_SEQLEN = 1 * 1024  # Sequence length for validation. State will be wrong at the start of each sequence.
            bsize = len(valitext) // VALI_SEQLEN
            txt.print_validation_header(len(codetext), bookranges)
            vali_x, vali_y, _ = next(
                txt.rnn_minibatch_sequencer(valitext, bsize, VALI_SEQLEN,
                                            1))  # all data in 1 batch
            vali_nullstate = np.zeros([bsize, INTERNALSIZE * NLAYERS])
            feed_dict = {
                X: vali_x,
                Y_: vali_y,
                Hin: vali_nullstate,
                pkeep: 1.0,  # no dropout for validation
                batchsize: bsize
            }
            ls, acc, smm = sess.run([batchloss, accuracy, summaries],
                                    feed_dict=feed_dict)
            txt.print_validation_stats(ls, acc)
            # save validation data for Tensorboard
            validation_writer.add_summary(smm, step)

        # display a short text generated with the current weights and biases (every 150 batches)
        if step // 3 % _50_BATCHES == 0:
            txt.print_text_generation_header()
            ry = np.array([[txt.convert_from_alphabet(ord("K"))]])
            rh = np.zeros([1, INTERNALSIZE * NLAYERS])
            for k in range(1000):
                ryo, rh = sess.run([Yo, H],
                                   feed_dict={
                                       X: ry,
                                       pkeep: 1.0,
                                       Hin: rh,
                                       batchsize: 1
                                   })
                rc = txt.sample_from_probabilities(
                    ryo, topn=10 if epoch <= 1 else 2)
                print(chr(txt.convert_to_alphabet(rc)), end="")
                ry = np.array([[rc]])
            txt.print_text_generation_footer()

        # save a checkpoint (every 500 batches)
        if step // 10 % _50_BATCHES == 0:
            saver.save(sess,
                       FLAGS.checkpoint_dir + '/rnn_train_' + timestamp,
                       global_step=step)

        # display progress bar
        progress.step(reset=step % _50_BATCHES == 0)

        # loop state around
        istate = ostate
        step += FLAGS.train_batch_size * FLAGS.seqlen
Exemple #22
0
    def build_graph(self):
        configs = self.trainingManager.configs

        # shared between train and test
        self.keep_prop_tf = tf.placeholder(dtype=tf.float32, name="keep_prop_tf")

        # repeat it stacked_layers
        encoder_dropcells_fw = [tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(configs.internal_state_encoder), output_keep_prob=self.keep_prop_tf) for _ in range(configs.stacked_layers)]
        encoder_multi_cell_fw = rnn.MultiRNNCell(encoder_dropcells_fw, state_is_tuple=True)

        encoder_dropcells_bw = [tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(configs.internal_state_encoder), output_keep_prob=self.keep_prop_tf) for _ in range(configs.stacked_layers)]
        encoder_multi_cell_bw = rnn.MultiRNNCell(encoder_dropcells_bw, state_is_tuple=True)

        decoder_dropcells = [tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(configs.internal_state_decoder), output_keep_prob=self.keep_prop_tf) for _ in range(configs.stacked_layers)]
        decoder_multi_cell = rnn.MultiRNNCell(decoder_dropcells, state_is_tuple=True)

        with tf.variable_scope('train'):
            # input placeholders
            self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')  # [en_seq_len, batch  ]
            self.encoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_length')  # [batch]

            self.decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_inputs')  # [de_seq_len, batch] starts with KOKO_START token
            self.decoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='decoder_inputs_length')  # [batch]  IMPORTANT NOTE : decoder_inputs_length = counts the start token
            self.decoder_outputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_outputs')  # [de_seq_len, batch ] = ground truth padded at the end with zeros

            # embedding lookup or one hot encoding of my languages (encoder / decoder)
            if configs.use_embedding:
                encoder_embeddings = tf.Variable(tf.random_uniform([configs.vocabulary_size_encoder, configs.encoder_embedding_size], -1.0, 1.0), dtype=tf.float32)
                encoder_inputs_to_rnn = tf.nn.embedding_lookup(encoder_embeddings, self.encoder_inputs)  # [  sequence_length,batch_size, encoder_embedding_size ] # embedded

                decoder_embeddings = tf.Variable(tf.random_uniform([configs.vocabulary_size_decoder, configs.decoder_embedding_size], -1.0, 1.0), dtype=tf.float32)
                decoder_inputs_to_rnn = tf.nn.embedding_lookup(decoder_embeddings, self.decoder_inputs)  # [  sequence_length,batch_size, decoder_embedding_size ] # embedded

            else:
                encoder_inputs_to_rnn = tf.one_hot(self.encoder_inputs, configs.vocabulary_size_encoder, 1.0, 0.0)  # [  sequence_length,batch_size, vocabulary_size ]  # one hot encoded
                decoder_inputs_to_rnn = tf.one_hot(self.decoder_inputs, configs.vocabulary_size_decoder, 1.0, 0.0)  # [  sequence_length,batch_size, vocabulary_size ]  # one hot encoded

            (self.encoder_fw_outputs, self.encoder_bw_outputs), (encoder_fw_final_state, encoder_bw_final_state) = \
                tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_multi_cell_fw,
                                                cell_bw=encoder_multi_cell_bw,
                                                inputs=encoder_inputs_to_rnn,
                                                sequence_length=self.encoder_inputs_length,
                                                dtype=tf.float32,
                                                time_major=True)
            # outputs :[sequence_length, batch_size, internal_state_encoder]
            # final_state:[batch_size, internal_state_encoder] as tuple repeated stack times
            # this is my thought vector = [batch_size, internal_state_encoder(fw)+internal_state_encoder(bw)]

            # i will re-feed this thought vector at inference
            self.decoder_init_state_from_encoder = tuple([tf.concat((encoder_fw_final_state[i], encoder_bw_final_state[i]), axis=1) for i in range(configs.stacked_layers)])  # state is tuple for decoder input

            # decoder dynamic rnn
            self.decoder_states_outputs, self.decoder_final_state = tf.nn.dynamic_rnn(decoder_multi_cell,
                                                                                      inputs=decoder_inputs_to_rnn,
                                                                                      initial_state=self.decoder_init_state_from_encoder,
                                                                                      time_major=True,
                                                                                      sequence_length=self.decoder_inputs_length)
            # decoder_states_outputs :[sequence_length, batch_size, internal_state_decoder]
            # decoder_final_state :[batch_size, internal_state_decoder] as tuple repeated stack times

            decoder_logits = tf.layers.dense(self.decoder_states_outputs, units=configs.vocabulary_size_decoder, use_bias=True)  # projection on the vocabulary outputs : [sequence_length, batch_size, vocabulary_size_decoder]
            self.dec_probabilities = tf.nn.softmax(decoder_logits)  # [sequence_length, batch_size, vocabulary_size_decoder]

            # the triangle has the decoder shape not the encoder !!!!
            lower_triangular_ones = tf.constant(np.tril(np.ones([configs.max_seq_len_decoder, configs.max_seq_len_decoder])), dtype=tf.float32)  # lower triangle ones [max_seq_len_encoder,max_seq_len_encoder] >> [[1. 0.],[1. 1.]]
            _, batch_size_tf = tf.unstack(tf.shape(self.encoder_inputs))  # seq_length , batch_size

            seqlen_mask = tf.transpose(tf.slice(tf.gather(lower_triangular_ones, self.decoder_inputs_length - 1), begin=[0, 0], size=[batch_size_tf, tf.reduce_max(self.decoder_inputs_length)]))  # so you need to take length -1 due to lower triangle ones [sequence_length, batch_size]

            # connect outputs to
            with tf.name_scope("optimization"):
                # Loss function
                self.loss = tf.contrib.seq2seq.sequence_loss(decoder_logits, self.decoder_outputs, seqlen_mask)  # sparse softmax cross entropy

                # Optimizer
                self.train_step = tf.train.RMSPropOptimizer(configs.learning_rate).minimize(self.loss)

            # To calculate the number correct, this means we don't count the padded as correct
            correct = tf.cast(tf.equal(tf.cast(tf.argmax(decoder_logits, 2), tf.int32), self.decoder_outputs), dtype=tf.float32) * seqlen_mask
            self.accuracy = tf.reduce_sum(correct) / tf.reduce_sum(seqlen_mask)

            # summary tensors
            if not self.trainingManager.is_local_env:
                loss_summary = tf.summary.scalar("batch_loss", self.loss)
                acc_summary = tf.summary.scalar("batch_accuracy", self.accuracy)
                self.summaries = tf.summary.merge([loss_summary, acc_summary])
Exemple #23
0
o = [0, 0, 0, 1]
x_data = np.array([[h, e, l, l, o],
                   [e, o, l, l, l],
                   [l, l, e, e, l]], dtype=np.float32)
# with tf.variable_scope('initial_state') as scope:
#     batch_size = 3

#     pp.pprint(x_data)
#
#     # One cell RNN input_dim (4) -> output_dim (5). sequence: 5, batch: 3
#     hidden_size = 2
#     cell = rnn.BasicLSTMCell(num_units=hidden_size, state_is_tuple=True)
#     initial_state = cell.zero_state(batch_size, tf.float32)
#     outputs, _states = tf.nn.dynamic_rnn(cell, x_data,
#                                          initial_state=initial_state, dtype=tf.float32)
#     sess.run(tf.global_variables_initializer())
#     pp.pprint(outputs.eval())

with tf.variable_scope('MultiRNNCell') as scope:
    # Make rnn
    # cell = rnn.BasicLSTMCell(num_units=5, state_is_tuple=True)
    def lstm_cell():
        cell = rnn.BasicLSTMCell(5, state_is_tuple=True)
        return cell
    cells = rnn.MultiRNNCell([lstm_cell() for _ in range(3)], state_is_tuple=True)  # 3 layers
    # print(x_data)
    # rnn in/out
    outputs, _states = tf.nn.dynamic_rnn(cells, x_data, dtype=tf.float32)
    print("dynamic rnn: ", outputs)
    sess.run(tf.global_variables_initializer())
    pp.pprint(outputs.eval())  # batch size, unrolling (time), hidden_size
Exemple #24
0
    def __init__(self, params):
        """
        :param params:是一个字典,包含num_steps,state_size,batch_size,num_classes,learning_rate
        """

        self.params = params
        n_steps = params["n_steps"]
        n_input = params["n_input"]
        n_units = params["n_units"]
        n_classes = params["n_classes"]
        batch_size = params["batch_size"]
        # "n_steps": 128,
        # "n_input": 128,
        # "n_units": 128,
        # "n_classes": 6,
        # "batch_size": 100,
        # "n_epochs": 50,
        # "learning_rate": 0.0003,
        # "display_step": 1,
        # "run_mode": "/cpu:0",
        # "split_png_data": "/Users/jw/Desktop/audio_data/1484131952_256_0.5/split_png_data/CASIA"

        tf.reset_default_graph()
        with tf.get_default_graph().as_default():
            with tf.name_scope("placeholder"):
                self.x = tf.placeholder("float", [None, n_steps * n_input],
                                        name="x")
                self.input = tf.reshape(self.x, [-1, n_steps, n_input])
                self.y = tf.placeholder("float", [None, n_classes], name="y")
                self.keep_prob = tf.placeholder(tf.float32)

            with tf.variable_scope("softmax"):
                weights = tf.Variable(tf.random_normal([n_units, n_classes]),
                                      name='weights')
                biases = tf.Variable(tf.random_normal([n_classes]),
                                     name='biases')

            # x = tf.transpose(self.x, [1, 0, 2])
            # x = tf.reshape(x, [-1, n_input])
            # x = tf.split(0, n_steps, x)
            sequence_length = np.zeros([batch_size], dtype=int)
            sequence_length += n_steps

            state_size = self.params["n_units"]
            num_layers = self.params["n_layers"]
            cell_type = self.params["cell_type"]
            num_weights_for_custom_cell = self.params.get("n_weights")

            if cell_type == 'Custom':
                cell = CustomCell(state_size, num_weights_for_custom_cell)
                cell = rnn.MultiRNNCell([
                    rnn.DropoutWrapper(rnn.LSTMCell(state_size,
                                                    state_is_tuple=True),
                                       input_keep_prob=self.keep_prob)
                    for _ in range(num_layers)
                ])
            elif cell_type == 'GRU':
                cell = rnn.GRUCell(state_size)
            elif cell_type == 'LSTM':
                cell = rnn.MultiRNNCell([
                    rnn.DropoutWrapper(rnn.LSTMCell(state_size,
                                                    state_is_tuple=True),
                                       input_keep_prob=self.keep_prob)
                    for _ in range(num_layers)
                ])
            elif cell_type == 'LN_LSTM':
                cell = LayerNormalizedLSTMCell(state_size)
            else:
                cell = rnn.BasicRNNCell(state_size)

            cell = rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)

            self.init_state = cell.zero_state(batch_size, dtype=tf.float32)
            outputs, self.final_state = tf.nn.dynamic_rnn(
                cell,
                self.input,
                dtype=tf.float32,
                initial_state=self.init_state,
                sequence_length=sequence_length)
            # outputs's shape [batch_size, time_step, state_size]
            outputs = tf.transpose(outputs, [1, 0, 2])

            pred = tf.matmul(outputs[-1], weights) + biases
            self.cost = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(logits=pred,
                                                        labels=self.y))
            self.optimizer = tf.train.AdamOptimizer(learning_rate=params['learning_rate']) \
                .minimize(self.cost)

            correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(self.y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

            tf.summary.scalar("cost", self.cost)
            tf.summary.scalar("accuracy", self.accuracy)
            self.merge_summary_op = tf.summary.merge_all()

            logger.info("模型构建完毕")
Exemple #25
0
X = tf.reshape(_X, [-1, 28, 28])
y = tf.placeholder(tf.float32, [None, class_num])
keep_prob = tf.placeholder(tf.float32)


def unit_lstm():
    lstm_cell = rnn.BasicLSTMCell(num_units=hidden_size,
                                  forget_bias=1.0,
                                  state_is_tuple=True)
    lstm_cell = rnn.DropoutWrapper(cell=lstm_cell,
                                   input_keep_prob=1.0,
                                   output_keep_prob=keep_prob)
    return lstm_cell


mlstm_cell = rnn.MultiRNNCell([unit_lstm() for i in range(3)],
                              state_is_tuple=True)

init_state = mlstm_cell.zero_state(batch_size, dtype=tf.float32)
outputs, state = tf.nn.dynamic_rnn(mlstm_cell,
                                   inputs=X,
                                   initial_state=init_state,
                                   time_major=False)
h_state = outputs[:, -1, :]

W = tf.Variable(tf.truncated_normal([hidden_size, class_num], stddev=0.1),
                dtype=tf.float32)
bias = tf.Variable(tf.constant(0.1, shape=[class_num]), dtype=tf.float32)
y_pre = tf.nn.softmax(tf.matmul(h_state, W) + bias)

cross_entropy = -tf.reduce_mean(y * tf.log(y_pre))
train_op = tf.train.AdamOptimizer(lr).minimize(cross_entropy)
Exemple #26
0
time_step = 1
input_num = 8
output_num = 1
epoch_num = 50
batch_size = 72
learning_rate = 0.001
with tf.device('/gpu:0'):
    x = tf.placeholder("float", [None, time_step, input_num])
    y = tf.placeholder("float", [None, output_num])

    def lstm_cell():
        lstm_cell = rnn.BasicLSTMCell(hidden_num)
        return lstm_cell

    with tf.variable_scope("lstm", reuse=None):
        Multi_cell = rnn.MultiRNNCell([lstm_cell() for _ in range(layer_num)],
                                      state_is_tuple=True)
        outputs, _ = tf.nn.dynamic_rnn(Multi_cell, x, dtype=tf.float32)
        prediction = tf.layers.dense(inputs=outputs[:, -1, :],
                                     units=output_num)
        loss = tf.reduce_mean(tf.abs(prediction - y))
        train_step = tf.train.AdamOptimizer().minimize(loss)
        init = tf.global_variables_initializer()
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True)) as sess:
    sess.run(init)
    loss_epoch = [0] * epoch_num
    xx = 0
    n = 0
    pre = []
    rmse = 0
    start_train = time.time()
      #'cnnoutscale': tf.Variable(tf.ones([2048])),
      #'featbeta': tf.Variable(tf.zeros([4096])),
      #'featscale': tf.Variable(tf.ones([4096])),
      #'gbeta': tf.Variable(tf.zeros([1000])),
      #'gscale': tf.Variable(tf.ones([1000]))
   }

   # question-embedding
   #embed_ques_W = tf.Variable(tf.random_uniform([vocabulary_size, input_embedding_size], -0.08, 0.08), name='embed_ques_W')

   # encoder: RNN body
   lstm_1 = rnn_cell.LSTMCell(rnn_size, input_embedding_size, use_peepholes=True, state_is_tuple=False)
   lstm_dropout_1 = rnn_cell.DropoutWrapper(lstm_1, output_keep_prob = 1 - dropout_rate)
   lstm_2 = rnn_cell.LSTMCell(rnn_size, rnn_size, use_peepholes=True, state_is_tuple=False)
   lstm_dropout_2 = rnn_cell.DropoutWrapper(lstm_2, output_keep_prob = 1 - dropout_rate)
   stacked_lstm = rnn_cell.MultiRNNCell([lstm_dropout_1, lstm_dropout_2], state_is_tuple=False)


   image = tf.placeholder(tf.float32, [batch_size, 2048])
   question = tf.placeholder(tf.int32, [batch_size, max_words_q])
   #answers_true = tf.placeholder(tf.float32, (batch_size, 1000))
   #noise = tf.placeholder(tf.float32, [batch_size, 4096])

   #answers_false = tf.placeholder(tf.float32, (None, 1000))
   #image_false = tf.placeholder(tf.float32, (None, 2048))
   #question_false = tf.placeholder(tf.int32, [batch_size, max_words_q])

      
   #state = tf.zeros([batch_size, stacked_lstm.state_size])
   state = stacked_lstm.zero_state(batch_size, tf.float32)
   loss = 0.0
Exemple #28
0
    def __init__(self,
                 num_emb,
                 batch_size,
                 emb_dim,
                 hidden_dim,
                 sequence_length,
                 start_token,
                 learning_rate=0.01,
                 reward_gamma=0.95):
        self.num_emb = num_emb
        self.batch_size = batch_size
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.sequence_length = sequence_length
        self.start_token = tf.constant([start_token] * self.batch_size,
                                       dtype=tf.int32)
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.reward_gamma = reward_gamma
        self.temperature = 1.0
        self.grad_clip = 5.0

        self.expected_reward = tf.Variable(tf.zeros([self.sequence_length]))

        with tf.variable_scope('generator') as scope:
            self.g_embeddings = tf.Variable(
                self.init_matrix([self.num_emb, self.emb_dim]))
            self.g_recurrent_unit = self.create_recurrent_unit(
            )  # maps h_tm1 to h_t for generator
            self.g_output_unit = self.create_output_unit(
            )  # maps h_t to o_t (output token logits)

            # placeholder definition
            self.x = tf.placeholder(
                tf.int32, shape=[self.batch_size, self.sequence_length])
            # sequence of indices of true data, not including start token

            self.rewards = tf.placeholder(
                tf.float32, shape=[self.batch_size, self.sequence_length])
            # get from rollout policy and discriminator

            # processed for batch
            with tf.device("/cpu:0"):
                inputs = tf.split(axis=1,
                                  num_or_size_splits=self.sequence_length,
                                  value=tf.nn.embedding_lookup(
                                      self.g_embeddings, self.x))
                self.processed_x = tf.stack([
                    tf.squeeze(input_, [1]) for input_ in inputs
                ])  # seq_length x batch_size x emb_dim

            cell = rnn.BasicLSTMCell(self.hidden_dim, state_is_tuple=True)
            self.cell = rnn.MultiRNNCell([cell] * 2, state_is_tuple=True)

            self.h0 = tf.zeros([self.batch_size, self.hidden_dim])
            self.h0 = tf.stack([self.h0, self.h0])

            self.h0 = self.cell.zero_state(self.batch_size, tf.float32)

            gen_o = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                 size=self.sequence_length,
                                                 dynamic_size=False,
                                                 infer_shape=True)
            gen_x = tensor_array_ops.TensorArray(dtype=tf.int32,
                                                 size=self.sequence_length,
                                                 dynamic_size=False,
                                                 infer_shape=True)

            def _g_recurrence(x_t, h_tm1, gen_o, gen_x):
                h_t = self.g_recurrent_unit(x_t, h_tm1)  # hidden_memory_tuple
                o_t = self.g_output_unit(
                    h_t)  # batch x vocab , logits not prob
                log_prob = tf.log(tf.nn.softmax(o_t))
                next_token = tf.cast(
                    tf.reshape(tf.multinomial(log_prob, 1), [self.batch_size]),
                    tf.int32)
                x_tp1 = tf.nn.embedding_lookup(self.g_embeddings,
                                               next_token)  # batch x emb_dim
                gen_o = gen_o.write(i,
                                    tf.reduce_sum(
                                        tf.multiply(
                                            tf.one_hot(next_token,
                                                       self.num_emb, 1.0, 0.0),
                                            tf.nn.softmax(o_t)),
                                        1))  # [batch_size] , prob
                gen_x = gen_x.write(i, next_token)  # indices, batch_size
                return x_tp1, h_t, gen_o, gen_x

            # My loop
            initial_state = (tf.zeros([self.batch_size,
                                       self.hidden_dim]), self.h0)

            x_t, h_t, gen_o, gen_x = tf.nn.embedding_lookup(
                self.g_embeddings,
                self.start_token), initial_state, gen_o, gen_x
            for i in range(self.sequence_length):
                if i > 0: scope.reuse_variables()
                x_t, h_t, gen_o, gen_x = _g_recurrence(x_t, h_t, gen_o, gen_x)
            self.gen_o, self.gen_x = gen_o, gen_x

            self.gen_x = self.gen_x.stack()  # seq_length x batch_size
            self.gen_x = tf.transpose(self.gen_x,
                                      perm=[1, 0])  # batch_size x seq_length

            # supervised pretraining for generator
            g_predictions = tensor_array_ops.TensorArray(
                dtype=tf.float32,
                size=self.sequence_length,
                dynamic_size=False,
                infer_shape=True)

            g_logits = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                    size=self.sequence_length,
                                                    dynamic_size=False,
                                                    infer_shape=True)

            ta_emb_x = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                    size=self.sequence_length)
            ta_emb_x = ta_emb_x.unstack(self.processed_x)

            def _pretrain_recurrence(x_t, h_tm1, g_predictions, g_logits):
                h_t = self.g_recurrent_unit(x_t, h_tm1)
                o_t = self.g_output_unit(h_t)
                g_predictions = g_predictions.write(
                    i, tf.nn.softmax(o_t))  # batch x vocab_size
                g_logits = g_logits.write(i, o_t)  # batch x vocab_size
                x_tp1 = ta_emb_x.read(i)
                return x_tp1, h_t, g_predictions, g_logits

            initial_state = (tf.zeros([self.batch_size,
                                       self.hidden_dim]), self.h0)
            x_t, ht = tf.nn.embedding_lookup(self.g_embeddings,
                                             self.start_token), initial_state
            for i in range(self.sequence_length):
                if i > 0: scope.reuse_variables()
                x_t, h_t, g_predictions, g_logits = _pretrain_recurrence(
                    x_t, h_t, g_predictions, g_logits)
            self.g_predictions, self.g_logits = g_predictions, g_logits

            self.g_predictions = tf.transpose(
                self.g_predictions.stack(),
                perm=[1, 0, 2])  # batch_size x seq_length x vocab_size

            self.g_logits = tf.transpose(
                self.g_logits.stack(),
                perm=[1, 0, 2])  # batch_size x seq_length x vocab_size

        # pretraining loss
        self.pretrain_loss = -tf.reduce_sum(
            tf.one_hot(tf.to_int32(tf.reshape(
                self.x, [-1])), self.num_emb, 1.0, 0.0) * tf.log(
                    tf.clip_by_value(
                        tf.reshape(self.g_predictions, [-1, self.num_emb]),
                        1e-20, 1.0))) / (self.sequence_length *
                                         self.batch_size)

        # training updates
        pretrain_opt = self.g_optimizer(self.learning_rate)

        tvars = tf.trainable_variables()
        g_params = [var for var in tvars if 'generator' in var.name]

        self.pretrain_grad, _ = tf.clip_by_global_norm(
            tf.gradients(self.pretrain_loss, g_params), self.grad_clip)
        self.pretrain_updates = pretrain_opt.apply_gradients(
            zip(self.pretrain_grad, g_params))

        #######################################################################################################
        #  Unsupervised Training
        #######################################################################################################
        self.g_loss = -tf.reduce_sum(
            tf.reduce_sum(
                tf.one_hot(tf.to_int32(tf.reshape(
                    self.x, [-1])), self.num_emb, 1.0, 0.0) * tf.log(
                        tf.clip_by_value(
                            tf.reshape(self.g_predictions, [-1, self.num_emb]),
                            1e-20, 1.0)), 1) * tf.reshape(self.rewards, [-1]))

        g_opt = self.g_optimizer(self.learning_rate)

        self.g_grad, _ = tf.clip_by_global_norm(
            tf.gradients(self.g_loss, g_params), self.grad_clip)
        self.g_updates = g_opt.apply_gradients(zip(self.g_grad, g_params))
    def __init__(self,
                 encoder_inputs,
                 encoder_lengths,
                 encoder_inputs_2,
                 encoder_lengths_2,
                 decoder_inputs,
                 decoder_lengths,
                 _embed_ph,
                 learn_rate,
                 start_token,
                 if_test=False,
                 temperature=None,
                 end_rate=1):
        self.start_tokens = tf.constant([start_token] * BATCH_SIZE,
                                        dtype=tf.int32)

        self.learn_rate = learn_rate
        keep_prob = 0.8

        with tf.variable_scope('generator',
                               initializer=tf.orthogonal_initializer()):
            with tf.variable_scope('embedding'):
                self.embedding = tf.get_variable(
                    name='embedding',
                    shape=[param.VOCAB_SIZE, INPUT_DIM],
                    trainable=True)
                self._embed_ph = _embed_ph
                self._embed_init = self.embedding.assign(self._embed_ph)
            self.encoder_inputs = encoder_inputs
            self.encoder_lengths = encoder_lengths
            if param.Use_VAE:
                self.encoder_inputs_2 = encoder_inputs_2
                self.encoder_lengths_2 = encoder_lengths_2

            self.decoder_lengths = decoder_lengths
            self.decoder_inputs = decoder_inputs[:, :-1]
            max_len = tf.shape(decoder_inputs)[-1]

            with tf.variable_scope('cell', initializer=xavier_initializer()):
                self.encoder_cell = rnn.MultiRNNCell([
                    rnn.BasicLSTMCell(_NUM_UNITS, activation=tf.tanh)
                    for _ in range(param.NUM_LAYERS)
                ])
                self.encoder_init_state = self.encoder_cell.zero_state(
                    BATCH_SIZE, dtype=tf.float32)

                _, self.encoder_final_state = tf.nn.dynamic_rnn(
                    cell=self.encoder_cell,
                    initial_state=self.encoder_init_state,
                    inputs=self.encoder_inputs,
                    sequence_length=self.encoder_lengths,
                    scope='encoder')
                if param.Use_VAE:
                    self.encoder_cell_2 = rnn.MultiRNNCell([
                        rnn.BasicLSTMCell(_NUM_UNITS, activation=tf.tanh)
                        for _ in range(param.NUM_LAYERS)
                    ])
                    self.encoder_init_state_2 = self.encoder_cell.zero_state(
                        BATCH_SIZE, dtype=tf.float32)
                    _, self.encoder_final_state_2 = tf.nn.dynamic_rnn(
                        cell=self.encoder_cell_2,
                        initial_state=self.encoder_init_state_2,
                        inputs=tf.nn.embedding_lookup(self.embedding,
                                                      self.decoder_inputs),
                        sequence_length=self.decoder_lengths,
                        scope='encoder_2')

                self.decoder_cell = rnn.MultiRNNCell([
                    rnn.BasicLSTMCell(_NUM_UNITS)
                    for _ in range(param.NUM_LAYERS)
                ])
                self.decoder_cell_drop = tf.contrib.rnn.DropoutWrapper(
                    self.decoder_cell, output_keep_prob=keep_prob)
                self.decoder_init_state = self.decoder_cell_drop.zero_state(
                    BATCH_SIZE, dtype=tf.float32)
                if param.Use_VAE:
                    self.decoder_input_state_2 = self.encoder_final_state_2
                    # self.decoder_input_state = (LSTMStateTuple(self.decoder_input_state[0,0],self.decoder_input_state[0,1]),
                    #                         LSTMStateTuple(self.decoder_input_state[1,0],self.decoder_input_state[1,1]))

                self.decoder_input_state = self.encoder_final_state

                if Use_relational_memory:
                    g_output_unit = create_output_unit(
                        _NUM_UNITS * NUM_LAYERS * 2, param.VOCAB_SIZE)
                    mem_slots = param.mem_slots
                    head_size = param.head_size
                    num_heads = param.num_heads
                    gen_mem = RelationalMemory(mem_slots=mem_slots,
                                               head_size=head_size,
                                               num_heads=num_heads)
                    self.decoder_inputs = tf.pad(self.decoder_inputs,
                                                 [[0, 0], [0, 1]])
                    x_emb = tf.transpose(
                        tf.nn.embedding_lookup(self.embedding,
                                               self.decoder_inputs),
                        perm=[1, 0, 2])  # seq_len x batch_size x emb_dim
                    g_predictions = tensor_array_ops.TensorArray(
                        dtype=tf.float32,
                        size=max_len,
                        dynamic_size=True,
                        infer_shape=True)
                    g_predictions = g_predictions.write(
                        0,
                        tf.one_hot(self.decoder_inputs[:, 0],
                                   param.VOCAB_SIZE))
                    ta_emb_x = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                            size=max_len)
                    ta_emb_x = ta_emb_x.unstack(x_emb)
                    with tf.variable_scope('postprocessing',
                                           initializer=xavier_initializer()):
                        if Post_with_state:
                            self.softmax_w = tf.get_variable(
                                'softmax_w', [
                                    head_size * num_heads + 2 * NUM_UNITS,
                                    param.VOCAB_SIZE
                                ])
                        else:
                            self.softmax_w = tf.get_variable(
                                'softmax_w',
                                [head_size * num_heads, param.VOCAB_SIZE])

                        self.softmax_b = tf.get_variable(
                            'softmax_b', [param.VOCAB_SIZE])
                    # the generator recurrent moddule used for pre-training
                    def _pretrain_recurrence(i, x_t, h_tm1, g_predictions):
                        mem_o_t, h_t = gen_mem(x_t, h_tm1)
                        if Post_with_state:
                            mem_o_t = tf.concat([
                                tf.reshape(mem_o_t,
                                           [-1, head_size * num_heads]),
                                self.decoder_input_state[:, 0, :2 * NUM_UNITS]
                            ], -1)
                        else:
                            mem_o_t = tf.reshape(mem_o_t,
                                                 [-1, head_size * num_heads])
                        o_t = tf.nn.bias_add(tf.matmul(mem_o_t,
                                                       self.softmax_w),
                                             bias=self.softmax_b)
                        #o_t = g_output_unit(mem_o_t)
                        g_predictions = g_predictions.write(
                            i, o_t)  # batch_size x vocab_size
                        x_tp1 = ta_emb_x.read(i)
                        return i + 1, x_tp1, h_t, g_predictions

                    self.decoder_input_state = tf.convert_to_tensor(
                        self.decoder_input_state)
                    self.decoder_input_state = tf.transpose(
                        self.decoder_input_state, perm=[2, 0, 1, 3])
                    self.decoder_input_state = tf.reshape(
                        self.decoder_input_state, [BATCH_SIZE, mem_slots, -1])

                    if param.Use_latent_z:
                        self.decoder_input_state = tf.concat([
                            tf.truncated_normal(
                                shape=self.decoder_input_state.shape),
                            self.decoder_input_state
                        ],
                                                             axis=-1)
                    elif Use_VAE:
                        self.decoder_input_state_2 = tf.convert_to_tensor(
                            self.decoder_input_state_2)
                        self.decoder_input_state_2 = tf.transpose(
                            self.decoder_input_state_2, perm=[2, 0, 1, 3])
                        self.decoder_input_state_2 = tf.reshape(
                            self.decoder_input_state_2, [BATCH_SIZE, -1])
                        self.mn = tf.layers.dense(self.decoder_input_state_2,
                                                  units=NUM_UNITS * 2)
                        self.sd = 0.5 * tf.layers.dense(
                            self.decoder_input_state_2, units=NUM_UNITS * 2)
                        epsilon = tf.random_normal(
                            tf.stack([
                                tf.shape(self.decoder_input_state_2)[0],
                                NUM_UNITS * 2
                            ]))
                        self.decoder_input_state_2 = self.mn + tf.multiply(
                            epsilon, tf.exp(self.sd))
                        self.decoder_input_state_2 = tf.reshape(
                            self.decoder_input_state_2,
                            [BATCH_SIZE, mem_slots, -1])

                        self.decoder_input_state = tf.concat([
                            self.decoder_input_state_2,
                            self.decoder_input_state
                        ],
                                                             axis=-1)

                    # build a graph for outputting sequential tokens
                    _, _, self.decoder_final_state, self.outputs = control_flow_ops.while_loop(
                        cond=lambda i, _1, _2, _3: i < max_len,
                        body=_pretrain_recurrence,
                        loop_vars=(tf.constant(1, dtype=tf.int32),
                                   tf.nn.embedding_lookup(
                                       self.embedding, self.decoder_inputs[:,
                                                                           0]),
                                   self.decoder_input_state, g_predictions))

                    self.logits = tf.transpose(self.outputs.stack()[1:, :, :],
                                               perm=[1, 0, 2])
                else:
                    self.outputs, self.decoder_final_state = tf.nn.dynamic_rnn(
                        cell=self.decoder_cell_drop,
                        initial_state=self.decoder_input_state,  #初始状态,h
                        inputs=tf.nn.embedding_lookup(
                            self.embedding, self.decoder_inputs),  #输入x
                        sequence_length=self.decoder_lengths,
                        dtype=tf.float32,
                        scope='decoder')

            #这里直接用softmax

            if Use_relational_memory:
                pass
            else:
                # self.logits = g_output_unit(tf.reshape(self.outputs, [-1, _NUM_UNITS]))
                with tf.variable_scope('cell/postprocessing',
                                       initializer=xavier_initializer()):
                    self.softmax_w = tf.get_variable(
                        'softmax_w', [_NUM_UNITS, param.VOCAB_SIZE])
                    self.softmax_b = tf.get_variable('softmax_b',
                                                     [param.VOCAB_SIZE])
                self.logits = tf.nn.bias_add(tf.matmul(
                    tf.reshape(self.outputs, [-1, _NUM_UNITS]),
                    self.softmax_w),
                                             bias=self.softmax_b)
            self.probs = tf.reshape(
                tf.nn.softmax(self.logits),
                [BATCH_SIZE, -1, param.VOCAB_SIZE])  #输出应该是一个one_shot向量
            self.labels = tf.one_hot(decoder_inputs[:, 1:],
                                     depth=param.VOCAB_SIZE,
                                     dtype=tf.int32)
            self.right_count = tf.reduce_sum(
                tf.reduce_sum(
                    tf.multiply(
                        tf.one_hot(tf.argmax(self.probs,
                                             -1), param.VOCAB_SIZE),
                        tf.to_float(self.labels)), -1),
                -1) / tf.to_float(max_len)

            self.logits = tf.reshape(self.logits,
                                     [BATCH_SIZE, -1, param.VOCAB_SIZE])
            loss = get_loss(LOSS_TYPE, self.logits, self.labels,
                            self.decoder_lengths, BATCH_SIZE)
            self.loss = loss
            self.original_loss = get_loss(1, self.logits, self.labels,
                                          self.decoder_lengths, BATCH_SIZE)
            # ---------- generate tokens and approximated one-hot results (Adversarial) ---------
            gen_o = tensor_array_ops.TensorArray(dtype=tf.float32,
                                                 size=0,
                                                 dynamic_size=True,
                                                 infer_shape=True)  #the prob
            gen_x = tensor_array_ops.TensorArray(
                dtype=tf.int32, size=0, dynamic_size=True,
                infer_shape=True)  # sampled token
            gen_x_onehot_adv = tensor_array_ops.TensorArray(
                dtype=tf.float32, size=0, dynamic_size=True,
                infer_shape=True)  # generator output (relaxed of gen_x)

            random_start_length = tf.constant(param.start_length,
                                              dtype=tf.int32)

            #            random_start_length = tf.random_uniform(shape = [],minval=0,maxval=sentence_min_len,dtype=tf.int32)
            def _start_recurrence(word_i, gen_o, gen_x, gen_x_onehot_adv):
                gen_x = gen_x.write(word_i, decoder_inputs[:, word_i])
                gen_o = gen_o.write(
                    word_i,
                    tf.one_hot(decoder_inputs[:, word_i], param.VOCAB_SIZE,
                               1.0, 0.0))
                gen_x_onehot_adv = gen_x_onehot_adv.write(
                    word_i,
                    tf.one_hot(decoder_inputs[:, word_i], param.VOCAB_SIZE,
                               1.0, 0.0) * 1000000)
                return word_i + 1, gen_o, gen_x, gen_x_onehot_adv

            _, gen_o, gen_x, gen_x_onehot_adv = control_flow_ops.while_loop(
                cond=lambda i, _1, _2, _3: i < random_start_length + 1,
                body=_start_recurrence,
                loop_vars=(tf.constant(0), gen_o, gen_x, gen_x_onehot_adv))

            #temperature = param.temperature
            # the generator recurrent module used for adversarial training
            if Use_relational_memory:
                with tf.variable_scope('cell/postprocessing', reuse=True):
                    if Post_with_state:
                        self.softmax_w = tf.get_variable(
                            'softmax_w', [
                                param.head_size * param.num_heads +
                                2 * NUM_UNITS, param.VOCAB_SIZE
                            ])
                    else:
                        self.softmax_w = tf.get_variable(
                            'softmax_w', [
                                param.head_size * param.num_heads,
                                param.VOCAB_SIZE
                            ])

                    self.softmax_b = tf.get_variable('softmax_b',
                                                     [param.VOCAB_SIZE])
                if param.start_length >= 1:
                    x_emb = tf.transpose(tf.nn.embedding_lookup(
                        self.embedding, self.decoder_inputs),
                                         perm=[1, 0, 2])
                    ta_emb_gen_x = tensor_array_ops.TensorArray(
                        dtype=tf.float32, size=max_len)
                    ta_emb_gen_x = ta_emb_gen_x.unstack(x_emb)

                    # the generator recurrent moddule used for pre-training
                    def _start_rel_recurrence(i, x_t, h_tm1):
                        mem_o_t, h_t = gen_mem(x_t, h_tm1)
                        if Post_with_state:
                            mem_o_t = tf.concat([
                                tf.reshape(mem_o_t,
                                           [-1, head_size * num_heads]),
                                self.decoder_input_state[:, 0, :2 * NUM_UNITS]
                            ], -1)
                        else:
                            mem_o_t = tf.reshape(mem_o_t,
                                                 [-1, head_size * num_heads])
                        o_t = tf.nn.bias_add(tf.matmul(mem_o_t,
                                                       self.softmax_w),
                                             bias=self.softmax_b)
                        x_tp1 = ta_emb_gen_x.read(i)
                        return i + 1, x_tp1, h_t

                    self.decoder_start_word_state = \
                        tf.cond(random_start_length > 0,
                        lambda: (control_flow_ops.while_loop(
                        cond=lambda i, _1, _2: i < random_start_length + 1,
                        body=_start_rel_recurrence,
                        loop_vars=(
                        tf.constant(1, dtype=tf.int32),
                        tf.nn.embedding_lookup(self.embedding, self.decoder_inputs[:,0]),
                        self.decoder_input_state)))[2],
                        lambda: self.decoder_input_state)
                    # _, _, self.decoder_start_word_state = control_flow_ops.while_loop(
                    #     cond=lambda i, _1, _2: i < random_start_length + 1,
                    #     body=_start_rel_recurrence,
                    #     loop_vars=(
                    #     tf.constant(1, dtype=tf.int32),
                    #     tf.nn.embedding_lookup(self.embedding, self.decoder_inputs[:,0]),
                    #     self.decoder_input_state))
                else:
                    self.decoder_start_word_state = self.decoder_input_state
            else:
                with tf.variable_scope('cell', reuse=True):
                    _, self.decoder_start_word_state = tf.nn.dynamic_rnn(
                        cell=self.decoder_cell_drop,
                        initial_state=self.encoder_final_state,  # 初始状态,h
                        inputs=tf.nn.embedding_lookup(
                            self.embedding, self.decoder_inputs),  # 输入x
                        sequence_length=np.ones(BATCH_SIZE) *
                        random_start_length,
                        dtype=tf.float32,
                        scope='decoder')

            def _gen_recurrence(i, x_t, state, gen_o, gen_x, gen_x_onehot_adv):
                if Use_relational_memory:
                    mem_o_t, state = gen_mem(x_t, state)  # hidden_memory_tuple
                    if Post_with_state:
                        mem_o_t = tf.concat([
                            tf.reshape(mem_o_t, [-1, head_size * num_heads]),
                            self.decoder_input_state[:, 0, :2 * NUM_UNITS]
                        ], -1)
                    else:
                        mem_o_t = tf.reshape(mem_o_t,
                                             [-1, head_size * num_heads])
                    logits = tf.nn.bias_add(tf.matmul(mem_o_t, self.softmax_w),
                                            bias=self.softmax_b)

                    pad = np.ones((BATCH_SIZE, VOCAB_SIZE))
                    pad_2 = tf.one_hot(tf.to_int32(
                        tf.constant(np.ones((BATCH_SIZE)) * 2)),
                                       depth=VOCAB_SIZE)
                    pad_2 = tf.multiply(pad_2, end_rate - 1)
                    pad = tf.constant(pad)
                    pad = tf.add(tf.to_float(pad), tf.to_float(pad_2))
                    logits = tf.multiply(logits, pad)

                else:
                    with tf.variable_scope('cell', reuse=True):
                        outputs, state = rnn.static_rnn(
                            cell=self.decoder_cell_drop,
                            initial_state=state,
                            inputs=[x_t],  #输入x
                            sequence_length=np.ones(BATCH_SIZE),
                            dtype=tf.float32,
                            scope='decoder')
                    with tf.variable_scope('postprocessing', reuse=True):
                        logits = tf.nn.bias_add(tf.matmul(
                            tf.reshape(outputs, [-1, _NUM_UNITS]),
                            self.softmax_w),
                                                bias=self.softmax_b)


#                logits = g_output_unit(tf.reshape(outputs, [-1, _NUM_UNITS]))
                prob = tf.reshape(
                    tf.nn.softmax(logits),
                    [BATCH_SIZE, param.VOCAB_SIZE])  #without length
                if not if_test:
                    gumbel_t = add_gumbel(logits)
                else:
                    #                    gumbel_t = logits
                    gumbel_t = add_gumbel(tf.multiply(1.2, logits))

                next_token = tf.to_int32(
                    tf.stop_gradient(tf.argmax(gumbel_t, axis=1)))
                next_token_onehot = tf.one_hot(next_token, param.VOCAB_SIZE,
                                               1.0, 0.0)
                x_onehot_appr = tf.multiply(
                    gumbel_t,
                    temperature)  # one-hot-like, [batch_size x vocab_size]

                gen_o = gen_o.write(i, logits)
                gen_x = gen_x.write(i, next_token)
                gen_x_onehot_adv = gen_x_onehot_adv.write(
                    i, tf.nn.softmax(x_onehot_appr))
                x_tp1 = tf.nn.embedding_lookup(self.embedding, next_token)
                return i + 1, x_tp1, state, gen_o, gen_x, gen_x_onehot_adv

            # build a graph for outputting sequential tokens

            _, _, _, self.gen_o, self.gen_x, self.gen_x_onehot_adv = control_flow_ops.while_loop(
                cond=lambda i, _1, _2, _3, _4, _5: i < max_len,
                body=_gen_recurrence,
                loop_vars=(random_start_length + 1,
                           tf.nn.embedding_lookup(
                               self.embedding,
                               decoder_inputs[:, random_start_length]),
                           self.decoder_start_word_state, gen_o, gen_x,
                           gen_x_onehot_adv))
            self.gen_o = tf.transpose(
                self.gen_o.stack(),
                perm=[1, 0, 2])  # batch_size x seq_len x vocab_size
            self.gen_x = tf.transpose(self.gen_x.stack(), perm=[1, 0])
            self.gen_x_onehot_adv = tf.transpose(self.gen_x_onehot_adv.stack(),
                                                 perm=[1, 0, 2])

            temp_list = tf.constant(list(range(100)))
            temp_list = temp_list[:max_len] - 1
            temp_list = tf.tile(tf.reshape(temp_list, [1, max_len]),
                                [BATCH_SIZE, 1])
            self.ifequal = tf.equal(tf.to_int32(tf.argmax(self.gen_o, -1)),
                                    tf.ones_like(self.gen_x))
            self.ifequal = tf.to_int32(self.ifequal[:, :-1])
            self.ifequal = tf.concat(
                [self.ifequal,
                 tf.ones((BATCH_SIZE, 1), tf.int32)], -1)
            self.total_length = tf.multiply(
                tf.to_int32(self.ifequal),
                temp_list) + 10000 * (1 - tf.to_int32(self.ifequal))
            self.gen_x_length = tf.reduce_mean(
                tf.to_float(tf.reduce_min(self.total_length, -1)))
    def __init__(self, args, infer=False):
        '''
        Initialisation function for the class Model.
        Params:
        args: Contains arguments required for the Model creation
        '''

        # If sampling new trajectories, then infer mode
        if infer:
            # Infer one position at a time
            args.batch_size = 1
            args.seq_length = 1

        # Store the arguments
        self.args = args

        # TODO: (resolve) Do we need to use a fixed seq_length?
        # Input data contains sequence of (x,y) points
        self.input_data = tf.placeholder(tf.float32,
                                         [None, args.seq_length, 2])
        # target data contains sequences of (x,y) points as well
        self.target_data = tf.placeholder(tf.float32,
                                          [None, args.seq_length, 2])
        # fraction of nodes to drop when running the graph
        self.dropout = tf.placeholder(tf.float32)

        # Learning rate
        self.lr = tf.Variable(args.learning_rate,
                              trainable=False,
                              name="learning_rate")

        cells = []
        # loop through once for each layer of nodes
        for _ in range(args.num_layers):
            # Initialize a BasicLSTMCell recurrent unit
            # args.rnn_size contains the dimension of the hidden state of the LSTM
            cell = rnn.BasicLSTMCell(args.rnn_size, state_is_tuple=True)
            # Add dropout for training normalization
            cell = rnn.DropoutWrapper(cell,
                                      output_keep_prob=1.0 - self.dropout)
            cells.append(cell)
        cell = rnn.MultiRNNCell(cells, state_is_tuple=True)

        # Store the recurrent unit
        self.cell = cell

        # Initial cell state of the LSTM (initialised with zeros)
        self.initial_state = cell.zero_state(batch_size=args.batch_size,
                                             dtype=tf.float32)

        # Output size is the set of parameters (mu, sigma, corr)
        output_size = 5  # 2 mu, 2 sigma and 1 corr

        # Embedding for the spatial coordinates
        with tf.variable_scope("coordinate_embedding"):
            #  The spatial embedding using a ReLU layer
            #  Embed the 2D coordinates into embedding_size dimensions
            #  TODO: (improve) For now assume embedding_size = rnn_size
            embedding_w = tf.get_variable("embedding_w",
                                          [2, args.embedding_size])
            embedding_b = tf.get_variable("embedding_b", [args.embedding_size])

        # Output linear layer
        with tf.variable_scope("rnnlm"):
            output_w = tf.get_variable(
                "output_w", [args.rnn_size, output_size],
                initializer=tf.truncated_normal_initializer(stddev=0.01),
                trainable=True)
            output_b = tf.get_variable(
                "output_b", [output_size],
                initializer=tf.constant_initializer(0.01),
                trainable=True)

        # Split inputs according to sequences.
        ## inputs = tf.split(1, args.seq_length, self.input_data)
        inputs = tf.split(self.input_data, args.seq_length, 1)

        # Get a list of 2D tensors. Each of size numPoints x 2
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        # Embed the input spatial points into the embedding space
        embedded_inputs = []
        for x in inputs:
            # Each x is a 2D tensor of size numPoints x 2
            # Embedding layer
            embedded_x = tf.nn.relu(
                tf.add(tf.matmul(x, embedding_w), embedding_b))
            embedded_inputs.append(embedded_x)

        # Feed the embedded input data, the initial state of the LSTM cell, the recurrent unit to the seq2seq decoder
        ## outputs, last_state = tf.nn.seq2seq.rnn_decoder(embedded_inputs, self.initial_state, cell, loop_function=None, scope="rnnlm")
        outputs, last_state = tf.contrib.legacy_seq2seq.rnn_decoder(
            embedded_inputs,
            self.initial_state,
            cell,
            loop_function=None,
            scope="rnnlm")
        #        outputs, last_state = tf.nn.dynamic_rnn(cell, embedded_inputs, initial_state=self.initial_state, scope="rnnlm")

        # Concatenate the outputs from the RNN decoder and reshape it to ?xargs.rnn_size
        ## output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
        output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size])

        # Apply the output linear layer
        output = tf.nn.xw_plus_b(output, output_w, output_b)
        # Store the final LSTM cell state after the input data has been feeded
        self.final_state = last_state

        # reshape target data so that it aligns with predictions
        flat_target_data = tf.reshape(self.target_data, [-1, 2])
        # Extract the x-coordinates and y-coordinates from the target data
        ## [x_data, y_data] = tf.split(1, 2, flat_target_data)
        [x_data, y_data] = tf.split(flat_target_data, 2, 1)

        def tf_2d_normal(x, y, mux, muy, sx, sy, rho):
            '''
            Function that implements the PDF of a 2D normal distribution
            params:
            x : input x points
            y : input y points
            mux : mean of the distribution in x
            muy : mean of the distribution in y
            sx : std dev of the distribution in x
            sy : std dev of the distribution in y
            rho : Correlation factor of the distribution
            '''
            # eq 3 in the paper
            # and eq 24 & 25 in Graves (2013)
            # Calculate (x - mux) and (y-muy)
            ## normx = tf.sub(x, mux)
            ## normy = tf.sub(y, muy)
            normx = tf.subtract(x, mux)
            normy = tf.subtract(y, muy)
            # Calculate sx*sy
            ## sxsy = tf.mul(sx, sy)
            sxsy = tf.multiply(sx, sy)
            # Calculate the exponential factor
            ## z = tf.square(tf.div(normx, sx)) + tf.square(tf.div(normy, sy)) - 2*tf.div(tf.mul(rho, tf.mul(normx, normy)), sxsy)
            z = tf.square(tf.div(normx, sx)) + tf.square(tf.div(normy, sy)) -\
                2 * tf.div(tf.multiply(rho, tf.multiply(normx, normy)), sxsy)
            negRho = 1 - tf.square(rho)
            # Numerator
            result = tf.exp(tf.div(-z, 2 * negRho))
            # Normalization constant
            ## denom = 2 * np.pi * tf.mul(sxsy, tf.sqrt(negRho))
            denom = 2 * np.pi * tf.multiply(sxsy, tf.sqrt(negRho))
            # Final PDF calculation
            result = tf.div(result, denom)
            self.result = result
            return result

        # Important difference between loss func of Social LSTM and Graves (2013)
        # is that it is evaluated over all time steps in the latter whereas it is
        # done from t_obs+1 to t_pred in the former
        def get_lossfunc(z_mux, z_muy, z_sx, z_sy, z_corr, x_data, y_data):
            '''
            Function to calculate given a 2D distribution over x and y, and target data
            of observed x and y points
            params:
            z_mux : mean of the distribution in x
            z_muy : mean of the distribution in y
            z_sx : std dev of the distribution in x
            z_sy : std dev of the distribution in y
            z_rho : Correlation factor of the distribution
            x_data : target x points
            y_data : target y points
            '''
            # step = tf.constant(1e-3, dtype=tf.float32, shape=(1, 1))

            # Calculate the PDF of the data w.r.t to the distribution
            result0 = tf_2d_normal(x_data, y_data, z_mux, z_muy, z_sx, z_sy,
                                   z_corr)
            # result0_2 = tf_2d_normal(tf.add(x_data, step), y_data, z_mux, z_muy, z_sx, z_sy, z_corr)
            # result0_3 = tf_2d_normal(x_data, tf.add(y_data, step), z_mux, z_muy, z_sx, z_sy, z_corr)
            # result0_4 = tf_2d_normal(tf.add(x_data, step), tf.add(y_data, step), z_mux, z_muy, z_sx, z_sy, z_corr)

            # result0 = tf.div(tf.add(tf.add(tf.add(result0_1, result0_2), result0_3), result0_4), tf.constant(4.0, dtype=tf.float32, shape=(1, 1)))
            # result0 = tf.mul(tf.mul(result0, step), step)

            # For numerical stability purposes
            epsilon = 1e-20

            # TODO: (resolve) I don't think we need this as we don't have the inner
            # summation
            # result1 = tf.reduce_sum(result0, 1, keep_dims=True)
            # Apply the log operation
            result1 = -tf.log(tf.maximum(result0,
                                         epsilon))  # Numerical stability

            # TODO: For now, implementing loss func over all time-steps
            # Sum up all log probabilities for each data point
            return tf.reduce_sum(result1)

        def get_coef(output):
            # eq 20 -> 22 of Graves (2013)
            # TODO : (resolve) Does Social LSTM paper do this as well?
            # the paper says otherwise but this is essential as we cannot
            # have negative standard deviation and correlation needs to be between
            # -1 and 1

            z = output
            # Split the output into 5 parts corresponding to means, std devs and corr
            ## z_mux, z_muy, z_sx, z_sy, z_corr = tf.split(1, 5, z)
            z_mux, z_muy, z_sx, z_sy, z_corr = tf.split(z, 5, 1)

            # The output must be exponentiated for the std devs
            z_sx = tf.exp(z_sx)
            z_sy = tf.exp(z_sy)
            # Tanh applied to keep it in the range [-1, 1]
            z_corr = tf.tanh(z_corr)

            return [z_mux, z_muy, z_sx, z_sy, z_corr]

        # Extract the coef from the output of the linear layer
        [o_mux, o_muy, o_sx, o_sy, o_corr] = get_coef(output)
        # Store the output from the model
        self.output = output

        # Store the predicted outputs
        self.mux = o_mux
        self.muy = o_muy
        self.sx = o_sx
        self.sy = o_sy
        self.corr = o_corr

        # Compute the loss function
        lossfunc = get_lossfunc(o_mux, o_muy, o_sx, o_sy, o_corr, x_data,
                                y_data)

        # Compute the cost
        self.cost = tf.div(lossfunc, (args.batch_size * args.seq_length))

        # Get trainable_variables
        tvars = tf.trainable_variables()

        # L2 loss
        l2 = args.lambda_param * sum(tf.nn.l2_loss(tvar) for tvar in tvars)
        self.cost = self.cost + l2

        # TODO: (resolve) We are clipping the gradients as is usually done in LSTM
        # implementations. Social LSTM paper doesn't mention about this at all
        # Calculate gradients of the cost w.r.t all the trainable variables
        self.gradients = tf.gradients(self.cost, tvars)
        # Clip the gradients if they are larger than the value given in args
        grads, _ = tf.clip_by_global_norm(self.gradients, args.grad_clip)

        # NOTE: Using RMSprop as suggested by Social LSTM instead of Adam as Graves(2013) does
        # optimizer = tf.train.AdamOptimizer(self.lr)
        # initialize the optimizer with teh given learning rate
        optimizer = tf.train.RMSPropOptimizer(self.lr)

        # Train operator
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))