def test_model_var_inputs(): # wanted to test when our train graph has more inputs that do not need to be fed (e.g. variable state) n_features = 5 embed_size = 4 hidden_dim = 3 seq_size = 3 out_size = 2 batch_size = 2 x = tx.Input(np.random.random([batch_size, seq_size]), n_units=seq_size, dtype=tf.int32) y = tx.Input(np.random.random([batch_size, out_size]), n_units=out_size, dtype=tf.float32) lookup = tx.Lookup(x, seq_size=seq_size, embedding_shape=[n_features, embed_size]) # seq = lookup.permute_batch_time() seq = tx.Transpose(lookup, [1, 0, 2]) rnn1 = tx.RNN(seq, cell_config=tx.RNNCell.config(n_units=hidden_dim)) y_ = tx.Linear(rnn1[seq_size - 1], n_units=out_size) # y_ = tx.Linear(tx.SeqConcat(lookup, seq_size=seq_size), n_units=out_size) # @tx.layer(n_units=2, dtype=tf.float32, name="loss") # def loss(pred, labels): # return tx.mse(pred, labels) model = tx.Model(run_inputs=x, run_outputs=y_, train_inputs=[x, y], train_outputs=y_, train_loss=tx.MSE(y_, y)) # model.draw("test.pdf") model.set_optimizer(tf.optimizers.SGD, lr=0.5) data1 = [[0, 1, 2], [2, 1, 0]] data2 = [[0., 1.], [1., 0.]] model.train_step(input_feed={x: data1, y: data2})
def test_attention_rnn_shape(): """ test attention and rnn layers integration with shape inference """ x1 = tx.Input(tf.ones([1, 2, 3]), n_units=3, name="x1") rnn1 = tx.RNN(x1, cell_config=tx.LSTMCell.config(n_units=4), n_units=4, stateful=False) att = tx.MHAttention(rnn1, rnn1, rnn1, n_units=3) rnn1_res = rnn1() att_res = att() assert rnn1.n_units == 4 assert rnn1.n_units == rnn1.cell.n_units assert tx.shape_equal(rnn1.shape[:-1], att.shape[:-1]) assert att.shape[-1] == att.n_units assert tx.shape_equal(rnn1_res.shape[1:], rnn1.shape[1:]) assert tx.shape_equal(att_res.shape[1:], att.shape[1:])
def test_module_with_attention(): """ Module + Attention integration This also tests Graph indirectly to check if we can add layers whose input layers are the same object (e.g. in self-attention) """ x1 = tx.Input(tf.ones([1, 2, 3]), n_units=3, name="x1") rnn1 = tx.RNN(x1, cell_config=tx.LSTMCell.config(n_units=4), n_units=4, stateful=False) att = tx.MHAttention(rnn1, rnn1, rnn1, n_units=3) m = tx.Module(inputs=x1, output=att, dependencies=rnn1.previous_state) g = tx.Graph.build(inputs=x1, outputs=m, add_missing_inputs=True) fn = g.as_function(ord_inputs=x1, ord_outputs=m) # this returns a tuple out1 = g.compute(tf.ones([1, 2, 3])) # this returns the function result out2 = fn(tf.ones([1, 2, 3])) assert tx.tensor_equal(out1[0], out2)
kernel_u = tx.Concat(*kernel_u) tx_kernel = tx.Merge(kernel_w, kernel_u, merge_fn=lambda l: tf.concat(l, axis=0)) # kernel = tx.Reshape(kernel, [-1, 4 * cell_units]) tf_zero_state = tf_cell.zero_state(batch_size, dtype=tf.float32) tf_out, tf_state = tf_cell(t1.tensor, state=tf_zero_state) # inject my internal state into TensorFlow lstm tf_cell._kernel = tx_kernel tf_out, tf_state = tf_cell(t1.tensor, state=tf_zero_state) tx_rnn = tx.RNN(seq, cell_proto=lambda x, **kwargs: tx_cell.reuse_with(x, **kwargs), stateful=False) tx_rnn = tx.Transpose(tx_rnn, [1, 0, 2]) # time major maintains the format in the output # if time major output is time major # if batch major, output is batch major tf_rnn, tf_state = tf.nn.dynamic_rnn( cell=tf_cell, inputs=lookup.tensor, sequence_length=None, initial_state=tf_zero_state, time_major=False, ) with tf.Session() as sess:
def __init__(self, inputs, labels, vocab_size, embed_dim, h_dim, embed_init=tx.zeros_init(), logit_init=tx.glorot_uniform(), num_h=1, h_activation=tx.tanh, h_init=tx.glorot_uniform(), w_dropconnect=None, u_dropconnect=None, r_dropout=0.4, y_dropout=0.4, embed_dropout=0.3, other_dropout=0.3, l2_loss=False, l2_weight=1e-5, use_f_predict=False, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=False, logit_bias=False, use_nce=False, nce_samples=10, skip_connections=False): if not isinstance(inputs, tx.Input): raise TypeError("inputs must be an Input layer") self.inputs = inputs self.labels = labels if not isinstance(labels, tx.Input): raise TypeError("labels must be an Input layer") if inputs.dtype != tf.int32 and inputs.dtype != tf.int64: raise TypeError( "Invalid dtype for input: expected int32 or int64, got {}". format(inputs.dtype)) if num_h < 0: raise ValueError("num hidden should be >= 0") # =============================================== # RUN GRAPH # =============================================== var_reg = [] with tf.name_scope("run"): # feature lookup embeddings = tx.Lookup(inputs, seq_size=None, lookup_shape=[vocab_size, embed_dim], weight_init=embed_init) var_reg.append(embeddings.weights) feature_lookup = embeddings.permute_batch_time() last_layer = feature_lookup cell_proto = tx.LSTMCell.proto( n_units=h_dim, activation=h_activation, gate_activation=tx.hard_sigmoid, w_init=h_init, u_init=h_init, w_dropconnect=w_dropconnect, u_dropconnect=u_dropconnect, r_dropout=r_dropout, x_dropout=None, y_dropout=y_dropout, regularized=False, name="cell", ) lstm_layers = [] for i in range(num_h): lstm_layer = tx.RNN(last_layer, cell_proto=cell_proto, regularized=False, stateful=True, name="LSTM_{}".format(i + 1)) lstm_layers.append(lstm_layer) var_reg += [wi.weights for wi in lstm_layer.cell.w] var_reg += [ui.weights for ui in lstm_layer.cell.u] last_layer = lstm_layer # last time step is the state used to make the prediction # last_layer = tx.Reshape(last_layer, [-1, h_dim]) # TODO this is not consistent with locked dropout for the last layer # where the same mask should be applied across time steps # to do this I need either y_dropout to be available or some sort of map # operation I can use with layers outputting 3D tensors # something equivalent to https://keras.io/layers/wrappers/ which applies # a layer to every temporal slice of an input. They implement this the same way # they implement an RNN # feature prediction for Energy-Based Model if use_f_predict: last_layer = tx.Linear(last_layer, embed_dim, f_init, add_bias=True, name="f_predict") # proto = tx.GRUCell.proto(n_units=embed_dim, # activation=h_activation, # gate_activation=tx.hard_sigmoid, # w_init=h_init, # u_init=h_init, # w_dropconnect=w_dropconnect, # u_dropconnect=u_dropconnect, # r_dropout=r_dropout, # x_dropout=None, # y_dropout=y_dropout, # regularized=False) # last_layer1 = tx.RNN(last_layer, cell_proto=proto, regularized=False, stateful=False) # last_layer2 = last_layer1.reuse_with(last_layer, reverse=True) # last_layer = tx.Add(last_layer1, last_layer2) # last_layer = tx.Module(last_layer, last_layer) var_reg += last_layer.variables # var_reg.append(last_layer.weights) f_predict = last_layer shared_weights = feature_lookup.weights if embed_share else None transpose_weights = embed_share logit_init = logit_init if not embed_share else None run_logits = tx.Linear(last_layer, n_units=vocab_size, weight_init=logit_init, shared_weights=shared_weights, transpose_weights=transpose_weights, add_bias=logit_bias, name="logits") if not embed_share: var_reg.append(run_logits.weights) run_output = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): embeddings = embeddings.reuse_with(inputs) feature_lookup = embeddings.permute_batch_time() if embed_dropout: feature_lookup = tx.Dropout(feature_lookup, probability=embed_dropout, name="drop_features") last_layer = feature_lookup for i in range(num_h): lstm_layer = lstm_layers[i].reuse_with(last_layer, regularized=True) last_layer = lstm_layer # last_layer = tx.Reshape(last_layer, [-1, h_dim]) # feature prediction for Energy-Based Model if use_f_predict: # last_layer = f_predict.reuse_with(last_layer) last_layer = f_predict.reuse_with(last_layer, regularized=True) last_layer = tx.Dropout(last_layer, probability=other_dropout, locked=False) train_logits = run_logits.reuse_with(last_layer, name="train_logits") train_output = tx.Activation(train_logits, tx.softmax, name="run_output") def categorical_loss(labels, logits): # labels come as a batch of classes [[1,2],[3,4]] -> [1,3,2,4] time steps are ordered to match logits labels = tx.Transpose(labels) labels = tx.Reshape(labels, [-1]) labels = tx.dense_one_hot(labels, num_cols=vocab_size) loss = tx.categorical_cross_entropy(labels=labels, logits=logits) return tf.reduce_mean(loss) def nce_loss(labels, weights, bias, predict): noise = uniform_sampler(labels, 1, nce_samples, True, vocab_size) loss = tf.nn.nce_loss(weights=weights, biases=bias, inputs=predict, labels=labels, num_sampled=nce_samples, num_classes=vocab_size, num_true=1, sampled_values=noise) return tf.reduce_mean(loss) if use_nce: bias = tx.VariableLayer(var_shape=[vocab_size], name="nce_bias") # wraps a layer to expose the weights as a layer but with the layer as its input nce_weights = tx.WrapLayer(embeddings, n_units=embeddings.n_units, wrap_fn=lambda x: x.weights, layer_fn=True) train_loss = tx.LambdaLayer(labels, nce_weights, bias, last_layer, apply_fn=nce_loss, name="nce_loss") else: train_loss = tx.LambdaLayer(labels, train_logits, apply_fn=categorical_loss, name="train_loss") if l2_loss: l2_losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = tx.LambdaLayer( train_loss, apply_fn=lambda x: x + l2_weight * tf.add_n(l2_losses), name="train_loss_l2") # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): eval_loss = tx.LambdaLayer(labels, run_logits, apply_fn=categorical_loss, name="eval_loss") self.stateful_layers = lstm_layers # BUILD MODEL super().__init__(run_outputs=run_output, run_inputs=inputs, train_inputs=[inputs, labels], train_outputs=train_output, train_loss=train_loss, eval_inputs=[inputs, labels], eval_outputs=run_output, eval_score=eval_loss)
def test_rnn_layer(): n_features = 5 embed_size = 4 hidden_dim = 3 seq_size = 3 batch_size = 2 inputs = tx.Input(np.random.random([batch_size, seq_size]), n_units=seq_size, dtype=tf.int32) lookup = tx.Lookup(inputs, seq_size=seq_size, embedding_shape=[n_features, embed_size]) seq = lookup.permute_batch_time() ones_state = tf.ones([batch_size, hidden_dim]) zero_state = (tf.zeros([batch_size, hidden_dim])) rnn_proto = tx.RNNCell.config(n_units=hidden_dim) rnn1 = tx.RNN(seq, cell_config=rnn_proto, previous_state=ones_state, return_state=True) rnn2 = rnn1.reuse_with(seq) # problem with RNN layer is that it uses modules that require # all the params to output the right answer # we need to supply the default values for the rest or all the inputs out1, last1 = rnn1() out2, last2 = rnn2() assert tx.tensor_equal(out1, out2) assert tx.tensor_equal(last1, last2) rnn3 = rnn1.reuse_with(seq, zero_state) rnn4 = rnn3.reuse_with(seq) rnn5 = rnn4.reuse_with(seq, ones_state) assert tx.tensor_equal(rnn2.previous_state, rnn1.previous_state) assert tx.tensor_equal(rnn3.previous_state, rnn4.previous_state) out3, last3 = rnn3() out4, last4 = rnn4() assert tx.tensor_equal(out3, out4) assert tx.tensor_equal(last3, last4) cell_state1 = rnn1.cell.previous_state[0]() cell_state2 = rnn2.cell.previous_state[0]() cell_state3 = rnn3.cell.previous_state[0]() cell_state4 = rnn4.cell.previous_state[0]() assert len(rnn1.cell.previous_state) == 1 assert tx.tensor_equal(cell_state1, cell_state2) assert tx.tensor_equal(cell_state3, cell_state4) assert not tx.tensor_equal(out1, out3) out5, last5 = rnn5() assert tx.tensor_equal(out1, out5) assert tx.tensor_equal(last1, last5)
def test_lstm_rnn_stateful(): n_units = 4 batch_size = 12 seq_size = 3 n_features = 16 embed_size = 6 feature_indices = np.random.randint(0, high=n_features, size=[batch_size, seq_size]) inputs = tx.Input(init_value=feature_indices, n_units=seq_size, dtype=tf.int32) lookup = tx.Lookup(inputs, seq_size=seq_size, embedding_shape=[n_features, embed_size]) seq = lookup.permute_batch_time() # (N, T, M) # print(np.shape(seq())) lstm_cell = tx.LSTMCell.config(n_units=n_units, activation=tf.tanh, gate_activation=tf.sigmoid, forget_bias_init=tf.initializers.ones()) # state0 = [s() for s in lstm0.previous_state] # inputs.value = tf.ones([batch_size, n_features]) # res1 = lstm1(inputs, state0) # res1_ = lstm1(inputs, state0) lstm_layer = tx.RNN(input_seq=seq, cell_config=lstm_cell, stateful=True, return_state=True) state0 = [s() for s in lstm_layer.previous_state] lstm_layer() state1 = [s() for s in lstm_layer.previous_state] for i in range(len(state0)): assert not tx.tensor_equal(state0[i], state1[i]) assert np.shape(state1[0]) == (batch_size, n_units) tx_cell = lstm_layer.cell kernel = tf.concat([w.weights.value() for w in tx_cell.w], axis=-1) recurrent_kernel = tf.concat([u.weights.value() for u in tx_cell.u], axis=-1) bias = tf.concat([w.bias.value() for w in tx_cell.w], axis=-1) # create keras lstm and update with the same cell state # since LSTM initializes the cell state internally this was # the only way to initializing that state from the tensorx state class FromOther(tf.keras.initializers.Initializer): def __init__(self, value): self.value = value def __call__(self, shape, dtype=None): if not tf.TensorShape(shape).is_compatible_with( tf.shape(self.value)): raise Exception( f"init called with shape {shape} != value shape {tf.shape(self.value)}" ) else: return self.value # seq = lookup() # seq = tf.transpose(seq, [1, 0, 2]) # lstm_cell = tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=n_units) # lstm_cell.build(np.shape(seq[0])) # full_kernel = tf.concat([kernel, recurrent_kernel], axis=0) # lstm_cell = (full_kernel, bias) # lstm_cell.weights[0] = full_kernel # lstm_cell.weights[1] = bias # print(type()) # print(lstm_cell(seq[0],state=tuple(state1))) # rnn = tf.keras.layers.RNN(cell=lstm_cell, # dtype=tf.float32, # return_sequences=True, # time_major=True, # unroll=False) # print(rnn(seq)) # print(lstm_layer()) # tf_lstm_output = rnn(seq, tuple(state1)) # tx_lstm_output = lstm_layer() keras_lstm = tf.keras.layers.LSTM( units=n_units, activation=tf.tanh, kernel_initializer=FromOther(kernel.numpy()), recurrent_initializer=FromOther(recurrent_kernel.numpy()), bias_initializer=FromOther(bias.numpy()), recurrent_activation=tf.sigmoid, unit_forget_bias=False, implementation=2, time_major=True, unroll=True, return_sequences=True, stateful=False) # # lookup is of form [batch x features x input_dim] instead of [features x batch x input_dim] keras_lstm_output = keras_lstm(seq(), initial_state=tuple(state1)) assert tx.tensor_equal(keras_lstm.cell.kernel.value(), kernel) assert tx.tensor_equal(keras_lstm.cell.recurrent_kernel.value(), recurrent_kernel) assert tx.tensor_equal(keras_lstm.cell.bias.value(), bias) tx_lstm_output = lstm_layer()[0] assert tx.tensor_all_close(keras_lstm_output, tx_lstm_output)